forked from doomdagadiggiedahdah/SimpleStories
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_dataset.py
33 lines (27 loc) · 984 Bytes
/
process_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# Marked for deletion: This only contains code that is not needed for a well-prompted run.
import json
import os
def separate_stories(stories, end_string):
out = []
for entry in stories:
story = entry["story"]
out.extend([entry | {"story": x} for x in story.split(end_string) if len(x) > 5])
if end_string in story:
print(len(out))
print(entry)
return out
filename = r"D:\simple_stories_generate\data\stories-claude-3-5-sonnet-20240620-2024-08-25-18-06-54.jsonl"
new_filename = f"{filename.split('.')[0]}_processed.{filename.split('.')[1]}"
try:
os.remove(new_filename)
except FileNotFoundError:
pass
print(filename)
with open(filename) as fp:
lines = fp.readlines()
for line in lines:
data = json.loads(line)
out = separate_stories(data, "THE END")
for new_line in out:
with open(new_filename, "a") as f:
f.write(json.dumps(new_line) + "\n")