-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_text.py
171 lines (135 loc) · 4.28 KB
/
filter_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
""" Given a json file with transcript information this tools can perform
manipulations including generating word lists.
Optionally provide the output json file name with -j
Usage: python filter_text.py sample.json wordlist.txt
"""
import argparse
import json
import string
import os
import re
def save_wordlist(wordlist, filename):
""" Given a list of strings write to file
"""
try:
with open(filename, 'w', encoding = 'utf-8') as f:
for word in wordlist:
f.write(word + '\n')
except:
print("Could not write out to file " + filename)
exit()
def extract_wordlist(data):
""" Given the data object produce a list of strings of single words
Returned list is of unique words and sorted
"""
result = []
for utt in data:
words = utt.get('transcript').split()
result.extend(words)
result = list(set(result))
result.sort()
return result
def filter_data(data):
""" Given a data object remove any transcriptons with undesirable features
"""
to_remove = string.punctuation + "…" + "’" + "“" + "–" + "”" + "‘"
special_cases = ["<silence>"]
cleaned_data = []
for utt in data:
trans = utt.get('transcript').lower()
if trans in special_cases:
continue # Ignore
words = trans.split()
clean_words = []
valid_utterance = True
for word in words:
# If utterance contains a translation
if word == '@ENG@': # Translations / ignore
#words = words[:words.index(word)]
break
# If partial digit, throw out whole utterance
if bool(re.search(r'\d', word)) and not word.isdigit():
valid_utterance = False
break
# Remove punctuation and bad chars
for char in to_remove:
word = word.replace(char, '')
clean_words.append(word)
cleaned_trans = ' '.join(clean_words).strip()
if cleaned_trans == "":
valid_utterance = False
if not valid_utterance: # Something was bad in utterance
continue
# Should be a clean valid utterance
utt['transcript'] = cleaned_trans
cleaned_data.append(utt)
return cleaned_data
def _filter_data(data):
""" Returns a dictionary of words and frequencies
"""
raise Exception("Deprecated")
to_remove = string.punctuation + "…" + "’" + "“" + "–" + "”"
special_cases = ["<silence>"]
empty_utts = []
for utt in data:
words = utt.get('transcript').split()
for word in words:
if word in special_cases:
empty_utts.append(utt)
if word == "@ENG@": ## In abui following is a translation to english
words = words[:words.index(word)]
break
for char in to_remove:
#word = word.replace(char, '')
words = [word.replace(char, '') for word in words]
words = [word for word in words if not bool(re.search(r'\d', word)) and not word.isdigit()] # Filter digits
utt['transcript'] = ' '.join(words).lower() #words.join(' ').lower()
if utt['transcript'].strip() == "":
empty_utts.append(utt)
# clean any empty/special case utterances
[data.remove(utt) for utt in empty_utts]
#print(utt['transcript'])
def load_file(filename):
""" Given a filename load and return the object
"""
try:
with open(filename, "r", encoding = "utf-8") as f:
data = json.load(f)
except Exception as e:
print("Could not read file " + filename)
exit()
return data
def write_json(data, filename):
""" Wrtie a data object in json format
"""
try:
with open(filename, 'w', encoding = 'utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent = 4)
except:
print("Could not write out json file " + filename)
exit()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=str,
help="The input file to clean.")
parser.add_argument("wordlistfile", type=str,
help="Output word list.")
parser.add_argument("-j", "--jsonOutput", type=str,
help="Name of json file to use for cleaned data")
args = parser.parse_args()
json_outfile = "{0}_clean.json".format(args.infile.rstrip('.json'))
if args.jsonOutput:
json_outfile = args.jsonOutput
data = load_file(args.infile)
print("Filtering...", end='', flush=True)
data = filter_data(data) # mutates the data object
print("Done.")
print("Wordlist...", end='', flush=True)
wordlist = extract_wordlist(data)
print("Done.")
print("Write out wordlist and json...", end='', flush=True)
save_wordlist(wordlist, args.wordlistfile)
write_json(data, json_outfile)
print("Done.")
if __name__ == '__main__':
main()