-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_manager.py
230 lines (190 loc) · 8.37 KB
/
data_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import re
import json
import nltk
import random
class DataManager(object):
"""
This class is responsible from json operations
"""
def __init__(self, config, augmenter):
self.config = config
self.augmenter = augmenter
self.data_test = []
self.data_train = []
self.word_frequencies = {}
self.vocab_size = -1
self.data = None
self.read_json(self.config.json_path)
def preprocess_data(self):
self.shuffle()
self.apply_regex()
# self.remove_bad_data() # disabled
self.calculate_word_frequencies()
self.calculate_vocabulary_size()
self.tokenize_sentences()
self.split_test_train(self.config.train_ratio, self.config.split_method)
self.pad_sentences(self.config.title_pad_length, self.config.abstract_pad_length)
if self.config.augment:
self.augment_train_data()
def read_json(self, path):
with open(path) as json_file:
self.data = json.load(json_file)
def shuffle(self):
# in place
random.shuffle(self.data)
def apply_regex(self):
"""
Insert space before special characters such as "," or "." etc.
Convert to lowercase
Remove stopwords (English)
"""
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
for i in range(len(self.data)):
# get title and abstract
current_title = self.get_title_from_idx(i)
current_abstract = self.get_abstract_from_idx(i)
# apply regex
pattern = re.compile(r"([.,])")
stopword_pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
current_title = pattern.sub("", current_title).lower()
current_abstract = pattern.sub("", current_abstract).lower()
# remove stopwords using regex
current_title = stopword_pattern.sub('', current_title)
current_abstract = stopword_pattern.sub('', current_abstract)
# assign
self.data[i]["Title"] = current_title
self.data[i]["Abstract"] = current_abstract
def augment_train_data(self):
"""
Augmenter is slow. This would be a problem with big data.
"""
# do not augment on evaluation dataset
original_len = len(self.data_train)
for i in range(len(self.data_train)):
if i % 100 == 0:
print(f"Augmenting train data, progress: {i} / {original_len}")
title = self.data_train[i]["Title"]
abstract = self.data_train[i]["Abstract"]
label = self.data_train[i]["Label"]
title = self.augmenter.augment(title)
abstract = self.augmenter.augment(abstract)
self.data_train.append({"Title": title, "Abstract": abstract, "Label": label})
print(f"Train data amount after augmenting: {len(self.data_train)}")
def split_test_train(self, train_ratio, split_method):
"""
:param train_ratio: train data / all data ratio
:param split_method: 0: split randomly, 1: split each label individually to ensure numbers given in github
:return: None
"""
if split_method == 0:
split_index = int(len(self.data) * train_ratio)
self.data_train = self.data[0:split_index]
self.data_test = self.data[split_index:len(self.data)]
assert(len(self.data_train) + len(self.data_test) == len(self.data))
elif split_method == 1:
label_split_data = []
for i in range (len(self.config.label_dict)):
label_split_data.append(0)
label_split_data[i] = []
# split into labels
for data in self.data:
label_idx = self.config.label_dict[data["Label"]]
label_split_data[label_idx].append(data)
# split into train-test
for data in label_split_data:
split_index = int(len(data) * train_ratio)
[self.data_train.append(row) for row in data[0:split_index]]
[self.data_test.append(row) for row in data[split_index:len(data)]]
# shuffle
random.shuffle((self.data_train))
random.shuffle((self.data_test))
def create_train_test_jsonfile(self):
with open(self.config.train_data_path, 'w') as outfile:
json.dump(self.data_train, outfile)
with open(self.config.test_data_path, 'w') as outfile:
json.dump(self.data_test, outfile)
def remove_bad_data(self):
"""
If there are (and there are) some data instances with empty title or abstract fields, remove them.
"""
min_title_len = 9e10
min_abstract_len = 9e10
indexes_to_remove = []
for i in range(len(self.data)):
title_len = len(self.data[i]["Title"])
if title_len < min_title_len:
min_title_len = title_len
abstract_len = len(self.data[i]["Abstract"])
if abstract_len < min_abstract_len:
min_abstract_len = abstract_len
if title_len == 0 or abstract_len == 0:
if i not in indexes_to_remove:
indexes_to_remove.append(i)
# start deleting from the end of list to avoid problems with shifting indexes
indexes_to_remove.reverse()
for index in indexes_to_remove:
print("Deleting index: ", index)
del self.data[index]
print("Min Title len:", min_title_len)
print("Min Abstract len:", min_abstract_len)
def tokenize_sentences(self):
for i in range(len(self.data)):
self.data[i]["Title"] = self.get_title_from_idx(i).split(" ")
self.data[i]["Abstract"] = self.get_abstract_from_idx(i).split(" ")
def get_title_from_idx(self, idx):
return self.data[idx]["Title"]
def get_abstract_from_idx(self, idx):
return self.data[idx]["Abstract"]
def pad_sentences(self, desired_length_title, desired_length_abstract):
# data is assumed to be tokenized at this point
pad_word = ["<pad>"]
for i in range(len(self.data)):
title = self.get_title_from_idx(i)
abstract = self.get_abstract_from_idx(i)
# pad title
if len(title) > desired_length_title:
title = title[0:desired_length_title]
elif len(title) < desired_length_title:
title += pad_word * (desired_length_title - len(title))
# pad abstract
if len(abstract) > desired_length_abstract:
abstract = abstract[0:desired_length_abstract]
elif len(abstract) < desired_length_abstract:
abstract += pad_word * (desired_length_abstract - len(abstract))
self.data[i]["Title"] = title
self.data[i]["Abstract"] = abstract
def calculate_word_frequencies(self):
self.word_frequencies = {}
for i in range(len(self.data)):
title = self.get_title_from_idx(i).split(' ')
abstract = self.get_abstract_from_idx(i).split(' ')
for word in title:
try:
self.word_frequencies[word] += 1
except KeyError:
self.word_frequencies[word] = 1
for word in abstract:
try:
self.word_frequencies[word] += 1
except KeyError:
self.word_frequencies[word] = 1
def calculate_vocabulary_size(self):
self.vocab_size = len(self.word_frequencies)
def count_labels(self):
counts_train = [0, 0]
counts_test = [0, 0]
for i in range(len(self.data_train)):
if self.data_train[i]["Label"] == "Material Science":
counts_train[0] += 1
elif self.data_train[i]["Label"] == "Chemistry":
counts_train[1] += 1
for i in range(len(self.data_test)):
if self.data_test[i]["Label"] == "Material Science":
counts_test[0] += 1
elif self.data_test[i]["Label"] == "Chemistry":
counts_test[1] += 1
print(f"Train Num Material: {counts_train[0]}")
print(f"Train Num Chemistry: {counts_train[1]}")
print(f"Test Num Material: {counts_test[0]}")
print(f"Test Num Chemistry: {counts_test[1]}")