-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathconvert_data.py
305 lines (255 loc) · 12.1 KB
/
convert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# -*- coding: utf8 -*-
"""
Script to convert multi-document inputs to TensorFlow examples which can be sent to the PG-MMR model.
"""
import glob
import struct
import shutil
from tensorflow.core.example import example_pb2
import nltk
import os
from bs4 import BeautifulSoup
import io
from absl import flags
from absl import app
import sys
reload(sys)
sys.setdefaultencoding('utf8')
FLAGS = flags.FLAGS
p_start_tag = '<P>'
p_end_tag = '</P>'
def fix_bracket_token(token):
if token == '(':
return '-lrb-'
elif token == ')':
return '-rrb-'
elif token == '[':
return '-lsb-'
elif token == ']':
return '-rsb-'
else:
return token
def is_quote(tokens):
contains_quotation_marks = "''" in tokens and len(tokens) > 0 and tokens[0] == "``"
doesnt_end_with_period = len(tokens) > 0 and tokens[-1] != "."
# contains_says = "says" in tokens or "said" in tokens
decision = contains_quotation_marks or doesnt_end_with_period
if decision:
print "Skipping quote: ", ' '.join(tokens)
return decision
def process_sent(sent):
line = sent.lower()
tokenized_sent = nltk.word_tokenize(line)
tokenized_sent = [fix_bracket_token(token) for token in tokenized_sent]
return tokenized_sent
def process_dataset(dataset_name, out_data_path, TAC_path='', DUC_path='', custom_dataset_path=''):
data_dirs = {
'tac_2011': {
'article_dir': os.path.join(TAC_path, 'summary_data/s11/test_doc_files'),
'abstract_dir': os.path.join(TAC_path, 'summary_data/s11/models')
},
'tac_2010': {
'article_dir': os.path.join(TAC_path, 'summary_data/s10/test_doc_files'),
'abstract_dir': os.path.join(TAC_path, 'summary_data/s10/models')
},
'tac_2008': {
'article_dir': os.path.join(TAC_path, 'summary_data/s08/test_doc_files'),
'abstract_dir': os.path.join(TAC_path, 'summary_data/s08/models')
},
'duc_2004': {
'article_dir': os.path.join(DUC_path, 'Original/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs'),
'abstract_dir': os.path.join(DUC_path, 'past_duc/duc2004/duc2004_results/ROUGE/eval/models/2')
},
'duc_2003': {
'article_dir': os.path.join(DUC_path, 'Original/DUC2003_Summarization_Documents/duc2003_testdata/task2/docs'),
'abstract_dir': os.path.join(DUC_path, 'past_duc/duc2003/results/detagged.duc2003.abstracts/models')
}
}
if dataset_name == 'duc_tac':
combine_duc_2003_tac_2008_tac_2010(out_data_path)
return
if dataset_name in data_dirs:
article_dir = data_dirs[dataset_name]['article_dir']
abstract_dir = data_dirs[dataset_name]['abstract_dir']
is_tac = 'tac' in dataset_name
is_custom_dataset = False
else:
article_dir = custom_dataset_path
abstract_dir = custom_dataset_path
is_tac = False
is_custom_dataset = True
out_dir = os.path.join(out_data_path, dataset_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
multidoc_dirnames = sorted(os.listdir(article_dir))
out_idx = 1
for multidoc_dirname in multidoc_dirnames:
article, abstracts, doc_indices, raw_article_sents = get_article_abstract(multidoc_dirname, article_dir, abstract_dir, is_tac, is_custom_dataset)
with open(os.path.join(out_dir, 'test_{:03d}.bin'.format(out_idx)), 'wb') as writer:
write_example(article, abstracts, doc_indices, raw_article_sents, writer)
out_idx += 1
def combine_duc_2003_tac_2008_tac_2010(out_data_path):
out_dir = os.path.join(out_data_path, 'duc_tac')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
out_idx = 1
for dataset_name in ['duc_2003', 'tac_2008', 'tac_2010']:
example_dir = os.path.join(out_data_path, dataset_name)
if not os.path.exists(example_dir) or len(os.listdir(example_dir)) == 0:
process_dataset(dataset_name)
example_files = glob.glob(os.path.join(example_dir,'*'))
for old_file in example_files:
new_file = os.path.join(out_dir, 'test_{:03d}.bin'.format(out_idx))
shutil.copyfile(old_file, new_file)
out_idx += 1
def concatenate_p_tags(soup):
lines = []
for tag in soup.findAll('p'):
lines.append(tag.renderContents().replace('\n', ' ').strip())
contents = ' '.join(lines)
return contents
def has_p_tags(soup):
return soup.find('p')
def fix_exceptions(sentences):
new_sentences = []
for sent in sentences:
split_sents = sent.split('Hun Sen. ')
if len(split_sents) == 2:
sent1 = split_sents[0] + 'Hun Sen.'
sent2 = split_sents[1].strip()
new_sentences.append(sent1)
if sent2 != '':
new_sentences.append(sent2)
else:
new_sentences.append(sent)
return new_sentences
def add_sents_to_article(sentences, article, raw_article_sents, doc_indices, doc_idx):
for orig_sent in sentences:
tokenized_sent = process_sent(orig_sent)
if is_quote(tokenized_sent):
continue
sent = ' '.join(tokenized_sent)
article += sent + ' '
doc_indices_for_tokens = [doc_idx] * len(tokenized_sent)
doc_indices_str = ' '.join(str(x) for x in doc_indices_for_tokens)
doc_indices += doc_indices_str + ' '
raw_article_sents.append(orig_sent)
return article, raw_article_sents, doc_indices
def get_article(article_dir, multidoc_dirname, is_tac):
if is_tac:
multidoc_dir = os.path.join(article_dir, multidoc_dirname, multidoc_dirname + '-A')
else:
multidoc_dir = os.path.join(article_dir, multidoc_dirname)
doc_names = os.listdir(multidoc_dir)
doc_names = sorted([f for f in doc_names if os.path.isfile(os.path.join(multidoc_dir, f)) and '.py' not in f])
article = ''
doc_indices = ''
raw_article_sents = []
for doc_idx, doc_name in enumerate(doc_names):
doc_path = os.path.join(multidoc_dir, doc_name)
with open(doc_path) as f:
article_text = f.read()
soup = BeautifulSoup(article_text, 'html.parser')
if is_tac:
contents = concatenate_p_tags(soup)
sentences = nltk.tokenize.sent_tokenize(contents)
article, raw_article_sents, doc_indices = add_sents_to_article(sentences, article, raw_article_sents, doc_indices, doc_idx)
else:
if has_p_tags(soup):
contents = concatenate_p_tags(soup)
else:
contents = soup.findAll('text')[0].renderContents().replace('\n', ' ').strip()
contents = ' '.join(contents.split())
sentences = nltk.tokenize.sent_tokenize(contents)
fixed_sentences = fix_exceptions(sentences)
article, raw_article_sents, doc_indices = add_sents_to_article(sentences, article, raw_article_sents, doc_indices, doc_idx)
article = article.encode('utf-8').strip()
return article, doc_indices, raw_article_sents
def process_abstract(abstract_lines):
abstract = ''
for line in abstract_lines:
line = line.lower()
line = line.replace(u'\x92', "'")
tokenized_sent = nltk.word_tokenize(line)
tokenized_sent = [fix_bracket_token(token) for token in tokenized_sent]
sent = ' '.join(tokenized_sent)
abstract += '<s> ' + sent + ' </s> '
abstract = abstract.encode('utf-8').strip()
abstract = abstract.strip()
return abstract
def get_abstract(multidoc_dirname, abstract_dir, is_tac):
abstracts = []
doc_num = ''.join([s for s in multidoc_dirname if s.isdigit()])
all_doc_names = os.listdir(abstract_dir)
if is_tac:
abstract_doc_name = 'D' + doc_num + '-A'
else:
abstract_doc_name = 'D' + doc_num + '.M'
selected_doc_names = [doc_name for doc_name in all_doc_names if abstract_doc_name in doc_name]
if len(selected_doc_names) == 0:
raise Exception('no docs found for doc ' + doc_num)
for selected_doc_name in selected_doc_names:
with io.open(os.path.join(abstract_dir, selected_doc_name), encoding='utf-8', errors='ignore') as f:
abstract_lines = f.readlines()
abstract = process_abstract(abstract_lines)
abstracts.append(abstract)
return abstracts
def get_article_abstract(multidoc_dirname, article_dir, abstract_dir, is_tac, is_custom_dataset):
if is_custom_dataset:
article, abstracts, doc_indices, raw_article_sents = get_custom_article_abstract(multidoc_dirname, article_dir)
else:
article, doc_indices, raw_article_sents = get_article(article_dir, multidoc_dirname, is_tac, is_single_doc=False)
abstracts = get_abstract(multidoc_dirname, abstract_dir, is_tac)
return article, abstracts, doc_indices, raw_article_sents
def write_example(article, abstracts, doc_indices, raw_article_sents, writer):
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([article])
for abstract in abstracts:
tf_example.features.feature['abstract'].bytes_list.value.extend([abstract])
if doc_indices is not None:
tf_example.features.feature['doc_indices'].bytes_list.value.extend([doc_indices])
for sent in raw_article_sents:
tf_example.features.feature['raw_article_sents'].bytes_list.value.extend([sent])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
def get_custom_article_abstract(multidoc_dirname, article_dir):
with open(os.path.join(article_dir, multidoc_dirname)) as f:
text = f.read()
docs = [[sent.strip() for sent in doc.strip().split('\n')] for doc in text.split('<SUMMARIES>')[0].strip().split('\n\n')]
article = ''
doc_indices = ''
raw_article_sents = []
for doc_idx, sentences in enumerate(docs):
article, raw_article_sents, doc_indices = add_sents_to_article(sentences, article, raw_article_sents, doc_indices, doc_idx)
article = article.encode('utf-8').strip()
if '<SUMMARIES>' in text:
abstracts_unprocessed = [[sent.strip() for sent in abs.strip().split('\n')] for abs in text.split('<SUMMARIES>')[1].strip().split('\n\n')]
abstracts = []
for abstract_lines in abstracts_unprocessed:
abstract = process_abstract(abstract_lines)
abstracts.append(abstract)
else:
abstracts = []
return article, abstracts, doc_indices, raw_article_sents
def main(unused_argv):
if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
raise Exception("Problem with flags: %s" % unused_argv)
if FLAGS.dataset_name == '':
raise Exception('Must specify which dataset to convert.')
process_dataset(FLAGS.dataset_name, FLAGS.out_data_path, FLAGS.TAC_path, FLAGS.DUC_path, FLAGS.custom_dataset_path)
if __name__ == '__main__':
flags.DEFINE_string('dataset_name', 'example_custom_dataset', 'Which dataset to convert from raw data to tf examples')
flags.DEFINE_string('out_data_path', 'tf_data', 'Where to put output tf examples')
flags.DEFINE_string('TAC_path', '', 'Path to raw TAC data.')
flags.DEFINE_string('DUC_path', '', 'Path to raw DUC data.')
flags.DEFINE_string('custom_dataset_path', 'example_custom_dataset/', 'Path to custom dataset. Format of custom dataset must be:\n'
+ 'One file for each topic...\n'
+ 'Distinct articles will be separated by one blank line (two carriage returns \\n)...\n'
+ 'Each sentence of the article will be on its own line\n'
+ 'After all articles, there will be one blank line, followed by \'<SUMMARIES>\' without the quotes...\n'
+ 'Distinct summaries will be separated by one blank line...'
+ 'Each sentence of the summary will be on its own line'
+ 'See the directory example_custom_dataset for an example')
app.run(main)