-
Notifications
You must be signed in to change notification settings - Fork 1
/
expand_eval.py
92 lines (77 loc) · 3.37 KB
/
expand_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: CC-BY-NC-4.0
import json
from copy import deepcopy
import glob
from newspaper import Article
from utils import process_text, get_offsets
SUCCESS_DOWNLOAD_STATE = 2
from sys import argv
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from tqdm import tqdm
def expand_data(data):
tokenizer = SpacyTokenizer()
found = 0
all_urls = list()
for item_index, item in tqdm(enumerate(data)):
segments = dict()
for doc_index, doc in enumerate(item["documents"]):
all_urls.append(doc["url"])
if doc["url"] in mapping:
data[item_index]["documents"][doc_index]["text"] = process_text(mapping[doc["url"]])
data[item_index]["documents"][doc_index]["sentences"] = get_offsets(mapping[doc["url"]], tokenizer, doc["text_id"])
for segment in data[item_index]["documents"][doc_index]["sentences"]:
segments[segment["segment_id"]] = segment["text"]
found += 1
else:
print("Document not found for ID :", doc["text_id"])
continue
for statement_index, statement in enumerate(item["statements"]):
data[item_index]["statements"][statement_index]["text"] = segments[statement["segment_id"]][statement["start_char"]:statement["end_char"]]
all_urls = list(set(all_urls))
print("Total mapped = ", found)
print("Total corpus = ", len(all_urls))
return deepcopy(data)
if __name__ == '__main__':
mapping_dir = argv[1]
dev_data = json.load(open("./data/dev.json"))
test_data = json.load(open("./data/test.json"))
mapping_names = glob.glob("{}/*-filter.jsonl".format(mapping_dir))
mapping = dict()
for fname in tqdm(mapping_names):
jsonlines = open(fname).readlines()
for line in jsonlines:
item = json.loads(line)
if item["html"].strip() != "":
cc_article = Article('')
cc_article.download_state = SUCCESS_DOWNLOAD_STATE
cc_article.html = item["html"]
cc_article.parse()
if cc_article.text.strip() != "":
mapping[item["url"]] = cc_article.text.strip()
inp_data = list()
inp_data = deepcopy(dev_data)
inp_data.extend(deepcopy(test_data))
all_urls = dict()
for item in inp_data:
if item["year"] not in all_urls:
all_urls[item["year"]] = list()
for doc in item["documents"]:
all_urls[item["year"]].append(doc["url"])
for year in all_urls:
all_urls[year] = list(set(all_urls[year]))
print("Total mappings found in CC News = ", len(mapping))
print("Total documents in Corpus = ", sum(list([len(all_urls[year]) for year in all_urls])))
for year in all_urls:
print("Year = ", year)
not_found = 0
for url in all_urls[year]:
if url not in mapping:
not_found += 1
print("Total = ", len(all_urls[year]), "Not found = ", not_found)
print("Expanding Dev Set")
dev_data = expand_data(dev_data)
json.dump(dev_data, open("./data/expanded_dev.json", "w"), indent=4)
print("Expanding Test Set")
test_data = expand_data(test_data)
json.dump(test_data, open("./data/expanded_test.json", "w"), indent=4)