-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcondensed_article_searcher.py
114 lines (104 loc) · 4.24 KB
/
condensed_article_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import json
import re
import hashlib
from collections import defaultdict
article_to_parent = {}
markup_link_pattern = re.compile(r'\[\[([^|\]]{1,256})(\|[^\]]{1,256})?\]\]', re.I)
# can be 1., - + * for leading items
# also can have leading spaces
# but how much of the text do we take? all of it?
# take until newline seems like a reasonable strategy, or up to 256 chars
list_item_pattern = re.compile(r'^\s*([*-+]+|\d+\.)\s*([^\n]{4,256})$', re.I | re.M)
# Table markup parser, goal is to extract only the table items
# Do the easy thing and just look for |-
# can either be a single line (|| delimited) or a line for each item (| start)
table_row_pattern = re.compile(r'^\|-\n((\|\s+[^\n]+)+)$', re.I | re.M)
style_eliminator_pattern = re.compile(r'^\|\s+style=[^|]+\|(.+)$', re.I)
# NOTE: flag to control whether we generate "norm" indexes or full indexes
# where a norm index just uses list items and table items which are more reliable and structured
IS_NORM = False
def sha256digest(string):
sha256 = hashlib.sha256()
sha256.update(string.encode('utf-8'))
return sha256.hexdigest()
title_dict = defaultdict(lambda: set())
item_dict = defaultdict(lambda: set())
longest_title = ''
longest_record = None
total_list_items = 0
def check_record(record, cc):
global title_dict
global item_dict
global total_list_items
# title_sha = sha256digest(record['title'])
# Parse all [[subreferences]] contained in article
if record['page'].startswith('"{{Not English'):
# Skip it
return
title = json.loads(record['title']).strip().lower()
page = json.loads(record['page'])
# Hash title
title_hash = sha256digest(title)
if not title_hash.startswith(cc):
return
# print('title: {}'.format(title))
for match in re.finditer(markup_link_pattern, page):
title_dict[match.group(1).strip().lower()].add(title)
# Norm excludes sub item lists (or "- anything" items) which can be
# entire paragraphs and are extremely noisy
if not IS_NORM:
for match in re.finditer(list_item_pattern, page):
# Check to see if the match would also contain markup link
if markup_link_pattern.match(match.group(2)):
continue
# print('li: {}'.format(match.group(2).encode('utf-8')))
item_dict[match.group(2)].add(title)
total_list_items += 1
for match in re.finditer(table_row_pattern, page):
raw_row = match.group(1)
# could start with '| style="'
style_clause_match = style_eliminator_pattern.match(raw_row)
if style_clause_match is not None:
# import pdb; pdb.set_trace()
raw_row = style_clause_match.group(1)
# the raw_row is gonna start with (| and whitespace)
trimmed_row = raw_row.lstrip('|')
# split by ||
components = trimmed_row.split('||')
# ignore special stuff? |-
# | Total || {{bartable|100||2||background:grey}}
normalized_components = []
for comp in components:
norm = comp.strip().lower()
if len(norm) == 0:
continue
item_dict[norm].add(title)
normalized_components.append(norm)
# print(normalized_components)
fields = ['title', 'categories', 'page']
count = 0
for cc in '0123456789abcdef':
title_dict = defaultdict(lambda: set())
item_dict = defaultdict(lambda: set())
print("processing: {}".format(cc))
with open('condensed.csv', 'r') as ff:
i = 0
record = {}
for line in ff:
record[fields[i]] = line.rstrip()
i += 1
if i == 3:
check_record(record, cc)
record = {}
i = 0
count += 1
if count % 1000000 == 0 and i == 0:
print('checked: {}'.format(count))
print('total list items: {}'.format(total_list_items))
index_folder = 'norm_table_indexes' if IS_NORM else 'table_indexes'
if not os.path.isdir(index_folder):
os.mkdir(index_folder)
with open('{}/{}.txt'.format(index_folder, cc), 'w') as index_file:
for t_title in sorted(item_dict):
index_file.write('{}\n'.format(json.dumps({'t': t_title, 'as': list(item_dict[t_title])})))