-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_searcher.py
35 lines (29 loc) · 1.31 KB
/
index_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import json
from collections import defaultdict
#search_set = ['manning', 'brady', 'dalton']
#search_set = ['sun tzu', 'hannibal', 'rommel', 'patton']
#search_set = ['sun tzu', 'hannibal', 'rommel', 'patton']
search_set = ['tom bowler','red devil','agate','cats eye']
#search_set = ['drugstore', 'urban']
#search_set = ['okinawa', 'tinian']
association_dict = defaultdict(lambda: {})
cc = 0
with open('big_table_index.txt', 'r') as ff:
for line in ff:
title, associated_articles = json.loads(line.strip())
for item in search_set:
# TODO: fix this search as there are a ton of false positives
if item in title:
for thingy in associated_articles:
association_dict[item][thingy] = title
cc += 1
if cc % 1000000 == 0:
print('processed: {}'.format(cc))
# Look for thingies in common
association_count_dict = defaultdict(int)
for item in search_set:
for assoc in association_dict[item]:
association_count_dict[assoc] += 1
for k, count in association_count_dict.iteritems():
if count >= len(search_set):
print(k, [association_dict[item].get(k) for item in search_set])