-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
76 lines (61 loc) · 2.13 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sys
import os
import re
import errno
import json
from contextlib import closing
import xapian as _x
from fetch_data import fetch_movies
FIELDS = ['Title', 'Plot', "Actors", "Director", "Year", "Rated"]
RATED = ["G", "PG", "PG-13", "R", "N/A"]
SLOT_YEAR = 0
SLOT_RATED = 1
def _format_rated(rated):
if rated not in RATED:
return "N/A"
return rated
def main():
movies = fetch_movies()
# try to make a db in pwd
try:
os.mkdir('./xdb/')
except (OSError, IOError) as e:
if e.errno != errno.EEXIST:
raise
with closing(_x.WritableDatabase('./xdb/movies.db',
_x.DB_CREATE_OR_OPEN)) as x_db:
for rank, mov in movies:
# make a new document
x_doc = _x.Document()
# setup indexer
indexer = _x.TermGenerator()
indexer.set_stemmer(_x.Stem("english"))
indexer.set_document(x_doc)
title = mov.get(u"Title")
plot = mov.get(u"Plot")
actors = mov.get(u'Actors')
directors = mov.get(u"Director")
year = mov.get(u'Year')
rated = mov.get(u'Rated')
# index terms
indexer.index_text(plot)
indexer.index_text(title, 1, "S")
indexer.index_text(title)
indexer.index_text(actors)
indexer.index_text(directors)
# index year as value(serizlized) for range query
x_doc.add_value(SLOT_YEAR, _x.sortable_serialise(int(year)))
rated_value = _format_rated(rated)
# add a boolean term for filtering on rated
x_doc.add_boolean_term("XRATED:{}".format(rated_value))
# index rated as value for faceting
x_doc.add_value(SLOT_RATED, rated_value)
# store the data blob to the document
data = {sel: mov[sel] for sel in FIELDS}
data['rank'] = rank
x_doc.set_data(json.dumps(data, encoding='utf8'))
# save
x_db.replace_document(rank, x_doc)
print("indexing done")
if __name__ == '__main__':
sys.exit(main())