-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_retriever.py
134 lines (106 loc) · 5 KB
/
my_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import math
from collections import defaultdict
class Retrieve:
# Create new Retrieve object storing index and termWeighting scheme
def __init__(self, index, termWeighting):
self.index = index
self.termWeighting = termWeighting
# total number of documents in a collection
self.collection_size = int
# total number of terms in a document -- {docid: count}
self.n_terms = defaultdict(int)
# inverse document frequency for each term -- {term: idf_value}
self.idf = dict()
# document weight vector -- {docid: {term: weight}}
self.document_weight_vector = defaultdict(dict)
# length of the document weight vector -- {docid: length}
self.len_document_weight_vector = defaultdict(float)
##############################################
# Calculate the collection size
all_documentIds = set()
for term in self.index:
for docid in self.index[term]:
all_documentIds.add(docid)
self.collection_size = len(all_documentIds)
##############################################
# TF-IDF scheme
if self.termWeighting == 'tfidf':
# inverse document frequency for each term -- self.idf = {term: idf_value}
# term frequency in a document -- self.index[term][docid]
# Calculate Inverse Document Frequency
for term in self.index:
# total number of documents : self.collection_size
# number of documents with term : len(self.index[term])
self.idf[term] = math.log(self.collection_size/len(self.index[term]), 10)
# Calculate TF-IDF
for term in self.index:
for docid in self.index[term]:
self.document_weight_vector[docid].update({term: self.index[term][docid]*self.idf[term]})
# Term Frequency scheme
elif self.termWeighting == 'tf':
for term in self.index:
for docid in self.index[term]:
# number of times term appears in a document : self.index[term][docid]
self.document_weight_vector[docid].update({term: self.index[term][docid]})
# Binary scheme
else:
for term in self.index:
for docid in self.index[term]:
# binary weights for index terms in a document
self.document_weight_vector[docid].update({term: 1})
# 1. Calculate the length of the document weight vector
for docid, weight in self.document_weight_vector.items():
d = list(weight.values())
length = math.sqrt(sum([a*a for a in d]))
self.len_document_weight_vector[docid] = length
##############################################
# Method performing retrieval for specified query
def forQuery(self, query):
scores = defaultdict(float)
# 2. Calculate the length of a query vector
query_vector = []
# tf-idf
if self.termWeighting == 'tfidf':
for term in query:
if term not in self.idf:
pass
else:
query_vector.append(query[term]*self.idf[term])
len_query = math.sqrt(sum([a*a for a in query_vector]))
# term frequency
elif self.termWeighting == 'tf':
for term in query:
query_vector.append(query[term])
len_query = math.sqrt(sum([a*a for a in query_vector]))
# binary
else:
len_query = math.sqrt(len(query))
##############################################
# Calculate Cosine Similarity
for docid in range(self.collection_size):
q = 0 # query vector element
d = 0 # document vector element
numerator = 0 # dot product of q, d
for term in query:
# Calculate only the term in query which exists in the document
if term in self.document_weight_vector[docid+1]:
# 3. numerator = dot product of query and document vector
if self.termWeighting == 'tfidf':
q = query[term] * self.idf[term]
d = self.index[term][docid+1]*self.idf[term]
numerator += q * d
elif self.termWeighting == 'tf':
q = query[term]
d = self.index[term][docid+1]
numerator += q * d
else:
q += 1
d += 1
numerator += q * d
# Cosine Similarity
denominator = len_query * self.len_document_weight_vector[docid+1]
score = numerator/denominator
scores[docid+1] = score
# best_rank = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:10]
best_rank_10 = sorted(scores, key=scores.get, reverse=True)[:10]
return best_rank_10