-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwand.cpp
219 lines (199 loc) · 6.53 KB
/
wand.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#include "wand.h"
using namespace std;
/**
* Initializes the WAND object.
* Initializes the InvertedIndex to query from and also the UB
* of each posting list.
*/
WAND::WAND(unordered_map<string, vector<pair<int, int>>> & postingList) {
InvertedIndex = postingList;
//Initialize UB for each term.
for (auto & pair : postingList) {
int maxScore = 0;
for (auto & document : pair.second) {
if (document.second > maxScore) {
maxScore = document.second;
}
}
UB[pair.first] = maxScore;
}
}
/**
* Initializes the WAND iterator, sets all the variables to default values,
* clears currentInvertedIndex, sortedTerms, top_k heap. Initializes
* currentInvertedIndex iterators to first document in the InvertedIndex.
*/
void WAND::init(vector<string> & query_terms, int k, int F, vector<string> & must_include) {
curDoc = -1;
threshold = 0;
thresholdFactor = F;
fullEvaluationCount = 0;
result_num = k;
top_k = priority_queue<pair<int, int>, vector<pair<int, int>>, greater<pair<int, int>>>();
currentInvertedIndex.clear();
sortedTerms.clear();
for (auto & term : query_terms) {
currentInvertedIndex[term] = InvertedIndex[term].begin();
}
if (!must_include.empty()) {
for (auto & term : must_include) {
for (auto & document : InvertedIndex[term]) {
document.second += 500;
}
UB[term] += 500;
threshold += UB[term];
}
}
}
//Returns the full evaluation count for this particular WAND run.
int WAND::getFullEvaluationCount() {
return fullEvaluationCount;
}
/**
* Sorts the current DIDs in the increasing order.
* Sorts [DID, term] pairs in the sortedTerms vector.
*/
void WAND::sortTerms() {
//If sortedTerms is empty initialize it. I believe this can go in init method.
if (sortedTerms.size() == 0) {
for (auto & elem : currentInvertedIndex) {
sortedTerms.push_back(make_pair(elem.second->first, elem.first)); //[DID, term]
}
}
sort(sortedTerms.begin(), sortedTerms.end()); //sort based on DID.
}
/**
* Finds pivot term by summing the UB of terms in the sortedTerms order.
* Returns the index of the pivot term in sortedTerms.
* Returns -1 if pivot term was not found.
*/
int WAND::findPivotTerm() {
int UBscore = 0;
for (size_t i = 0; i < sortedTerms.size(); i++) {
UBscore += UB[sortedTerms[i].second];
if (UBscore >= threshold) {
return i;
}
}
return -1;
}
/**
* Advances the iterator of a term to >= docID.
* change_index is the index of the term in sortedTerms.
*/
void WAND::nextID(int change_index, int docID) {
string change_term = sortedTerms[change_index].second;
while (currentInvertedIndex[change_term]->first < docID) {
currentInvertedIndex[change_term]++;
}
//Update the DID of the change_term in sortedTerms list.
sortedTerms[change_index].first = currentInvertedIndex[change_term]->first;
}
/**
* WAND iterator that returns the next valid candidate to be in top-k.
* Returns next document that has UB(d, q) >= threshold.
*/
int WAND::next() {
while (true) {
//sorts the term lists based on current DID in the lists.
sortTerms();
//Finds the pivot term in sortedTerms and returns the index.
int pivot_index = findPivotTerm();
if (pivot_index == -1) {
return -1;
}
string pivot_term = sortedTerms[pivot_index].second;
int pivot_DID = currentInvertedIndex[pivot_term]->first; //or sortedTerms[pivot_index].first
if (pivot_DID == lastID) {
return -1;
}
if (pivot_DID <= curDoc) {
int change_index = 0;
nextID(change_index, curDoc + 1);
}
else {
int first_DID = sortedTerms[0].first;
if (first_DID == pivot_DID) {
curDoc = pivot_DID;
return curDoc;
}
else {
//Advance all of the iterators above pivot term to >= pivotID
for (int i = 0; i < pivot_index; i++) {
nextID(i, pivot_DID);
}
}
}
}
}
/**
* Inserts the docID and its score as pair into the heap.
* If heap is not full, insert it without comparing with
* minimum in top-k. If heap is full, compare with minimum and
* update threshold if inserted.
*/
void WAND::insertHeap(int docID, int score) {
if (top_k.size() < result_num) {
top_k.push(make_pair(score, docID));
//once the heap is full, update threshold from -1.
if (top_k.size() == result_num) {
threshold = thresholdFactor * top_k.top().first;
}
}
else {
if (score > top_k.top().first) {
top_k.pop();
top_k.push(make_pair(score, docID));
threshold = thresholdFactor * top_k.top().first;
}
}
}
/**
* Computes full score of a document.
* Makes use of the fact that the first couple DID of sortedTerms are the
* same.
*/
int WAND::fullScore(int docID) {
int score = 0;
fullEvaluationCount++; //Keep track of full evaluation count.
for (auto & p : sortedTerms) {
if (p.first != docID) {
break;
}
score += currentInvertedIndex[p.second]->second;
}
return score;
}
/**
* The overall process. Initializes the query and calls
* next() repeatedly to get candidates to be in top-k and then
* computes their full score to see if they belong in top-k.
* You can specify must include terms in the resulting documents
* You can also specify the threshold factor F.
*/
vector<pair<int, int>> WAND::DoQuery(vector<string> & query_terms, int k, int F, vector<string> must_include) {
init(query_terms, k, F, must_include);
while (true) {
int candidate_DID = next();
if (candidate_DID == -1) {
break;
}
int full_doc_score = fullScore(candidate_DID);
insertHeap(candidate_DID, full_doc_score);
}
if (!must_include.empty()) {
for (auto & term : must_include) {
for(auto & document : InvertedIndex[term]) {
document.second -= 500;
}
UB[term] -= 500;
}
}
//Returns the top-k documents as vector of pairs of [DID, score].
vector<pair<int, int>> resultList;
while (!top_k.empty()) {
resultList.push_back(make_pair(top_k.top().second, top_k.top().first));
top_k.pop();
}
return resultList;
}