Skip to content

Commit

Permalink
workaround for issue #375 (#378)
Browse files Browse the repository at this point in the history
* workaround for issue #375

* Add a test to validate docCount changes

* Clean up deprecated method usage

* Nitpick cleaning

* Add test to verify fix of issue #356

Co-authored-by: = <[email protected]>
  • Loading branch information
ndkmath1 and worleydl authored Jun 22, 2021
1 parent 41d4e71 commit 7e413c2
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 7 deletions.
10 changes: 4 additions & 6 deletions src/main/java/com/o19s/es/explore/ExplorerQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
Expand Down Expand Up @@ -105,12 +104,11 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo

for (Term term : terms) {
TermStates ctx = TermStates.build(searcher.getTopReaderContext(), term, scoreMode.needsScores());
TermStatistics tStats = searcher.termStatistics(term, ctx);
if(tStats != null){
df_stats.add(tStats.docFreq());
idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs()));
ttf_stats.add(tStats.totalTermFreq());

if(ctx != null){
df_stats.add(ctx.docFreq());
idf_stats.add(sim.idf(ctx.docFreq(), searcher.collectionStatistics(term.field()).docCount()));
ttf_stats.add(ctx.totalTermFreq());
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/o19s/es/termstat/TermStatSupplier.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public void bump (IndexSearcher searcher, LeafReaderContext context,

// Collection Statistics
df_stats.add(termStates.docFreq());
idf_stats.add(sim.idf(termStates.docFreq(), searcher.getIndexReader().numDocs()));
idf_stats.add(sim.idf(termStates.docFreq(), searcher.collectionStatistics(term.field()).docCount()));
ttf_stats.add(termStates.totalTermFreq());

// Doc specifics
Expand Down
33 changes: 33 additions & 0 deletions src/test/java/com/o19s/es/explore/ExplorerQueryTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.junit.After;
import org.junit.Before;

import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.equalTo;

public class ExplorerQueryTests extends LuceneTestCase {
Expand Down Expand Up @@ -65,6 +66,11 @@ public void setupIndex() throws Exception {
doc.add(newTextField("text", docs[i], Field.Store.YES));
indexWriter.addDocument(doc);
}

// Add a junk doc to validate IDF doc count
Document doc = new Document();
doc.add(new Field("_id", Integer.toString(docs.length + 1), StoredField.TYPE));
indexWriter.addDocument(doc);
}

reader = DirectoryReader.open(dir);
Expand Down Expand Up @@ -287,4 +293,31 @@ public void testInvalidStat() throws Exception {

expectThrows(RuntimeException.class, () -> searcher.search(eq, 4));
}

public void testIdfComputation() throws Exception {
Query q = new TermQuery(new Term("text", "cow"));
String statsType = "mean_classic_idf";

ExplorerQuery eq = new ExplorerQuery(q, statsType);

TopDocs docs = searcher.search(eq, 4);

/*
Prior to PR #378, the wrong doc count was being supplied for idf
Before the fix, the idf for the first document would be over 1.98
Computed idf for text:cow = 1.8472
*/
assertThat((double) docs.scoreDocs[0].score, closeTo(1.8472, .01));
}

public void testNonExistentVocab() throws Exception {
Query q = new TermQuery(new Term("text", "chicken"));
String statsType = "min_raw_df";

ExplorerQuery eq = new ExplorerQuery(q, statsType);
TopDocs docs = searcher.search(eq, 4);

assertThat(docs.scoreDocs[0].score, equalTo(0.0f));
}
}

0 comments on commit 7e413c2

Please sign in to comment.