diff --git a/src/main/java/com/o19s/es/explore/ExplorerQuery.java b/src/main/java/com/o19s/es/explore/ExplorerQuery.java index 5b47e07e..a64fb3e5 100644 --- a/src/main/java/com/o19s/es/explore/ExplorerQuery.java +++ b/src/main/java/com/o19s/es/explore/ExplorerQuery.java @@ -24,7 +24,6 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Scorer; @@ -105,12 +104,11 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo for (Term term : terms) { TermStates ctx = TermStates.build(searcher.getTopReaderContext(), term, scoreMode.needsScores()); - TermStatistics tStats = searcher.termStatistics(term, ctx); - if(tStats != null){ - df_stats.add(tStats.docFreq()); - idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs())); - ttf_stats.add(tStats.totalTermFreq()); + if(ctx != null){ + df_stats.add(ctx.docFreq()); + idf_stats.add(sim.idf(ctx.docFreq(), searcher.collectionStatistics(term.field()).docCount())); + ttf_stats.add(ctx.totalTermFreq()); } } diff --git a/src/main/java/com/o19s/es/termstat/TermStatSupplier.java b/src/main/java/com/o19s/es/termstat/TermStatSupplier.java index 2f1c704e..04de9342 100644 --- a/src/main/java/com/o19s/es/termstat/TermStatSupplier.java +++ b/src/main/java/com/o19s/es/termstat/TermStatSupplier.java @@ -75,7 +75,7 @@ public void bump (IndexSearcher searcher, LeafReaderContext context, // Collection Statistics df_stats.add(termStates.docFreq()); - idf_stats.add(sim.idf(termStates.docFreq(), searcher.getIndexReader().numDocs())); + idf_stats.add(sim.idf(termStates.docFreq(), searcher.collectionStatistics(term.field()).docCount())); ttf_stats.add(termStates.totalTermFreq()); // Doc specifics diff --git a/src/test/java/com/o19s/es/explore/ExplorerQueryTests.java b/src/test/java/com/o19s/es/explore/ExplorerQueryTests.java index 9537c632..22c4d531 100644 --- a/src/test/java/com/o19s/es/explore/ExplorerQueryTests.java +++ b/src/test/java/com/o19s/es/explore/ExplorerQueryTests.java @@ -37,6 +37,7 @@ import org.junit.After; import org.junit.Before; +import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; public class ExplorerQueryTests extends LuceneTestCase { @@ -65,6 +66,11 @@ public void setupIndex() throws Exception { doc.add(newTextField("text", docs[i], Field.Store.YES)); indexWriter.addDocument(doc); } + + // Add a junk doc to validate IDF doc count + Document doc = new Document(); + doc.add(new Field("_id", Integer.toString(docs.length + 1), StoredField.TYPE)); + indexWriter.addDocument(doc); } reader = DirectoryReader.open(dir); @@ -287,4 +293,31 @@ public void testInvalidStat() throws Exception { expectThrows(RuntimeException.class, () -> searcher.search(eq, 4)); } + + public void testIdfComputation() throws Exception { + Query q = new TermQuery(new Term("text", "cow")); + String statsType = "mean_classic_idf"; + + ExplorerQuery eq = new ExplorerQuery(q, statsType); + + TopDocs docs = searcher.search(eq, 4); + + /* + Prior to PR #378, the wrong doc count was being supplied for idf + + Before the fix, the idf for the first document would be over 1.98 + Computed idf for text:cow = 1.8472 + */ + assertThat((double) docs.scoreDocs[0].score, closeTo(1.8472, .01)); + } + + public void testNonExistentVocab() throws Exception { + Query q = new TermQuery(new Term("text", "chicken")); + String statsType = "min_raw_df"; + + ExplorerQuery eq = new ExplorerQuery(q, statsType); + TopDocs docs = searcher.search(eq, 4); + + assertThat(docs.scoreDocs[0].score, equalTo(0.0f)); + } }