Skip to content

Commit

Permalink
MB-61640: Toy: Fuzzy and Wildcard dynamic scoring
Browse files Browse the repository at this point in the history
 - Added lavenshtein distance calculation for fuzzy and wildcard searchers
 - Added new implementations of certain functions to allow passing of edit distances per term
 - Multiplied boosts by inverse of edit distance for score calculation
  • Loading branch information
Likith101 committed Jul 31, 2024
1 parent 25027cc commit 43b9e50
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 8 deletions.
18 changes: 11 additions & 7 deletions search/searcher/search_fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,14 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
}
}

return NewMultiTermSearcher(ctx, indexReader, candidates, field,
boost, options, true)
return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
boost, fuzzyCandidates.editDistances, options, true)
}

type fuzzyCandidates struct {
candidates []string
bytesRead uint64
candidates []string
editDistances []int
bytesRead uint64
}

func reportIOStats(ctx context.Context, bytesRead uint64) {
Expand All @@ -91,9 +92,10 @@ func reportIOStats(ctx context.Context, bytesRead uint64) {
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
rv = &fuzzyCandidates{
candidates: make([]string, 0),
candidates: make([]string, 0),
editDistances: make([]int, 0),
}

var reuse []int
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
Expand All @@ -110,6 +112,8 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv.candidates = append(rv.candidates, tfd.Term)
ld, _, _ := search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
rv.editDistances = append(rv.editDistances, ld)
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand All @@ -136,14 +140,14 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
}()

// enumerate terms and check levenshtein distance
var reuse []int
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
var ld int
var exceeded bool
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, ld)
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
Expand Down
49 changes: 49 additions & 0 deletions search/searcher/search_multi_term.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,29 @@ func NewMultiTermSearcher(ctx context.Context, indexReader index.IndexReader, te
options, limit)
}

func NewMultiTermSearcherBoosted(ctx context.Context, indexReader index.IndexReader, terms []string,
field string, boost float64, editDistances []int, options search.SearcherOptions, limit bool) (
search.Searcher, error) {

if tooManyClauses(len(terms)) {
if optionsDisjunctionOptimizable(options) {
return optimizeMultiTermSearcher(ctx, indexReader, terms, field, boost, options)
}
if limit {
return nil, tooManyClausesErr(field, len(terms))
}
}

qsearchers, err := makeBatchSearchersBoosted(ctx, indexReader, terms, field, boost, editDistances, options)
if err != nil {
return nil, err
}

// build disjunction searcher of these ranges
return newMultiTermSearcherInternal(ctx, indexReader, qsearchers, field, boost,
options, limit)
}

func NewMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions, limit bool) (
search.Searcher, error) {
Expand Down Expand Up @@ -151,6 +174,32 @@ func makeBatchSearchers(ctx context.Context, indexReader index.IndexReader, term
return qsearchers, nil
}

func makeBatchSearchersBoosted(ctx context.Context, indexReader index.IndexReader, terms []string, field string,
boost float64, editDistances []int, options search.SearcherOptions) ([]search.Searcher, error) {

qsearchers := make([]search.Searcher, len(terms))
qsearchersClose := func() {
for _, searcher := range qsearchers {
if searcher != nil {
_ = searcher.Close()
}
}
}
for i, term := range terms {
var err error
var editMultiplier float64
if editDistances != nil {
editMultiplier = 1 / float64(editDistances[i]+1)
}
qsearchers[i], err = NewTermSearcher(ctx, indexReader, term, field, boost*editMultiplier, options)
if err != nil {
qsearchersClose()
return nil, err
}
}
return qsearchers, nil
}

func optimizeMultiTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, terms [][]byte,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
Expand Down
14 changes: 13 additions & 1 deletion search/searcher/search_regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
package searcher

import (
"bytes"
"context"
"regexp"
"unicode"

"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
Expand Down Expand Up @@ -59,17 +61,27 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader,
}()

var candidateTerms []string
var editDistances []int
var baseBuf bytes.Buffer

for _, char := range pattern {
if unicode.IsLetter(char) {
baseBuf.WriteRune(char)
}
}
baseStr := baseBuf.String()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
candidateTerms = append(candidateTerms, tfd.Term)
ld := search.LevenshteinDistance(tfd.Term, baseStr)
editDistances = append(editDistances, ld)
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}

return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost,
return NewMultiTermSearcherBoosted(ctx, indexReader, candidateTerms, field, boost, editDistances,
options, true)
}

Expand Down

0 comments on commit 43b9e50

Please sign in to comment.