diff --git a/analysis/synonym/synonym.go b/analysis/synonym/synonym.go new file mode 100644 index 000000000..547e5c9ca --- /dev/null +++ b/analysis/synonym/synonym.go @@ -0,0 +1,65 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package synonym + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" + index "github.com/blevesearch/bleve_index_api" +) + +const Name = "textsynonym" + +type SynonymSource struct { + collection string + analyzer string +} + +func New(collection string, analyzer string) *SynonymSource { + return &SynonymSource{ + collection: collection, + analyzer: analyzer, + } +} + +func (p *SynonymSource) Collection() string { + return p.collection +} + +func (p *SynonymSource) Analyzer() string { + return p.analyzer +} + +func (q *SynonymSource) MetadataKey() string { + return q.collection + string(index.SynonymKeySeparator) + q.analyzer +} + +func SynonymSourceConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.SynonymSource, error) { + collection, ok := config["collection"].(string) + if !ok { + return nil, fmt.Errorf("must specify synonym collection") + } + analyzer, ok := config["analyzer"].(string) + if !ok { + return nil, fmt.Errorf("must specify synonym analyzer") + } + return New(collection, analyzer), nil +} + +func init() { + registry.RegisterSynonymSource(Name, SynonymSourceConstructor) +} diff --git a/analysis/token/synonym/synonym.go b/analysis/token/synonym/synonym.go new file mode 100644 index 000000000..35bbedc19 --- /dev/null +++ b/analysis/token/synonym/synonym.go @@ -0,0 +1,452 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package synonym + +import ( + "fmt" + "strings" + "sync" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/registry" + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" + "github.com/blevesearch/vellum" +) + +const Name = "synonym" + +const SeparatingCharacter = ' ' + +type SynonymFilter struct { + metadata []*index.SynonymMetadata + fuzziness int + prefix int +} + +// fstPath is used for storing the data associated +// with a particular path in the FST. This would be used for enabling +// fuzzy search in the synonym filter. +type fstPath struct { + // output is the sum of the outputs of all the states seen in the path + output uint64 + // state is the state at the end of the path + state int + // word is the sequence of characters that were matched in the path + word []byte +} + +func resetPath(path *fstPath) *fstPath { + path.output = 0 + path.state = 0 + path.word = path.word[:0] + return path +} + +var pathPool = sync.Pool{ + New: func() interface{} { return new(fstPath) }, +} + +// check if the input FST path ends with a word. +// A word in the FST is defined as a sequence of characters that is either not followed by any other character +// or is followed by a space. +// This function is used for enabling fuzzy search in the synonym filter. +func pathEndsWithWord(path *fstPath, fst *vellum.FST) (bool, error) { + spaceAccept := fst.Accept(path.state, SeparatingCharacter) + numEdges, err := fst.GetNumTransitionsForState(path.state) + if err != nil { + return false, err + } + // if, at the current state of the input FST path, there is a transition available for space, or + // if there is no transition available, the path ends with a word. + return spaceAccept != 1 || numEdges == 0, nil +} + +// This function performs a DFS from the state present in the input FST path. +// MatchTry is the word that must be fuzzily matched in the FST, starting from path. +// ValidPaths is the list of valid paths that were found in the FST and if the matchTry is fuzzily matched in +// the FST, validPaths is appended with the new path that was found. +// Fuzziness is the maximum Levenshtein distance allowed between matchTry and the word fuzzily matched in the FST. +// +// Basic logic followed is as follows: +// - At the state in the FST where the character in the input string fails to match, +// we start a DFS searching for states that end a word +// - which are states that either do not have any outgoing transition or +// have a transition for a space +// - Since it is a FST, multiple such paths will be found, and each path's word is checked for +// edit distance criteria satisfaction before it is added to validPaths +func fuzzyMatch(path *fstPath, matchTry string, validPaths []*fstPath, fst *vellum.FST, fuzziness int) ([]*fstPath, error) { + // stack for iterative DFS + pathStack := []*fstPath{path} + var pathIsValid bool + for len(pathStack) > 0 { + path = pathStack[len(pathStack)-1] + pathStack = pathStack[:len(pathStack)-1] + pathIsValid = false + // if input FST path ends with a word and if the word fuzzily matches the matchTry, append the validPaths with the input FST path. + endsWithWord, err := pathEndsWithWord(path, fst) + if err != nil { + return nil, err + } + if endsWithWord { + if search.LevenshteinDistance(string(path.word), matchTry) <= fuzziness { + validPaths = append(validPaths, path) + pathIsValid = true + } + } + // perform a DFS from the current state of the input FST path. + // even if the path ends with a word, the DFS is performed since there there may be + // another transition apart from a space at the state in the path. + // + // this condition is a shortcut to discard certain fst paths where it becomes known that the word + // will have a edit distance > fuzziness to matchTry, as path does not end with a word and + // path.word is already exceeding len(matchTry) by 'fuzziness' amount and any further + // exploration along this path will need removal of characters >'fuzziness' + if len(path.word)-len(matchTry) <= fuzziness { + transitions, err := fst.GetTransitionsForState(path.state) + if err != nil { + return nil, err + } + // for each transition from the current state of the input FST path, perform a DFS. + // if the transition is not a space, get a new fstPath from the pool, and insert it + // into the stack with the updated parameters + for _, character := range transitions { + if character != SeparatingCharacter { + newState, newOutput := fst.AcceptWithVal(path.state, character) + newPath := pathPool.Get().(*fstPath) + newPath.state, newPath.output, newPath.word = newState, path.output+newOutput, append(path.word, character) + pathStack = append(pathStack, newPath) + if err != nil { + return nil, err + } + } + } + } + // only discard fstPath if it was not added to validPaths + if !pathIsValid { + pathPool.Put(resetPath(path)) + } + } + return validPaths, nil +} + +// acceptSpaces is a helper function that accepts a sequence of spaces in the input FST path. +// if the desired number of spaces is found, the state and output of the input FST path are updated +// and the input FST path is returned. +func acceptSpaces(numSpaces int, path *fstPath, fst *vellum.FST) *fstPath { + numAcceptedSpaces := 0 + state := path.state + var tmp uint64 + for numAcceptedSpaces < numSpaces { + state, tmp = fst.AcceptWithVal(state, SeparatingCharacter) + if state == 1 { + return nil + } + path.output += tmp + numAcceptedSpaces++ + } + path.state = state + return path +} + +// checkForMatch is a function that checks if a sequence of tokens starting from inputIndex in the input +// token stream matches a phrase in the FST. +// if a match is found, the function returns the new value to be assigned to inputIndex in the Filter, which is +// index of the last matched token, along with the list of synonyms found for the matched phrase. +// The position of the first word and the position of the last word in the matched phrase are also returned. +// Greedy matching is used to find the longest sequence of input tokens that matches a phrase in the FST. +// Two types of matches are possible for a word in the FST: +// 1. Exact match: the word in the FST matches the word in the input token stream exactly. +// 2. Fuzzy match: the word in the FST matches the word in the input token stream with a Levenshtein distance +// less than or equal to the fuzziness parameter. +func (s *SynonymFilter) checkForMatch(input analysis.TokenStream, inputIndex int, fst *vellum.FST, + metadata *index.SynonymMetadata) ([]uint64, int, error) { + + numTokensConsumed := 0 + var rv []uint64 + var validPaths []*fstPath + var tokenLen, matchLen, pathIndex int + var err error + // since a token can fuzzily match multiple words in the FST, a slice of valid paths is maintained. + // with each valid path representing a sequence of transitions in the FST that starts from the start state. + // for every input token, all the valid paths are checked for an exact match or a fuzzy match. + startPath := pathPool.Get().(*fstPath) + startPath.state, startPath.output, startPath.word = fst.Start(), 0, nil + validPaths = append(validPaths, startPath) + numValidPaths := len(validPaths) + prevPos := input[inputIndex].Position + for inputIndex < len(input) { + // check how many stop words are present between the current token and the previous token. + // the number of stop words is the number of times a FST path accepts the space character + // this code block will be executed only after the first token is exactly or fuzzily matched. + numStopWords := input[inputIndex].Position - prevPos - 1 + if numStopWords > 0 { + pathIndex = 0 + for _, path := range validPaths { + rv := acceptSpaces(numStopWords, path, fst) + if rv != nil { + validPaths[pathIndex] = rv + pathIndex++ + } else { + pathPool.Put(resetPath(path)) + } + } + validPaths = validPaths[:pathIndex] + numValidPaths = len(validPaths) + } + pathIndex = 0 + // for each valid path, check if the current token is exactly or fuzzily matched. + for pathIndex < numValidPaths { + tokenLen = len(input[inputIndex].Term) + matchLen = 0 + path := validPaths[pathIndex] + for _, character := range input[inputIndex].Term { + newState, output := fst.AcceptWithVal(path.state, character) + if newState == 1 { + // if the current token is not exactly matched, check if it is fuzzily matched, if + // fuzziness is not 0 and if the match length is greater than or equal to the prefix length. + // if the current token is fuzzily matched, append the validPaths with the + // set of new paths that are found. + if s.fuzziness != 0 && matchLen >= s.prefix { + validPaths, err = fuzzyMatch(path, string(input[inputIndex].Term), + validPaths, fst, s.fuzziness) + if err != nil { + return nil, 0, err + } + } + break + } else { + matchLen++ + path.state = newState + path.output += output + path.word = append(path.word, character) + } + } + // if the current token is exactly matched, check if the path ends with a word. + // this is done so that the path is not considered as a valid path if the current token + // is a prefix of a word in the FST. + if matchLen == tokenLen { + endsWithWord, err := pathEndsWithWord(path, fst) + if err != nil { + return nil, 0, err + } + if endsWithWord { + validPaths = append(validPaths, path) + } else { + // since the path does not end with a word try to auto-complete the token + // by calling the fuzzyMatch function. + // example - if the token is "foo" and the FST contains "foobar", "foobaz" and "foobuzz", + // the token "foo" will be auto-completed to "foobar", "foobaz" and "foobuzz". + // provided fuzziness is set to 3. + if s.fuzziness != 0 && matchLen >= s.prefix { + validPaths, err = fuzzyMatch(path, string(input[inputIndex].Term), + validPaths, fst, s.fuzziness) + if err != nil { + return nil, 0, err + } + } + } + } + pathIndex++ + } + // filter out the valid paths from the previous iteration. + validPaths = validPaths[numValidPaths:] + prevPos = input[inputIndex].Position + numTokensConsumed++ + inputIndex++ + var seenSynonyms []uint64 + matchFound := false + pathIndex = 0 + // for each valid path, if the state is a matching state, append the seenSynonyms with the synonyms + // by using the output of the path as the key to the hashToSynonyms map. + // filter out all paths that do not have a transition for space from the current state and since each + // valid path ends with a word in the FST, the "dead-ends", or the paths that do not have any transition + // from the current state are filtered out. + for _, path := range validPaths { + isMatch, finalOutput := fst.IsMatchWithVal(path.state) + hashedLHS := path.output + finalOutput + if isMatch && !metadata.HashToPhrase[hashedLHS].IsInvalid { + for _, RHS := range metadata.HashToSynonyms[hashedLHS] { + if !RHS.IsInvalid { + seenSynonyms = append(seenSynonyms, RHS.Hash) + } + } + seenSynonyms = append(seenSynonyms, hashedLHS) + matchFound = true + } + pathAfterSpace := acceptSpaces(1, path, fst) + if pathAfterSpace != nil { + validPaths[pathIndex] = pathAfterSpace + validPaths[pathIndex].word = nil + pathIndex++ + } else { + pathPool.Put(resetPath(path)) + } + } + validPaths = validPaths[:pathIndex] + // if a match is found, update rv with the newInputIndex, matchedSynonyms + if matchFound { + rv = seenSynonyms + } + numValidPaths = len(validPaths) + if numValidPaths == 0 { + return rv, numTokensConsumed, nil + } + } + return rv, numTokensConsumed, nil +} + +type asyncResult struct { + err error + output []string + numTokensConsumed int +} + +func (s *SynonymFilter) parallelFilter(input analysis.TokenStream, metadata *index.SynonymMetadata, + inputIdx int, fst *vellum.FST) *asyncResult { + rv, numTokensConsumed, err := s.checkForMatch(input, inputIdx, fst, metadata) + output := make([]string, len(rv)) + for i, hash := range rv { + output[i] = metadata.HashToPhrase[hash].Phrase + } + return &asyncResult{ + output: output, + numTokensConsumed: numTokensConsumed, + err: err, + } +} + +func (s *SynonymFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + // Load FST data for each metadata + allFst, err := loadFSTData(s.metadata) + if err != nil { + fmt.Printf("error in synonym filter: %v\n", err) + return input + } + var rv analysis.TokenStream + idx := 0 + prevPos := 0 + for idx < len(input) { + asyncResults, maxTokensConsumed := s.runParallelFilters(input, allFst, idx) + synonyms := s.getSynonymsWithMaxTokensConsumed(asyncResults, maxTokensConsumed) + newPos := calculateNewPosition(input, idx, prevPos) + + rv = appendSynonymsToResult(rv, input, idx, maxTokensConsumed, newPos, synonyms) + + idx += maxTokensConsumed + prevPos = newPos + } + return rv +} + +func loadFSTData(metadataList []*index.SynonymMetadata) ([]*vellum.FST, error) { + allFst := make([]*vellum.FST, len(metadataList)) + for i, metadata := range metadataList { + fst, err := vellum.Load(metadata.SynonymFST) + if err != nil { + return nil, err + } + allFst[i] = fst + } + return allFst, nil +} + +func (s *SynonymFilter) runParallelFilters(input analysis.TokenStream, allFst []*vellum.FST, idx int) ([]*asyncResult, int) { + asyncResults := make([]*asyncResult, len(s.metadata)) + var maxTokensConsumed int + var waitGroup sync.WaitGroup + for i, metadata := range s.metadata { + waitGroup.Add(1) + go func(i int, metadata *index.SynonymMetadata) { + asyncResults[i] = s.parallelFilter(input, metadata, idx, allFst[i]) + waitGroup.Done() + }(i, metadata) + } + waitGroup.Wait() + for _, asr := range asyncResults { + if asr.err != nil { + fmt.Printf("error in synonym filter: %v\n", asr.err) + return asyncResults, maxTokensConsumed + } + if asr.numTokensConsumed > maxTokensConsumed { + maxTokensConsumed = asr.numTokensConsumed + } + } + return asyncResults, maxTokensConsumed +} + +func (s *SynonymFilter) getSynonymsWithMaxTokensConsumed(asyncResults []*asyncResult, maxTokensConsumed int) []string { + var synonyms []string + for _, asr := range asyncResults { + if asr.numTokensConsumed == maxTokensConsumed { + synonyms = append(synonyms, asr.output...) + } + } + return synonyms +} + +func calculateNewPosition(input analysis.TokenStream, idx, prevPos int) int { + var B int + if idx == 0 { + B = 0 + } else { + B = input[idx-1].Position + } + return input[idx].Position - B + prevPos +} + +func appendSynonymsToResult(rv analysis.TokenStream, input analysis.TokenStream, idx, maxTokensConsumed, newPos int, synonyms []string) analysis.TokenStream { + if maxTokensConsumed == 1 { + rv = append(rv, input[idx]) + } else { + phrase := make([]string, maxTokensConsumed) + for i := idx; i < idx+maxTokensConsumed; i++ { + phrase[i-idx] = string(input[i].Term) + } + rv = append(rv, &analysis.Token{ + Term: []byte(strings.Join(phrase, " ")), + Position: newPos, + Start: input[idx].Start, + End: input[idx+maxTokensConsumed-1].End, + Type: analysis.Synonym, + }) + } + for _, i := range synonyms { + rv = append(rv, &analysis.Token{ + Term: []byte(i), + Position: newPos, + Start: input[idx].Start, + End: input[idx+maxTokensConsumed-1].End, + Type: analysis.Synonym, + }) + } + return rv +} + +func SynonymFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return &SynonymFilter{}, nil +} + +func NewSynonymTokenFilter(metadata []*index.SynonymMetadata, fuzziness int, prefix int) *SynonymFilter { + return &SynonymFilter{ + metadata: metadata, + fuzziness: fuzziness, + prefix: prefix, + } +} + +func init() { + registry.RegisterTokenFilter(Name, SynonymFilterConstructor) +} diff --git a/analysis/token/synonym/synonym_util.go b/analysis/token/synonym/synonym_util.go new file mode 100644 index 000000000..aee8bebe9 --- /dev/null +++ b/analysis/token/synonym/synonym_util.go @@ -0,0 +1,118 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package synonym + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/blevesearch/bleve/v2/analysis" + index "github.com/blevesearch/bleve_index_api" +) + +// This function is used to convert the token stream to a phrase by using the +// position attribute of the tokens for example if the token stream is +// "hello world" and the position of hello is 2 and world is 4 then the phrase +// will be ["hello","","world"] +// This would essentially maintain the number of stop words between two +// normal words and maintain the order of the words while also stripping +// stop words at the end and start of the phrase. +func tokenStreamToPhrase(tokens analysis.TokenStream) []string { + firstPosition := int(^uint(0) >> 1) + lastPosition := 0 + for _, token := range tokens { + if token.Position < firstPosition { + firstPosition = token.Position + } + if token.Position > lastPosition { + lastPosition = token.Position + } + } + phraseLen := lastPosition - firstPosition + 1 + rv := make([]string, phraseLen) + if phraseLen > 0 { + for _, token := range tokens { + pos := token.Position - firstPosition + rv[pos] = string(token.Term) + } + } + return rv +} + +// applies an analyzer to each string in a slice and returns the result slice. +// if the analyzer is nil, the original slice is returned. +func analyzeSlice(analyzer analysis.Analyzer, slice []string) []string { + if analyzer == nil { + return slice + } + loc := 0 + rv := make([]string, len(slice)) + for _, val := range slice { + analyzedPhrase := tokenStreamToPhrase(analyzer.Analyze([]byte(val))) + result := strings.Join(analyzedPhrase, " ") + rv[loc] = result + loc++ + } + return rv +} + +// applies the analyzer specified by the mapping to the input and synonyms +// of the synonym struct. if the analyzer is nil, the original struct is returned. +func Analyze(syn *index.SynonymDefinition, analyzer analysis.Analyzer) *index.SynonymDefinition { + return &index.SynonymDefinition{ + MappingType: syn.MappingType, + Input: analyzeSlice(analyzer, syn.Input), + Synonyms: analyzeSlice(analyzer, syn.Synonyms), + } +} + +func AddSynonymFilter(ctx context.Context, analyzer analysis.Analyzer, i index.IndexReader, + key, synonymSourceName string, fuzziness, prefix int) (analysis.Analyzer, error) { + + if searchSynMD := ctx.Value("_use_synonym_metadata_key"); searchSynMD != nil { + if synSources, ok := searchSynMD.([]byte); ok { + var myData map[string][]*index.SynonymMetadata + err := json.Unmarshal(synSources, &myData) + if err != nil { + return nil, fmt.Errorf("error unmarshalling synonym metadata, err: %v", err) + } + if synMD, ok := myData[synonymSourceName]; ok { + if len(synMD) == 0 { + return nil, fmt.Errorf("no synonym metadata found") + } + synonymTokenFilter := NewSynonymTokenFilter(synMD, fuzziness, prefix) + return &analysis.ExtendedAnalyzer{ + BaseAnalyzer: analyzer, + ExtraTokenFilters: []analysis.TokenFilter{synonymTokenFilter}, + }, nil + } + } + } + + metadata, err := i.SynonymMetadata(key) + if err != nil { + return nil, fmt.Errorf("error reading synonym metadata from index file, err: %v", err) + } + if len(metadata) == 0 { + return nil, fmt.Errorf("no synonym metadata found") + } + synonymTokenFilter := NewSynonymTokenFilter(metadata, fuzziness, prefix) + return &analysis.ExtendedAnalyzer{ + BaseAnalyzer: analyzer, + ExtraTokenFilters: []analysis.TokenFilter{synonymTokenFilter}, + }, nil +} diff --git a/analysis/type.go b/analysis/type.go index e3a7f201b..b5947cb1a 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -35,6 +35,7 @@ const ( Double Boolean IP + Synonym ) // Token represents one occurrence of a term at a particular location in a @@ -106,6 +107,30 @@ type DateTimeParser interface { ParseDateTime(string) (time.Time, string, error) } +type SynonymSource interface { + Collection() string + Analyzer() string + MetadataKey() string +} + type ByteArrayConverter interface { Convert([]byte) (interface{}, error) } + +type ExtendedAnalyzer struct { + BaseAnalyzer Analyzer + ExtraTokenFilters []TokenFilter +} + +func (atf *ExtendedAnalyzer) Analyze(input []byte) TokenStream { + var output TokenStream + if atf.BaseAnalyzer != nil { + output = atf.BaseAnalyzer.Analyze(input) + } else { + output = TokenStream{&Token{Term: input, Position: 1}} + } + for _, tf := range atf.ExtraTokenFilters { + output = tf.Filter(output) + } + return output +} diff --git a/builder.go b/builder.go index 30285a2e4..18874af7d 100644 --- a/builder.go +++ b/builder.go @@ -15,6 +15,7 @@ package bleve import ( + "encoding/json" "fmt" "github.com/blevesearch/bleve/v2/document" @@ -43,6 +44,40 @@ func (b *builderImpl) Index(id string, data interface{}) error { return err } +func (b *builderImpl) IndexSynonym(collection string, id string, data []byte) error { + if id == "" { + return ErrorEmptyID + } + if collection == "" { + return fmt.Errorf("collection cannot be empty") + } + var syn index.SynonymDefinition + err := json.Unmarshal(data, &syn) + if err != nil { + return fmt.Errorf("error parsing synonym definition: %v", err) + } + + err = syn.Validate() + if err != nil { + return fmt.Errorf("error validating synonym definition: %v", err) + } + + // the list of synonym sources in the index mapping + analyzers := b.m.AnalyzersForSynonymCollection(collection) + if analyzers == nil { + return fmt.Errorf("no synonym sources defined for collection: %s", collection) + } + + // Create a new synonym document + doc := document.NewDocument(id) + for analyzerName, analyzer := range analyzers { + fieldName := index.CreateSynonymMetadataKey(collection, analyzerName) + doc.AddField(document.NewSynonymField(fieldName, &syn, analyzer, collection)) + } + b.b.Index(doc) + return nil +} + func (b *builderImpl) Close() error { return b.b.Close() } diff --git a/cmd/bleve/cmd/registry.go b/cmd/bleve/cmd/registry.go index 9d5fc3f4e..3a3269149 100644 --- a/cmd/bleve/cmd/registry.go +++ b/cmd/bleve/cmd/registry.go @@ -54,6 +54,9 @@ var registryCmd = &cobra.Command{ types, instances = registry.DateTimeParserTypesAndInstances() printType("Date Time Parser", types, instances) + types, instances = registry.SynonymSourceTypesAndInstances() + printType("Synonym Source", types, instances) + types, instances = registry.KVStoreTypesAndInstances() printType("KV Store", types, instances) diff --git a/config/config.go b/config/config.go index 492b86f74..f894ffac3 100644 --- a/config/config.go +++ b/config/config.go @@ -79,6 +79,9 @@ import ( _ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds" _ "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds" + // synonym sources + _ "github.com/blevesearch/bleve/v2/analysis/synonym" + // languages _ "github.com/blevesearch/bleve/v2/analysis/lang/ar" _ "github.com/blevesearch/bleve/v2/analysis/lang/bg" diff --git a/document/field_synonym.go b/document/field_synonym.go new file mode 100644 index 000000000..ab16f6558 --- /dev/null +++ b/document/field_synonym.go @@ -0,0 +1,97 @@ +// Copyright (c) 2022 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/analysis/token/synonym" + + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeSynonymField int + +func init() { + var f SynonymField + reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size()) +} + +type SynonymField struct { + name string + synDef *index.SynonymDefinition + analyzedSynDef *index.SynonymDefinition + analyzer analysis.Analyzer + options index.FieldIndexingOptions + numPlainTextBytes uint64 + length int + value []byte +} + +func (n *SynonymField) Size() int { + return reflectStaticSizeSynonymField + size.SizeOfPtr + + len(n.name) +} + +func (n *SynonymField) Name() string { + return n.name +} + +func (n *SynonymField) NumPlainTextBytes() uint64 { + return n.numPlainTextBytes +} + +func (n *SynonymField) ArrayPositions() []uint64 { + return nil +} + +func (n *SynonymField) Options() index.FieldIndexingOptions { + return n.options +} + +func (n *SynonymField) Value() []byte { + return n.value +} + +func (n *SynonymField) EncodedFieldType() byte { + return 'y' +} + +func (n *SynonymField) AnalyzedLength() int { + return n.length +} + +func (n *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (n *SynonymField) Analyze() { + n.analyzedSynDef = synonym.Analyze(n.synDef, n.analyzer) +} + +func (n *SynonymField) AnalyzedSynonymDefinition() *index.SynonymDefinition { + return n.analyzedSynDef +} + +func NewSynonymField(metadataKey string, synDef *index.SynonymDefinition, analyzer analysis.Analyzer, collection string) *SynonymField { + return &SynonymField{ + name: metadataKey, + synDef: synDef, + analyzer: analyzer, + options: index.IndexField, + } +} diff --git a/go.mod b/go.mod index 5a922d8d0..fcc7209af 100644 --- a/go.mod +++ b/go.mod @@ -41,3 +41,11 @@ require ( github.com/spf13/pflag v1.0.5 // indirect golang.org/x/sys v0.5.0 // indirect ) + +replace github.com/blevesearch/vellum => ../vellum + +replace github.com/blevesearch/zapx/v15 => ../zap + +replace github.com/blevesearch/bleve_index_api => ../bleve_index_api + +replace github.com/blevesearch/scorch_segment_api/v2 => ../scorch_segment_api diff --git a/go.sum b/go.sum index 90ebfb29b..d5cb69747 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,6 @@ github.com/RoaringBitmap/roaring v1.2.3 h1:yqreLINqIrX22ErkKI0vY47/ivtJr6n+kMhVO github.com/RoaringBitmap/roaring v1.2.3/go.mod h1:plvDsJQpxOC5bw8LRteu/MLWHsHez/3y6cubLI4/1yE= github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA= github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= -github.com/blevesearch/bleve_index_api v1.0.6 h1:gyUUxdsrvmW3jVhhYdCVL6h9dCjNT/geNU7PxGn37p8= -github.com/blevesearch/bleve_index_api v1.0.6/go.mod h1:YXMDwaXFFXwncRS8UobWs7nvo0DmusriM1nztTlj1ms= github.com/blevesearch/geo v0.1.18 h1:Np8jycHTZ5scFe7VEPLrDoHnnb9C4j636ue/CGrhtDw= github.com/blevesearch/geo v0.1.18/go.mod h1:uRMGWG0HJYfWfFJpK3zTdnnr1K+ksZTuWKhXeSokfnM= github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA= @@ -17,8 +15,6 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.1.6 h1:CdekX/Ob6YCYmeHzD72cKpwzBjvkOGegHOqhAkXp6yA= -github.com/blevesearch/scorch_segment_api/v2 v2.1.6/go.mod h1:nQQYlp51XvoSVxcciBjtvuHPIVjlWrN1hX4qwK2cqdc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -29,8 +25,6 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= -github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= @@ -39,8 +33,6 @@ github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIq github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk= github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU= github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= -github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ= -github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/index.go b/index.go index 7d4c9be9b..2a8e59452 100644 --- a/index.go +++ b/index.go @@ -16,6 +16,8 @@ package bleve import ( "context" + "encoding/json" + "fmt" "github.com/blevesearch/bleve/v2/index/upsidedown" @@ -60,6 +62,45 @@ func (b *Batch) Index(id string, data interface{}) error { return nil } +func (b *Batch) IndexSynonym(collection string, id string, data []byte) error { + if id == "" { + return ErrorEmptyID + } + if collection == "" { + return fmt.Errorf("collection cannot be empty") + } + var syn index.SynonymDefinition + err := json.Unmarshal(data, &syn) + if err != nil { + return fmt.Errorf("error parsing synonym definition: %v", err) + } + + err = syn.Validate() + if err != nil { + return fmt.Errorf("error validating synonym definition: %v", err) + } + + // the list of synonym sources in the index mapping + analyzers := b.index.Mapping().AnalyzersForSynonymCollection(collection) + if analyzers == nil { + return fmt.Errorf("no synonym sources defined for collection: %s", collection) + } + + // Create a new synonym document + doc := document.NewDocument(id) + for analyzerName, analyzer := range analyzers { + fieldName := index.CreateSynonymMetadataKey(collection, analyzerName) + doc.AddField(document.NewSynonymField(fieldName, &syn, analyzer, collection)) + } + b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + + return nil +} + func (b *Batch) LastDocSize() uint64 { return b.lastDocSize } @@ -211,6 +252,7 @@ type Index interface { // requests. See Index interface documentation for details about mapping // rules. Index(id string, data interface{}) error + IndexSynonym(collection string, id string, data []byte) error Delete(id string) error NewBatch() *Batch @@ -300,6 +342,7 @@ func OpenUsing(path string, runtimeConfig map[string]interface{}) (Index, error) // indexed only once. type Builder interface { Index(id string, data interface{}) error + IndexSynonym(collection string, id string, data []byte) error Close() error } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index f30d795e9..da881d26b 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -361,7 +361,7 @@ func (s *Scorch) Delete(id string) error { return s.Batch(b) } -// Batch applices a batch of changes to the index atomically +// Batch applies a batch of changes to the index atomically func (s *Scorch) Batch(batch *index.Batch) (err error) { start := time.Now() diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 59828e875..07b56002d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -47,6 +47,8 @@ type asynchSegmentResult struct { postings segment.PostingsList + synonymMetadata *index.SynonymMetadata + err error } @@ -212,6 +214,73 @@ func (is *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) { }, false) } +func isValid(deleted *roaring.Bitmap, docNums map[uint32]struct{}) bool { + if deleted == nil { + return true + } + for docNum := range docNums { + if !deleted.Contains(docNum) { + return true + } + } + return false +} + +func cleanSynonymMetadata(s *index.SynonymMetadata, b *roaring.Bitmap) { + if s == nil { + return + } + for hash, synonyms := range s.HashToSynonyms { + // fast path -> if the LHS' DocNums is not in the bitmap, then + // set lhsStruct Invalid as true. + lhsStruct := s.HashToPhrase[hash] + if !isValid(b, lhsStruct.DocNums) { + lhsStruct.IsInvalid = true + continue + } + // lhs struct is valid + for _, syn := range synonyms { + // check all rhs structs now + if !isValid(b, syn.DocNums) { + syn.IsInvalid = true + } + } + } +} + +func (is *IndexSnapshot) SynonymMetadata(metadataKey string) ([]*index.SynonymMetadata, error) { + results := make(chan *asynchSegmentResult) + for _, s := range is.segment { + go func(s *SegmentSnapshot) { + synSeg, ok := s.segment.(segment.SynonymSegment) + if !ok { + // segment is not a synonym segment so just skip it + results <- &asynchSegmentResult{synonymMetadata: nil, err: nil} + return + } + metadata, err := synSeg.SynonymMetadata(metadataKey) + cleanSynonymMetadata(metadata, s.Deleted()) + results <- &asynchSegmentResult{synonymMetadata: metadata, err: err} + }(s) + } + rv := make([]*index.SynonymMetadata, 0, len(is.segment)) + var err error + for count := 0; count < len(is.segment); count++ { + asr := <-results + if asr.err != nil && err == nil { + err = asr.err + } else { + if asr.synonymMetadata != nil { + rv = append(rv, asr.synonymMetadata) + } + } + } + if err != nil { + return nil, err + } + return rv, nil +} + // calculateExclusiveEndFromInclusiveEnd produces the next key // when sorting using memcmp style comparisons, suitable to // use as the end key in a traditional (inclusive, exclusive] diff --git a/index/upsidedown/index_reader.go b/index/upsidedown/index_reader.go index 44ccf591a..c7f138db2 100644 --- a/index/upsidedown/index_reader.go +++ b/index/upsidedown/index_reader.go @@ -20,7 +20,7 @@ import ( "github.com/blevesearch/bleve/v2/document" index "github.com/blevesearch/bleve_index_api" - "github.com/blevesearch/upsidedown_store_api" + store "github.com/blevesearch/upsidedown_store_api" ) var reflectStaticSizeIndexReader int @@ -64,6 +64,10 @@ func (i *IndexReader) DocIDReaderAll() (index.DocIDReader, error) { return newUpsideDownCouchDocIDReader(i) } +func (i *IndexReader) SynonymMetadata(field string) ([]*index.SynonymMetadata, error) { + return nil, nil +} + func (i *IndexReader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { return newUpsideDownCouchDocIDReaderOnly(i, ids) } diff --git a/index_alias_impl.go b/index_alias_impl.go index a73dd6b8f..183cd35e2 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -76,6 +76,21 @@ func (i *indexAliasImpl) Index(id string, data interface{}) error { return i.indexes[0].Index(id, data) } +func (i *indexAliasImpl) IndexSynonym(collection string, id string, data []byte) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + return i.indexes[0].Index(id, data) +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() @@ -279,12 +294,12 @@ func (i *indexAliasImpl) Mapping() mapping.IndexMapping { return nil } - err := i.isAliasToSingleIndex() - if err != nil { - return nil + for _, index := range i.indexes { + if index.Mapping() != nil { + return index.Mapping() + } } - - return i.indexes[0].Mapping() + return nil } func (i *indexAliasImpl) Stats() *IndexStat { @@ -471,7 +486,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se // run search on each index in separate go routine var waitGroup sync.WaitGroup - var searchChildIndex = func(in Index, childReq *SearchRequest) { + var searchChildIndex = func(idx int, in Index, childReq *SearchRequest) { rv := asyncSearchResult{Name: in.Name()} rv.Result, rv.Err = in.SearchInContext(ctx, childReq) asyncResults <- &rv @@ -479,8 +494,8 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } waitGroup.Add(len(indexes)) - for _, in := range indexes { - go searchChildIndex(in, createChildSearchRequest(req)) + for idx, in := range indexes { + go searchChildIndex(idx, in, createChildSearchRequest(req)) } // on another go routine, close after finished diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 1b6ae55f4..f9d95fa12 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -1249,6 +1249,9 @@ type stubIndex struct { func (i *stubIndex) Index(id string, data interface{}) error { return i.err } +func (i *stubIndex) IndexSynonym(collection string, id string, data []byte) error { + return i.err +} func (i *stubIndex) Delete(id string) error { return i.err diff --git a/index_impl.go b/index_impl.go index d5f34a2a3..336c2c1dd 100644 --- a/index_impl.go +++ b/index_impl.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "encoding/json" "fmt" "io" "os" @@ -57,6 +58,9 @@ var mappingInternalKey = []byte("_mapping") const SearchQueryStartCallbackKey = "_search_query_start_callback_key" const SearchQueryEndCallbackKey = "_search_query_end_callback_key" +const SearchSynonymMetadataKey = "_search_synonym_metadata_key" +const UseSynonymMetadataKey = "_use_synonym_metadata_key" + type SearchQueryStartCallbackFn func(size uint64) error type SearchQueryEndCallbackFn func(size uint64) error @@ -261,6 +265,48 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { return } +func (i *indexImpl) IndexSynonym(collection string, id string, data []byte) (err error) { + if id == "" { + return ErrorEmptyID + } + if collection == "" { + return fmt.Errorf("collection cannot be empty") + } + + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + var syn index.SynonymDefinition + err = json.Unmarshal(data, &syn) + if err != nil { + err = fmt.Errorf("error parsing synonym definition: %v", err) + return + } + err = syn.Validate() + if err != nil { + err = fmt.Errorf("error validating synonym definition: %v", err) + return + } + + // the list of synonym sources in the index mapping + analyzers := i.m.AnalyzersForSynonymCollection(collection) + if analyzers == nil { + return fmt.Errorf("no synonym sources defined for collection: %s", collection) + } + + // Create a new synonym document + doc := document.NewDocument(id) + for analyzerName, analyzer := range analyzers { + fieldName := index.CreateSynonymMetadataKey(collection, analyzerName) + doc.AddField(document.NewSynonymField(fieldName, &syn, analyzer, collection)) + } + err = i.i.Update(doc) + return nil +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { @@ -444,7 +490,39 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !i.open { return nil, ErrorIndexClosed } - + if searchSynMD := ctx.Value(SearchSynonymMetadataKey); searchSynMD != nil { + if synSources, ok := searchSynMD.([]string); ok { + rv := make(map[string][]*index.SynonymMetadata) + indexReader, err := i.i.Reader() + if err != nil { + return nil, fmt.Errorf("error opening index reader %v", err) + } + defer func() { + if cerr := indexReader.Close(); err == nil && cerr != nil { + err = cerr + } + }() + for _, src := range synSources { + synOb := i.m.SynonymSourceNamed(src) + if synOb == nil { + return nil, fmt.Errorf("no synonym source named '%s' registered", src) + } + synonyms, err := indexReader.SynonymMetadata(synOb.MetadataKey()) + if err != nil { + return nil, err + } + rv[src] = synonyms + } + return &SearchResult{ + Status: &SearchStatus{ + Total: 1, + Successful: 1, + }, + Request: req, + SynonymMetadata: rv, + }, nil + } + } var reverseQueryExecution bool if req.SearchBefore != nil { reverseQueryExecution = true diff --git a/mapping/analysis.go b/mapping/analysis.go index 03e3cd01b..311e97232 100644 --- a/mapping/analysis.go +++ b/mapping/analysis.go @@ -21,6 +21,7 @@ type customAnalysis struct { TokenFilters map[string]map[string]interface{} `json:"token_filters,omitempty"` Analyzers map[string]map[string]interface{} `json:"analyzers,omitempty"` DateTimeParsers map[string]map[string]interface{} `json:"date_time_parsers,omitempty"` + SynonymSources map[string]map[string]interface{} `json:"synonym_sources,omitempty"` } func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { @@ -83,6 +84,12 @@ func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { return err } } + for name, config := range c.SynonymSources { + _, err := i.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + } return nil } @@ -94,6 +101,7 @@ func newCustomAnalysis() *customAnalysis { TokenFilters: make(map[string]map[string]interface{}), Analyzers: make(map[string]map[string]interface{}), DateTimeParsers: make(map[string]map[string]interface{}), + SynonymSources: make(map[string]map[string]interface{}), } return &rv } diff --git a/mapping/document.go b/mapping/document.go index aacaa0a55..92773d786 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -77,6 +77,16 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache) error { return err } } + if field.SynonymSource != "" { + synSrc, err := cache.SynonymSourceNamed(field.SynonymSource) + if err != nil { + return err + } + _, err = cache.AnalyzerNamed(synSrc.Analyzer()) + if err != nil { + return err + } + } switch field.Type { case "text", "datetime", "number", "boolean", "geopoint", "geoshape", "IP": default: @@ -97,6 +107,14 @@ func (dm *DocumentMapping) analyzerNameForPath(path string) string { return "" } +func (dm *DocumentMapping) synonymSourceNameForPath(path string) string { + field := dm.fieldDescribedByPath(path) + if field != nil { + return field.SynonymSource + } + return "" +} + func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { pathElements := decodePath(path) if len(pathElements) > 1 { diff --git a/mapping/field.go b/mapping/field.go index 82d51f317..c057d3eb5 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -69,6 +69,8 @@ type FieldMapping struct { // the processing of freq/norm details when the default score based relevancy // isn't needed. SkipFreqNorm bool `json:"skip_freq_norm,omitempty"` + + SynonymSource string `json:"synonym_source,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -448,6 +450,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "synonym_source": + err := json.Unmarshal(v, &fm.SynonymSource) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } diff --git a/mapping/index.go b/mapping/index.go index 0de4147a4..2002d5708 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/analysis/analyzer/standard" "github.com/blevesearch/bleve/v2/analysis/datetime/optional" + "github.com/blevesearch/bleve/v2/analysis/synonym" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" @@ -34,6 +35,7 @@ const defaultType = "_default" const defaultField = "_all" const defaultAnalyzer = standard.Name const defaultDateTimeParser = optional.Name +const defaultSynonymSource = synonym.Name // An IndexMappingImpl controls how objects are placed // into an index. @@ -145,6 +147,16 @@ func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[stri return nil } +// AddSynonymSource defines a custom synonym source for use in this mapping +func (im *IndexMappingImpl) AddSynonymSource(name string, config map[string]interface{}) error { + _, err := im.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + im.CustomAnalysis.SynonymSources[name] = config + return nil +} + // NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMappingImpl { return &IndexMappingImpl{ @@ -337,6 +349,14 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} return nil } +func (im *IndexMappingImpl) AnalyzersForSynonymCollection(collection string) map[string]analysis.Analyzer { + return im.cache.AnalyzersForSynonymCollection(collection) +} + +func (im *IndexMappingImpl) SynonymCollections() []string { + return im.cache.SynonymCollections() +} + type walkContext struct { doc *document.Document im *IndexMappingImpl @@ -436,3 +456,33 @@ func (im *IndexMappingImpl) FieldAnalyzer(field string) string { func (im *IndexMappingImpl) DefaultSearchField() string { return im.DefaultField } + +func (im *IndexMappingImpl) SynonymSourceNameForPath(path string) string { + //look for explicit mapping on the field + for _, docMapping := range im.TypeMapping { + synonymSourceName := docMapping.synonymSourceNameForPath(path) + if synonymSourceName != "" { + return synonymSourceName + } + } + + // now try the default mapping + pathMapping, _ := im.DefaultMapping.documentMappingForPath(path) + if pathMapping != nil { + if len(pathMapping.Fields) > 0 { + if pathMapping.Fields[0].SynonymSource != "" { + return pathMapping.Fields[0].SynonymSource + } + } + } + return "" +} + +func (im *IndexMappingImpl) SynonymSourceNamed(name string) analysis.SynonymSource { + synonymSource, err := im.cache.SynonymSourceNamed(name) + if err != nil { + logger.Printf("error using synonym source named: %s", name) + return nil + } + return synonymSource +} diff --git a/mapping/mapping.go b/mapping/mapping.go index a3e5a54e0..e10c6f31e 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -55,4 +55,8 @@ type IndexMapping interface { AnalyzerNameForPath(path string) string AnalyzerNamed(name string) analysis.Analyzer + + SynonymSourceNameForPath(path string) string + SynonymSourceNamed(name string) analysis.SynonymSource + AnalyzersForSynonymCollection(name string) map[string]analysis.Analyzer } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index e0151af7a..2dbc33a75 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -17,12 +17,13 @@ package mapping import ( "encoding/json" "fmt" - index "github.com/blevesearch/bleve_index_api" "reflect" "strconv" "testing" "time" + index "github.com/blevesearch/bleve_index_api" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/exception" "github.com/blevesearch/bleve/v2/analysis/tokenizer/regexp" "github.com/blevesearch/bleve/v2/document" diff --git a/registry/index_type.go b/registry/index_type.go index 67938c4af..5d0bcd296 100644 --- a/registry/index_type.go +++ b/registry/index_type.go @@ -38,7 +38,7 @@ func IndexTypeConstructorByName(name string) IndexTypeConstructor { func IndexTypesAndInstances() ([]string, []string) { var types []string var instances []string - for name := range stores { + for name := range indexTypes { types = append(types, name) } return types, instances diff --git a/registry/registry.go b/registry/registry.go index 1954d0896..eaeb94162 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -36,6 +36,7 @@ var tokenMaps = make(TokenMapRegistry, 0) var tokenFilters = make(TokenFilterRegistry, 0) var analyzers = make(AnalyzerRegistry, 0) var dateTimeParsers = make(DateTimeParserRegistry, 0) +var synonymSources = make(SynonymSourceRegistry, 0) type Cache struct { CharFilters *CharFilterCache @@ -44,6 +45,7 @@ type Cache struct { TokenFilters *TokenFilterCache Analyzers *AnalyzerCache DateTimeParsers *DateTimeParserCache + SynonymSources *SynonymSourceCache FragmentFormatters *FragmentFormatterCache Fragmenters *FragmenterCache Highlighters *HighlighterCache @@ -57,6 +59,7 @@ func NewCache() *Cache { TokenFilters: NewTokenFilterCache(), Analyzers: NewAnalyzerCache(), DateTimeParsers: NewDateTimeParserCache(), + SynonymSources: NewSynonymSourceCache(), FragmentFormatters: NewFragmentFormatterCache(), Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), @@ -147,6 +150,44 @@ func (c *Cache) DefineDateTimeParser(name string, config map[string]interface{}) return c.DateTimeParsers.DefineDateTimeParser(name, typ, config, c) } +func (c *Cache) SynonymSourceNamed(name string) (analysis.SynonymSource, error) { + return c.SynonymSources.SynonymSourceNamed(name, c) +} + +func (c *Cache) DefineSynonymSource(name string, config map[string]interface{}) (analysis.SynonymSource, error) { + typ, err := typeFromConfig(config) + if err != nil { + return nil, err + } + return c.SynonymSources.DefineSynonymSource(name, typ, config, c) +} + +func (c *Cache) AnalyzersForSynonymCollection(collection string) map[string]analysis.Analyzer { + if c.SynonymSources == nil { + // no synonym sources defined + return nil + } + analyzerNames := c.SynonymSources.analyzersForCollection(collection) + if len(analyzerNames) == 0 { + // no synonym sources for this collection + return nil + } + rv := make(map[string]analysis.Analyzer) + for _, analyzerName := range analyzerNames { + analyzer, _ := c.AnalyzerNamed(analyzerName) + rv[analyzerName] = analyzer + } + return rv +} + +func (c *Cache) SynonymCollections() []string { + if c.SynonymSources == nil { + // no synonym sources defined + return nil + } + return c.SynonymSources.SynonymCollections() +} + func (c *Cache) FragmentFormatterNamed(name string) (highlight.FragmentFormatter, error) { return c.FragmentFormatters.FragmentFormatterNamed(name, c) } diff --git a/registry/synonym_source.go b/registry/synonym_source.go new file mode 100644 index 000000000..a5124dd20 --- /dev/null +++ b/registry/synonym_source.go @@ -0,0 +1,117 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" +) + +func RegisterSynonymSource(name string, constructor SynonymSourceConstructor) { + _, exists := synonymSources[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate synonym source named '%s'", name)) + } + synonymSources[name] = constructor +} + +type SynonymSourceConstructor func(config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) +type SynonymSourceRegistry map[string]SynonymSourceConstructor + +type SynonymSourceCache struct { + *ConcurrentCache +} + +func NewSynonymSourceCache() *SynonymSourceCache { + return &SynonymSourceCache{ + NewConcurrentCache(), + } +} + +func SynonymSourceBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { + cons, registered := synonymSources[name] + if !registered { + return nil, fmt.Errorf("no synonym source with name or type '%s' registered", name) + } + synonymSource, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building synonym source: %v", err) + } + return synonymSource, nil +} + +func (c *SynonymSourceCache) SynonymSourceNamed(name string, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.ItemNamed(name, cache, SynonymSourceBuild) + if err != nil { + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) analyzersForCollection(collection string) []string { + c.mutex.RLock() + defer c.mutex.RUnlock() + rv := make([]string, 0, len(c.data)) + for _, synSource := range c.data { + if synSource.(analysis.SynonymSource).Collection() == collection { + rv = append(rv, synSource.(analysis.SynonymSource).Analyzer()) + } + } + return rv +} + +func (c *SynonymSourceCache) SynonymCollections() []string { + c.mutex.RLock() + defer c.mutex.RUnlock() + collectionSet := make(map[string]struct{}) + rv := make([]string, 0, len(c.data)) + for _, synSource := range c.data { + collection := synSource.(analysis.SynonymSource).Collection() + if _, ok := collectionSet[collection]; !ok { + collectionSet[collection] = struct{}{} + rv = append(rv, synSource.(analysis.SynonymSource).Collection()) + } + } + return rv +} + +func (c *SynonymSourceCache) DefineSynonymSource(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.DefineItem(name, typ, config, cache, SynonymSourceBuild) + if err != nil { + if err == ErrAlreadyDefined { + return nil, fmt.Errorf("synonym source named '%s' already defined", name) + } + return nil, err + } + + return item.(analysis.SynonymSource), nil +} + +func SynonymSourceTypesAndInstances() ([]string, []string) { + emptyConfig := map[string]interface{}{} + emptyCache := NewCache() + var types []string + var instances []string + for name, cons := range synonymSources { + _, err := cons(emptyConfig, emptyCache) + if err == nil { + instances = append(instances, name) + } else { + types = append(types, name) + } + } + return types, instances +} diff --git a/search.go b/search.go index 8ca0310fb..26aa5f761 100644 --- a/search.go +++ b/search.go @@ -30,6 +30,7 @@ import ( "github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/size" "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" ) const defaultDateTimeParser = optional.Name @@ -540,14 +541,15 @@ func (ss *SearchStatus) Merge(other *SearchStatus) { // Took - The time taken to execute the search. // Facets - The facet results for the search. type SearchResult struct { - Status *SearchStatus `json:"status"` - Request *SearchRequest `json:"request"` - Hits search.DocumentMatchCollection `json:"hits"` - Total uint64 `json:"total_hits"` - Cost uint64 `json:"cost"` - MaxScore float64 `json:"max_score"` - Took time.Duration `json:"took"` - Facets search.FacetResults `json:"facets"` + Status *SearchStatus `json:"status"` + Request *SearchRequest `json:"request"` + Hits search.DocumentMatchCollection `json:"hits"` + Total uint64 `json:"total_hits"` + Cost uint64 `json:"cost"` + MaxScore float64 `json:"max_score"` + Took time.Duration `json:"took"` + Facets search.FacetResults `json:"facets"` + SynonymMetadata map[string][]*index.SynonymMetadata `json:"synonym_metadata,omitempty"` } func (sr *SearchResult) Size() int { @@ -618,6 +620,14 @@ func (sr *SearchResult) String() string { // Merge will merge together multiple SearchResults during a MultiSearch func (sr *SearchResult) Merge(other *SearchResult) { sr.Status.Merge(other.Status) + if other.SynonymMetadata != nil { + if sr.SynonymMetadata == nil { + sr.SynonymMetadata = make(map[string][]*index.SynonymMetadata) + } + for k, v := range other.SynonymMetadata { + sr.SynonymMetadata[k] = append(sr.SynonymMetadata[k], v...) + } + } sr.Hits = append(sr.Hits, other.Hits...) sr.Total += other.Total sr.Cost += other.Cost diff --git a/search/collector/search_test.go b/search/collector/search_test.go index 1f6f88213..8973ac808 100644 --- a/search/collector/search_test.go +++ b/search/collector/search_test.go @@ -102,6 +102,10 @@ func (sr *stubReader) Size() int { return 0 } +func (sr *stubReader) SynonymMetadata(field string) ([]*index.SynonymMetadata, error) { + return nil, nil +} + func (sr *stubReader) TermFieldReader(ctx context.Context, term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { return nil, nil } diff --git a/search/query/boolean.go b/search/query/boolean.go index 026a58688..8c19b79ea 100644 --- a/search/query/boolean.go +++ b/search/query/boolean.go @@ -198,6 +198,19 @@ func (q *BooleanQuery) Validate() error { } return nil } +func (q *BooleanQuery) SynonymSourceName(m mapping.IndexMapping) []string { + rv := []string{} + if qm, ok := q.Must.(SynonymSearchEnabledQuery); ok { + rv = append(rv, qm.SynonymSourceName(m)...) + } + if qs, ok := q.Should.(SynonymSearchEnabledQuery); ok { + rv = append(rv, qs.SynonymSourceName(m)...) + } + if qmn, ok := q.MustNot.(SynonymSearchEnabledQuery); ok { + rv = append(rv, qmn.SynonymSourceName(m)...) + } + return rv +} func (q *BooleanQuery) UnmarshalJSON(data []byte) error { tmp := struct { diff --git a/search/query/conjunction.go b/search/query/conjunction.go index 0565e18f7..15bb2657f 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -54,6 +54,16 @@ func (q *ConjunctionQuery) AddQuery(aq ...Query) { } } +func (q *ConjunctionQuery) SynonymSourceName(m mapping.IndexMapping) []string { + rv := []string{} + for _, disjunct := range q.Conjuncts { + if disjunct, ok := disjunct.(SynonymSearchEnabledQuery); ok { + rv = append(rv, disjunct.SynonymSourceName(m)...) + } + } + return rv +} + func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Conjuncts)) for _, conjunct := range q.Conjuncts { diff --git a/search/query/disjunction.go b/search/query/disjunction.go index f8573d081..fa4ae8d11 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -60,6 +60,16 @@ func (q *DisjunctionQuery) SetMin(m float64) { q.Min = m } +func (q *DisjunctionQuery) SynonymSourceName(m mapping.IndexMapping) []string { + rv := []string{} + for _, disjunct := range q.Disjuncts { + if disjunct, ok := disjunct.(SynonymSearchEnabledQuery); ok { + rv = append(rv, disjunct.SynonymSourceName(m)...) + } + } + return rv +} + func (q *DisjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Disjuncts)) diff --git a/search/query/match.go b/search/query/match.go index 074d11d34..fd0010a44 100644 --- a/search/query/match.go +++ b/search/query/match.go @@ -18,20 +18,23 @@ import ( "context" "fmt" + "github.com/blevesearch/bleve/v2/analysis/token/synonym" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) type MatchQuery struct { - Match string `json:"match"` - FieldVal string `json:"field,omitempty"` - Analyzer string `json:"analyzer,omitempty"` - BoostVal *Boost `json:"boost,omitempty"` - Prefix int `json:"prefix_length"` - Fuzziness int `json:"fuzziness"` - Operator MatchQueryOperator `json:"operator,omitempty"` + Match string `json:"match"` + FieldVal string `json:"field,omitempty"` + Analyzer string `json:"analyzer,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Prefix int `json:"prefix_length"` + Fuzziness int `json:"fuzziness"` + Operator MatchQueryOperator `json:"operator,omitempty"` + SynonymSource string `json:"synonym_source,omitempty"` } type MatchQueryOperator int @@ -111,6 +114,27 @@ func (q *MatchQuery) SetPrefix(p int) { q.Prefix = p } +func (q *MatchQuery) SynonymSourceName(m mapping.IndexMapping) []string { + synonymSourceName := "" + if q.SynonymSource != "" { + synonymSourceName = q.SynonymSource + } else { + field := q.FieldVal + if q.FieldVal == "" { + field = m.DefaultSearchField() + } + synonymSourceName = m.SynonymSourceNameForPath(field) + } + if synonymSourceName != "" { + return []string{synonymSourceName} + } + return []string{} +} + +func (q *MatchQuery) SetSynonymSource(s string) { + q.SynonymSource = s +} + func (q *MatchQuery) SetOperator(operator MatchQueryOperator) { q.Operator = operator } @@ -134,6 +158,37 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } + // short circuit fuzziness value validation + if q.Fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + } + if q.Fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + + synonymSourceName := q.SynonymSourceName(m) + if len(synonymSourceName) > 0 { + // Switch to Synonym Search + synonymSource := m.SynonymSourceNamed(synonymSourceName[0]) + if synonymSource == nil { + return nil, fmt.Errorf("no synonym source named '%s' registered", synonymSourceName) + } + + analyzer, err := synonym.AddSynonymFilter(ctx, analyzer, i, synonymSource.MetadataKey(), synonymSourceName[0], q.Fuzziness, q.Prefix) + if err != nil { + return nil, err + } + + tokens := analyzer.Analyze([]byte(q.Match)) + if len(tokens) > 0 { + return searcher.NewSynonymSearcher(ctx, i, tokens, field, q.BoostVal.Value(), q.Fuzziness, q.Prefix, int(q.Operator), options) + } + + noneQuery := NewMatchNoneQuery() + return noneQuery.Searcher(ctx, i, m, options) + } + + // Default to non-synonym search tokens := analyzer.Analyze([]byte(q.Match)) if len(tokens) > 0 { diff --git a/search/query/phrase.go b/search/query/phrase.go index 9092e72d0..7130fdc92 100644 --- a/search/query/phrase.go +++ b/search/query/phrase.go @@ -26,10 +26,11 @@ import ( ) type PhraseQuery struct { - Terms []string `json:"terms"` - Field string `json:"field,omitempty"` - BoostVal *Boost `json:"boost,omitempty"` - Fuzziness int `json:"fuzziness"` + Terms []string `json:"terms"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + Fuzziness int `json:"fuzziness"` + SynonymSource string `json:"synonym_source,omitempty"` } // NewPhraseQuery creates a new Query for finding @@ -40,8 +41,8 @@ type PhraseQuery struct { // IncludeTermVectors set to true. func NewPhraseQuery(terms []string, field string) *PhraseQuery { return &PhraseQuery{ - Terms: terms, - Field: field, + Terms: terms, + FieldVal: field, } } @@ -50,6 +51,14 @@ func (q *PhraseQuery) SetBoost(b float64) { q.BoostVal = &boost } +func (q *PhraseQuery) SetField(f string) { + q.FieldVal = f +} + +func (q *PhraseQuery) Field() string { + return q.FieldVal +} + func (q *PhraseQuery) SetFuzziness(f int) { q.Fuzziness = f } @@ -59,7 +68,12 @@ func (q *PhraseQuery) Boost() float64 { } func (q *PhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { - return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.Field, q.BoostVal.Value(), options) + field := q.FieldVal + if q.FieldVal == "" { + field = m.DefaultSearchField() + } + + return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, field, q.BoostVal.Value(), options) } func (q *PhraseQuery) Validate() error { @@ -77,7 +91,7 @@ func (q *PhraseQuery) UnmarshalJSON(data []byte) error { return err } q.Terms = tmp.Terms - q.Field = tmp.Field + q.FieldVal = tmp.FieldVal q.BoostVal = tmp.BoostVal q.Fuzziness = tmp.Fuzziness return nil diff --git a/search/query/query.go b/search/query/query.go index eb7b34adb..9add56b1f 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -58,6 +58,11 @@ type FieldableQuery interface { Field() string } +type SynonymSearchEnabledQuery interface { + Query + SynonymSourceName(mapping.IndexMapping) []string +} + // A ValidatableQuery represents a Query which can be validated // prior to execution. type ValidatableQuery interface { diff --git a/search/query/query_string.go b/search/query/query_string.go index 42bb598bb..d58a01635 100644 --- a/search/query/query_string.go +++ b/search/query/query_string.go @@ -67,3 +67,13 @@ func (q *QueryStringQuery) Validate() error { } return nil } +func (q *QueryStringQuery) SynonymSourceName(m mapping.IndexMapping) []string { + newQuery, err := parseQuerySyntax(q.Query) + if err != nil { + return nil + } + if newQuery, ok := newQuery.(SynonymSearchEnabledQuery); ok { + return newQuery.SynonymSourceName(m) + } + return nil +} diff --git a/search/query/term.go b/search/query/term.go index 5c6af3962..2941a1987 100644 --- a/search/query/term.go +++ b/search/query/term.go @@ -16,7 +16,9 @@ package query import ( "context" + "fmt" + "github.com/blevesearch/bleve/v2/analysis/token/synonym" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/searcher" @@ -24,9 +26,10 @@ import ( ) type TermQuery struct { - Term string `json:"term"` - FieldVal string `json:"field,omitempty"` - BoostVal *Boost `json:"boost,omitempty"` + Term string `json:"term"` + FieldVal string `json:"field,omitempty"` + BoostVal *Boost `json:"boost,omitempty"` + SynonymSource string `json:"synonym_source,omitempty"` } // NewTermQuery creates a new Query for finding an @@ -54,10 +57,50 @@ func (q *TermQuery) Field() string { return q.FieldVal } +func (q *TermQuery) SynonymSourceName(m mapping.IndexMapping) []string { + synonymSourceName := "" + if q.SynonymSource != "" { + synonymSourceName = q.SynonymSource + } else { + field := q.FieldVal + if q.FieldVal == "" { + field = m.DefaultSearchField() + } + synonymSourceName = m.SynonymSourceNameForPath(field) + } + if synonymSourceName != "" { + return []string{synonymSourceName} + } + return []string{} +} + func (q *TermQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { field := q.FieldVal if q.FieldVal == "" { field = m.DefaultSearchField() } + + synonymSourceName := q.SynonymSourceName(m) + if len(synonymSourceName) > 0 { + // Switch to Synonym Search + synonymSource := m.SynonymSourceNamed(synonymSourceName[0]) + if synonymSource == nil { + return nil, fmt.Errorf("no synonym source named '%s' registered", synonymSourceName) + } + // there is no base analyzer in term queries, so the created analyzer will only be a + // synonym filter + analyzer, err := synonym.AddSynonymFilter(ctx, nil, i, synonymSource.MetadataKey(), synonymSourceName[0], 0, 0) + if err != nil { + return nil, err + } + + tokens := analyzer.Analyze([]byte(q.Term)) + if len(tokens) > 0 { + return searcher.NewSynonymSearcher(ctx, i, tokens, field, q.BoostVal.Value(), 0, 0, 0, options) + } + noneQuery := NewMatchNoneQuery() + return noneQuery.Searcher(ctx, i, m, options) + + } return searcher.NewTermSearcher(ctx, i, q.Term, field, q.BoostVal.Value(), options) } diff --git a/search/scorer/scorer_disjunction.go b/search/scorer/scorer_disjunction.go index 054e76fd4..f908e9e58 100644 --- a/search/scorer/scorer_disjunction.go +++ b/search/scorer/scorer_disjunction.go @@ -15,6 +15,7 @@ package scorer import ( + "context" "fmt" "reflect" @@ -30,20 +31,33 @@ func init() { } type DisjunctionQueryScorer struct { - options search.SearcherOptions + overrideScorer search.SynonymScorerCallbackFn + options search.SearcherOptions } func (s *DisjunctionQueryScorer) Size() int { return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr } -func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { +func NewDisjunctionQueryScorer(ctx context.Context, options search.SearcherOptions) *DisjunctionQueryScorer { + if overridingScorer := ctx.Value(search.SynonymScorerKey); overridingScorer != nil { + if scorerF, ok := overridingScorer.(search.SynonymScorerCallbackFn); ok { + return &DisjunctionQueryScorer{ + overrideScorer: scorerF, + options: options, + } + } + } return &DisjunctionQueryScorer{ options: options, } } func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents []*search.DocumentMatch, countMatch, countTotal int) *search.DocumentMatch { + if s.overrideScorer != nil { + return s.overrideScorer(ctx, constituents, s.options) + } + var sum float64 var childrenExplanations []*search.Explanation if s.options.Explain { diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index d36e30131..09463246c 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -73,7 +73,7 @@ func newDisjunctionHeapSearcher(ctx context.Context, indexReader index.IndexRead indexReader: indexReader, searchers: searchers, numSearchers: len(searchers), - scorer: scorer.NewDisjunctionQueryScorer(options), + scorer: scorer.NewDisjunctionQueryScorer(ctx, options), min: int(min), matching: make([]*search.DocumentMatch, len(searchers)), matchingCurrs: make([]*SearcherCurr, len(searchers)), diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index 0969c8cf3..03da6ddb7 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -16,6 +16,7 @@ package searcher import ( "context" + "fmt" "math" "reflect" "sort" @@ -67,7 +68,7 @@ func newDisjunctionSliceSearcher(ctx context.Context, indexReader index.IndexRea searchers: searchers, numSearchers: len(searchers), currs: make([]*search.DocumentMatch, len(searchers)), - scorer: scorer.NewDisjunctionQueryScorer(options), + scorer: scorer.NewDisjunctionQueryScorer(ctx, options), min: int(min), matching: make([]*search.DocumentMatch, len(searchers)), matchingIdxs: make([]int, len(searchers)), @@ -199,6 +200,7 @@ func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( found = true partialMatch := len(s.matching) != len(s.searchers) // score this match + fmt.Println(s.matching) rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) rv.PartialMatch = partialMatch } diff --git a/search/searcher/search_synonym.go b/search/searcher/search_synonym.go new file mode 100644 index 000000000..30183cad0 --- /dev/null +++ b/search/searcher/search_synonym.go @@ -0,0 +1,171 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/analysis/token/synonym" + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +// closeSearchers closes the searchers passed to it. +// called when there is an error building the searchers. +// It returns the first error encountered while closing the searchers. +// If no error is encountered, it returns nil. +func closeSearchers(err *error, searchers map[int][]search.Searcher) { + if *err != nil { + for _, sa := range searchers { + for _, s := range sa { + s.Close() + } + } + } +} + +// splitOnSpace splits the phrase on space and returns a slice of strings. +// This is used to split the synonym phrase into individual words. +// For example, "a b c" is split into ["a", "b", "c"]. +// It differs from strings.Split() in that it inserts a null character +// when two consecutive spaces are encountered. +// For example, "a b c" is split into ["a", "", "b", "c"]. +func tokenizeSynonym(phrase string) []string { + var rv []string + var tmp []rune + for _, character := range phrase { + if character == synonym.SeparatingCharacter { + rv = append(rv, string(tmp)) + tmp = tmp[:0] + } else { + tmp = append(tmp, character) + } + } + rv = append(rv, string(tmp)) + return rv +} + +func tokenKey(token *analysis.Token) struct { + Term string + Position int +} { + return struct { + Term string + Position int + }{string(token.Term), token.Position} +} + +func SynonymScorer(ctx *search.SearchContext, constituents []*search.DocumentMatch, options search.SearcherOptions) *search.DocumentMatch { + var sum float64 + var childrenExplanations []*search.Explanation + if options.Explain { + childrenExplanations = make([]*search.Explanation, len(constituents)) + } + + for i, docMatch := range constituents { + sum += docMatch.Score + if options.Explain { + childrenExplanations[i] = docMatch.Expl + } + } + + var expl *search.Explanation + if options.Explain { + expl = &search.Explanation{Value: sum, Message: "sum of synonyms:", Children: childrenExplanations} + } + + // reuse constituents[0] as the return value + rv := constituents[0] + rv.Score = sum + rv.Expl = expl + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) + return rv +} + +func NewSynonymSearcher(ctx context.Context, indexReader index.IndexReader, + tokens analysis.TokenStream, field string, boost float64, fuzziness int, + prefix int, operator int, options search.SearcherOptions) (search.Searcher, error) { + + uniqueTokens := make(map[struct { + Term string + Position int + }]analysis.TokenType) + for _, token := range tokens { + key := tokenKey(token) + if _, exists := uniqueTokens[key]; !exists { + uniqueTokens[key] = token.Type + } + } + var err error + searchersAtPosition := make(map[int][]search.Searcher) + defer closeSearchers(&err, searchersAtPosition) + + for tok, typ := range uniqueTokens { + var searcher search.Searcher + if typ == analysis.Synonym { + tokens := tokenizeSynonym(tok.Term) + if len(tokens) > 1 { + searcher, err = NewPhraseSearcher(ctx, indexReader, tokens, fuzziness, field, boost, options) + if err != nil { + return nil, err + } + searchersAtPosition[tok.Position] = append(searchersAtPosition[tok.Position], searcher) + continue + } + } + if fuzziness > 0 { + searcher, err = NewFuzzySearcher(ctx, indexReader, tok.Term, prefix, fuzziness, field, boost, options) + if err != nil { + return nil, err + } + } else { + searcher, err = NewTermSearcher(ctx, indexReader, tok.Term, field, boost, options) + if err != nil { + return nil, err + } + } + searchersAtPosition[tok.Position] = append(searchersAtPosition[tok.Position], searcher) + } + + searchers := make([]search.Searcher, len(searchersAtPosition)) + idx := 0 + overridedScorerCtx := context.WithValue(ctx, search.SynonymScorerKey, search.SynonymScorerCallbackFn(SynonymScorer)) + for _, searcher := range searchersAtPosition { + var s search.Searcher + if len(searcher) == 1 { + s = searcher[0] + } else if len(searcher) > 1 { + s, err = NewDisjunctionSearcher(overridedScorerCtx, indexReader, searcher, 1, options) + if err != nil { + return nil, err + } + } + searchers[idx] = s + idx += 1 + } + var rv search.Searcher + switch operator { + case 0: + rv, err = NewDisjunctionSearcher(ctx, indexReader, searchers, 1, options) + case 1: + rv, err = NewConjunctionSearcher(ctx, indexReader, searchers, options) + } + if err != nil { + return nil, err + } + return rv, nil +} diff --git a/search/util.go b/search/util.go index b2cb62a2d..e02feb878 100644 --- a/search/util.go +++ b/search/util.go @@ -106,6 +106,9 @@ const ( const SearchIncrementalCostKey = "_search_incremental_cost_key" const QueryTypeKey = "_query_type_key" const FuzzyMatchPhraseKey = "_fuzzy_match_phrase_key" +const SynonymScorerKey = "_synonym_scorer_key" + +type SynonymScorerCallbackFn func(*SearchContext, []*DocumentMatch, SearcherOptions) *DocumentMatch func RecordSearchCost(ctx context.Context, msg SearchIncrementalCostCallbackMsg, bytes uint64) { diff --git a/search_test.go b/search_test.go index 37da8da0a..351de4772 100644 --- a/search_test.go +++ b/search_test.go @@ -37,6 +37,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds" + "github.com/blevesearch/bleve/v2/analysis/lang/en" "github.com/blevesearch/bleve/v2/analysis/token/length" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/shingle" @@ -3376,3 +3377,438 @@ func TestPercentAndIsoStyleDates(t *testing.T) { } } } + +func TestSynonymSearchTermQuery(t *testing.T) { + + imap := mapping.NewIndexMapping() + fruitSynonymConfig := map[string]interface{}{ + "type": "textsynonym", + "collection": "fruits", + "analyzer": standard.Name, + } + + err := imap.AddSynonymSource("fruitSyn", fruitSynonymConfig) + if err != nil { + t.Fatal(err) + } + + fruitSynonymEnConfig := map[string]interface{}{ + "type": "textsynonym", + "collection": "fruits", + "analyzer": en.AnalyzerName, + } + + err = imap.AddSynonymSource("fruitSynEn", fruitSynonymEnConfig) + if err != nil { + t.Fatal(err) + } + + fruitField := mapping.NewTextFieldMapping() + fruitField.Analyzer = standard.Name + fruitField.SynonymSource = "fruitSyn" + + imap.DefaultMapping.AddFieldMappingsAt("fruit", fruitField) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc0": { + "fruit": "i had an apple for breakfast, a banana for lunch and a guava for dinner", + }, + "doc1": { + "fruit": "the banana is a fruit that is yellow; a guava is a fruit that is green", + }, + "doc2": { + "fruit": "guava is harvested in the winter, while mango is harvested in the summer", + }, + "doc3": { + "fruit": "the orange is watermelon's best friend", + }, + "doc4": { + "fruit": "muskmelon and watermelon are both melons whereas mango and orange are not", + }, + "doc9": { + "fruit": "apple orange banana mango guava", + }, + } + + synonymDocs := map[string]map[string]interface{}{ + "doc5": { + "mappingType": "equivalent", + "synonyms": []string{"apple", "banana", "guava", "mango", "orange"}, + }, + "doc6": { + "mappingType": "explicit", + "input": []string{"fruit"}, + "synonyms": []string{"apple", "banana", "guava", "mango", "orange", "muskmelon", "watermelon"}, + }, + "doc7": { + "mappingType": "equivalent", + "synonyms": []string{"apple", "guava", "mango", "orange"}, + }, + } + batch := idx.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + for docID, doc := range synonymDocs { + marshalledJson, err := json.Marshal(doc) + if err != nil { + t.Fatal(err) + } + err = batch.IndexSynonym("fruits", docID, marshalledJson) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + qw := NewMatchQuery("apple") + qw.SetField("fruit") + sr := NewSearchRequest(qw) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + sr.Explain = true + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + synonymDocs = map[string]map[string]interface{}{ + "doc6": { + "mappingType": "equivalent", + "synonyms": []string{"watermelon", "muskmelon", "mango", "orange"}, + }, + } + batch = idx.NewBatch() + batch.Delete("doc5") + for docID, doc := range synonymDocs { + marshalledJson, err := json.Marshal(doc) + if err != nil { + t.Fatal(err) + } + err = batch.IndexSynonym("fruits", docID, marshalledJson) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + //time.Sleep(5 * time.Second) + fmt.Println("Second Query") + q := NewMatchQuery("fruit") + q.SetField("fruit") + sr = NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + res, err = idx.Search(sr) + if err != nil { + t.Fatal(err) + } + fmt.Println(res) + q = NewMatchQuery("mango") + q.SetField("fruit") + sr = NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + res, err = idx.Search(sr) + if err != nil { + t.Fatal(err) + } + fmt.Println(res) + + batch = idx.NewBatch() + batch.Delete("doc5") + batch.Delete("doc6") + batch.Delete("doc7") + + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + q = NewMatchQuery("mango") + q.SetField("fruit") + sr = NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + res, err = idx.Search(sr) + if err != nil { + t.Fatal(err) + } + fmt.Println(res) +} + +func TestSynonymSearchAcronymQuery(t *testing.T) { + + imap := mapping.NewIndexMapping() + fruitSynonymConfig := map[string]interface{}{ + "type": "textsynonym", + "collection": "acronyms", + "analyzer": en.AnalyzerName, + } + + err := imap.AddSynonymSource("acronymSource", fruitSynonymConfig) + if err != nil { + t.Fatal(err) + } + + descField := mapping.NewTextFieldMapping() + descField.Analyzer = en.AnalyzerName + descField.SynonymSource = "acronymSource" + + imap.DefaultMapping.AddFieldMappingsAt("desc", descField) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc0": { + "desc": "the usa is also called United states of america", + }, + "doc1": { + "desc": "united india", + }, + "doc2": { + "desc": "united kingdom is also called UK", + }, + "doc3": { + "desc": "the united nations is also called UN", + }, + "doc4": { + "desc": "united states is present in north america", + }, + "doc9": { + "desc": "usa uk un un un", + }, + } + + synonymDocs := map[string]map[string]interface{}{ + "doc5": { + "mappingType": "equivalent", + "synonyms": []string{"united states of america", "usa", "united states", "us of a", "america"}, + }, + "doc7": { + "mappingType": "equivalent", + "synonyms": []string{"uk", "united kingdom", "great britain", "britain"}, + }, + "doc6": { + "mappingType": "equivalent", + "synonyms": []string{"un", "united nations", "united nations organization"}, + }, + "doc10": { + "mappingType": "equivalent", + "synonyms": []string{"united states of brazil", "brazil"}, + }, + } + batch := idx.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + for docID, doc := range synonymDocs { + marshalledJson, err := json.Marshal(doc) + if err != nil { + t.Fatal(err) + } + err = batch.IndexSynonym("acronyms", docID, marshalledJson) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + q := NewMatchQuery("i live in the unitd sttes of aerca, my brother lives in united states of brazil and my friend lives in united kingdom") + q.SetField("desc") + q.SetFuzziness(2) + sr := NewSearchRequest(q) + sr.Explain = true + sr.Highlight = NewHighlightWithStyle(ansi.Name) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + fmt.Println(res.Hits[0].Expl) +} + +func TestSynonymSearchAliasQuery(t *testing.T) { + + imap := mapping.NewIndexMapping() + fruitSynonymConfig := map[string]interface{}{ + "type": "textsynonym", + "collection": "acronyms", + "analyzer": en.AnalyzerName, + } + + err := imap.AddSynonymSource("acronymSource", fruitSynonymConfig) + if err != nil { + t.Fatal(err) + } + + descField := mapping.NewTextFieldMapping() + descField.Analyzer = en.AnalyzerName + descField.SynonymSource = "acronymSource" + + imap.DefaultMapping.AddFieldMappingsAt("desc", descField) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx1, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx1.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc0": { + "desc": "the usa is also called United states of america", + }, + "doc1": { + "desc": "united japan", + }, + } + + synonymDocs := map[string]map[string]interface{}{ + "doc5": { + "mappingType": "equivalent", + "synonyms": []string{"usa", "united states", "unites states of america", "us of a", "america"}, + }, + "doc7": { + "mappingType": "equivalent", + "synonyms": []string{"uk", "united kingdom", "great britain", "britain"}, + }, + } + batch := idx1.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + for docID, doc := range synonymDocs { + marshalledJson, err := json.Marshal(doc) + if err != nil { + t.Fatal(err) + } + err = batch.IndexSynonym("acronyms", docID, marshalledJson) + if err != nil { + t.Fatal(err) + } + } + err = idx1.Batch(batch) + if err != nil { + t.Fatal(err) + } + + tmpIndexPath = createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx2, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx2.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents = map[string]map[string]interface{}{ + "doc2": { + "desc": "united kingdom is also called UK", + }, + "doc3": { + "desc": "the united nations is also called UN", + }, + "doc4": { + "desc": "united states is present in north america", + }, + "doc9": { + "desc": "usa uk un un un", + }, + } + + synonymDocs = map[string]map[string]interface{}{ + "doc6": { + "mappingType": "equivalent", + "synonyms": []string{"un", "united nations", "united nations organization"}, + }, + "doc10": { + "mappingType": "equivalent", + "synonyms": []string{"united states of brazil", "brazil"}, + }, + } + batch = idx2.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + for docID, doc := range synonymDocs { + marshalledJson, err := json.Marshal(doc) + if err != nil { + t.Fatal(err) + } + err = batch.IndexSynonym("acronyms", docID, marshalledJson) + if err != nil { + t.Fatal(err) + } + } + err = idx2.Batch(batch) + if err != nil { + t.Fatal(err) + } + + idx := NewIndexAlias(idx1, idx2) + + q := NewMatchQuery("i live in the unitd sttes of aerca, my brother lives in united states of brazil and my friend lives in united kingdom") + q.SetField("desc") + q.SetFuzziness(2) + sr := NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + fmt.Println(res) +}