diff --git a/document/document.go b/document/document.go index 54fd6d442..0f9591c85 100644 --- a/document/document.go +++ b/document/document.go @@ -48,6 +48,13 @@ func NewDocument(id string) *Document { } } +func NewSynonymDocument(id string) *Document { + return &Document{ + id: id, + Fields: make([]Field, 0), + } +} + func (d *Document) Size() int { sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + len(d.id) @@ -133,3 +140,11 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) { func (d *Document) HasComposite() bool { return len(d.CompositeFields) > 0 } + +func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) { + for _, f := range d.Fields { + if sf, ok := f.(index.SynonymField); ok { + visitor(sf) + } + } +} diff --git a/document/field_synonym.go b/document/field_synonym.go new file mode 100644 index 000000000..0e4812690 --- /dev/null +++ b/document/field_synonym.go @@ -0,0 +1,143 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeSynonymField int + +func init() { + var f SynonymField + reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size()) +} + +const DefaultSynonymIndexingOptions = index.IndexField + +type SynonymField struct { + name string + analyzer analysis.Analyzer + options index.FieldIndexingOptions + input []string + synonyms []string + numPlainTextBytes uint64 + + // populated during analysis + synonymMap map[string][]string +} + +func (s *SynonymField) Size() int { + return reflectStaticSizeSynonymField + size.SizeOfPtr + + len(s.name) +} + +func (s *SynonymField) Name() string { + return s.name +} + +func (s *SynonymField) ArrayPositions() []uint64 { + return nil +} + +func (s *SynonymField) Options() index.FieldIndexingOptions { + return s.options +} + +func (s *SynonymField) NumPlainTextBytes() uint64 { + return s.numPlainTextBytes +} + +func (s *SynonymField) AnalyzedLength() int { + return 0 +} + +func (s *SynonymField) EncodedFieldType() byte { + return 'y' +} + +func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (s *SynonymField) Analyze() { + var analyzedInput []string + if len(s.input) > 0 { + analyzedInput = make([]string, 0, len(s.input)) + for _, term := range s.input { + analyzedInput = append(analyzedInput, analyzeSynonymTerm(term, s.analyzer)) + } + } + analyzedSynonyms := make([]string, 0, len(s.synonyms)) + for _, syn := range s.synonyms { + analyzedSynonyms = append(analyzedSynonyms, analyzeSynonymTerm(syn, s.analyzer)) + } + s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms) +} + +func (s *SynonymField) Value() []byte { + return nil +} + +func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) { + for term, synonyms := range s.synonymMap { + visitor(term, synonyms) + } +} + +func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField { + return &SynonymField{ + name: name, + analyzer: analyzer, + options: DefaultSynonymIndexingOptions, + input: input, + synonyms: synonyms, + } +} + +func processSynonymData(input []string, synonyms []string) map[string][]string { + var synonymMap map[string][]string + if len(input) > 0 { + // Map each term to the same list of synonyms. + synonymMap = make(map[string][]string, len(input)) + for _, term := range input { + synonymMap[term] = append([]string(nil), synonyms...) // Avoid sharing slices. + } + } else { + synonymMap = make(map[string][]string, len(synonyms)) + // Precompute a map where each synonym points to all other synonyms. + for i, elem := range synonyms { + synonymMap[elem] = make([]string, 0, len(synonyms)-1) + for j, otherElem := range synonyms { + if i != j { + synonymMap[elem] = append(synonymMap[elem], otherElem) + } + } + } + } + return synonymMap +} + +func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string { + tokenStream := analyzer.Analyze([]byte(term)) + if len(tokenStream) == 0 { + return term + } + return string(tokenStream[0].Term) +} diff --git a/error.go b/error.go index 2d2751cd4..b57a61543 100644 --- a/error.go +++ b/error.go @@ -27,6 +27,7 @@ const ( ErrorEmptyID ErrorIndexReadInconsistency ErrorTwoPhaseSearchInconsistency + ErrorSynonymSearchNotSupported ) // Error represents a more strongly typed bleve error for detecting @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{ ErrorEmptyID: "document ID cannot be empty", ErrorIndexReadInconsistency: "index read inconsistency detected", ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change", + ErrorSynonymSearchNotSupported: "synonym search not supported", } diff --git a/index.go b/index.go index acbefc695..d98f28558 100644 --- a/index.go +++ b/index.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "github.com/blevesearch/bleve/v2/index/upsidedown" @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error { return nil } +func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + if eventIndex, ok := b.index.(index.EventIndex); ok { + eventIndex.FireIndexEvent() + } + synMap, ok := b.index.Mapping().(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + + return nil +} + func (b *Batch) LastDocSize() uint64 { return b.lastDocSize } @@ -323,3 +354,27 @@ type IndexCopyable interface { // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string + +// SynonymDefinition represents a synonym mapping in Bleve. +// Each instance associates one or more input terms with a list of synonyms, +// defining how terms are treated as equivalent in searches. +type SynonymDefinition struct { + // Input is an optional list of terms for unidirectional synonym mapping. + // When terms are specified in Input, they will map to the terms in Synonyms, + // making the relationship unidirectional (each Input maps to all Synonyms). + // If Input is omitted, the relationship is bidirectional among all Synonyms. + Input []string `json:"input"` + + // Synonyms is a list of terms that are considered equivalent. + // If Input is specified, each term in Input will map to each term in Synonyms. + // If Input is not specified, the Synonyms list will be treated bidirectionally, + // meaning each term in Synonyms is treated as synonymous with all others. + Synonyms []string `json:"synonyms"` +} + +func (sd *SynonymDefinition) Validate() error { + if len(sd.Synonyms) == 0 { + return fmt.Errorf("synonym definition must have at least one synonym") + } + return nil +} diff --git a/mapping/index.go b/mapping/index.go index 94b2cdfa7..d6d355b11 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -354,6 +354,23 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} return nil } +func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error { + // determine all the synonym sources with the given collection + // and create a synonym field for each + for name, synSource := range im.SynonymSources { + if synSource.Collection() == collection { + // create a new field with the name of the synonym source + analyzer := im.AnalyzerNamed(synSource.Analyzer()) + if analyzer == nil { + return fmt.Errorf("unknown analyzer named: %s", synSource.Analyzer()) + } + field := document.NewSynonymField(name, analyzer, input, synonyms) + doc.AddField(field) + } + } + return nil +} + type walkContext struct { doc *document.Document im *IndexMappingImpl diff --git a/mapping/mapping.go b/mapping/mapping.go index cbfc98faa..6100d6d09 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -58,3 +58,8 @@ type IndexMapping interface { FieldMappingForPath(path string) FieldMapping } + +type SynonymMapping interface { + IndexMapping + MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error +}