third draft

blevesearch · Nov 5, 2024 · a9bc9ae · a9bc9ae
1 parent af50359
commit a9bc9ae
Show file tree

Hide file tree

Showing 7 changed files with 455 additions and 78 deletions.
diff --git a/doc_test.go b/doc_test.go
@@ -176,3 +176,149 @@ func (s *stubField) NumPlainTextBytes() uint64 {
 func (s *stubField) Compose(field string, length int, freq index.TokenFrequencies) {
 
 }
+
+// -----------------------------------------------------------------------------
+type stubSynonymDefinition struct {
+	term     string
+	synonyms []string
+}
+
+func (s *stubSynonymDefinition) Term() string {
+	return s.term
+}
+
+func (s *stubSynonymDefinition) Synonyms() []string {
+	return s.synonyms
+}
+
+func newStubSynonymDefinition(term string, synonyms []string) index.SynonymDefinition {
+	return &stubSynonymDefinition{
+		term:     term,
+		synonyms: synonyms,
+	}
+}
+
+// -----------------------------------------------------------------------------
+type stubSynonymField struct {
+	name        string
+	synonymDefs []index.SynonymDefinition
+}
+
+func (s *stubSynonymField) Name() string {
+	return s.name
+}
+
+func (s *stubSynonymField) Value() []byte {
+	return nil
+}
+
+func (s *stubSynonymField) ArrayPositions() []uint64 {
+	return nil
+}
+
+func (s *stubSynonymField) EncodedFieldType() byte {
+	return 0
+}
+
+func (s *stubSynonymField) Analyze() {
+
+}
+
+func (s *stubSynonymField) Options() index.FieldIndexingOptions {
+	return 0
+}
+
+func (s *stubSynonymField) AnalyzedLength() int {
+	return 0
+}
+
+func (s *stubSynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
+	return nil
+}
+
+func (s *stubSynonymField) NumPlainTextBytes() uint64 {
+	return 0
+}
+
+func (sf *stubSynonymField) VisitSynonymDefinitions(visitor func(index.SynonymDefinition)) {
+	for _, def := range sf.synonymDefs {
+		visitor(def)
+	}
+}
+
+func analyzeStubTerm(term string) string {
+	lowerCaseTerm := strings.ToLower(term)
+	return lowerCaseTerm
+}
+
+func newStubSynonymField(name, analyzer string, defs []index.SynonymDefinition) index.SynonymField {
+	analyzedSynonymDefs := make([]index.SynonymDefinition, 0, len(defs))
+	for _, def := range defs {
+		analyzedTerm := analyzeStubTerm(def.Term())
+		analyzedSynonyms := make([]string, 0, len(def.Synonyms()))
+		for _, syn := range def.Synonyms() {
+			analyzedSynonyms = append(analyzedSynonyms, analyzeStubTerm(syn))
+		}
+		analyzedSynonymDefs = append(analyzedSynonymDefs, newStubSynonymDefinition(analyzedTerm, analyzedSynonyms))
+	}
+	return &stubSynonymField{
+		name:        name,
+		synonymDefs: analyzedSynonymDefs,
+	}
+}
+
+// -----------------------------------------------------------------------------
+type stubSynonymDocument struct {
+	id     string
+	fields []index.Field
+}
+
+func (s *stubSynonymDocument) ID() string {
+	return s.id
+}
+
+func (s *stubSynonymDocument) Size() int {
+	return 0
+}
+
+func (s *stubSynonymDocument) VisitFields(visitor index.FieldVisitor) {
+	for _, f := range s.fields {
+		visitor(f)
+	}
+}
+
+func (s *stubSynonymDocument) HasComposite() bool {
+	return false
+}
+
+func (s *stubSynonymDocument) VisitComposite(visitor index.CompositeFieldVisitor) {
+}
+
+func (s *stubSynonymDocument) NumPlainTextBytes() uint64 {
+	return 0
+}
+func (s *stubSynonymDocument) StoredFieldsBytes() uint64 {
+	return 0
+}
+
+func (s *stubSynonymDocument) AddIDField() {
+	s.fields = append(s.fields, newStubFieldSplitString("_id", nil, s.id, true, false, false))
+}
+
+func (s *stubSynonymDocument) VisitSynonymField(visitor index.SynonymFieldVisitor) {
+	for _, f := range s.fields {
+		if sf, ok := f.(index.SynonymField); ok {
+			visitor(sf)
+		}
+	}
+}
+
+func newStubSynonymDocument(id string, synonymField index.SynonymField) index.SynonymDocument {
+	rv := &stubSynonymDocument{
+		id:     id,
+		fields: []index.Field{synonymField},
+	}
+	return rv
+}
+
+// -----------------------------------------------------------------------------
diff --git a/section_faiss_vector_index.go b/section_faiss_vector_index.go
@@ -36,10 +36,10 @@ const defaultFaissOMPThreads = 1
 func init() {
 	rand.Seed(time.Now().UTC().UnixNano())
 	registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{})
-	isFieldNotApplicableToInvertedTextSection = func(field index.Field) bool {
+	invertedIndexExclusionChecks = append(invertedIndexExclusionChecks, func(field index.Field) bool {
 		_, ok := field.(index.VectorField)
 		return ok
-	}
+	})
 	faiss.SetOMPThreads(defaultFaissOMPThreads)
 }
 

diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go
@@ -34,16 +34,27 @@ func init() {
 type invertedTextIndexSection struct {
 }
 
-// this function is something that tells the inverted index section whether to
-// process a particular field or not - since it might be processed by another
-// section this function helps in avoiding unnecessary work.
-// (only used by faiss vector section currently, will need a separate API for every
-// section we introduce in the future or a better way forward - TODO)
-var isFieldNotApplicableToInvertedTextSection func(field index.Field) bool
+// This function checks whether the inverted index section should process
+// a particular field, avoiding unnecessary work if another section will handle it.
+var isFieldExcludedFromInvertedIndex = func(field index.Field) bool {
+	for _, excludeField := range invertedIndexExclusionChecks {
+		if excludeField(field) {
+			// atleast one section has agreed to exclude this field
+			// from inverted index processing and has agreed to process it
+			// independently
+			return true
+		}
+	}
+	// no section has excluded this field from inverted index processing
+	// so it should be processed by the inverted index section
+	return false
+}
+
+// List of checks to determine if a field is excluded from the inverted index section
+var invertedIndexExclusionChecks = make([]func(field index.Field) bool, 0)
 
 func (i *invertedTextIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) {
-	if isFieldNotApplicableToInvertedTextSection == nil ||
-		!isFieldNotApplicableToInvertedTextSection(field) {
+	if !isFieldExcludedFromInvertedIndex(field) {
 		invIndexOpaque := i.getInvertedIndexOpaque(opaque)
 		invIndexOpaque.process(field, fieldID, docNum)
 	}
@@ -439,6 +450,13 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin
 	}
 
 	for fieldID, terms := range io.DictKeys {
+		dict := io.Dicts[fieldID]
+		// dict is nil if the field is excluded from inverted index
+		// processing, so skip it
+		if len(dict) == 0 {
+			continue
+		}
+
 		if cap(docTermMap) < len(io.results) {
 			docTermMap = make([][]byte, len(io.results))
 		} else {
@@ -448,8 +466,6 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin
 			}
 		}
 
-		dict := io.Dicts[fieldID]
-
 		for _, term := range terms { // terms are already sorted
 			pid := dict[term] - 1