Skip to content

Commit

Permalink
third draft
Browse files Browse the repository at this point in the history
  • Loading branch information
CascadingRadium committed Nov 5, 2024
1 parent af50359 commit a9bc9ae
Show file tree
Hide file tree
Showing 7 changed files with 455 additions and 78 deletions.
146 changes: 146 additions & 0 deletions doc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,149 @@ func (s *stubField) NumPlainTextBytes() uint64 {
func (s *stubField) Compose(field string, length int, freq index.TokenFrequencies) {

}

// -----------------------------------------------------------------------------
type stubSynonymDefinition struct {
term string
synonyms []string
}

func (s *stubSynonymDefinition) Term() string {
return s.term
}

func (s *stubSynonymDefinition) Synonyms() []string {
return s.synonyms
}

func newStubSynonymDefinition(term string, synonyms []string) index.SynonymDefinition {
return &stubSynonymDefinition{
term: term,
synonyms: synonyms,
}
}

// -----------------------------------------------------------------------------
type stubSynonymField struct {
name string
synonymDefs []index.SynonymDefinition
}

func (s *stubSynonymField) Name() string {
return s.name
}

func (s *stubSynonymField) Value() []byte {
return nil
}

func (s *stubSynonymField) ArrayPositions() []uint64 {
return nil
}

func (s *stubSynonymField) EncodedFieldType() byte {
return 0
}

func (s *stubSynonymField) Analyze() {

}

func (s *stubSynonymField) Options() index.FieldIndexingOptions {
return 0
}

func (s *stubSynonymField) AnalyzedLength() int {
return 0
}

func (s *stubSynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
return nil
}

func (s *stubSynonymField) NumPlainTextBytes() uint64 {
return 0
}

func (sf *stubSynonymField) VisitSynonymDefinitions(visitor func(index.SynonymDefinition)) {
for _, def := range sf.synonymDefs {
visitor(def)
}
}

func analyzeStubTerm(term string) string {
lowerCaseTerm := strings.ToLower(term)
return lowerCaseTerm
}

func newStubSynonymField(name, analyzer string, defs []index.SynonymDefinition) index.SynonymField {
analyzedSynonymDefs := make([]index.SynonymDefinition, 0, len(defs))
for _, def := range defs {
analyzedTerm := analyzeStubTerm(def.Term())
analyzedSynonyms := make([]string, 0, len(def.Synonyms()))
for _, syn := range def.Synonyms() {
analyzedSynonyms = append(analyzedSynonyms, analyzeStubTerm(syn))
}
analyzedSynonymDefs = append(analyzedSynonymDefs, newStubSynonymDefinition(analyzedTerm, analyzedSynonyms))
}
return &stubSynonymField{
name: name,
synonymDefs: analyzedSynonymDefs,
}
}

// -----------------------------------------------------------------------------
type stubSynonymDocument struct {
id string
fields []index.Field
}

func (s *stubSynonymDocument) ID() string {
return s.id
}

func (s *stubSynonymDocument) Size() int {
return 0
}

func (s *stubSynonymDocument) VisitFields(visitor index.FieldVisitor) {
for _, f := range s.fields {
visitor(f)
}
}

func (s *stubSynonymDocument) HasComposite() bool {
return false
}

func (s *stubSynonymDocument) VisitComposite(visitor index.CompositeFieldVisitor) {
}

func (s *stubSynonymDocument) NumPlainTextBytes() uint64 {
return 0
}
func (s *stubSynonymDocument) StoredFieldsBytes() uint64 {
return 0
}

func (s *stubSynonymDocument) AddIDField() {
s.fields = append(s.fields, newStubFieldSplitString("_id", nil, s.id, true, false, false))
}

func (s *stubSynonymDocument) VisitSynonymField(visitor index.SynonymFieldVisitor) {
for _, f := range s.fields {
if sf, ok := f.(index.SynonymField); ok {
visitor(sf)
}
}
}

func newStubSynonymDocument(id string, synonymField index.SynonymField) index.SynonymDocument {
rv := &stubSynonymDocument{
id: id,
fields: []index.Field{synonymField},
}
return rv
}

// -----------------------------------------------------------------------------
4 changes: 2 additions & 2 deletions section_faiss_vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ const defaultFaissOMPThreads = 1
func init() {
rand.Seed(time.Now().UTC().UnixNano())
registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{})
isFieldNotApplicableToInvertedTextSection = func(field index.Field) bool {
invertedIndexExclusionChecks = append(invertedIndexExclusionChecks, func(field index.Field) bool {
_, ok := field.(index.VectorField)
return ok
}
})
faiss.SetOMPThreads(defaultFaissOMPThreads)
}

Expand Down
36 changes: 26 additions & 10 deletions section_inverted_text_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,27 @@ func init() {
type invertedTextIndexSection struct {
}

// this function is something that tells the inverted index section whether to
// process a particular field or not - since it might be processed by another
// section this function helps in avoiding unnecessary work.
// (only used by faiss vector section currently, will need a separate API for every
// section we introduce in the future or a better way forward - TODO)
var isFieldNotApplicableToInvertedTextSection func(field index.Field) bool
// This function checks whether the inverted index section should process
// a particular field, avoiding unnecessary work if another section will handle it.
var isFieldExcludedFromInvertedIndex = func(field index.Field) bool {
for _, excludeField := range invertedIndexExclusionChecks {
if excludeField(field) {
// atleast one section has agreed to exclude this field
// from inverted index processing and has agreed to process it
// independently
return true
}
}
// no section has excluded this field from inverted index processing
// so it should be processed by the inverted index section
return false
}

// List of checks to determine if a field is excluded from the inverted index section
var invertedIndexExclusionChecks = make([]func(field index.Field) bool, 0)

func (i *invertedTextIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) {
if isFieldNotApplicableToInvertedTextSection == nil ||
!isFieldNotApplicableToInvertedTextSection(field) {
if !isFieldExcludedFromInvertedIndex(field) {
invIndexOpaque := i.getInvertedIndexOpaque(opaque)
invIndexOpaque.process(field, fieldID, docNum)
}
Expand Down Expand Up @@ -439,6 +450,13 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin
}

for fieldID, terms := range io.DictKeys {
dict := io.Dicts[fieldID]
// dict is nil if the field is excluded from inverted index
// processing, so skip it
if len(dict) == 0 {
continue
}

if cap(docTermMap) < len(io.results) {
docTermMap = make([][]byte, len(io.results))
} else {
Expand All @@ -448,8 +466,6 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin
}
}

dict := io.Dicts[fieldID]

for _, term := range terms { // terms are already sorted
pid := dict[term] - 1

Expand Down
Loading

0 comments on commit a9bc9ae

Please sign in to comment.