From 1bc2c974e180b1ace37036a6c6d3af0575cb3975 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 15 Oct 2024 14:39:31 +0530 Subject: [PATCH] fourth draft --- new.go | 36 +++++---- section_synonym.go | 26 ++++--- thesaurus_test.go | 188 ++++++++++++++++++++++++++++++--------------- 3 files changed, 160 insertions(+), 90 deletions(-) diff --git a/new.go b/new.go index f0d37c4..c99b933 100644 --- a/new.go +++ b/new.go @@ -174,23 +174,6 @@ func (s *interim) convert() (uint64, uint64, error) { s.FieldsMap = map[string]uint16{} } - args := map[string]interface{}{ - "results": s.results, - "chunkMode": s.chunkMode, - } - if s.opaque == nil { - s.opaque = map[int]resetable{} - for i, x := range segmentSections { - s.opaque[int(i)] = x.InitOpaque(args) - } - } else { - for k, v := range args { - for _, op := range s.opaque { - op.Set(k, v) - } - } - } - s.getOrDefineField("_id") // _id field is fieldID 0 for _, result := range s.results { @@ -208,6 +191,25 @@ func (s *interim) convert() (uint64, uint64, error) { s.FieldsMap[fieldName] = uint16(fieldID + 1) } + args := map[string]interface{}{ + "results": s.results, + "chunkMode": s.chunkMode, + "fieldsMap": s.FieldsMap, + "fieldsInv": s.FieldsInv, + } + if s.opaque == nil { + s.opaque = map[int]resetable{} + for i, x := range segmentSections { + s.opaque[int(i)] = x.InitOpaque(args) + } + } else { + for k, v := range args { + for _, op := range s.opaque { + op.Set(k, v) + } + } + } + s.processDocuments() storedIndexOffset, err := s.writeStoredFields() diff --git a/section_synonym.go b/section_synonym.go index b460419..35c14d3 100644 --- a/section_synonym.go +++ b/section_synonym.go @@ -45,6 +45,13 @@ type synonymIndexOpaque struct { // indicates whether the following structs are initialized init bool + // FieldsMap maps field name to field id and must be set in + // the index opaque using the key "fieldsMap" + // used for ensuring accurate mapping between fieldID and + // thesaurusID + // name -> field id + FieldsMap map[string]uint16 + // ThesaurusMap adds 1 to thesaurus id to avoid zero value issues // name -> thesaurus id + 1 ThesaurusMap map[string]uint16 @@ -89,6 +96,8 @@ func (so *synonymIndexOpaque) Set(key string, value interface{}) { switch key { case "results": so.results = value.([]index.Document) + case "fieldsMap": + so.FieldsMap = value.(map[string]uint16) } } @@ -124,7 +133,7 @@ func (so *synonymIndexOpaque) Reset() (err error) { func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16, docNum uint32) { if !so.init && so.results != nil { - so.realloc(fieldID) + so.realloc() so.init = true } @@ -146,7 +155,7 @@ func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16, }) } -func (so *synonymIndexOpaque) realloc(fieldID uint16) { +func (so *synonymIndexOpaque) realloc() { var pidNext int var sidNext uint32 so.ThesaurusMap = map[string]uint16{} @@ -155,7 +164,8 @@ func (so *synonymIndexOpaque) realloc(fieldID uint16) { for _, result := range so.results { if synDoc, ok := result.(index.SynonymDocument); ok { synDoc.VisitSynonymField(func(synField index.SynonymField) { - so.getOrDefineThesaurus(fieldID, synField.Name()) + fieldIDPlus1 := so.FieldsMap[synField.Name()] + so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()) }) } } @@ -163,8 +173,8 @@ func (so *synonymIndexOpaque) realloc(fieldID uint16) { for _, result := range so.results { if synDoc, ok := result.(index.SynonymDocument); ok { synDoc.VisitSynonymField(func(synField index.SynonymField) { - - thesaurusID := uint16(so.getOrDefineThesaurus(fieldID, synField.Name())) + fieldIDPlus1 := so.FieldsMap[synField.Name()] + thesaurusID := uint16(so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name())) thesaurus := so.Thesauri[thesaurusID] thesaurusKeys := so.ThesaurusKeys[thesaurusID] @@ -378,12 +388,6 @@ func (s *synonymIndexSection) Process(opaque map[int]resetable, docNum uint32, f return } if sf, ok := field.(index.SynonymField); ok { - // at this point we have a synonym document being processed - // and this document is expected to have a single field - // which is a synonym field. - // we consider the - // fieldName as the thesaurusName and - // fieldID as the thesaurusID. so := s.getSynonymIndexOpaque(opaque) so.process(sf, fieldID, docNum) } diff --git a/thesaurus_test.go b/thesaurus_test.go index 67849ec..79611d2 100644 --- a/thesaurus_test.go +++ b/thesaurus_test.go @@ -22,6 +22,7 @@ import ( "errors" + "github.com/RoaringBitmap/roaring" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" ) @@ -63,8 +64,8 @@ func buildTestSegmentForThesaurus(results []index.Document) (*SegmentBase, error return seg.(*SegmentBase), err } -func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string) ([]string, error) { - list, err := thes.SynonymsList([]byte(term), nil, nil) +func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string, except *roaring.Bitmap) ([]string, error) { + list, err := thes.SynonymsList([]byte(term), except, nil) if err != nil { return nil, err } @@ -89,7 +90,7 @@ func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string) ([ return synonyms, nil } -func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string][]string, seg segment.Segment) error { +func checkWithDeletes(except *roaring.Bitmap, collectionName string, testSynonymMap map[string][]string, seg segment.Segment) error { dict, err := seg.Dictionary(collectionName) if err != nil { return err @@ -115,7 +116,7 @@ func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string return errors.New("expected a thesaurus") } for term, expectedSynonyms := range testSynonymMap { - synonyms, err := extractSynonymsForTermFromThesaurus(thes, term) + synonyms, err := extractSynonymsForTermFromThesaurus(thes, term, except) if err != nil { return err } @@ -133,6 +134,28 @@ func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string return nil } +func testSegmentSynonymAccuracy(collSynMap map[string][]testSynonymDefinition, seg segment.Segment) error { + for collectionName, testSynonymMap := range collSynMap { + expectedSynonymMap := createExpectedSynonymMap(testSynonymMap) + err := checkWithDeletes(nil, collectionName, expectedSynonymMap, seg) + if err != nil { + return err + } + for i := 0; i < len(testSynonymMap); i++ { + except := roaring.New() + except.Add(uint32(i)) + modifiedSynonymMap := append([]testSynonymDefinition{}, testSynonymMap[:i]...) + modifiedSynonymMap = append(modifiedSynonymMap, testSynonymMap[i+1:]...) + expectedSynonymMap = createExpectedSynonymMap(modifiedSynonymMap) + err = checkWithDeletes(except, collectionName, expectedSynonymMap, seg) + if err != nil { + return err + } + } + } + return nil +} + type testSynonymDefinition struct { terms []string synonyms []string @@ -152,78 +175,119 @@ func createExpectedSynonymMap(input []testSynonymDefinition) map[string][]string return rv } -func TestThesaurusSingleSegment(t *testing.T) { - err := os.RemoveAll("/tmp/scorch.zap") +func buildSegment(testSynonymDefinitions map[string][]testSynonymDefinition) (segment.Segment, error) { + tmpDir, err := os.MkdirTemp("", "zap-") if err != nil { - t.Fatalf("error removing directory: %v", err) - } - collectionName := "coll1" - testSynonymDefinitions := []testSynonymDefinition{ - { - terms: nil, - synonyms: []string{ - "adeptness", - "aptitude", - "facility", - "faculty", - "capacity", - "power", - "knack", - "proficiency", - "ability", - }, - }, - { - terms: []string{"afflict"}, - synonyms: []string{ - "affect", - "bother", - "distress", - "oppress", - "trouble", - "torment", - }, - }, - { - terms: []string{"capacity"}, - synonyms: []string{ - "volume", - "content", - "size", - "dimensions", - "measure", - }, - }, + return nil, err + } + + err = os.RemoveAll(tmpDir) + if err != nil { + return nil, err } var testSynonymDocuments []index.Document - for i, testSynonymDefinition := range testSynonymDefinitions { - testSynonymDocuments = append(testSynonymDocuments, buildTestSynonymDocument( - strconv.Itoa(i), - collectionName, - testSynonymDefinition.terms, - testSynonymDefinition.synonyms, - )) + for collName, synDefs := range testSynonymDefinitions { + for i, testSynonymDefinition := range synDefs { + testSynonymDocuments = append(testSynonymDocuments, buildTestSynonymDocument( + strconv.Itoa(i), + collName, + testSynonymDefinition.terms, + testSynonymDefinition.synonyms, + )) + } } sb, err := buildTestSegmentForThesaurus(testSynonymDocuments) if err != nil { - t.Fatalf("error building test seg: %v", err) + return nil, err + } + err = PersistSegmentBase(sb, tmpDir) + if err != nil { + return nil, err } - err = PersistSegmentBase(sb, "/tmp/scorch.zap") + seg, err := zapPlugin.Open(tmpDir) if err != nil { - t.Fatalf("error persisting seg: %v", err) + return nil, err } - seg, err := zapPlugin.Open("/tmp/scorch.zap") + err = testSegmentSynonymAccuracy(testSynonymDefinitions, seg) if err != nil { - t.Fatalf("error opening seg: %v", err) + return nil, err + } + return seg, nil +} + +func TestSingleSegmentThesaurus(t *testing.T) { + firstCollectionName := "coll0" + secondCollectionName := "coll1" + testSynonymDefinitions := map[string][]testSynonymDefinition{ + firstCollectionName: { + { + terms: nil, + synonyms: []string{ + "adeptness", + "aptitude", + "facility", + "faculty", + "capacity", + "power", + "knack", + "proficiency", + "ability", + }, + }, + { + terms: []string{"afflict"}, + synonyms: []string{ + "affect", + "bother", + "distress", + "oppress", + "trouble", + "torment", + }, + }, + { + terms: []string{"capacity"}, + synonyms: []string{ + "volume", + "content", + "size", + "dimensions", + "measure", + }, + }, + }, + secondCollectionName: { + { + synonyms: []string{ + "absolutely", + "unqualifiedly", + "unconditionally", + "unreservedly", + "unexceptionally", + "unequivocally", + }, + }, + { + terms: []string{"abrupt"}, + synonyms: []string{ + "sudden", + "hasty", + "quick", + "precipitate", + "snappy", + }, + }, + }, + } + + seg1, err := buildSegment(testSynonymDefinitions) + if err != nil { + t.Fatalf("error building segment: %v", err) } defer func() { - cerr := seg.Close() + cerr := seg1.Close() if cerr != nil { t.Fatalf("error closing seg: %v", err) } }() - err = testSegmentSynonymAccuracy(collectionName, createExpectedSynonymMap(testSynonymDefinitions), seg) - if err != nil { - t.Fatalf("error testing segment: %v", err) - } }