Skip to content

Commit

Permalink
fourth draft
Browse files Browse the repository at this point in the history
  • Loading branch information
CascadingRadium committed Nov 5, 2024
1 parent a9bc9ae commit 1bc2c97
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 90 deletions.
36 changes: 19 additions & 17 deletions new.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,23 +174,6 @@ func (s *interim) convert() (uint64, uint64, error) {
s.FieldsMap = map[string]uint16{}
}

args := map[string]interface{}{
"results": s.results,
"chunkMode": s.chunkMode,
}
if s.opaque == nil {
s.opaque = map[int]resetable{}
for i, x := range segmentSections {
s.opaque[int(i)] = x.InitOpaque(args)
}
} else {
for k, v := range args {
for _, op := range s.opaque {
op.Set(k, v)
}
}
}

s.getOrDefineField("_id") // _id field is fieldID 0

for _, result := range s.results {
Expand All @@ -208,6 +191,25 @@ func (s *interim) convert() (uint64, uint64, error) {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}

args := map[string]interface{}{
"results": s.results,
"chunkMode": s.chunkMode,
"fieldsMap": s.FieldsMap,
"fieldsInv": s.FieldsInv,
}
if s.opaque == nil {
s.opaque = map[int]resetable{}
for i, x := range segmentSections {
s.opaque[int(i)] = x.InitOpaque(args)
}
} else {
for k, v := range args {
for _, op := range s.opaque {
op.Set(k, v)
}
}
}

s.processDocuments()

storedIndexOffset, err := s.writeStoredFields()
Expand Down
26 changes: 15 additions & 11 deletions section_synonym.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ type synonymIndexOpaque struct {
// indicates whether the following structs are initialized
init bool

// FieldsMap maps field name to field id and must be set in
// the index opaque using the key "fieldsMap"
// used for ensuring accurate mapping between fieldID and
// thesaurusID
// name -> field id
FieldsMap map[string]uint16

// ThesaurusMap adds 1 to thesaurus id to avoid zero value issues
// name -> thesaurus id + 1
ThesaurusMap map[string]uint16
Expand Down Expand Up @@ -89,6 +96,8 @@ func (so *synonymIndexOpaque) Set(key string, value interface{}) {
switch key {
case "results":
so.results = value.([]index.Document)
case "fieldsMap":
so.FieldsMap = value.(map[string]uint16)
}
}

Expand Down Expand Up @@ -124,7 +133,7 @@ func (so *synonymIndexOpaque) Reset() (err error) {

func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16, docNum uint32) {
if !so.init && so.results != nil {
so.realloc(fieldID)
so.realloc()
so.init = true
}

Expand All @@ -146,7 +155,7 @@ func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16,
})
}

func (so *synonymIndexOpaque) realloc(fieldID uint16) {
func (so *synonymIndexOpaque) realloc() {
var pidNext int
var sidNext uint32
so.ThesaurusMap = map[string]uint16{}
Expand All @@ -155,16 +164,17 @@ func (so *synonymIndexOpaque) realloc(fieldID uint16) {
for _, result := range so.results {
if synDoc, ok := result.(index.SynonymDocument); ok {
synDoc.VisitSynonymField(func(synField index.SynonymField) {
so.getOrDefineThesaurus(fieldID, synField.Name())
fieldIDPlus1 := so.FieldsMap[synField.Name()]
so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name())
})
}
}

for _, result := range so.results {
if synDoc, ok := result.(index.SynonymDocument); ok {
synDoc.VisitSynonymField(func(synField index.SynonymField) {

thesaurusID := uint16(so.getOrDefineThesaurus(fieldID, synField.Name()))
fieldIDPlus1 := so.FieldsMap[synField.Name()]
thesaurusID := uint16(so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()))

thesaurus := so.Thesauri[thesaurusID]
thesaurusKeys := so.ThesaurusKeys[thesaurusID]
Expand Down Expand Up @@ -378,12 +388,6 @@ func (s *synonymIndexSection) Process(opaque map[int]resetable, docNum uint32, f
return
}
if sf, ok := field.(index.SynonymField); ok {
// at this point we have a synonym document being processed
// and this document is expected to have a single field
// which is a synonym field.
// we consider the
// fieldName as the thesaurusName and
// fieldID as the thesaurusID.
so := s.getSynonymIndexOpaque(opaque)
so.process(sf, fieldID, docNum)
}
Expand Down
188 changes: 126 additions & 62 deletions thesaurus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"errors"

"github.com/RoaringBitmap/roaring"
index "github.com/blevesearch/bleve_index_api"
segment "github.com/blevesearch/scorch_segment_api/v2"
)
Expand Down Expand Up @@ -63,8 +64,8 @@ func buildTestSegmentForThesaurus(results []index.Document) (*SegmentBase, error
return seg.(*SegmentBase), err
}

func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string) ([]string, error) {
list, err := thes.SynonymsList([]byte(term), nil, nil)
func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string, except *roaring.Bitmap) ([]string, error) {
list, err := thes.SynonymsList([]byte(term), except, nil)
if err != nil {
return nil, err
}
Expand All @@ -89,7 +90,7 @@ func extractSynonymsForTermFromThesaurus(thes segment.Thesaurus, term string) ([
return synonyms, nil
}

func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string][]string, seg segment.Segment) error {
func checkWithDeletes(except *roaring.Bitmap, collectionName string, testSynonymMap map[string][]string, seg segment.Segment) error {
dict, err := seg.Dictionary(collectionName)
if err != nil {
return err
Expand All @@ -115,7 +116,7 @@ func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string
return errors.New("expected a thesaurus")
}
for term, expectedSynonyms := range testSynonymMap {
synonyms, err := extractSynonymsForTermFromThesaurus(thes, term)
synonyms, err := extractSynonymsForTermFromThesaurus(thes, term, except)
if err != nil {
return err
}
Expand All @@ -133,6 +134,28 @@ func testSegmentSynonymAccuracy(collectionName string, testSynonymMap map[string
return nil
}

func testSegmentSynonymAccuracy(collSynMap map[string][]testSynonymDefinition, seg segment.Segment) error {
for collectionName, testSynonymMap := range collSynMap {
expectedSynonymMap := createExpectedSynonymMap(testSynonymMap)
err := checkWithDeletes(nil, collectionName, expectedSynonymMap, seg)
if err != nil {
return err
}
for i := 0; i < len(testSynonymMap); i++ {
except := roaring.New()
except.Add(uint32(i))
modifiedSynonymMap := append([]testSynonymDefinition{}, testSynonymMap[:i]...)
modifiedSynonymMap = append(modifiedSynonymMap, testSynonymMap[i+1:]...)
expectedSynonymMap = createExpectedSynonymMap(modifiedSynonymMap)
err = checkWithDeletes(except, collectionName, expectedSynonymMap, seg)
if err != nil {
return err
}
}
}
return nil
}

type testSynonymDefinition struct {
terms []string
synonyms []string
Expand All @@ -152,78 +175,119 @@ func createExpectedSynonymMap(input []testSynonymDefinition) map[string][]string
return rv
}

func TestThesaurusSingleSegment(t *testing.T) {
err := os.RemoveAll("/tmp/scorch.zap")
func buildSegment(testSynonymDefinitions map[string][]testSynonymDefinition) (segment.Segment, error) {
tmpDir, err := os.MkdirTemp("", "zap-")
if err != nil {
t.Fatalf("error removing directory: %v", err)
}
collectionName := "coll1"
testSynonymDefinitions := []testSynonymDefinition{
{
terms: nil,
synonyms: []string{
"adeptness",
"aptitude",
"facility",
"faculty",
"capacity",
"power",
"knack",
"proficiency",
"ability",
},
},
{
terms: []string{"afflict"},
synonyms: []string{
"affect",
"bother",
"distress",
"oppress",
"trouble",
"torment",
},
},
{
terms: []string{"capacity"},
synonyms: []string{
"volume",
"content",
"size",
"dimensions",
"measure",
},
},
return nil, err
}

err = os.RemoveAll(tmpDir)
if err != nil {
return nil, err
}
var testSynonymDocuments []index.Document
for i, testSynonymDefinition := range testSynonymDefinitions {
testSynonymDocuments = append(testSynonymDocuments, buildTestSynonymDocument(
strconv.Itoa(i),
collectionName,
testSynonymDefinition.terms,
testSynonymDefinition.synonyms,
))
for collName, synDefs := range testSynonymDefinitions {
for i, testSynonymDefinition := range synDefs {
testSynonymDocuments = append(testSynonymDocuments, buildTestSynonymDocument(
strconv.Itoa(i),
collName,
testSynonymDefinition.terms,
testSynonymDefinition.synonyms,
))
}
}
sb, err := buildTestSegmentForThesaurus(testSynonymDocuments)
if err != nil {
t.Fatalf("error building test seg: %v", err)
return nil, err
}
err = PersistSegmentBase(sb, tmpDir)
if err != nil {
return nil, err
}
err = PersistSegmentBase(sb, "/tmp/scorch.zap")
seg, err := zapPlugin.Open(tmpDir)
if err != nil {
t.Fatalf("error persisting seg: %v", err)
return nil, err
}
seg, err := zapPlugin.Open("/tmp/scorch.zap")
err = testSegmentSynonymAccuracy(testSynonymDefinitions, seg)
if err != nil {
t.Fatalf("error opening seg: %v", err)
return nil, err
}
return seg, nil
}

func TestSingleSegmentThesaurus(t *testing.T) {
firstCollectionName := "coll0"
secondCollectionName := "coll1"
testSynonymDefinitions := map[string][]testSynonymDefinition{
firstCollectionName: {
{
terms: nil,
synonyms: []string{
"adeptness",
"aptitude",
"facility",
"faculty",
"capacity",
"power",
"knack",
"proficiency",
"ability",
},
},
{
terms: []string{"afflict"},
synonyms: []string{
"affect",
"bother",
"distress",
"oppress",
"trouble",
"torment",
},
},
{
terms: []string{"capacity"},
synonyms: []string{
"volume",
"content",
"size",
"dimensions",
"measure",
},
},
},
secondCollectionName: {
{
synonyms: []string{
"absolutely",
"unqualifiedly",
"unconditionally",
"unreservedly",
"unexceptionally",
"unequivocally",
},
},
{
terms: []string{"abrupt"},
synonyms: []string{
"sudden",
"hasty",
"quick",
"precipitate",
"snappy",
},
},
},
}

seg1, err := buildSegment(testSynonymDefinitions)
if err != nil {
t.Fatalf("error building segment: %v", err)
}
defer func() {
cerr := seg.Close()
cerr := seg1.Close()
if cerr != nil {
t.Fatalf("error closing seg: %v", err)
}
}()
err = testSegmentSynonymAccuracy(collectionName, createExpectedSynonymMap(testSynonymDefinitions), seg)
if err != nil {
t.Fatalf("error testing segment: %v", err)
}
}

0 comments on commit 1bc2c97

Please sign in to comment.