Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Thesaurus API and Synonym Index Handling in Search #268

Merged
merged 30 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
99b59a0
first draft
CascadingRadium Oct 10, 2024
2bde50a
second draft
CascadingRadium Oct 10, 2024
9f7957f
third draft
CascadingRadium Oct 14, 2024
093bee1
fourth draft
CascadingRadium Oct 15, 2024
ba3f3d5
merge
CascadingRadium Oct 15, 2024
5234df2
fix size API impl
CascadingRadium Nov 6, 2024
83f1bcb
test fixes
CascadingRadium Nov 14, 2024
ce33fef
fix tests
CascadingRadium Nov 15, 2024
877152d
refactor tests
CascadingRadium Nov 15, 2024
8bbd323
bug fixes
CascadingRadium Dec 9, 2024
2f8298c
fix interface change bug
CascadingRadium Dec 10, 2024
5e40167
minor cleanup
CascadingRadium Dec 11, 2024
ab86d46
clean
CascadingRadium Dec 11, 2024
508221a
Upgrade vellum, bleve_index_api, scorch_segment_api
abhinavdangeti Dec 11, 2024
7f8890a
Merge remote-tracking branch 'origin/master' into synonyms
abhinavdangeti Dec 11, 2024
042af03
command line tooling
CascadingRadium Dec 12, 2024
801ea51
update zap.md
CascadingRadium Dec 12, 2024
c5634c0
small formatting fixes in zap.md
CascadingRadium Dec 12, 2024
f9896a9
fix cmd
CascadingRadium Dec 12, 2024
1b5ea94
thes -> thesaurus
CascadingRadium Dec 12, 2024
946a369
minor zapx.md changes
CascadingRadium Dec 12, 2024
cdad5b4
address code review
CascadingRadium Dec 13, 2024
3609ffc
fix
CascadingRadium Dec 13, 2024
bde3bf8
review comments
CascadingRadium Dec 13, 2024
5794708
rename file
CascadingRadium Dec 13, 2024
079fe42
add code commentary and refactor
CascadingRadium Dec 13, 2024
ddd8f76
add comment
CascadingRadium Dec 16, 2024
2227d1e
Merge branch 'master' into synonyms
CascadingRadium Dec 18, 2024
0d80cde
review comments
CascadingRadium Dec 19, 2024
c94e916
addess comments
CascadingRadium Dec 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
docValueOffset: 0, // docValueOffsets identified automatically by the section
fieldFSTs: make(map[uint16]*vellum.FST),
vecIndexCache: newVectorIndexCache(),
synIndexCache: newSynonymIndexCache(),
// following fields gets populated by loadFieldsNew
fieldsMap: make(map[string]uint16),
dictLocs: make([]uint64, 0),
Expand Down
140 changes: 140 additions & 0 deletions cmd/zap/cmd/synonym.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright (c) 2024 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"bytes"
"encoding/binary"
"fmt"

"github.com/RoaringBitmap/roaring/roaring64"
"github.com/blevesearch/vellum"
"github.com/spf13/cobra"
)

var thesaurusCmd = &cobra.Command{
Use: "thesaurus [path] [name]",
Short: "thesaurus prints the thesaurus with the specified name",
Long: `The thesaurus command lets you print the thesaurus with the specified name.`,
RunE: func(cmd *cobra.Command, args []string) error {
pos := segment.FieldsIndexOffset()
if pos == 0 {
// this is the case only for older file formats
return fmt.Errorf("file format not supported")
}
if len(args) < 2 {
return fmt.Errorf("must specify thesaurus name")
}

pos, err := segment.ThesaurusAddr(args[1])
if err != nil {
return fmt.Errorf("error determining address: %v", err)
}
fmt.Printf("thesaurus with name %s starts at %d (%x)\n", args[1], pos, pos)

data := segment.Data()
vellumLen, read := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(read)
fmt.Printf("vellum length: %d\n", vellumLen)

fstBytes := data[pos : pos+vellumLen]
pos += vellumLen
fst, err := vellum.Load(fstBytes)
if err != nil {
return fmt.Errorf("thesaurus name %s vellum err: %v", args[1], err)
}
fmt.Printf("raw vellum data:\n % x\n", fstBytes)

numSyns, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
if numSyns == 0 {
return fmt.Errorf("no synonyms found")
}
synTermMap := make(map[uint32][]byte, numSyns)
for i := 0; i < int(numSyns); i++ {
synID, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
termLen, n := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
pos += uint64(n)
if termLen == 0 {
return fmt.Errorf("term length is 0")
}
term := data[pos : pos+uint64(termLen)]
pos += uint64(termLen)
synTermMap[uint32(synID)] = term
}

fmt.Printf("termID to term mapping:\n")
fmt.Printf(" termID\tterm\n")
for k, v := range synTermMap {
fmt.Printf(" %d\t%s\n", k, string(v))
}
fmt.Printf("thesaurus (term -> [{termID|docNum},...]):\n")
var totalTerms int
itr, err := fst.Iterator(nil, nil)
for err == nil {
var sl *roaring64.Bitmap
currTerm, currVal := itr.Current()
sl, err = readSynonymsList(currVal, data)
if err != nil {
return err
}
sitr := sl.Iterator()
printStr := fmt.Sprintf(" %s -> [", currTerm)
for sitr.HasNext() {
encodedVal := sitr.Next()
tID, docNum := decodeSynonym(encodedVal)
str := fmt.Sprintf("{%d|%d},", tID, docNum)
printStr += str
}
printStr = printStr[:len(printStr)-1] + "]"
fmt.Printf("%s\n", printStr)
totalTerms++
err = itr.Next()
}
fmt.Printf("Total terms in thesaurus : %d\n", totalTerms)
if err != nil && err != vellum.ErrIteratorDone {
return fmt.Errorf("error iterating thesaurus: %v", err)
}
return nil
},
}

func readSynonymsList(postingsOffset uint64, data []byte) (*roaring64.Bitmap, error) {
var n uint64
var read int

var postingsLen uint64
postingsLen, read = binary.Uvarint(data[postingsOffset : postingsOffset+binary.MaxVarintLen64])
n += uint64(read)

buf := bytes.NewReader(data[postingsOffset+n : postingsOffset+n+postingsLen])
r := roaring64.NewBitmap()

_, err := r.ReadFrom(buf)
if err != nil {
return nil, fmt.Errorf("error loading roaring bitmap: %v", err)
}

return r, nil
}

func decodeSynonym(synonymCode uint64) (synonymID uint32, docID uint32) {
return uint32(synonymCode >> 32), uint32(synonymCode)
}

func init() {
RootCmd.AddCommand(thesaurusCmd)
}
154 changes: 154 additions & 0 deletions doc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,157 @@ func (s *stubField) NumPlainTextBytes() uint64 {
func (s *stubField) Compose(field string, length int, freq index.TokenFrequencies) {

}

// -----------------------------------------------------------------------------
type stubSynonymField struct {
name string
analyzer string
input []string
synonyms []string

synonymMap map[string][]string
}

func (s *stubSynonymField) Name() string {
return s.name
}

func (s *stubSynonymField) Value() []byte {
return nil
}

func (s *stubSynonymField) ArrayPositions() []uint64 {
return nil
}

func (s *stubSynonymField) EncodedFieldType() byte {
return 0
}

func (s *stubSynonymField) Analyze() {
var analyzedInput []string
if len(s.input) > 0 {
analyzedInput = make([]string, 0, len(s.input))
for _, term := range s.input {
analyzedInput = append(analyzedInput, analyzeStubTerm(term, s.analyzer))
}
}
analyzedSynonyms := make([]string, 0, len(s.synonyms))
for _, syn := range s.synonyms {
analyzedSynonyms = append(analyzedSynonyms, analyzeStubTerm(syn, s.analyzer))
}
s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms)
}

func (s *stubSynonymField) Options() index.FieldIndexingOptions {
return 0
}

func (s *stubSynonymField) AnalyzedLength() int {
return 0
}

func (s *stubSynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies {
return nil
}

func (s *stubSynonymField) NumPlainTextBytes() uint64 {
return 0
}

func (sf *stubSynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) {
for term, synonyms := range sf.synonymMap {
visitor(term, synonyms)
}
}

func processSynonymData(input []string, synonyms []string) map[string][]string {
var synonymMap map[string][]string
if len(input) > 0 {
// Map each term to the same list of synonyms.
synonymMap = make(map[string][]string, len(input))
for _, term := range input {
synonymMap[term] = append([]string(nil), synonyms...) // Avoid sharing slices.
}
} else {
synonymMap = make(map[string][]string, len(synonyms))
// Precompute a map where each synonym points to all other synonyms.
for i, elem := range synonyms {
synonymMap[elem] = make([]string, 0, len(synonyms)-1)
for j, otherElem := range synonyms {
if i != j {
synonymMap[elem] = append(synonymMap[elem], otherElem)
}
}
}
}
return synonymMap
}

func analyzeStubTerm(term string, analyzer string) string {
lowerCaseTerm := strings.ToLower(term)
return lowerCaseTerm
}

func newStubSynonymField(name string, analyzer string, input []string, synonyms []string) index.SynonymField {
return &stubSynonymField{
name: name,
analyzer: analyzer,
input: input,
synonyms: synonyms,
}
}

// -----------------------------------------------------------------------------
type stubSynonymDocument struct {
id string
fields []index.Field
}

func (s *stubSynonymDocument) ID() string {
return s.id
}

func (s *stubSynonymDocument) Size() int {
return 0
}

func (s *stubSynonymDocument) VisitFields(visitor index.FieldVisitor) {
for _, f := range s.fields {
visitor(f)
}
}

func (s *stubSynonymDocument) HasComposite() bool {
return false
}

func (s *stubSynonymDocument) VisitComposite(visitor index.CompositeFieldVisitor) {
}

func (s *stubSynonymDocument) NumPlainTextBytes() uint64 {
return 0
}
func (s *stubSynonymDocument) StoredFieldsBytes() uint64 {
return 0
}

func (s *stubSynonymDocument) AddIDField() {
s.fields = append(s.fields, newStubFieldSplitString("_id", nil, s.id, true, false, false))
}

func (s *stubSynonymDocument) VisitSynonymFields(visitor index.SynonymFieldVisitor) {
for _, f := range s.fields {
if sf, ok := f.(index.SynonymField); ok {
visitor(sf)
}
}
}

func newStubSynonymDocument(id string, synonymField index.SynonymField) index.SynonymDocument {
rv := &stubSynonymDocument{
id: id,
fields: []index.Field{synonymField},
}
return rv
}
4 changes: 3 additions & 1 deletion faiss_vector_posting.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ var emptyVecPostingsIterator = &VecPostingsIterator{}
var emptyVecPostingsList = &VecPostingsList{}

func (vpl *VecPostingsList) Iterator(prealloc segment.VecPostingsIterator) segment.VecPostingsIterator {

if vpl.postings == nil {
return emptyVecPostingsIterator
}
// tbd: do we check the cardinality of postings and scores?
var preallocPI *VecPostingsIterator
pi, ok := prealloc.(*VecPostingsIterator)
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ go 1.21

require (
github.com/RoaringBitmap/roaring v1.9.3
github.com/blevesearch/bleve_index_api v1.1.13
github.com/blevesearch/bleve_index_api v1.2.0
github.com/blevesearch/go-faiss v1.0.24
github.com/blevesearch/mmap-go v1.0.4
github.com/blevesearch/scorch_segment_api/v2 v2.2.16
github.com/blevesearch/vellum v1.0.11
github.com/blevesearch/scorch_segment_api/v2 v2.3.0
github.com/blevesearch/vellum v1.1.0
github.com/golang/snappy v0.0.4
github.com/spf13/cobra v1.7.0
)
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4
github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48=
github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY=
github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0=
github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU=
github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
Expand Down
Loading
Loading