Skip to content

Commit

Permalink
Interfaces for thesaurus datatype (#57)
Browse files Browse the repository at this point in the history
- Add interfaces to abstract the thesaurus and its helper iterator methods.
- Extend Fuzzy and Regex FieldDict interfaces to return abstracted automatons
   for calculating Damerau-Levenshtein distance and regex term matching,
   respectively, based on the original term/pattern using which these automatons
   were built.
- Add interfaces for special synonym documents and synonym fields. These
  interfaces allow differentiation between synonym documents and regular
  documents during processing in the index.
  • Loading branch information
CascadingRadium authored Dec 11, 2024
1 parent 01e7988 commit bc5aa25
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 0 deletions.
23 changes: 23 additions & 0 deletions document.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,26 @@ type TokenizableSpatialField interface {
// to override the spatial token generations during the analysis phase.
SetSpatialAnalyzerPlugin(SpatialAnalyzerPlugin)
}

// SynonymField represents a field that contains a list of synonyms for a set of terms.
// Each SynonymField is generated from a single synonym definition, and its name corresponds
// to the synonym source to which the synonym definition belongs.
type SynonymField interface {
Field
// IterateSynonyms iterates over the synonyms for the term in the field.
// The provided visitor function is called with each term and its corresponding synonyms.
IterateSynonyms(visitor func(term string, synonyms []string))
}

// SynonymFieldVisitor is a function type used to visit a SynonymField within a document.
type SynonymFieldVisitor func(SynonymField)

// SynonymDocument represents a special type of document that contains synonym fields.
// Each SynonymField is a field with a list of synonyms for a set of terms.
// These fields are derived from synonym definitions, and their names correspond to the synonym sources.
type SynonymDocument interface {
Document
// VisitSynonymFields allows iteration over all synonym fields in the document.
// The provided visitor function is called for each synonym field.
VisitSynonymFields(visitor SynonymFieldVisitor)
}
81 changes: 81 additions & 0 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,41 @@ type CopyReader interface {
CloseCopyReader() error
}

// RegexAutomaton abstracts an automaton built using a regex pattern.
type RegexAutomaton interface {
// MatchesRegex returns true if the given string matches the regex pattern
// used to build the automaton.
MatchesRegex(string) bool
}

// IndexReaderRegexp provides functionality to work with regex-based field dictionaries.
type IndexReaderRegexp interface {
// FieldDictRegexp returns a FieldDict for terms matching the specified regex pattern
// in the dictionary of the given field.
FieldDictRegexp(field string, regex string) (FieldDict, error)

// FieldDictRegexpAutomaton returns a FieldDict and a RegexAutomaton that can be used
// to match strings against the regex pattern.
FieldDictRegexpAutomaton(field string, regex string) (FieldDict, RegexAutomaton, error)
}

// FuzzyAutomaton abstracts a Levenshtein automaton built using a term and a fuzziness value.
type FuzzyAutomaton interface {
// MatchAndDistance checks if the given string is within the fuzziness distance
// of the term used to build the automaton. It also returns the edit (Levenshtein)
// distance between the string and the term.
MatchAndDistance(term string) (bool, uint8)
}

// IndexReaderFuzzy provides functionality to work with fuzzy matching in field dictionaries.
type IndexReaderFuzzy interface {
// FieldDictFuzzy returns a FieldDict for terms that are within the specified fuzziness
// distance of the given term and match the specified prefix in the given field.
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)

// FieldDictFuzzyAutomaton returns a FieldDict and a FuzzyAutomaton that can be used
// to calculate the edit distance between the term and other strings.
FieldDictFuzzyAutomaton(field string, term string, fuzziness int, prefix string) (FieldDict, FuzzyAutomaton, error)
}

type IndexReaderContains interface {
Expand Down Expand Up @@ -252,3 +281,55 @@ type IndexBuilder interface {
Index(doc Document) error
Close() error
}

// ThesaurusTermReader is an interface for enumerating synonyms of a term in a thesaurus.
type ThesaurusTermReader interface {
// Next returns the next synonym of the term, or an error if something goes wrong.
// Returns nil when the enumeration is complete.
Next() (string, error)

// Close releases any resources associated with the reader.
Close() error

Size() int
}

// ThesaurusEntry represents a term in the thesaurus for which synonyms are stored.
type ThesaurusEntry struct {
Term string
}

// ThesaurusKeys is an interface for enumerating terms (keys) in a thesaurus.
type ThesaurusKeys interface {
// Next returns the next key in the thesaurus, or an error if something goes wrong.
// Returns nil when the enumeration is complete.
Next() (*ThesaurusEntry, error)

// Close releases any resources associated with the reader.
Close() error
}

// ThesaurusReader is an interface for accessing a thesaurus in the index.
type ThesaurusReader interface {
IndexReader

// ThesaurusTermReader returns a reader for the synonyms of a given term in the
// specified thesaurus.
ThesaurusTermReader(ctx context.Context, name string, term []byte) (ThesaurusTermReader, error)

// ThesaurusKeys returns a reader for all terms in the specified thesaurus.
ThesaurusKeys(name string) (ThesaurusKeys, error)

// ThesaurusKeysFuzzy returns a reader for terms in the specified thesaurus that
// match the given prefix and are within the specified fuzziness distance from
// the provided term.
ThesaurusKeysFuzzy(name string, term string, fuzziness int, prefix string) (ThesaurusKeys, error)

// ThesaurusKeysRegexp returns a reader for terms in the specified thesaurus that
// match the given regular expression pattern.
ThesaurusKeysRegexp(name string, regex string) (ThesaurusKeys, error)

// ThesaurusKeysPrefix returns a reader for terms in the specified thesaurus that
// start with the given prefix.
ThesaurusKeysPrefix(name string, termPrefix []byte) (ThesaurusKeys, error)
}

0 comments on commit bc5aa25

Please sign in to comment.