From bc5aa25effbff5931ac9197d5a8b87e645c4a00d Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 11 Dec 2024 21:59:09 +0530 Subject: [PATCH] Interfaces for thesaurus datatype (#57) - Add interfaces to abstract the thesaurus and its helper iterator methods. - Extend Fuzzy and Regex FieldDict interfaces to return abstracted automatons for calculating Damerau-Levenshtein distance and regex term matching, respectively, based on the original term/pattern using which these automatons were built. - Add interfaces for special synonym documents and synonym fields. These interfaces allow differentiation between synonym documents and regular documents during processing in the index. --- document.go | 23 +++++++++++++++ index.go | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/document.go b/document.go index 0f9012f..10d48fe 100644 --- a/document.go +++ b/document.go @@ -91,3 +91,26 @@ type TokenizableSpatialField interface { // to override the spatial token generations during the analysis phase. SetSpatialAnalyzerPlugin(SpatialAnalyzerPlugin) } + +// SynonymField represents a field that contains a list of synonyms for a set of terms. +// Each SynonymField is generated from a single synonym definition, and its name corresponds +// to the synonym source to which the synonym definition belongs. +type SynonymField interface { + Field + // IterateSynonyms iterates over the synonyms for the term in the field. + // The provided visitor function is called with each term and its corresponding synonyms. + IterateSynonyms(visitor func(term string, synonyms []string)) +} + +// SynonymFieldVisitor is a function type used to visit a SynonymField within a document. +type SynonymFieldVisitor func(SynonymField) + +// SynonymDocument represents a special type of document that contains synonym fields. +// Each SynonymField is a field with a list of synonyms for a set of terms. +// These fields are derived from synonym definitions, and their names correspond to the synonym sources. +type SynonymDocument interface { + Document + // VisitSynonymFields allows iteration over all synonym fields in the document. + // The provided visitor function is called for each synonym field. + VisitSynonymFields(visitor SynonymFieldVisitor) +} diff --git a/index.go b/index.go index 85a50e7..4d8ecd3 100644 --- a/index.go +++ b/index.go @@ -105,12 +105,41 @@ type CopyReader interface { CloseCopyReader() error } +// RegexAutomaton abstracts an automaton built using a regex pattern. +type RegexAutomaton interface { + // MatchesRegex returns true if the given string matches the regex pattern + // used to build the automaton. + MatchesRegex(string) bool +} + +// IndexReaderRegexp provides functionality to work with regex-based field dictionaries. type IndexReaderRegexp interface { + // FieldDictRegexp returns a FieldDict for terms matching the specified regex pattern + // in the dictionary of the given field. FieldDictRegexp(field string, regex string) (FieldDict, error) + + // FieldDictRegexpAutomaton returns a FieldDict and a RegexAutomaton that can be used + // to match strings against the regex pattern. + FieldDictRegexpAutomaton(field string, regex string) (FieldDict, RegexAutomaton, error) } +// FuzzyAutomaton abstracts a Levenshtein automaton built using a term and a fuzziness value. +type FuzzyAutomaton interface { + // MatchAndDistance checks if the given string is within the fuzziness distance + // of the term used to build the automaton. It also returns the edit (Levenshtein) + // distance between the string and the term. + MatchAndDistance(term string) (bool, uint8) +} + +// IndexReaderFuzzy provides functionality to work with fuzzy matching in field dictionaries. type IndexReaderFuzzy interface { + // FieldDictFuzzy returns a FieldDict for terms that are within the specified fuzziness + // distance of the given term and match the specified prefix in the given field. FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) + + // FieldDictFuzzyAutomaton returns a FieldDict and a FuzzyAutomaton that can be used + // to calculate the edit distance between the term and other strings. + FieldDictFuzzyAutomaton(field string, term string, fuzziness int, prefix string) (FieldDict, FuzzyAutomaton, error) } type IndexReaderContains interface { @@ -252,3 +281,55 @@ type IndexBuilder interface { Index(doc Document) error Close() error } + +// ThesaurusTermReader is an interface for enumerating synonyms of a term in a thesaurus. +type ThesaurusTermReader interface { + // Next returns the next synonym of the term, or an error if something goes wrong. + // Returns nil when the enumeration is complete. + Next() (string, error) + + // Close releases any resources associated with the reader. + Close() error + + Size() int +} + +// ThesaurusEntry represents a term in the thesaurus for which synonyms are stored. +type ThesaurusEntry struct { + Term string +} + +// ThesaurusKeys is an interface for enumerating terms (keys) in a thesaurus. +type ThesaurusKeys interface { + // Next returns the next key in the thesaurus, or an error if something goes wrong. + // Returns nil when the enumeration is complete. + Next() (*ThesaurusEntry, error) + + // Close releases any resources associated with the reader. + Close() error +} + +// ThesaurusReader is an interface for accessing a thesaurus in the index. +type ThesaurusReader interface { + IndexReader + + // ThesaurusTermReader returns a reader for the synonyms of a given term in the + // specified thesaurus. + ThesaurusTermReader(ctx context.Context, name string, term []byte) (ThesaurusTermReader, error) + + // ThesaurusKeys returns a reader for all terms in the specified thesaurus. + ThesaurusKeys(name string) (ThesaurusKeys, error) + + // ThesaurusKeysFuzzy returns a reader for terms in the specified thesaurus that + // match the given prefix and are within the specified fuzziness distance from + // the provided term. + ThesaurusKeysFuzzy(name string, term string, fuzziness int, prefix string) (ThesaurusKeys, error) + + // ThesaurusKeysRegexp returns a reader for terms in the specified thesaurus that + // match the given regular expression pattern. + ThesaurusKeysRegexp(name string, regex string) (ThesaurusKeys, error) + + // ThesaurusKeysPrefix returns a reader for terms in the specified thesaurus that + // start with the given prefix. + ThesaurusKeysPrefix(name string, termPrefix []byte) (ThesaurusKeys, error) +}