From f0474a03fe129c16d749d5ec792b91910db3d3a2 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 14 Oct 2024 19:22:04 +0530 Subject: [PATCH 1/6] interfaces for thesaurus datatype --- segment.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/segment.go b/segment.go index 8e4a3d9..4432389 100644 --- a/segment.go +++ b/segment.go @@ -61,6 +61,11 @@ type PersistedSegment interface { Path() string } +type SynonymSegment interface { + Segment + Thesaurus(field string) (Thesaurus, error) +} + type TermDictionary interface { PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) @@ -70,6 +75,12 @@ type TermDictionary interface { Contains(key []byte) (bool, error) } +type Thesaurus interface { + SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) + + Contains(key []byte) (bool, error) +} + type DictionaryIterator interface { Next() (*index.DictEntry, error) } @@ -89,6 +100,10 @@ type PostingsList interface { // Or(other PostingsList) PostingsList } +type SynonymsList interface { + Iterator(prealloc SynonymsIterator) SynonymsIterator +} + type PostingsIterator interface { DiskStatsReporter @@ -107,6 +122,14 @@ type PostingsIterator interface { Size() int } +type SynonymsIterator interface { + // The caller is responsible for copying whatever it needs from + // the returned Posting instance before calling Next(), as some + // implementations may return a shared instance to reduce memory + // allocations. + Next() (Synonym, error) +} + type DiskStatsReporter interface { // BytesRead returns the bytes read from the disk as // part of the current running query. @@ -139,6 +162,12 @@ type Posting interface { Size() int } +type Synonym interface { + Term() string + DocNum() uint32 + SynonymID() uint32 +} + type Location interface { Field() string Start() uint64 From ba6b474cd75c70fe3287e6fc508ef8bc6705064f Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 5 Nov 2024 18:00:45 +0530 Subject: [PATCH 2/6] add size --- segment.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/segment.go b/segment.go index 4432389..e5fea1f 100644 --- a/segment.go +++ b/segment.go @@ -78,6 +78,8 @@ type TermDictionary interface { type Thesaurus interface { SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) + Size() int + Contains(key []byte) (bool, error) } @@ -102,6 +104,8 @@ type PostingsList interface { type SynonymsList interface { Iterator(prealloc SynonymsIterator) SynonymsIterator + + Size() int } type PostingsIterator interface { @@ -128,6 +132,8 @@ type SynonymsIterator interface { // implementations may return a shared instance to reduce memory // allocations. Next() (Synonym, error) + + Size() int } type DiskStatsReporter interface { From d6ac8cd28c7b8e02be731fc50356cd242fbb05bc Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 6 Nov 2024 16:12:34 +0530 Subject: [PATCH 3/6] fix size API --- segment.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/segment.go b/segment.go index e5fea1f..904f370 100644 --- a/segment.go +++ b/segment.go @@ -78,8 +78,6 @@ type TermDictionary interface { type Thesaurus interface { SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) - Size() int - Contains(key []byte) (bool, error) } @@ -172,6 +170,7 @@ type Synonym interface { Term() string DocNum() uint32 SynonymID() uint32 + Size() int } type Location interface { From e2cede86db0d355177b7dce8a1b8011f873c8620 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 12:40:41 +0530 Subject: [PATCH 4/6] reorder interfaces --- segment.go | 68 +++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/segment.go b/segment.go index 904f370..a9a396d 100644 --- a/segment.go +++ b/segment.go @@ -61,11 +61,6 @@ type PersistedSegment interface { Path() string } -type SynonymSegment interface { - Segment - Thesaurus(field string) (Thesaurus, error) -} - type TermDictionary interface { PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) @@ -75,12 +70,6 @@ type TermDictionary interface { Contains(key []byte) (bool, error) } -type Thesaurus interface { - SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) - - Contains(key []byte) (bool, error) -} - type DictionaryIterator interface { Next() (*index.DictEntry, error) } @@ -100,12 +89,6 @@ type PostingsList interface { // Or(other PostingsList) PostingsList } -type SynonymsList interface { - Iterator(prealloc SynonymsIterator) SynonymsIterator - - Size() int -} - type PostingsIterator interface { DiskStatsReporter @@ -124,16 +107,6 @@ type PostingsIterator interface { Size() int } -type SynonymsIterator interface { - // The caller is responsible for copying whatever it needs from - // the returned Posting instance before calling Next(), as some - // implementations may return a shared instance to reduce memory - // allocations. - Next() (Synonym, error) - - Size() int -} - type DiskStatsReporter interface { // BytesRead returns the bytes read from the disk as // part of the current running query. @@ -166,13 +139,6 @@ type Posting interface { Size() int } -type Synonym interface { - Term() string - DocNum() uint32 - SynonymID() uint32 - Size() int -} - type Location interface { Field() string Start() uint64 @@ -212,3 +178,37 @@ type FieldStats interface { Aggregate(stats FieldStats) Fetch() map[string]map[string]uint64 } + +type ThesaurusSegment interface { + Segment + Thesaurus(field string) (Thesaurus, error) +} + +type Thesaurus interface { + SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) + + AutomatonIterator(a Automaton, + startKeyInclusive, endKeyExclusive []byte) ThesaurusIterator + + Contains(key []byte) (bool, error) +} + +type ThesaurusIterator interface { + Next() (*index.ThesaurusEntry, error) +} + +type SynonymsList interface { + Iterator(prealloc SynonymsIterator) SynonymsIterator + Size() int +} + +type SynonymsIterator interface { + Next() (Synonym, error) + Size() int +} + +type Synonym interface { + Number() uint32 + Term() string + Size() int +} From c2fe0537415aa2b3d16830539f59dd86aad33f6d Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 11 Dec 2024 17:27:35 +0530 Subject: [PATCH 5/6] add comments --- segment.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/segment.go b/segment.go index a9a396d..a6d1183 100644 --- a/segment.go +++ b/segment.go @@ -179,36 +179,58 @@ type FieldStats interface { Fetch() map[string]map[string]uint64 } +// ThesaurusSegment provides access to a thesaurus within a specific segment of the index. type ThesaurusSegment interface { Segment - Thesaurus(field string) (Thesaurus, error) + // Thesaurus returns the Thesaurus with the specified name. + Thesaurus(name string) (Thesaurus, error) } +// Thesaurus encapsulates a structured collection of terms and their associated synonyms. type Thesaurus interface { + // SynonymsList retrieves a list of synonyms for the specified term. The `except` parameter + // excludes specific synonyms, such as those originating from deleted documents. The `prealloc` + // parameter allows the use of preallocated memory to optimize performance. SynonymsList(term []byte, except *roaring.Bitmap, prealloc SynonymsList) (SynonymsList, error) - AutomatonIterator(a Automaton, - startKeyInclusive, endKeyExclusive []byte) ThesaurusIterator + // AutomatonIterator creates an iterator over the thesaurus keys/terms using the provided automaton. + // The iteration is constrained by the specified key range [startKeyInclusive, endKeyExclusive). + // These terms or keys are the ones that have a SynonymsList associated with them, in the thesaurus. + AutomatonIterator(a Automaton, startKeyInclusive, endKeyExclusive []byte) ThesaurusIterator + // Contains checks if the given key exists in the thesaurus. Contains(key []byte) (bool, error) } +// ThesaurusIterator iterates over terms in a thesaurus. type ThesaurusIterator interface { + // Next returns the next entry in the thesaurus or an error if iteration fails. Next() (*index.ThesaurusEntry, error) } +// SynonymsList represents a list of synonyms for a term. type SynonymsList interface { + // Iterator returns an iterator to traverse the list of synonyms. + // The `prealloc` parameter can be used to reuse existing memory for the iterator. Iterator(prealloc SynonymsIterator) SynonymsIterator + Size() int } +// SynonymsIterator provides a mechanism to iterate over a list of synonyms. type SynonymsIterator interface { + // Next returns the next synonym in the list or an error if iteration fails. Next() (Synonym, error) + Size() int } +// Synonym represents a single synonym for a term in the thesaurus. type Synonym interface { + // Number returns the document number from which the synonym originates. Number() uint32 + // Term returns the textual representation of the synonym. Term() string + Size() int } From c4155949f81da52f3f988c76c38918d74eb19f16 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 11 Dec 2024 09:33:29 -0700 Subject: [PATCH 6/6] Upgrade bleve_index_api --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e82187e..155e222 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.21 require ( github.com/RoaringBitmap/roaring v1.9.3 - github.com/blevesearch/bleve_index_api v1.1.12 + github.com/blevesearch/bleve_index_api v1.2.0 ) require ( diff --git a/go.sum b/go.sum index 433da92..9b45c05 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4 github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+5kexNy1RXfegY= -github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo= +github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=