From 42c2a7078bd48cb52737eea15b1202681510291e Mon Sep 17 00:00:00 2001 From: Likith B Date: Tue, 25 Feb 2025 14:31:00 +0530 Subject: [PATCH 1/8] MB-59633: Toy: Remove encoded polygons from term dictionaries --- section_inverted_text_index.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 09b7163..4f83449 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "math" "sort" + "strings" "sync/atomic" "github.com/RoaringBitmap/roaring/v2" @@ -558,9 +559,11 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin } if postingsOffset > uint64(0) { - err = io.builder.Insert([]byte(term), postingsOffset) - if err != nil { - return nil, err + if !strings.HasPrefix(term, "##") { + err = io.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return nil, err + } } } From dfe84ed96d23add10332dcb2247432be16cb2a19 Mon Sep 17 00:00:00 2001 From: Likith B Date: Thu, 6 Mar 2025 10:50:48 +0530 Subject: [PATCH 2/8] MB-59633: Added commentary --- section_inverted_text_index.go | 1 + 1 file changed, 1 insertion(+) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 4f83449..d40b3d8 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -559,6 +559,7 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin } if postingsOffset > uint64(0) { + // Ignore terms with ##. They are glue bytes for encoded polygons if !strings.HasPrefix(term, "##") { err = io.builder.Insert([]byte(term), postingsOffset) if err != nil { From 8a8bc96caafd3c17eec7d73e04d4ec91b71857bb Mon Sep 17 00:00:00 2001 From: Likith B Date: Thu, 6 Mar 2025 13:35:15 +0530 Subject: [PATCH 3/8] MB-59633: Minor Code Refactor --- section_inverted_text_index.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index d40b3d8..5611d89 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -560,7 +560,7 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin if postingsOffset > uint64(0) { // Ignore terms with ##. They are glue bytes for encoded polygons - if !strings.HasPrefix(term, "##") { + if _, exists := io.geoShapeFields[uint16(fieldID)]; !exists || !strings.HasPrefix(term, "##") { err = io.builder.Insert([]byte(term), postingsOffset) if err != nil { return nil, err @@ -793,6 +793,10 @@ func (i *invertedIndexOpaque) realloc() { if field.Options().IncludeDocValues() { i.IncludeDocValues[fieldID] = true } + + if field.EncodedFieldType() == 's' { + i.geoShapeFields[fieldID] = struct{}{} + } } if cap(i.IncludeDocValues) >= len(i.FieldsInv) { @@ -801,6 +805,10 @@ func (i *invertedIndexOpaque) realloc() { i.IncludeDocValues = make([]bool, len(i.FieldsInv)) } + if i.geoShapeFields == nil { + i.geoShapeFields = make(map[uint16]struct{}) + } + for _, result := range i.results { // walk each composite field result.VisitComposite(func(field index.CompositeField) { @@ -971,6 +979,8 @@ type invertedIndexOpaque struct { tmp0 []byte + geoShapeFields map[uint16]struct{} + fieldAddrs map[int]int fieldsSame bool @@ -1021,6 +1031,7 @@ func (io *invertedIndexOpaque) Reset() (err error) { io.reusableFieldTFs = io.reusableFieldTFs[:0] io.tmp0 = io.tmp0[:0] + io.geoShapeFields = nil atomic.StoreUint64(&io.bytesWritten, 0) io.fieldsSame = false io.numDocs = 0 From 5cb76a6c05823d4b498007f61d793f2539969d80 Mon Sep 17 00:00:00 2001 From: Likith B Date: Fri, 7 Mar 2025 16:08:29 +0530 Subject: [PATCH 4/8] MB-59633: Add term to docvalues instead of removing from dicts --- section_inverted_text_index.go | 43 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 5611d89..1d85dd7 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -19,7 +19,6 @@ import ( "encoding/binary" "math" "sort" - "strings" "sync/atomic" "github.com/RoaringBitmap/roaring/v2" @@ -559,12 +558,9 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin } if postingsOffset > uint64(0) { - // Ignore terms with ##. They are glue bytes for encoded polygons - if _, exists := io.geoShapeFields[uint16(fieldID)]; !exists || !strings.HasPrefix(term, "##") { - err = io.builder.Insert([]byte(term), postingsOffset) - if err != nil { - return nil, err - } + err = io.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return nil, err } } @@ -615,6 +611,11 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false) if io.IncludeDocValues[fieldID] { for docNum, docTerms := range docTermMap { + if fieldTermMap, ok := io.specialTerms[int(docNum)]; ok { + if sTerm, ok := fieldTermMap[fieldID]; ok { + docTerms = append(append(docTerms, sTerm...), termSeparator) + } + } if len(docTerms) > 0 { err = fdvEncoder.Add(uint64(docNum), docTerms) if err != nil { @@ -759,7 +760,7 @@ func (i *invertedIndexOpaque) realloc() { i.FieldsMap[fieldName] = uint16(fieldID + 1) } - visitField := func(field index.Field) { + visitField := func(field index.Field, docNum int) { fieldID := uint16(i.getOrDefineField(field.Name())) dict := i.Dicts[fieldID] @@ -795,7 +796,12 @@ func (i *invertedIndexOpaque) realloc() { } if field.EncodedFieldType() == 's' { - i.geoShapeFields[fieldID] = struct{}{} + if f, ok := field.(index.GeoShapeField); ok { + if _, exists := i.specialTerms[docNum]; !exists { + i.specialTerms[docNum] = make(map[int][]byte) + } + i.specialTerms[docNum][int(fieldID)] = f.EncodedShape() + } } } @@ -805,18 +811,20 @@ func (i *invertedIndexOpaque) realloc() { i.IncludeDocValues = make([]bool, len(i.FieldsInv)) } - if i.geoShapeFields == nil { - i.geoShapeFields = make(map[uint16]struct{}) + if i.specialTerms == nil { + i.specialTerms = map[int]map[int][]byte{} } - for _, result := range i.results { + for docNum, result := range i.results { // walk each composite field result.VisitComposite(func(field index.CompositeField) { - visitField(field) + visitField(field, docNum) }) // walk each field - result.VisitFields(visitField) + result.VisitFields(func(field index.Field) { + visitField(field, docNum) + }) } numPostingsLists := pidNext @@ -970,6 +978,9 @@ type invertedIndexOpaque struct { numTermsPerPostingsList []int // key is postings list id numLocsPerPostingsList []int // key is postings list id + // docNum -> fieldID -> term + specialTerms map[int]map[int][]byte + builder *vellum.Builder builderBuf bytes.Buffer @@ -979,8 +990,6 @@ type invertedIndexOpaque struct { tmp0 []byte - geoShapeFields map[uint16]struct{} - fieldAddrs map[int]int fieldsSame bool @@ -1031,7 +1040,7 @@ func (io *invertedIndexOpaque) Reset() (err error) { io.reusableFieldTFs = io.reusableFieldTFs[:0] io.tmp0 = io.tmp0[:0] - io.geoShapeFields = nil + io.specialTerms = nil atomic.StoreUint64(&io.bytesWritten, 0) io.fieldsSame = false io.numDocs = 0 From ec86bc077dd229275499e15c026fb91c1c7c6e7e Mon Sep 17 00:00:00 2001 From: Likith B Date: Mon, 10 Mar 2025 10:54:53 +0530 Subject: [PATCH 5/8] MB-59633: Minor Refactoring --- section_inverted_text_index.go | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 1d85dd7..6859147 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -611,7 +611,7 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false) if io.IncludeDocValues[fieldID] { for docNum, docTerms := range docTermMap { - if fieldTermMap, ok := io.specialTerms[int(docNum)]; ok { + if fieldTermMap, ok := io.extraDocValues[int(docNum)]; ok { if sTerm, ok := fieldTermMap[fieldID]; ok { docTerms = append(append(docTerms, sTerm...), termSeparator) } @@ -795,13 +795,11 @@ func (i *invertedIndexOpaque) realloc() { i.IncludeDocValues[fieldID] = true } - if field.EncodedFieldType() == 's' { - if f, ok := field.(index.GeoShapeField); ok { - if _, exists := i.specialTerms[docNum]; !exists { - i.specialTerms[docNum] = make(map[int][]byte) - } - i.specialTerms[docNum][int(fieldID)] = f.EncodedShape() + if f, ok := field.(index.GeoShapeField); ok { + if _, exists := i.extraDocValues[docNum]; !exists { + i.extraDocValues[docNum] = make(map[int][]byte) } + i.extraDocValues[docNum][int(fieldID)] = f.EncodedShape() } } @@ -811,8 +809,8 @@ func (i *invertedIndexOpaque) realloc() { i.IncludeDocValues = make([]bool, len(i.FieldsInv)) } - if i.specialTerms == nil { - i.specialTerms = map[int]map[int][]byte{} + if i.extraDocValues == nil { + i.extraDocValues = map[int]map[int][]byte{} } for docNum, result := range i.results { @@ -979,7 +977,7 @@ type invertedIndexOpaque struct { numLocsPerPostingsList []int // key is postings list id // docNum -> fieldID -> term - specialTerms map[int]map[int][]byte + extraDocValues map[int]map[int][]byte builder *vellum.Builder builderBuf bytes.Buffer @@ -1040,7 +1038,7 @@ func (io *invertedIndexOpaque) Reset() (err error) { io.reusableFieldTFs = io.reusableFieldTFs[:0] io.tmp0 = io.tmp0[:0] - io.specialTerms = nil + io.extraDocValues = nil atomic.StoreUint64(&io.bytesWritten, 0) io.fieldsSame = false io.numDocs = 0 From 29b60ece4d30e0b5faddeae7774df5a2ef274e9b Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Tue, 11 Mar 2025 10:26:22 -0700 Subject: [PATCH 6/8] Upgrade bleve_index_api, scorch_segment_api for interface change --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 40c5c89..1b4b5a4 100644 --- a/go.mod +++ b/go.mod @@ -4,10 +4,10 @@ go 1.21 require ( github.com/RoaringBitmap/roaring/v2 v2.4.5 - github.com/blevesearch/bleve_index_api v1.2.1 + github.com/blevesearch/bleve_index_api v1.2.4 github.com/blevesearch/go-faiss v1.0.24 github.com/blevesearch/mmap-go v1.0.4 - github.com/blevesearch/scorch_segment_api/v2 v2.3.3 + github.com/blevesearch/scorch_segment_api/v2 v2.3.6 github.com/blevesearch/vellum v1.1.0 github.com/golang/snappy v0.0.4 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index bca8c45..17ff4ed 100644 --- a/go.sum +++ b/go.sum @@ -2,14 +2,14 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2 github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0= github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.2.1 h1:IuXwLvmyp7I7+e0FOA68gcHHLfzSQ4AqQ8wVab5uxk0= -github.com/blevesearch/bleve_index_api v1.2.1/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= +github.com/blevesearch/bleve_index_api v1.2.4 h1:XRk+wWtbNZSxKvW2bNg4fUpoc3XQDD63krBzB45fJDY= +github.com/blevesearch/bleve_index_api v1.2.4/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI= github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.3 h1:LtyQ1Wltja54bchqwgY20SvVe6HltUL4PsAPH3UNrQI= -github.com/blevesearch/scorch_segment_api/v2 v2.3.3/go.mod h1:LXidEjeenMdbcLKP/UdZi1HJOny61FbhslAh5SgN5Ik= +github.com/blevesearch/scorch_segment_api/v2 v2.3.6 h1:W9tnVyNqaowWrg8BgndTI4BXmwM+Pr6R/5+Xof7nJMk= +github.com/blevesearch/scorch_segment_api/v2 v2.3.6/go.mod h1:JNx97hWfRl223SpaO9aq0C6WZI9SNdBovMp7X9K7XqU= github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= From 6102d1e97a99fbcc2c253efce250ad2dcf30ec59 Mon Sep 17 00:00:00 2001 From: Likith B Date: Fri, 14 Mar 2025 15:52:10 +0530 Subject: [PATCH 7/8] MB-59633: Added some commentary --- section_inverted_text_index.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 6859147..60ed562 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -976,6 +976,8 @@ type invertedIndexOpaque struct { numTermsPerPostingsList []int // key is postings list id numLocsPerPostingsList []int // key is postings list id + // store terms that are unnecessary for the term dictionaries but needed in doc values + // eg - encoded geoshapes // docNum -> fieldID -> term extraDocValues map[int]map[int][]byte From a4754a5e6f7faaab719dd31b58863e30282c8a34 Mon Sep 17 00:00:00 2001 From: Likith B Date: Wed, 19 Mar 2025 19:50:33 +0530 Subject: [PATCH 8/8] MB-59633: Changed array type to uint16 --- section_inverted_text_index.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/section_inverted_text_index.go b/section_inverted_text_index.go index 60ed562..400a029 100644 --- a/section_inverted_text_index.go +++ b/section_inverted_text_index.go @@ -611,8 +611,8 @@ func (io *invertedIndexOpaque) writeDicts(w *CountHashWriter) (dictOffsets []uin fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false) if io.IncludeDocValues[fieldID] { for docNum, docTerms := range docTermMap { - if fieldTermMap, ok := io.extraDocValues[int(docNum)]; ok { - if sTerm, ok := fieldTermMap[fieldID]; ok { + if fieldTermMap, ok := io.extraDocValues[docNum]; ok { + if sTerm, ok := fieldTermMap[uint16(fieldID)]; ok { docTerms = append(append(docTerms, sTerm...), termSeparator) } } @@ -797,9 +797,9 @@ func (i *invertedIndexOpaque) realloc() { if f, ok := field.(index.GeoShapeField); ok { if _, exists := i.extraDocValues[docNum]; !exists { - i.extraDocValues[docNum] = make(map[int][]byte) + i.extraDocValues[docNum] = make(map[uint16][]byte) } - i.extraDocValues[docNum][int(fieldID)] = f.EncodedShape() + i.extraDocValues[docNum][fieldID] = f.EncodedShape() } } @@ -810,7 +810,7 @@ func (i *invertedIndexOpaque) realloc() { } if i.extraDocValues == nil { - i.extraDocValues = map[int]map[int][]byte{} + i.extraDocValues = map[int]map[uint16][]byte{} } for docNum, result := range i.results { @@ -979,7 +979,7 @@ type invertedIndexOpaque struct { // store terms that are unnecessary for the term dictionaries but needed in doc values // eg - encoded geoshapes // docNum -> fieldID -> term - extraDocValues map[int]map[int][]byte + extraDocValues map[int]map[uint16][]byte builder *vellum.Builder builderBuf bytes.Buffer