From 16093d1b650381b169c79a82ec1731509f157224 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 17 Nov 2025 15:08:30 +0530 Subject: [PATCH 01/70] rebase --- document/document.go | 38 +- index/scorch/introducer.go | 5 + index/scorch/scorch.go | 6 + index/scorch/snapshot_index.go | 61 +- index/scorch/snapshot_index_doc.go | 5 +- index/scorch/snapshot_index_tfr.go | 9 +- index/scorch/snapshot_index_vr.go | 9 +- index/scorch/snapshot_segment.go | 21 + index_impl.go | 109 ++- mapping/document.go | 18 +- mapping/index.go | 75 +- mapping/mapping.go | 16 + registry/nested.go | 108 +++ registry/registry.go | 2 + search/collector/eligible.go | 46 +- search/collector/knn.go | 42 +- search/collector/nested.go | 90 +++ search/collector/topn.go | 103 ++- search/explanation.go | 44 ++ .../highlighter/simple/highlighter_simple.go | 6 +- search/query/conjunction.go | 39 +- search/query/query.go | 9 +- search/scorer/scorer_conjunction_nested.go | 83 +++ search/search.go | 113 ++- search/searcher/search_conjunction_nested.go | 364 +++++++++ search/searcher/search_disjunction_heap.go | 7 +- search/searcher/search_match_all.go | 31 + search/searcher/search_numeric_range.go | 2 +- search/util.go | 26 + search_knn.go | 59 +- search_test.go | 705 ++++++++++++++++++ 31 files changed, 2146 insertions(+), 105 deletions(-) create mode 100644 registry/nested.go create mode 100644 search/collector/nested.go create mode 100644 search/scorer/scorer_conjunction_nested.go create mode 100644 search/searcher/search_conjunction_nested.go diff --git a/document/document.go b/document/document.go index 569d57bd6..280832be7 100644 --- a/document/document.go +++ b/document/document.go @@ -18,6 +18,7 @@ import ( "fmt" "reflect" + "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" ) @@ -30,8 +31,9 @@ func init() { } type Document struct { - id string `json:"id"` - Fields []Field `json:"fields"` + id string + Fields []Field `json:"fields"` + NestedDocuments []*Document `json:"nested_documents"` CompositeFields []*CompositeField StoredFieldsSize uint64 indexed bool @@ -157,3 +159,35 @@ func (d *Document) SetIndexed() { func (d *Document) Indexed() bool { return d.indexed } + +func (d *Document) AddNestedDocument(doc *Document) { + d.NestedDocuments = append(d.NestedDocuments, doc) +} + +func (d *Document) NestedFields() search.FieldSet { + if len(d.NestedDocuments) == 0 { + return nil + } + + fieldSet := search.NewFieldSet() + var collectFields func(index.Document) + collectFields = func(doc index.Document) { + // Add all field names from this nested document + doc.VisitFields(func(field index.Field) { + fieldSet.AddField(field.Name()) + }) + // Recursively collect from this document's nested documents + if nd, ok := doc.(index.NestedDocument); ok { + nd.VisitNestedDocuments(collectFields) + } + } + // Start collection from nested documents only (not root document) + d.VisitNestedDocuments(collectFields) + return fieldSet +} + +func (d *Document) VisitNestedDocuments(visitor func(doc index.Document)) { + for _, doc := range d.NestedDocuments { + visitor(doc) + } +} diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 8191584d2..597fb4bea 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -167,6 +167,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newss.deleted = nil } + // update the deleted bitmap to include any nested/sub-documents as well + // if the segment supports that + if ns, ok := newss.segment.(segment.NestedSegment); ok { + newss.deleted = ns.AddNestedDocuments(newss.deleted) + } // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 83924978e..85cf4ed02 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -768,6 +768,12 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { } } }) + if nd, ok := d.(index.NestedDocument); ok { + nd.VisitNestedDocuments(func(doc index.Document) { + doc.AddIDField() + analyze(doc, fn) + }) + } } func (s *Scorch) AddEligibleForRemoval(epoch uint64) { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3f2a330c5..e283c2340 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -17,7 +17,6 @@ package scorch import ( "container/heap" "context" - "encoding/binary" "fmt" "os" "path/filepath" @@ -42,9 +41,8 @@ type asynchSegmentResult struct { dict segment.TermDictionary dictItr segment.DictionaryIterator - cardinality int - index int - docs *roaring.Bitmap + index int + docs *roaring.Bitmap thesItr segment.ThesaurusIterator @@ -59,11 +57,11 @@ func init() { var err error lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed1 builder err: %v", err)) } lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed2 builder err: %v", err)) } } @@ -474,7 +472,7 @@ func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) { func (is *IndexSnapshot) DocCount() (uint64, error) { var rv uint64 for _, segment := range is.segment { - rv += segment.Count() + rv += segment.CountRoot() } return rv, nil } @@ -501,7 +499,7 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) { return nil, nil } - docNum, err := docInternalToNumber(next.ID) + docNum, err := next.ID.Value() if err != nil { return nil, err } @@ -571,7 +569,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in } func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return "", err } @@ -589,7 +587,7 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return 0, 0, err } @@ -776,25 +774,6 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } -func docNumberToBytes(buf []byte, in uint64) []byte { - if len(buf) != 8 { - if cap(buf) >= 8 { - buf = buf[0:8] - } else { - buf = make([]byte, 8) - } - } - binary.BigEndian.PutUint64(buf, in) - return buf -} - -func docInternalToNumber(in index.IndexInternalID) (uint64, error) { - if len(in) != 8 { - return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) - } - return binary.BigEndian.Uint64(in), nil -} - func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( segmentIndex int, localDocNum uint64, fields []string, cFields []string, visitor index.DocValueVisitor, dvs segment.DocVisitState) ( @@ -897,7 +876,7 @@ func (dvr *DocValueReader) BytesRead() uint64 { func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, visitor index.DocValueVisitor, ) (err error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return err } @@ -1297,3 +1276,25 @@ func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending boo return termFreqs[:limit], nil } + +func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID) ([]index.IndexInternalID, error) { + seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID) + if err != nil { + return nil, err + } + + ancestors := i.segment[seg].Ancestors(ldoc) + + // allocate space: +1 for the doc itself + rv := make([]index.IndexInternalID, len(ancestors)+1) + globalOffset := i.offsets[seg] + + // first element is the doc itself + rv[0] = index.NewIndexInternalID(nil, ldoc+globalOffset) + + // then all ancestors shifted by +1 + for j := 0; j < len(ancestors); j++ { + rv[j+1] = index.NewIndexInternalID(nil, ancestors[j]+globalOffset) + } + return rv, nil +} diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index 0a979bfb5..4048a199b 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "reflect" "github.com/RoaringBitmap/roaring/v2" @@ -49,7 +48,7 @@ func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { next := i.iterators[i.segmentOffset].Next() // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] - return docNumberToBytes(nil, uint64(next)+globalOffset), nil + return index.NewIndexInternalID(nil, uint64(next)+globalOffset), nil } return nil, nil } @@ -63,7 +62,7 @@ func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.Inde if next == nil { return nil, nil } - for bytes.Compare(next, ID) < 0 { + for next.Compare(ID) < 0 { next, err = i.Next() if err != nil { return nil, err diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index cd4d82dce..08d423925 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "context" "fmt" "reflect" @@ -94,7 +93,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) i.postingToTermFieldDoc(next, rv) i.currID = rv.ID @@ -146,7 +145,7 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { // Check if the TFR is a special unadorned composite optimization. // Such a TFR will NOT have a valid `term` or `field` set, making it // impossible for the TFR to replace itself with a new one. @@ -171,7 +170,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } } } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -196,7 +195,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo if preAlloced == nil { preAlloced = &index.TermFieldDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.postingToTermFieldDoc(next, preAlloced) i.currID = preAlloced.ID diff --git a/index/scorch/snapshot_index_vr.go b/index/scorch/snapshot_index_vr.go index bd57ad3e0..5e510c4d6 100644 --- a/index/scorch/snapshot_index_vr.go +++ b/index/scorch/snapshot_index_vr.go @@ -18,7 +18,6 @@ package scorch import ( - "bytes" "context" "encoding/json" "fmt" @@ -96,7 +95,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) rv.Score = float64(next.Score()) i.currID = rv.ID @@ -113,7 +112,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, preAlloced *index.VectorDoc) (*index.VectorDoc, error) { - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams, i.eligibleSelector) if err != nil { @@ -124,7 +123,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, *i = *(i2.(*IndexSnapshotVectorReader)) } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -149,7 +148,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, if preAlloced == nil { preAlloced = &index.VectorDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.currID = preAlloced.ID i.currPosting = next diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index c6f3584cc..ed3684557 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -113,6 +113,19 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } +// this counts the root documents in the segment this differs from Count() in that +// Count() counts all live documents including nested children, whereas this method +// counts only root live documents +func (s *SegmentSnapshot) CountRoot() uint64 { + var rv uint64 + if nsb, ok := s.segment.(segment.NestedSegment); ok { + rv = nsb.CountRoot(s.deleted) + } else { + rv = s.Count() + } + return rv +} + func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { rv, err := s.segment.DocNumbers(docIDs) if err != nil { @@ -361,3 +374,11 @@ func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { c.m.RUnlock() return rv } + +func (s *SegmentSnapshot) Ancestors(docID uint64) []uint64 { + nsb, ok := s.segment.(segment.NestedSegment) + if !ok { + return nil + } + return nsb.Ancestors(docID) +} diff --git a/index_impl.go b/index_impl.go index 8065d9c1e..ee1a00edf 100644 --- a/index_impl.go +++ b/index_impl.go @@ -572,8 +572,7 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in return nil, err } - fs := make(query.FieldSet) - fs, err := query.ExtractFields(req.Query, i.m, fs) + fs, err := query.ExtractFields(req.Query, i.m, search.NewFieldSet()) if err != nil { return nil, err } @@ -642,7 +641,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // ------------------------------------------------------------------------------------------ // set up additional contexts for any search operation that will proceed from - // here, such as presearch, collectors etc. + // here, such as presearch, knn collector, topn collector etc. // Scoring model callback to be used to get scoring model scoringModelCallback := func() string { @@ -687,6 +686,13 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) + // check if the index mapping has any nested fields, which should force + // all collectors and searchers to be run in nested mode + if nm, ok := i.m.(mapping.NestedMapping); ok { + if nm.CountNested() > 0 { + ctx = context.WithValue(ctx, search.NestedSearchKey, true) + } + } // ------------------------------------------------------------------------------------------ if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { @@ -716,11 +722,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.SearchBefore = nil } - var coll *collector.TopNCollector - if req.SearchAfter != nil { - coll = collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) - } else { - coll = collector.NewTopNCollector(req.Size, req.From, req.Sort) + coll, err := i.buildTopNCollector(ctx, req, indexReader) + if err != nil { + return nil, err } var knnHits []*search.DocumentMatch @@ -937,7 +941,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if i.name != "" && hit.Index == "" { hit.Index = i.name } - err, storedFieldsBytes := LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter) + err, storedFieldsBytes := LoadAndHighlightAllFields(hit, req, i.name, indexReader, highlighter) if err != nil { return nil, err } @@ -1105,6 +1109,57 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, return nil, totalStoredFieldsBytes } +// LoadAndHighlightAllFields loads stored fields + highlights for root and its descendants. +// All descendant documents are collected into a _$nested array in the root DocumentMatch. +func LoadAndHighlightAllFields( + root *search.DocumentMatch, + req *SearchRequest, + indexName string, + r index.IndexReader, + highlighter highlight.Highlighter, +) (error, uint64) { + var totalStoredFieldsBytes uint64 + // load root fields/highlights + err, bytes := LoadAndHighlightFields(root, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err, totalStoredFieldsBytes + } + // collect all descendant documents + nestedDocs := make([]*search.NestedDocumentMatch, 0) + // create a dummy desc DocumentMatch to reuse LoadAndHighlightFields + desc := &search.DocumentMatch{} + err = root.Children.IterateDescendants(func(descID index.IndexInternalID) error { + extID, err := r.ExternalID(descID) + if err != nil { + return err + } + // reset desc for reuse + desc.ID = extID + desc.IndexInternalID = descID + desc.Locations = root.Locations + err, bytes := LoadAndHighlightFields(desc, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err + } + // copy fields to nested doc and append + if len(desc.Fields) != 0 || len(desc.Fragments) != 0 { + nestedDocs = append(nestedDocs, search.NewNestedDocumentMatch(desc.Fields, desc.Fragments)) + } + desc.Reset() + return nil + }) + if err != nil { + return err, totalStoredFieldsBytes + } + // add nested documents to root under _$nested key + if len(nestedDocs) > 0 { + root.AddFieldValue("_$nested", nestedDocs) + } + return nil, totalStoredFieldsBytes +} + // Fields returns the name of all the fields this // Index has operated on. func (i *indexImpl) Fields() (fields []string, err error) { @@ -1487,3 +1542,39 @@ func (i *indexImpl) CentroidCardinalities(field string, limit int, descending bo return centroidCardinalities, nil } + +func (i *indexImpl) buildTopNCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader) (*collector.TopNCollector, error) { + newCollector := func() *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) + } + return collector.NewTopNCollector(req.Size, req.From, req.Sort) + } + + newNestedCollector := func(nr index.NestedReader) *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewNestedTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter, nr) + } + return collector.NewNestedTopNCollector(req.Size, req.From, req.Sort, nr) + } + + // check if we are in nested mode + if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { + // get the nested reader from the index reader + if nr, ok := reader.(index.NestedReader); ok { + // check if the mapping has any nested fields that intersect + if nm, ok := i.m.(mapping.NestedMapping); ok { + var fs search.FieldSet + var err error + fs, err = query.ExtractFields(req.Query, i.m, fs) + if err != nil { + return nil, err + } + if nm.IntersectsPrefix(fs) { + return newNestedCollector(nr), nil + } + } + } + } + return newCollector(), nil +} diff --git a/mapping/document.go b/mapping/document.go index bf93896c9..b595f3dda 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -22,6 +22,7 @@ import ( "reflect" "time" + "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" ) @@ -44,6 +45,7 @@ type DocumentMapping struct { Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` + Nested bool `json:"nested,omitempty"` DefaultAnalyzer string `json:"default_analyzer,omitempty"` DefaultSynonymSource string `json:"default_synonym_source,omitempty"` @@ -316,6 +318,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "nested": + err := util.UnmarshalJSON(v, &dm.Nested) + if err != nil { + return err + } case "default_analyzer": err := util.UnmarshalJSON(v, &dm.DefaultAnalyzer) if err != nil { @@ -438,10 +445,19 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } } case reflect.Slice, reflect.Array: + subDocMapping, _ := dm.documentMappingForPathElements(path) + nestedSubObjects := subDocMapping != nil && subDocMapping.Nested for i := 0; i < val.Len(); i++ { if val.Index(i).CanInterface() { fieldVal := val.Index(i).Interface() - dm.processProperty(fieldVal, path, append(indexes, uint64(i)), context) + if nestedSubObjects { + nestedDocument := document.NewDocument(fmt.Sprintf("%s_$%s_$%d", context.doc.ID(), encodePath(path), i)) + nestedContext := context.im.newWalkContext(nestedDocument, dm) + dm.processProperty(fieldVal, path, append(indexes, uint64(i)), nestedContext) + context.doc.AddNestedDocument(nestedDocument) + } else { + dm.processProperty(fieldVal, path, append(indexes, uint64(i)), context) + } } } case reflect.Ptr: diff --git a/mapping/index.go b/mapping/index.go index a40feb470..9aabab6ca 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -17,12 +17,14 @@ package mapping import ( "encoding/json" "fmt" + "strings" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/analysis/analyzer/standard" "github.com/blevesearch/bleve/v2/analysis/datetime/optional" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" + "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -363,7 +365,13 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} // see if the _all field was disabled allMapping, _ := docMapping.documentMappingForPath("_all") if allMapping == nil || allMapping.Enabled { - field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, index.IndexField|index.IncludeTermVectors) + excludedFromAll := walkContext.excludedFromAll + nf := doc.NestedFields() + if nf != nil { + // if the document has any nested fields, exclude them from _all + excludedFromAll = append(excludedFromAll, nf.Slice()...) + } + field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, excludedFromAll, index.IndexField|index.IncludeTermVectors) doc.AddField(field) } doc.SetIndexed() @@ -571,3 +579,68 @@ func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceV } return nil } + +func (im *IndexMappingImpl) buildNestedPrefixes() { + var collectNestedFields func(dm *DocumentMapping, pathComponents []string, currentDepth int) + collectNestedFields = func(dm *DocumentMapping, pathComponents []string, currentDepth int) { + for name, docMapping := range dm.Properties { + newPathComponents := append(pathComponents, name) + if docMapping.Nested { + // This is a nested field boundary + path := strings.Join(newPathComponents, ".") + im.cache.NestedPrefixes.AddPrefix(path, currentDepth+1) + // Continue deeper with incremented depth + collectNestedFields(docMapping, newPathComponents, currentDepth+1) + } else { + // Not nested, continue with same depth + collectNestedFields(docMapping, newPathComponents, currentDepth) + } + } + } + // Start from depth 0 (root) + if im.DefaultMapping != nil && im.DefaultMapping.Enabled { + collectNestedFields(im.DefaultMapping, []string{}, 0) + } + // Now do this for each type mapping + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + collectNestedFields(docMapping, []string{}, 0) + } + } +} + +func (im *IndexMappingImpl) CoveringDepth(fs search.FieldSet) int { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0 + } + + im.cache.NestedPrefixes.InitOnce(func() { + im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.CoveringDepth(fs) +} + +func (im *IndexMappingImpl) CountNested() int { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0 + } + + im.cache.NestedPrefixes.InitOnce(func() { + im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.CountNested() +} + +func (im *IndexMappingImpl) IntersectsPrefix(fs search.FieldSet) bool { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return false + } + + im.cache.NestedPrefixes.InitOnce(func() { + im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.IntersectsPrefix(fs) +} diff --git a/mapping/mapping.go b/mapping/mapping.go index a6c1591b8..0653f7531 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -20,6 +20,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/search" ) // A Classifier is an interface describing any object which knows how to @@ -74,3 +75,18 @@ type SynonymMapping interface { SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error } + +// A NestedMapping extends the IndexMapping interface to provide +// additional methods for working with nested object mappings. +type NestedMapping interface { + // CoveringDepth returns the deepest nested + // level common to all field paths + CoveringDepth(fieldPaths search.FieldSet) int + + // IntersectsPrefix returns true if any of the given + // field paths intersect with a known nested prefix + IntersectsPrefix(fieldPaths search.FieldSet) bool + + // CountNested returns the number of nested object mappings + CountNested() int +} diff --git a/registry/nested.go b/registry/nested.go new file mode 100644 index 000000000..54745fe3e --- /dev/null +++ b/registry/nested.go @@ -0,0 +1,108 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "strings" + "sync" + + "github.com/blevesearch/bleve/v2/search" +) + +type NestedFieldCache struct { + // nested prefix -> nested level + c *ConcurrentCache + + once sync.Once +} + +func NewNestedFieldCache() *NestedFieldCache { + return &NestedFieldCache{ + NewConcurrentCache(), + sync.Once{}, + } +} + +func (nfc *NestedFieldCache) InitOnce(initFunc func()) { + nfc.once.Do(initFunc) +} + +func (nfc *NestedFieldCache) AddPrefix(prefix string, level int) error { + buildFunc := func(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { + return level, nil + } + _, err := nfc.c.DefineItem(prefix, "", nil, nil, buildFunc) + if err == ErrAlreadyDefined { + // Already exists, that's ok + return nil + } + return err +} + +// Returns the deepest nested level that covers all the given field paths +func (nfc *NestedFieldCache) CoveringDepth(fieldPaths search.FieldSet) int { + if len(fieldPaths) == 0 { + return 0 + } + + nfc.c.mutex.RLock() + defer nfc.c.mutex.RUnlock() + + deepestLevel := 0 + + // Check each cached nested prefix + for prefix, item := range nfc.c.data { + level, ok := item.(int) + if !ok { + continue + } + + // Check if this nested prefix belongs to all the given paths + isCommonPrefix := true + for path := range fieldPaths { + if !strings.HasPrefix(path, prefix) { + isCommonPrefix = false + break + } + } + + // If it's a common prefix and deeper than what we've found so far + if isCommonPrefix && level > deepestLevel { + deepestLevel = level + } + } + + return deepestLevel +} + +func (nfc *NestedFieldCache) CountNested() int { + nfc.c.mutex.RLock() + defer nfc.c.mutex.RUnlock() + + return len(nfc.c.data) +} + +func (nfc *NestedFieldCache) IntersectsPrefix(fieldPaths search.FieldSet) bool { + nfc.c.mutex.RLock() + defer nfc.c.mutex.RUnlock() + for prefix := range nfc.c.data { + for path := range fieldPaths { + if strings.HasPrefix(path, prefix) { + return true + } + } + } + return false +} diff --git a/registry/registry.go b/registry/registry.go index 69ee8dd86..36f209d4f 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -49,6 +49,7 @@ type Cache struct { Fragmenters *FragmenterCache Highlighters *HighlighterCache SynonymSources *SynonymSourceCache + NestedPrefixes *NestedFieldCache } func NewCache() *Cache { @@ -63,6 +64,7 @@ func NewCache() *Cache { Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), SynonymSources: NewSynonymSourceCache(), + NestedPrefixes: NewNestedFieldCache(), } } diff --git a/search/collector/eligible.go b/search/collector/eligible.go index 49e044812..7b0ada345 100644 --- a/search/collector/eligible.go +++ b/search/collector/eligible.go @@ -31,12 +31,18 @@ type EligibleCollector struct { total uint64 took time.Duration eligibleSelector index.EligibleDocumentSelector + + nestedStore *collectStoreNested } func NewEligibleCollector(size int) *EligibleCollector { return newEligibleCollector(size) } +func NewNestedEligibleCollector(nr index.NestedReader, size int) *EligibleCollector { + return newNestedEligibleCollector(nr, size) +} + func newEligibleCollector(size int) *EligibleCollector { // No sort order & skip always 0 since this is only to filter eligible docs. ec := &EligibleCollector{ @@ -45,6 +51,15 @@ func newEligibleCollector(size int) *EligibleCollector { return ec } +func newNestedEligibleCollector(nr index.NestedReader, size int) *EligibleCollector { + // No sort order & skip always 0 since this is only to filter eligible docs. + ec := &EligibleCollector{ + size: size, + nestedStore: newStoreNested(nr), + } + return ec +} + func makeEligibleDocumentMatchHandler(ctx *search.SearchContext, reader index.IndexReader) (search.DocumentMatchHandler, error) { if ec, ok := ctx.Collector.(*EligibleCollector); ok { if vr, ok := reader.(index.VectorIndexReader); ok { @@ -108,9 +123,18 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search } ec.total++ - err = dmHandler(next) - if err != nil { - break + if ec.nestedStore != nil { + doc, err := ec.nestedStore.AddDocument(next) + if err != nil { + return err + } + // recycle + searchContext.DocumentMatchPool.Put(doc) + } else { + err = dmHandler(next) + if err != nil { + break + } } next, err = searcher.Next(searchContext) @@ -119,6 +143,22 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search return err } + if ec.nestedStore != nil { + var count uint64 + err := ec.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { + // process the root document + if err := dmHandler(doc); err != nil { + return err + } + count++ + return nil + }) + if err != nil { + return err + } + ec.total = count + } + // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) diff --git a/search/collector/knn.go b/search/collector/knn.go index 465bf6927..5ae79509b 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -136,6 +136,9 @@ type KNNCollector struct { took time.Duration results search.DocumentMatchCollection maxScore float64 + + nestedStore *collectStoreNested + descendants [][]index.IndexInternalID } func NewKNNCollector(kArray []int64, size int64) *KNNCollector { @@ -145,6 +148,15 @@ func NewKNNCollector(kArray []int64, size int64) *KNNCollector { } } +func NewNestedKNNCollector(nr index.NestedReader, kArray []int64, size int64) *KNNCollector { + return &KNNCollector{ + knnStore: GetNewKNNCollectorStore(kArray), + size: int(size), + + nestedStore: newStoreNested(nr), + } +} + func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { startTime := time.Now() var err error @@ -191,9 +203,17 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r } hc.total++ - err = dmHandler(next) - if err != nil { - break + if hc.nestedStore != nil { + doc, err := hc.nestedStore.AddDocument(next) + if err != nil { + return err + } + searchContext.DocumentMatchPool.Put(doc) + } else { + err = dmHandler(next) + if err != nil { + break + } } next, err = searcher.Next(searchContext) @@ -202,6 +222,22 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r return err } + if hc.nestedStore != nil { + var count uint64 + err := hc.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { + // process the root document + if err := dmHandler(doc); err != nil { + return err + } + count++ + return nil + }) + if err != nil { + return err + } + hc.total = count + } + // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) diff --git a/search/collector/nested.go b/search/collector/nested.go new file mode 100644 index 000000000..d680ad06d --- /dev/null +++ b/search/collector/nested.go @@ -0,0 +1,90 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +type collectStoreNested struct { + nr index.NestedReader + + interim map[uint64]*search.DocumentMatch +} + +func newStoreNested(nr index.NestedReader) *collectStoreNested { + rv := &collectStoreNested{ + interim: make(map[uint64]*search.DocumentMatch), + nr: nr, + } + return rv +} +func (c *collectStoreNested) AddDocument(doc *search.DocumentMatch) (*search.DocumentMatch, error) { + // find ancestors for the doc + ancestors, err := c.nr.Ancestors(doc.IndexInternalID) + if err != nil || len(ancestors) == 0 { + return nil, fmt.Errorf("error getting ancestors for doc %v: %v", doc.IndexInternalID, err) + } + // root docID is the last ancestor + rootID := ancestors[len(ancestors)-1] + rootIDVal, err := rootID.Value() + if err != nil { + return nil, err + } + // lookup existing root + rootDocument, ok := c.interim[rootIDVal] + if !ok { + // no interim root yet + if len(ancestors) == 1 { + // incoming doc is the root itself + c.interim[rootIDVal] = doc + return nil, nil + } + + // create new interim root and merge child into it + rootDocument = &search.DocumentMatch{IndexInternalID: rootID} + if err := rootDocument.MergeWith(doc); err != nil { + return nil, err + } + c.interim[rootIDVal] = rootDocument + + // return the child for recycling + return doc, nil + } + + // merge child into existing root + if err := rootDocument.MergeWith(doc); err != nil { + return nil, err + } + return doc, nil +} + +// NestedDocumentVisitor is the callback invoked for each root document. +// root is the merged root DocumentMatch. +type NestedDocumentVisitor func(root *search.DocumentMatch) error + +// VisitRoots walks over all collected interim values and calls the visitor. +func (c *collectStoreNested) VisitRoots(visitor NestedDocumentVisitor) error { + for _, root := range c.interim { + // invoke the visitor + if err := visitor(root); err != nil { + return err + } + } + return nil +} diff --git a/search/collector/topn.go b/search/collector/topn.go index 739dd8348..ab76ff33b 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -79,6 +79,8 @@ type TopNCollector struct { knnHits map[string]*search.DocumentMatch computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc + + nestedStore *collectStoreNested } // CheckDoneEvery controls how frequently we check the context deadline @@ -88,25 +90,51 @@ const CheckDoneEvery = uint64(1024) // skipping over the first 'skip' hits // ordering hits by the provided sort order func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { - return newTopNCollector(size, skip, sort) + return newTopNCollector(size, skip, sort, nil) } // NewTopNCollectorAfter builds a collector to find the top 'size' hits // skipping over the first 'skip' hits // ordering hits by the provided sort order +// starting after the provided 'after' sort values func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector { - rv := newTopNCollector(size, 0, sort) + rv := newTopNCollector(size, 0, sort, nil) rv.searchAfter = createSearchAfterDocument(sort, after) return rv } -func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { +// NewNestedTopNCollector builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { + return newTopNCollector(size, skip, sort, nr) +} + +// NewNestedTopNCollectorAfter builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// starting after the provided 'after' sort values +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollectorAfter(size int, sort search.SortOrder, after []string, nr index.NestedReader) *TopNCollector { + rv := newTopNCollector(size, 0, sort, nr) + rv.searchAfter = createSearchAfterDocument(sort, after) + return rv +} + +func newTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { hc := &TopNCollector{size: size, skip: skip, sort: sort} hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int { return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) }) + if nr != nil { + hc.nestedStore = newStoreNested(nr) + } + // these lookups traverse an interface, so do once up-front if sort.RequiresDocID() { hc.needDocIds = true @@ -293,30 +321,63 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } } - err = hc.adjustDocumentMatch(searchContext, reader, next) - if err != nil { - break - } + if hc.nestedStore != nil { + hc.total++ + doc, err := hc.nestedStore.AddDocument(next) + if err != nil { + return err + } + // recycle + searchContext.DocumentMatchPool.Put(doc) + } else { + err = hc.adjustDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + // no descendants at this point + err = hc.prepareDocumentMatch(searchContext, reader, next, false) + if err != nil { + break + } - err = hc.prepareDocumentMatch(searchContext, reader, next, false) - if err != nil { - break - } + err = dmHandler(next) + if err != nil { + break + } - err = dmHandler(next) - if err != nil { - break } - next, err = searcher.Next(searchContext) } if err != nil { return err } + + if hc.nestedStore != nil { + var count uint64 + err := hc.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { + if err := hc.adjustDocumentMatch(searchContext, reader, doc); err != nil { + return err + } + if err := hc.prepareDocumentMatch(searchContext, reader, doc, false); err != nil { + return err + } + if err := dmHandler(doc); err != nil { + return err + } + count++ + return nil + }) + if err != nil { + return err + } + hc.total = count + } + if hc.knnHits != nil { // we may have some knn hits left that did not match any of the top N tf-idf hits // we need to add them to the collector store to consider them as well. for _, knnDoc := range hc.knnHits { + // no descendants for knn docs err = hc.prepareDocumentMatch(searchContext, reader, knnDoc, true) if err != nil { return err @@ -501,7 +562,17 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc } } - err := hc.dvReader.VisitDocValues(d.IndexInternalID, v) + // if this is a nested document, we need to visit the doc values + // for all its ancestors as well + // so that facets/sorts can be computed correctly + err := d.Children.IterateDescendants(func(descendant index.IndexInternalID) error { + return hc.dvReader.VisitDocValues(descendant, v) + }) + if err != nil { + return err + } + // now visit the doc values for this document + err = hc.dvReader.VisitDocValues(d.IndexInternalID, v) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() } diff --git a/search/explanation.go b/search/explanation.go index 924050016..38273fecb 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -54,3 +54,47 @@ func (expl *Explanation) Size() int { return sizeInBytes } + +const MergedExplMessage = "sum of merged explanations:" + +// MergeExpl merges two explanations into one. +// If either explanation is nil, the other is returned. +// If the first explanation is already a merged explanation, +// the second explanation is appended to its children. +// Otherwise, a new merged explanation is created +// with the two explanations as its children. +func MergeExpl(first, second *Explanation) *Explanation { + if first == nil { + return second + } + if second == nil { + return first + } + if first.Message == MergedExplMessage { + // reuse first explanation as the merged one + first.Value += second.Value + first.Children = append(first.Children, second) + return first + } + // create a new explanation to hold the merged one + rv := &Explanation{ + Value: first.Value + second.Value, + Message: MergedExplMessage, + Children: []*Explanation{first, second}, + } + return rv +} + +func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { + if first == nil { + return second + } + if second == nil { + return first + } + // reuse first to store the union of both + for k, v := range second { + first[k] += v + } + return first +} diff --git a/search/highlight/highlighter/simple/highlighter_simple.go b/search/highlight/highlighter/simple/highlighter_simple.go index e898a1e61..d0adfa81f 100644 --- a/search/highlight/highlighter/simple/highlighter_simple.go +++ b/search/highlight/highlighter/simple/highlighter_simple.go @@ -146,12 +146,8 @@ func (s *Highlighter) BestFragmentsInField(dm *search.DocumentMatch, doc index.D formattedFragments[i] += s.sep } } - - if dm.Fragments == nil { - dm.Fragments = make(search.FieldFragmentMap, 0) - } if len(formattedFragments) > 0 { - dm.Fragments[field] = formattedFragments + dm.AddFragments(field, formattedFragments) } return formattedFragments diff --git a/search/query/conjunction.go b/search/query/conjunction.go index a2043720a..25fda0400 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -54,14 +54,39 @@ func (q *ConjunctionQuery) AddQuery(aq ...Query) { func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Conjuncts)) + cleanup := func() { + for _, searcher := range ss { + if searcher != nil { + _ = searcher.Close() + } + } + } + nestedMode, _ := ctx.Value(search.NestedSearchKey).(bool) + var nm mapping.NestedMapping + if nestedMode { + var ok bool + // get the nested mapping + if nm, ok = m.(mapping.NestedMapping); !ok { + // shouldn't be in nested mode if no nested mapping + nestedMode = false + } + } + // set of fields used in this query + var qfs search.FieldSet + var err error + for _, conjunct := range q.Conjuncts { + // Gather fields when nested mode is enabled + if nestedMode { + qfs, err = ExtractFields(conjunct, m, qfs) + if err != nil { + cleanup() + return nil, err + } + } sr, err := conjunct.Searcher(ctx, i, m, options) if err != nil { - for _, searcher := range ss { - if searcher != nil { - _ = searcher.Close() - } - } + cleanup() return nil, err } if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode { @@ -75,6 +100,10 @@ func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m return searcher.NewMatchNoneSearcher(i) } + if nestedMode { + return searcher.NewNestedConjunctionSearcher(ctx, i, ss, nm.CoveringDepth(qfs), options) + } + return searcher.NewConjunctionSearcher(ctx, i, ss, options) } diff --git a/search/query/query.go b/search/query/query.go index 27c3978b1..06e924882 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -455,13 +455,10 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { return string(data), err } -// FieldSet represents a set of queried fields. -type FieldSet map[string]struct{} - // ExtractFields returns a set of fields referenced by the query. // The returned set may be nil if the query does not explicitly reference any field // and the DefaultSearchField is unset in the index mapping. -func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { +func ExtractFields(q Query, m mapping.IndexMapping, fs search.FieldSet) (search.FieldSet, error) { if q == nil || m == nil { return fs, nil } @@ -474,9 +471,9 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, erro } if f != "" { if fs == nil { - fs = make(FieldSet) + fs = search.NewFieldSet() } - fs[f] = struct{}{} + fs.AddField(f) } case *QueryStringQuery: var expandedQuery Query diff --git a/search/scorer/scorer_conjunction_nested.go b/search/scorer/scorer_conjunction_nested.go new file mode 100644 index 000000000..a2b366fba --- /dev/null +++ b/search/scorer/scorer_conjunction_nested.go @@ -0,0 +1,83 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorer + +import ( + "reflect" + "slices" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeNestedConjunctionQueryScorer int + +func init() { + var ncqs NestedConjunctionQueryScorer + reflectStaticSizeNestedConjunctionQueryScorer = int(reflect.TypeOf(ncqs).Size()) +} + +type NestedConjunctionQueryScorer struct { + options search.SearcherOptions +} + +func (s *NestedConjunctionQueryScorer) Size() int { + return reflectStaticSizeNestedConjunctionQueryScorer + size.SizeOfPtr +} + +func NewNestedConjunctionQueryScorer(options search.SearcherOptions) *NestedConjunctionQueryScorer { + return &NestedConjunctionQueryScorer{ + options: options, + } +} + +func (s *NestedConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents []*search.DocumentMatch, + ancestry [][]index.IndexInternalID, joinIdx int) (*search.DocumentMatch, error) { + // Find the constituent with the shortest effective depth. + lcaIdx := 0 + lcaDepth := computeDepth(ancestry[0], joinIdx) + + for i := 1; i < len(ancestry); i++ { + d := computeDepth(ancestry[i], joinIdx) + if d < lcaDepth { + lcaDepth = d + lcaIdx = i + } + } + + // Clone the LCA document ID and start a fresh DocumentMatch. + lcaDocID := constituents[lcaIdx].IndexInternalID + result := &search.DocumentMatch{ + IndexInternalID: slices.Clone(lcaDocID), + } + + // Merge all constituents into the new match. + for _, dm := range constituents { + if err := result.MergeWith(dm); err != nil { + return nil, err + } + } + + return result, nil +} + +// computeDepth returns the depth considered for LCA selection. +func computeDepth(anc []index.IndexInternalID, joinIdx int) int { + if len(anc) <= joinIdx { + return len(anc) + } + return joinIdx + 1 +} diff --git a/search/search.go b/search/search.go index 46e32fed9..8df5b43a5 100644 --- a/search/search.go +++ b/search/search.go @@ -165,9 +165,9 @@ type DocumentMatch struct { // used to indicate the sub-scores that combined to form the // final score for this document match. This is only populated - // when the search request's query is a DisjunctionQuery - // or a ConjunctionQuery. The map key is the index of the sub-query - // in the DisjunctionQuery or ConjunctionQuery. The map value is the + // when the search request's query is a DisjunctionQuery. + // The map key is the index of the sub-query + // in the DisjunctionQuery. The map value is the // sub-score for that sub-query. ScoreBreakdown map[int]float64 `json:"score_breakdown,omitempty"` @@ -178,6 +178,10 @@ type DocumentMatch struct { // of the index that this match came from // of the current alias view, used in alias of aliases scenario IndexNames []string `json:"index_names,omitempty"` + + // Children holds any descendant/child matches that contributed + // to this root (or intermediate LCA) DocumentMatch. + Children DescendantStore `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -201,6 +205,21 @@ func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { dm.Fields[name] = valSlice } +func (dm *DocumentMatch) AddFragments(field string, fragments []string) { + if dm.Fragments == nil { + dm.Fragments = make(FieldFragmentMap) + } +OUTER: + for _, newFrag := range fragments { + for _, existingFrag := range dm.Fragments[field] { + if existingFrag == newFrag { + continue OUTER // no duplicates allowed + } + } + dm.Fragments[field] = append(dm.Fragments[field], newFrag) + } +} + // Reset allows an already allocated DocumentMatch to be reused func (dm *DocumentMatch) Reset() *DocumentMatch { // remember the []byte used for the IndexInternalID @@ -267,6 +286,10 @@ func (dm *DocumentMatch) Size() int { size.SizeOfPtr } + if dm.Children != nil { + sizeInBytes += dm.Children.Size() + } + return sizeInBytes } @@ -347,6 +370,31 @@ func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score) } +func (dm *DocumentMatch) MergeWith(other *DocumentMatch) error { + // merge score + dm.Score += other.Score + // merge explanations + dm.Expl = MergeExpl(dm.Expl, other.Expl) + // merge field term locations + dm.FieldTermLocations = MergeFieldTermLocations(dm.FieldTermLocations, []*DocumentMatch{other}) + // merge score breakdown + dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) + // merge Descendants/Children + // if the base and other have the same ID, then we are merging the same + // document match (from different clauses), so we need to merge their children/descendants + if !dm.IndexInternalID.Equals(other.IndexInternalID) { + if dm.Children == nil { + dm.Children = make(DescendantStore) + } + err := dm.Children.AddDescendant(other.IndexInternalID) + if err != nil { + return err + } + } + dm.Children = MergeDescendants(dm.Children, other.Children) + return nil +} + type DocumentMatchCollection []*DocumentMatch func (c DocumentMatchCollection) Len() int { return len(c) } @@ -393,3 +441,62 @@ func (sc *SearchContext) Size() int { return sizeInBytes } + +type DescendantStore map[uint64]index.IndexInternalID + +func MergeDescendants(first, second DescendantStore) DescendantStore { + if first == nil { + return second + } + if second == nil { + return first + } + for k, v := range second { + first[k] = v + } + return first +} + +func (ds DescendantStore) AddDescendant(descendant index.IndexInternalID) error { + key, err := descendant.Value() + if err != nil { + return err + } + // use clone to keep the store stateless + ds[key] = slices.Clone(descendant) + return nil +} + +func (ds DescendantStore) IterateDescendants(fn func(descendant index.IndexInternalID) error) error { + for _, descendant := range ds { + if err := fn(descendant); err != nil { + return err + } + } + return nil +} + +func (ds DescendantStore) Size() int { + sizeInBytes := size.SizeOfMap + for _, entry := range ds { + sizeInBytes += size.SizeOfPtr + len(entry) + } + return sizeInBytes +} + +// A NestedDocumentMatch is like a DocumentMatch but used for nested documents +// and does not have score or locations, or a score and is mainly used to +// hold field values and fragments, to be embedded in the parent DocumentMatch +type NestedDocumentMatch struct { + Fields map[string]interface{} `json:"fields,omitempty"` + Fragments FieldFragmentMap `json:"fragments,omitempty"` +} + +// NewNestedDocumentMatch creates a new NestedDocumentMatch instance +// with the given fields and fragments +func NewNestedDocumentMatch(fields map[string]interface{}, fragments FieldFragmentMap) *NestedDocumentMatch { + return &NestedDocumentMatch{ + Fields: fields, + Fragments: fragments, + } +} diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go new file mode 100644 index 000000000..d520ef844 --- /dev/null +++ b/search/searcher/search_conjunction_nested.go @@ -0,0 +1,364 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + "fmt" + "math" + "reflect" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/scorer" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeNestedConjunctionSearcher int + +func init() { + var ncs NestedConjunctionSearcher + reflectStaticSizeNestedConjunctionSearcher = int(reflect.TypeOf(ncs).Size()) +} + +type NestedConjunctionSearcher struct { + nestedReader index.NestedReader + searchers []search.Searcher + queryNorm float64 + currs []*search.DocumentMatch + currAncestors [][]index.IndexInternalID + pivotIDx int + scorer *scorer.NestedConjunctionQueryScorer + initialized bool + joinIdx int + options search.SearcherOptions +} + +func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, + searchers []search.Searcher, joinIdx int, options search.SearcherOptions) (search.Searcher, error) { + + var nr index.NestedReader + var ok bool + if nr, ok = indexReader.(index.NestedReader); !ok { + return nil, fmt.Errorf("indexReader does not support nested documents") + } + + // build our searcher + rv := NestedConjunctionSearcher{ + nestedReader: nr, + options: options, + searchers: searchers, + currs: make([]*search.DocumentMatch, len(searchers)), + currAncestors: make([][]index.IndexInternalID, len(searchers)), + scorer: scorer.NewNestedConjunctionQueryScorer(options), + joinIdx: joinIdx, + } + rv.computeQueryNorm() + + return &rv, nil +} + +// getTargetAncestor returns the appropriate ancestor ID for the given joinIdx +// if the ancestry chain is shallower than joinIdx, it returns the deepest ancestor +// otherwise it returns the ancestor at joinIdx level from the top-most ancestor +func getTargetAncestor(ancestors []index.IndexInternalID, joinIdx int) index.IndexInternalID { + if len(ancestors) > joinIdx { + return ancestors[len(ancestors)-joinIdx-1] + } + return ancestors[len(ancestors)-1] +} + +func (s *NestedConjunctionSearcher) initSearchers(ctx *search.SearchContext) (bool, error) { + var err error + // get all searchers pointing at their first match + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return false, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return true, nil + } + // get the ancestry chain for this match + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + if err != nil { + return false, err + } + } + // scan the ancestry chains for all searchers to get the pivotIDx + // the pivot will be the searcher with the longest ancestry chain + // if there are multiple with the same length, pick the one with + // the highest docID + s.pivotIDx = 0 + pivotLength := len(s.currAncestors[0]) + for i := 1; i < len(s.searchers); i++ { + if len(s.currAncestors[i]) > pivotLength { + s.pivotIDx = i + pivotLength = len(s.currAncestors[i]) + } else if len(s.currAncestors[i]) == pivotLength { + // if same length, pick the one with the highest docID + if s.currs[i].IndexInternalID.Compare(s.currs[s.pivotIDx].IndexInternalID) > 0 { + s.pivotIDx = i + } + } + } + s.initialized = true + return false, nil +} + +func (s *NestedConjunctionSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *NestedConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeNestedConjunctionSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + +func (s *NestedConjunctionSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *NestedConjunctionSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *NestedConjunctionSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *NestedConjunctionSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *NestedConjunctionSearcher) Min() int { + return 0 +} + +func (s *NestedConjunctionSearcher) DocumentMatchPoolSize() int { + rv := len(s.currs) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + if !s.initialized { + exhausted, err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + if exhausted { + return nil, nil + } + } + // we have the pivot searcher, now try to align all the others to it, using the racecar algorithm, + // basically - the idea is simple - we first check if the pivot searcher's indexInternalID + // is behind any of the other searchers, and if so, we are sure that the pivot searcher + // cannot be part of a match, so we advance it to the maximum of the other searchers. + // Now once the pivot searcher is ahead of all the other searchers, we advance all the other + // searchers to the corresponding ancestor of the pivot searcher, if all of them align on the correct + // ancestor, we have a match, otherwise we repeat the process. + for { + pivotSearcher := s.searchers[s.pivotIDx] + pivotDM := s.currs[s.pivotIDx] + if pivotDM == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + pivotAncestors := s.currAncestors[s.pivotIDx] + pivotID := pivotDM.IndexInternalID + // first, make sure the pivot is ahead of all the other searchers + // we do this by getting the max of all the other searchers' IDs + // at their respective target ancestors + // and if the pivot is behind that, we advance it to that + maxID := getTargetAncestor(pivotAncestors, s.joinIdx) + for i := 0; i < len(s.searchers); i++ { + if i == s.pivotIDx { + // skip the pivot itself + continue + } + curr := s.currs[i] + if curr == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + targetAncestor := getTargetAncestor(s.currAncestors[i], s.joinIdx) + // now compare curr's target ancestor with maxID + if targetAncestor.Compare(maxID) > 0 { + maxID = targetAncestor + } + } + if maxID.Compare(pivotID) > 0 { + var err error + // pivot is behind, so advance it + ctx.DocumentMatchPool.Put(pivotDM) + s.currs[s.pivotIDx], err = pivotSearcher.Advance(ctx, maxID) + if err != nil { + return nil, err + } + if s.currs[s.pivotIDx] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[s.pivotIDx], err = s.nestedReader.Ancestors(s.currs[s.pivotIDx].IndexInternalID) + if err != nil { + return nil, err + } + // now restart the whole process + continue + } + // at this point, we know the pivot is ahead of all the other searchers + // now try to align all the other searchers to the pivot's ancestry + // we do this by advancing each searcher to the corresponding ancestor + // of the pivot, with searchers with insufficient depth being advanced + // to the corresponding document ID in the pivot's ancestry and + // and the searchers with sufficient depth being advanced to the + // ancestor at joinIdx level once that is done we check if all the + // searchers are aligned if they are, we have a match, otherwise we have a + // scenario where one or more searchers have advanced beyond the pivot, so + // we need to restart the whole process where we have to find the new maxID + // and advance the pivot as done above + allAligned := true + for i := 0; i < len(s.searchers); i++ { + if i == s.pivotIDx { + // skip the pivot itself + continue + } + curr := s.currs[i] + if curr == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // try to align curr to the pivot's ancestry by advancing the + // searcher to the corresponding ancestor of the pivot + var targetAncestor index.IndexInternalID + if len(s.currAncestors[i]) > s.joinIdx { + // this searcher has sufficient depth, so use the pivot's ancestor at joinIdx + targetAncestor = pivotAncestors[len(pivotAncestors)-s.joinIdx-1] + } else { + // this searcher does not have sufficient depth, so use the pivot's + // ancestor at the searcher's max depth + targetAncestor = pivotAncestors[len(s.currAncestors[i])-1] + } + if curr.IndexInternalID.Compare(targetAncestor) < 0 { + var err error + ctx.DocumentMatchPool.Put(curr) + s.currs[i], err = s.searchers[i].Advance(ctx, targetAncestor) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + if err != nil { + return nil, err + } + } + // now check if we are aligned + currID := getTargetAncestor(s.currAncestors[i], s.joinIdx) + if currID.Compare(targetAncestor) != 0 { + allAligned = false + } + } + if allAligned { + // we have a match, so we can build the resulting DocumentMatch + // we do this by delegating to the scorer, which will pick the lowest + // common ancestor (LCA) and merge all the constituents into it + dm, err := s.scorer.Score(ctx, s.currs, s.currAncestors, s.joinIdx) + if err != nil { + return nil, err + } + // now advance the pivot searcher to get ready for the next call + ctx.DocumentMatchPool.Put(pivotDM) + s.currs[s.pivotIDx], err = pivotSearcher.Next(ctx) + if err != nil { + return nil, err + } + if s.currs[s.pivotIDx] != nil { + s.currAncestors[s.pivotIDx], err = s.nestedReader.Ancestors(s.currs[s.pivotIDx].IndexInternalID) + if err != nil { + return nil, err + } + } + // return the match we have + return dm, nil + } + } +} + +func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + for { + next, err := s.Next(ctx) + if err != nil { + return nil, err + } + if next == nil { + return nil, nil + } + if next.IndexInternalID.Compare(ID) >= 0 { + return next, nil + } + ctx.DocumentMatchPool.Put(next) + } +} diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index 3da876bd3..4c68e5691 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -15,7 +15,6 @@ package searcher import ( - "bytes" "container/heap" "context" "math" @@ -169,7 +168,7 @@ func (s *DisjunctionHeapSearcher) updateMatches() error { matchingIdxs = append(matchingIdxs, next.matchingIdx) // now as long as top of heap matches, keep popping - for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { + for len(s.heap) > 0 && next.curr.IndexInternalID.Equals(s.heap[0].curr.IndexInternalID) { next = heap.Pop(s).(*SearcherCurr) matching = append(matching, next.curr) matchingCurrs = append(matchingCurrs, next) @@ -264,7 +263,7 @@ func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, // find all searchers that actually need to be advanced // advance them, using s.matchingCurrs as temp storage - for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { + for len(s.heap) > 0 && s.heap[0].curr.IndexInternalID.Compare(ID) < 0 { searcherCurr := heap.Pop(s).(*SearcherCurr) ctx.DocumentMatchPool.Put(searcherCurr.curr) curr, err := searcherCurr.searcher.Advance(ctx, ID) @@ -347,7 +346,7 @@ func (s *DisjunctionHeapSearcher) Less(i, j int) bool { } else if s.heap[j].curr == nil { return false } - return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 + return s.heap[i].curr.IndexInternalID.Compare(s.heap[j].curr.IndexInternalID) < 0 } func (s *DisjunctionHeapSearcher) Swap(i, j int) { diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 57d8d0727..3ce7a69d9 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -36,6 +36,7 @@ type MatchAllSearcher struct { reader index.DocIDReader scorer *scorer.ConstantScorer count uint64 + nested bool } func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boost float64, options search.SearcherOptions) (*MatchAllSearcher, error) { @@ -50,11 +51,15 @@ func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boo } scorer := scorer.NewConstantScorer(1.0, boost, options) + // check if we are in nested mode + nested, _ := ctx.Value(search.NestedSearchKey).(bool) + return &MatchAllSearcher{ indexReader: indexReader, reader: reader, scorer: scorer, count: count, + nested: nested, }, nil } @@ -76,6 +81,22 @@ func (s *MatchAllSearcher) SetQueryNorm(qnorm float64) { s.scorer.SetQueryNorm(qnorm) } +func (s *MatchAllSearcher) isNested(id index.IndexInternalID) bool { + // if not running in nested mode, always return false + if !s.nested { + return false + } + // check if this doc has ancestors, if so it is nested + if nr, ok := s.reader.(index.NestedReader); ok { + anc, err := nr.Ancestors(id) + if err != nil { + return false + } + return len(anc) > 1 + } + return false +} + func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { id, err := s.reader.Next() if err != nil { @@ -86,6 +107,11 @@ func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatc return nil, nil } + if s.isNested(id) { + // if nested then skip and get next + return s.Next(ctx) + } + // score match docMatch := s.scorer.Score(ctx, id) // return doc match @@ -103,6 +129,11 @@ func (s *MatchAllSearcher) Advance(ctx *search.SearchContext, ID index.IndexInte return nil, nil } + if s.isNested(id) { + // if nested then return next + return s.Next(ctx) + } + // score match docMatch := s.scorer.Score(ctx, id) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index f086051c1..cd8f00719 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -132,7 +132,7 @@ func filterCandidateTerms(indexReader index.IndexReader, for err == nil && tfd != nil { termBytes := []byte(tfd.Term) i := sort.Search(len(terms), func(i int) bool { return bytes.Compare(terms[i], termBytes) >= 0 }) - if i < len(terms) && bytes.Compare(terms[i], termBytes) == 0 { + if i < len(terms) && bytes.Equal(terms[i], termBytes) { rv = append(rv, terms[i]) } terms = terms[i:] diff --git a/search/util.go b/search/util.go index 005fda67d..1af7bce56 100644 --- a/search/util.go +++ b/search/util.go @@ -156,6 +156,10 @@ const ( // ScoreFusionKey is used to communicate whether KNN hits need to be preserved for // hybrid search algorithms (like RRF) ScoreFusionKey ContextKey = "_fusion_rescoring_key" + + // NestedSearchKey is used to communicate whether the search is performed + // in an index with nested documents + NestedSearchKey ContextKey = "_nested_search_key" ) func RecordSearchCost(ctx context.Context, @@ -237,3 +241,25 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// NewFieldSet creates a new FieldSet. +func NewFieldSet() FieldSet { + return make(map[string]struct{}) +} + +// Add adds a field to the set. +func (fs FieldSet) AddField(field string) { + fs[field] = struct{}{} +} + +// Slice returns the fields in this set as a slice of strings. +func (fs FieldSet) Slice() []string { + rv := make([]string, 0, len(fs)) + for field := range fs { + rv = append(rv, field) + } + return rv +} diff --git a/search_knn.go b/search_knn.go index 73be6f5d5..48dd7f4e9 100644 --- a/search_knn.go +++ b/search_knn.go @@ -24,6 +24,7 @@ import ( "sort" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/query" @@ -372,7 +373,7 @@ func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMat } } req.Sort.Value(hit) - err, _ = LoadAndHighlightFields(hit, req, "", reader, nil) + err, _ = LoadAndHighlightAllFields(hit, req, "", reader, nil) if err != nil { return err } @@ -410,7 +411,10 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if err != nil { return nil, err } - filterColl := collector.NewEligibleCollector(int(indexDocCount)) + filterColl, err := i.buildEligibleCollector(ctx, filterQ, reader, int(indexDocCount)) + if err != nil { + return nil, err + } err = filterColl.Collect(ctx, filterSearcher, reader) if err != nil { return nil, err @@ -429,7 +433,10 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if err != nil { return nil, err } - knnCollector := collector.NewKNNCollector(kArray, sumOfK) + knnCollector, err := i.buildKNNCollector(ctx, KNNQuery, reader, kArray, sumOfK) + if err != nil { + return nil, err + } err = knnCollector.Collect(ctx, knnSearcher, reader) if err != nil { return nil, err @@ -661,3 +668,49 @@ func (r *rescorer) restoreKnnRequest() { r.req.KNN[i].Boost = &b } } + +func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, reader index.IndexReader, kArray []int64, somOfK int64) (*collector.KNNCollector, error) { + // check if we are in nested mode + if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { + // get the nested reader from the index reader + if nr, ok := reader.(index.NestedReader); ok { + // check if the KNN query intersects with the nested mapping + if nm, ok := i.m.(mapping.NestedMapping); ok { + var fs search.FieldSet + var err error + fs, err = query.ExtractFields(KNNQuery, i.m, fs) + if err != nil { + return nil, err + } + if nm.IntersectsPrefix(fs) { + return collector.NewNestedKNNCollector(nr, kArray, somOfK), nil + } + } + } + } + + return collector.NewKNNCollector(kArray, somOfK), nil +} + +func (i *indexImpl) buildEligibleCollector(ctx context.Context, filterQuery query.Query, reader index.IndexReader, size int) (*collector.EligibleCollector, error) { + // check if we are in nested mode + if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { + // get the nested reader from the index reader + if nr, ok := reader.(index.NestedReader); ok { + // check if the filter query intersects with the nested mapping + if nm, ok := i.m.(mapping.NestedMapping); ok { + var fs search.FieldSet + var err error + fs, err = query.ExtractFields(filterQuery, i.m, fs) + if err != nil { + return nil, err + } + if nm.IntersectsPrefix(fs) { + return collector.NewNestedEligibleCollector(nr, size), nil + } + } + } + } + + return collector.NewEligibleCollector(size), nil +} diff --git a/search_test.go b/search_test.go index 829696ae6..0e150ea1a 100644 --- a/search_test.go +++ b/search_test.go @@ -5206,3 +5206,708 @@ func TestSearchRequestValidatePagination(t *testing.T) { }) } } + +func createNestedIndexMapping() mapping.IndexMapping { + + /* + company + ├── id + ├── name + ├── departments[] (nested) + │ ├── name + │ ├── budget + │ ├── employees[] (nested) + │ │ ├── name + │ │ ├── role + │ └── projects[] (nested) + │ ├── title + │ ├── status + └── locations[] (nested) + ├── city + ├── country + */ + + // Create the index mapping + imap := mapping.NewIndexMapping() + + // Create company mapping + companyMapping := mapping.NewDocumentMapping() + + // Company ID field + companyIDField := mapping.NewTextFieldMapping() + companyMapping.AddFieldMappingsAt("id", companyIDField) + + // Company name field + companyNameField := mapping.NewTextFieldMapping() + companyMapping.AddFieldMappingsAt("name", companyNameField) + + // Departments mapping + departmentsMapping := mapping.NewDocumentMapping() + departmentsMapping.Nested = true + + // Department name field + deptNameField := mapping.NewTextFieldMapping() + departmentsMapping.AddFieldMappingsAt("name", deptNameField) + + // Department budget field + deptBudgetField := mapping.NewNumericFieldMapping() + departmentsMapping.AddFieldMappingsAt("budget", deptBudgetField) + + // Employees mapping + employeesMapping := mapping.NewDocumentMapping() + employeesMapping.Nested = true + + // Employee name field + empNameField := mapping.NewTextFieldMapping() + employeesMapping.AddFieldMappingsAt("name", empNameField) + + // Employee role field + empRoleField := mapping.NewTextFieldMapping() + employeesMapping.AddFieldMappingsAt("role", empRoleField) + + departmentsMapping.AddSubDocumentMapping("employees", employeesMapping) + + // Projects mapping + projectsMapping := mapping.NewDocumentMapping() + projectsMapping.Nested = true + + // Project title field + projTitleField := mapping.NewTextFieldMapping() + projTitleField.Analyzer = keyword.Name + projectsMapping.AddFieldMappingsAt("title", projTitleField) + + // Project status field + projStatusField := mapping.NewTextFieldMapping() + projectsMapping.AddFieldMappingsAt("status", projStatusField) + + departmentsMapping.AddSubDocumentMapping("projects", projectsMapping) + + companyMapping.AddSubDocumentMapping("departments", departmentsMapping) + + // Locations mapping + locationsMapping := mapping.NewDocumentMapping() + locationsMapping.Nested = true + + // Location city field + cityField := mapping.NewTextFieldMapping() + locationsMapping.AddFieldMappingsAt("city", cityField) + + // Location country field + countryField := mapping.NewTextFieldMapping() + locationsMapping.AddFieldMappingsAt("country", countryField) + + companyMapping.AddSubDocumentMapping("locations", locationsMapping) + + // Add company to type mapping + imap.DefaultMapping.AddSubDocumentMapping("company", companyMapping) + + return imap +} +func TestNestedPrefixes(t *testing.T) { + imap := createNestedIndexMapping() + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + nmap, ok := imap.(mapping.NestedMapping) + if !ok { + t.Fatal("index mapping is not a NestedMapping") + } + + // Test 1: Employee Role AND Employee Name + fs := search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.employees.name") + + // Expected depth is 2 (employees are nested within departments) + expectedDepth := 2 + + actualDepth := nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 2: Employee Role AND Employee Name AND Department Name + fs = search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.employees.name") + fs.AddField("company.departments.name") + // Expected depth is 1 (employees and department share the same department context) + expectedDepth = 1 + + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 3: Employee Role AND Location City + fs = search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.locations.city") + // Expected depth is 0 (employees and locations are in different nested contexts) + expectedDepth = 0 + + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 4: Company Name AND Location Country + fs = search.NewFieldSet() + fs.AddField("company.name") + fs.AddField("company.locations.country") + fs.AddField("company.locations.city") + // Expected depth is 0 (company.name is at root, locations are nested) + expectedDepth = 0 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 5: Department Budget AND Project Status AND Employee Name + fs = search.NewFieldSet() + fs.AddField("company.departments.budget") + fs.AddField("company.departments.projects.status") + fs.AddField("company.departments.employees.name") + // Expected depth is 1 (all share the same department context) + expectedDepth = 1 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 6: Single Field - Company ID + fs = search.NewFieldSet() + fs.AddField("company.id") + // Expected depth is 0 (company.id is at root) + expectedDepth = 0 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 7: No Fields + fs = search.NewFieldSet() + // Expected depth is 0 (no fields) + expectedDepth = 0 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 8: All Fields + fs = search.NewFieldSet() + fs.AddField("company.id") + fs.AddField("company.name") + fs.AddField("company.departments.name") + fs.AddField("company.departments.budget") + fs.AddField("company.departments.employees.name") + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.projects.title") + fs.AddField("company.departments.projects.status") + fs.AddField("company.locations.city") + fs.AddField("company.locations.country") + // Expected depth is 0 (fields span multiple nested contexts) + expectedDepth = 0 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 9: Project Title AND Project Status + fs = search.NewFieldSet() + fs.AddField("company.departments.projects.title") + fs.AddField("company.departments.projects.status") + // Expected depth is 2 (projects are nested within departments) + expectedDepth = 2 + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } + + // Test 10: Department Name AND Location Country + fs = search.NewFieldSet() + fs.AddField("company.departments.name") + fs.AddField("company.locations.country") + fs.AddField("company.locations.city") + + // Expected depth is 0 (departments and locations are in different nested contexts) + expectedDepth = 0 + + actualDepth = nmap.CoveringDepth(fs) + if actualDepth != expectedDepth { + t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + } +} + +func TestNestedConjunctionQuery(t *testing.T) { + imap := createNestedIndexMapping() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + // Index 3 sample documents + docs := []struct { + id string + data string + }{ + { + id: "1", + data: `{ + "company": { + "id": "c1", + "name": "TechCorp", + "departments": [ + { + "name": "Engineering", + "budget": 2000000, + "employees": [ + {"name": "Alice", "role": "Engineer"}, + {"name": "Bob", "role": "Manager"} + ], + "projects": [ + {"title": "Project X", "status": "ongoing"}, + {"title": "Project Y", "status": "completed"} + ] + }, + { + "name": "Sales", + "budget": 300000, + "employees": [ + {"name": "Eve", "role": "Salesperson"}, + {"name": "Mallory", "role": "Manager"} + ], + "projects": [ + {"title": "Project A", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens", "country": "Greece"}, + {"city": "Berlin", "country": "USA"} + ] + } + }`, + }, + { + id: "2", + data: `{ + "company" : { + "id": "c2", + "name": "BizInc", + "departments": [ + { + "name": "Marketing", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Marketer"}, + {"name": "David", "role": "Manager"} + ], + "projects": [ + {"title": "Project Z", "status": "ongoing"}, + {"title": "Project W", "status": "planned"} + ] + }, + { + "name": "Engineering", + "budget": 800000, + "employees": [ + {"name": "Frank", "role": "Manager"}, + {"name": "Grace", "role": "Engineer"} + ], + "projects": [ + {"title": "Project Alpha", "status": "completed"}, + {"title": "Project Beta", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens", "country": "USA"}, + {"city": "London", "country": "UK"} + ] + } + }`, + }, + { + id: "3", + data: `{ + "company": { + "id": "c3", + "name": "WebSolutions", + "departments": [ + { + "name": "HR", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Manager"}, + {"name": "Frank", "role": "HR"} + ], + "projects": [ + {"title": "Project Beta", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + }, + { + "name": "Engineering", + "budget": 200000, + "employees": [ + {"name": "Heidi", "role": "Support Engineer"}, + {"name": "Ivan", "role": "Manager"} + ], + "projects": [ + {"title": "Project Helpdesk", "status": "ongoing"}, + {"title": "Project FAQ", "status": "completed"} + ] + } + ], + "locations": [ + {"city": "Edinburgh", "country": "UK"}, + {"city": "London", "country": "Canada"} + ] + } + }`, + }, + } + + for _, doc := range docs { + var dataMap map[string]interface{} + err := json.Unmarshal([]byte(doc.data), &dataMap) + if err != nil { + t.Fatalf("failed to unmarshal document %s: %v", doc.id, err) + } + err = idx.Index(doc.id, dataMap) + if err != nil { + t.Fatalf("failed to index document %s: %v", doc.id, err) + } + } + + var buildReq = func(subQueries []query.Query) *SearchRequest { + rv := NewSearchRequest(query.NewConjunctionQuery(subQueries)) + rv.SortBy([]string{"_id"}) + return rv + } + + var ( + req *SearchRequest + res *SearchResult + deptNameQuery *query.MatchQuery + deptBudgetQuery *query.NumericRangeQuery + empNameQuery *query.MatchQuery + empRoleQuery *query.MatchQuery + projTitleQuery *query.MatchQuery + projStatusQuery *query.MatchQuery + countryQuery *query.MatchQuery + cityQuery *query.MatchQuery + ) + + // Test 1: Find companies with a department named "Engineering" AND budget over 900000 + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min := float64(800000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + req = buildReq([]query.Query{deptNameQuery, deptBudgetQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Test 2: Find companies with an employee named "Eve" AND project status "completed" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + projStatusQuery = query.NewMatchQuery("completed") + projStatusQuery.SetField("company.departments.projects.status") + + req = buildReq([]query.Query{empNameQuery, projStatusQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "1" || res.Hits[1].ID != "3" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Test 3: Find companies located in "Athens, USA" AND with an Engineering department + countryQuery = query.NewMatchQuery("USA") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery := query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + req = buildReq([]query.Query{locQuery, deptNameQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 4a: Find companies located in "Athens, USA" AND with an Engineering department with a budget over 1M + countryQuery = query.NewMatchQuery("USA") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min = float64(1000000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + deptQuery := query.NewConjunctionQuery([]query.Query{deptNameQuery, deptBudgetQuery}) + + req = buildReq([]query.Query{locQuery, deptQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Test 4b: Find companies located in "Athens, Greece" AND with an Engineering department with a budget over 1M + countryQuery = query.NewMatchQuery("Greece") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min = float64(1000000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + deptQuery = query.NewConjunctionQuery([]query.Query{deptNameQuery, deptBudgetQuery}) + + req = buildReq([]query.Query{locQuery, deptQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1hits, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "1" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 5a: Find companies with an employee named "Frank" AND role "Manager" whose department is + // handling a project titled "Project Beta" which is marked as "ongoing" + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery := query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + projTitleQuery = query.NewMatchQuery("Project Beta") + projTitleQuery.SetField("company.departments.projects.title") + + projStatusQuery = query.NewMatchQuery("completed") + projStatusQuery.SetField("company.departments.projects.status") + + projQuery := query.NewConjunctionQuery([]query.Query{projTitleQuery, projStatusQuery}) + + req = buildReq([]query.Query{empQuery, projQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) + } + + // Test 5b: Find companies with an employee named "Frank" AND role "Manager" whose department is + // handling a project titled "Project Beta" which is marked as "completed" + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + projTitleQuery = query.NewMatchQuery("Project Beta") + projTitleQuery.SetField("company.departments.projects.title") + + projStatusQuery = query.NewMatchQuery("ongoing") + projStatusQuery.SetField("company.departments.projects.status") + + projQuery = query.NewConjunctionQuery([]query.Query{projTitleQuery, projStatusQuery}) + + req = buildReq([]query.Query{empQuery, projQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 6a: Find companies with an employee named "Eve" AND role "Manager" + // who is working in a department located in "London, UK" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) + } + + // Test 6b: Find companies with an employee named "Eve" AND role "Manager" + // who is working in a department located in "London, Canada" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("Canada") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "3" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 7a: Find companies where Ivan the Manager works London, UK + + empNameQuery = query.NewMatchQuery("Ivan") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + + // Test 7b: Find companies where Ivan the Manager works London, Canada + + empNameQuery = query.NewMatchQuery("Ivan") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("Canada") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "3" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } +} From abaddc7171133c255dd21b971ee14078678bee8c Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 17 Nov 2025 17:58:06 +0530 Subject: [PATCH 02/70] minor UT change --- go.mod | 6 ++++++ go.sum | 6 ------ search_test.go | 7 +++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index c4bc98254..bf3c2cf45 100644 --- a/go.mod +++ b/go.mod @@ -44,3 +44,9 @@ require ( github.com/spf13/pflag v1.0.6 // indirect golang.org/x/sys v0.29.0 // indirect ) + +replace github.com/blevesearch/bleve_index_api => ../bleve_index_api + +replace github.com/blevesearch/zapx/v16 => ../zapx + +replace github.com/blevesearch/scorch_segment_api/v2 => ../scorch_segment_api diff --git a/go.sum b/go.sum index b46bebcef..d07d1ac8a 100644 --- a/go.sum +++ b/go.sum @@ -3,8 +3,6 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= -github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= @@ -20,8 +18,6 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= -github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -44,8 +40,6 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0= -github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/search_test.go b/search_test.go index 0e150ea1a..47657ea18 100644 --- a/search_test.go +++ b/search_test.go @@ -5273,7 +5273,6 @@ func createNestedIndexMapping() mapping.IndexMapping { // Project title field projTitleField := mapping.NewTextFieldMapping() - projTitleField.Analyzer = keyword.Name projectsMapping.AddFieldMappingsAt("title", projTitleField) // Project status field @@ -5614,7 +5613,7 @@ func TestNestedConjunctionQuery(t *testing.T) { deptBudgetQuery *query.NumericRangeQuery empNameQuery *query.MatchQuery empRoleQuery *query.MatchQuery - projTitleQuery *query.MatchQuery + projTitleQuery *query.MatchPhraseQuery projStatusQuery *query.MatchQuery countryQuery *query.MatchQuery cityQuery *query.MatchQuery @@ -5750,7 +5749,7 @@ func TestNestedConjunctionQuery(t *testing.T) { empQuery := query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) - projTitleQuery = query.NewMatchQuery("Project Beta") + projTitleQuery = query.NewMatchPhraseQuery("Project Beta") projTitleQuery.SetField("company.departments.projects.title") projStatusQuery = query.NewMatchQuery("completed") @@ -5777,7 +5776,7 @@ func TestNestedConjunctionQuery(t *testing.T) { empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) - projTitleQuery = query.NewMatchQuery("Project Beta") + projTitleQuery = query.NewMatchPhraseQuery("Project Beta") projTitleQuery.SetField("company.departments.projects.title") projStatusQuery = query.NewMatchQuery("ongoing") From f8f4061ae150b20f24749a91fa36a1a12e299e3e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 17 Nov 2025 17:58:27 +0530 Subject: [PATCH 03/70] revert gomod change --- go.mod | 6 ------ go.sum | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index bf3c2cf45..c4bc98254 100644 --- a/go.mod +++ b/go.mod @@ -44,9 +44,3 @@ require ( github.com/spf13/pflag v1.0.6 // indirect golang.org/x/sys v0.29.0 // indirect ) - -replace github.com/blevesearch/bleve_index_api => ../bleve_index_api - -replace github.com/blevesearch/zapx/v16 => ../zapx - -replace github.com/blevesearch/scorch_segment_api/v2 => ../scorch_segment_api diff --git a/go.sum b/go.sum index d07d1ac8a..b46bebcef 100644 --- a/go.sum +++ b/go.sum @@ -3,6 +3,8 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= +github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= @@ -18,6 +20,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= +github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -40,6 +44,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= +github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0= +github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= From b6ac3e11a4f521d787a2d2924e6f3c847fda6359 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 17 Nov 2025 18:48:23 +0530 Subject: [PATCH 04/70] typos --- search_knn.go | 6 +++--- search_test.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/search_knn.go b/search_knn.go index 48dd7f4e9..f4aa18879 100644 --- a/search_knn.go +++ b/search_knn.go @@ -669,7 +669,7 @@ func (r *rescorer) restoreKnnRequest() { } } -func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, reader index.IndexReader, kArray []int64, somOfK int64) (*collector.KNNCollector, error) { +func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, reader index.IndexReader, kArray []int64, sumOfK int64) (*collector.KNNCollector, error) { // check if we are in nested mode if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { // get the nested reader from the index reader @@ -683,13 +683,13 @@ func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, return nil, err } if nm.IntersectsPrefix(fs) { - return collector.NewNestedKNNCollector(nr, kArray, somOfK), nil + return collector.NewNestedKNNCollector(nr, kArray, sumOfK), nil } } } } - return collector.NewKNNCollector(kArray, somOfK), nil + return collector.NewKNNCollector(kArray, sumOfK), nil } func (i *indexImpl) buildEligibleCollector(ctx context.Context, filterQuery query.Query, reader index.IndexReader, size int) (*collector.EligibleCollector, error) { diff --git a/search_test.go b/search_test.go index 47657ea18..ce4c31537 100644 --- a/search_test.go +++ b/search_test.go @@ -5733,14 +5733,14 @@ func TestNestedConjunctionQuery(t *testing.T) { t.Fatalf("search failed: %v", err) } if len(res.Hits) != 1 { - t.Fatalf("expected 1hits, got %d", len(res.Hits)) + t.Fatalf("expected 1 hits, got %d", len(res.Hits)) } if res.Hits[0].ID != "1" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } // Test 5a: Find companies with an employee named "Frank" AND role "Manager" whose department is - // handling a project titled "Project Beta" which is marked as "ongoing" + // handling a project titled "Project Beta" which is marked as "completed" empNameQuery = query.NewMatchQuery("Frank") empNameQuery.SetField("company.departments.employees.name") @@ -5767,7 +5767,7 @@ func TestNestedConjunctionQuery(t *testing.T) { } // Test 5b: Find companies with an employee named "Frank" AND role "Manager" whose department is - // handling a project titled "Project Beta" which is marked as "completed" + // handling a project titled "Project Beta" which is marked as "ongoing" empNameQuery = query.NewMatchQuery("Frank") empNameQuery.SetField("company.departments.employees.name") @@ -5877,7 +5877,7 @@ func TestNestedConjunctionQuery(t *testing.T) { t.Fatalf("search failed: %v", err) } if len(res.Hits) != 0 { - t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) } // Test 7b: Find companies where Ivan the Manager works London, Canada From a5fab1784afb62aae37e2007a6595b58af9e346c Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 17 Nov 2025 20:33:29 +0530 Subject: [PATCH 05/70] frankUT --- search_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/search_test.go b/search_test.go index ce4c31537..c0c50f722 100644 --- a/search_test.go +++ b/search_test.go @@ -5603,6 +5603,11 @@ func TestNestedConjunctionQuery(t *testing.T) { var buildReq = func(subQueries []query.Query) *SearchRequest { rv := NewSearchRequest(query.NewConjunctionQuery(subQueries)) rv.SortBy([]string{"_id"}) + reqString, err := json.MarshalIndent(rv, "", " ") + if err != nil { + t.Fatalf("failed to marshal search request: %v", err) + } + t.Logf("Search Request: %s", reqString) return rv } @@ -5909,4 +5914,38 @@ func TestNestedConjunctionQuery(t *testing.T) { if res.Hits[0].ID != "3" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } + + // Test 8: Find companies where Frank the Manager works in Engineering department located in London, UK + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + deptQuery = query.NewConjunctionQuery([]query.Query{empQuery, deptNameQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{deptQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } } From 2be276f198c1c6d9e0ed95c9e9ebe42e43d99e12 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 19 Nov 2025 19:27:58 +0530 Subject: [PATCH 06/70] fix array of arrays --- mapping/document.go | 58 +++++++++++-- search_test.go | 200 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 237 insertions(+), 21 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index b595f3dda..15ab668f4 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -236,6 +236,17 @@ func NewDocumentMapping() *DocumentMapping { } } +// NewNestedDocumentMapping returns a new document +// mapping that treats sub-documents as nested +// objects. +func NewNestedDocumentMapping() *DocumentMapping { + return &DocumentMapping{ + Nested: true, + Enabled: true, + Dynamic: true, + } +} + // NewDocumentStaticMapping returns a new document // mapping that will not automatically index parts // of a document without an explicit mapping. @@ -392,6 +403,18 @@ func (dm *DocumentMapping) defaultSynonymSource(path []string) string { return rv } +// baseType returns the base type of v by dereferencing pointers +func baseType(v interface{}) reflect.Type { + if v == nil { + return nil + } + t := reflect.TypeOf(v) + for t.Kind() == reflect.Pointer { + t = t.Elem() + } + return t +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey @@ -446,19 +469,38 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } case reflect.Slice, reflect.Array: subDocMapping, _ := dm.documentMappingForPathElements(path) - nestedSubObjects := subDocMapping != nil && subDocMapping.Nested + allowNested := subDocMapping != nil && subDocMapping.Nested for i := 0; i < val.Len(); i++ { - if val.Index(i).CanInterface() { - fieldVal := val.Index(i).Interface() - if nestedSubObjects { - nestedDocument := document.NewDocument(fmt.Sprintf("%s_$%s_$%d", context.doc.ID(), encodePath(path), i)) + // for each array element, check if it can be represented as an interface + idxVal := val.Index(i) + // skip invalid values + if !idxVal.CanInterface() { + continue + } + // get the actual value in interface form + actual := idxVal.Interface() + // if nested mapping, only create nested document for object elements + if allowNested && actual != nil { + // check the kind of the actual value, is it an object (struct or map)? + typ := baseType(actual) + if typ == nil { + continue + } + kind := typ.Kind() + // only create nested docs for real JSON objects + if kind == reflect.Struct || kind == reflect.Map { + // Create nested document only for only object elements + nestedDocument := document.NewDocument( + fmt.Sprintf("%s_$%s_$%d", context.doc.ID(), encodePath(path), i)) nestedContext := context.im.newWalkContext(nestedDocument, dm) - dm.processProperty(fieldVal, path, append(indexes, uint64(i)), nestedContext) + dm.processProperty(actual, path, append(indexes, uint64(i)), nestedContext) context.doc.AddNestedDocument(nestedDocument) - } else { - dm.processProperty(fieldVal, path, append(indexes, uint64(i)), context) + continue } } + // non-nested mapping, or non-object element in nested mapping + // process the element normally + dm.processProperty(actual, path, append(indexes, uint64(i)), context) } case reflect.Ptr: ptrElem := val.Elem() diff --git a/search_test.go b/search_test.go index c0c50f722..9372eef68 100644 --- a/search_test.go +++ b/search_test.go @@ -5242,8 +5242,7 @@ func createNestedIndexMapping() mapping.IndexMapping { companyMapping.AddFieldMappingsAt("name", companyNameField) // Departments mapping - departmentsMapping := mapping.NewDocumentMapping() - departmentsMapping.Nested = true + departmentsMapping := mapping.NewNestedDocumentMapping() // Department name field deptNameField := mapping.NewTextFieldMapping() @@ -5254,8 +5253,7 @@ func createNestedIndexMapping() mapping.IndexMapping { departmentsMapping.AddFieldMappingsAt("budget", deptBudgetField) // Employees mapping - employeesMapping := mapping.NewDocumentMapping() - employeesMapping.Nested = true + employeesMapping := mapping.NewNestedDocumentMapping() // Employee name field empNameField := mapping.NewTextFieldMapping() @@ -5268,8 +5266,7 @@ func createNestedIndexMapping() mapping.IndexMapping { departmentsMapping.AddSubDocumentMapping("employees", employeesMapping) // Projects mapping - projectsMapping := mapping.NewDocumentMapping() - projectsMapping.Nested = true + projectsMapping := mapping.NewNestedDocumentMapping() // Project title field projTitleField := mapping.NewTextFieldMapping() @@ -5284,8 +5281,7 @@ func createNestedIndexMapping() mapping.IndexMapping { companyMapping.AddSubDocumentMapping("departments", departmentsMapping) // Locations mapping - locationsMapping := mapping.NewDocumentMapping() - locationsMapping.Nested = true + locationsMapping := mapping.NewNestedDocumentMapping() // Location city field cityField := mapping.NewTextFieldMapping() @@ -5603,11 +5599,6 @@ func TestNestedConjunctionQuery(t *testing.T) { var buildReq = func(subQueries []query.Query) *SearchRequest { rv := NewSearchRequest(query.NewConjunctionQuery(subQueries)) rv.SortBy([]string{"_id"}) - reqString, err := json.MarshalIndent(rv, "", " ") - if err != nil { - t.Fatalf("failed to marshal search request: %v", err) - } - t.Logf("Search Request: %s", reqString) return rv } @@ -5949,3 +5940,186 @@ func TestNestedConjunctionQuery(t *testing.T) { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } } + +func TestNestedArrayConjunctionQuery(t *testing.T) { + imap := NewIndexMapping() + groupsMapping := mapping.NewNestedDocumentMapping() + + nameField := mapping.NewTextFieldMapping() + groupsMapping.AddFieldMappingsAt("first_name", nameField) + groupsMapping.AddFieldMappingsAt("last_name", nameField) + + imap.DefaultMapping.AddSubDocumentMapping("groups", groupsMapping) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + docs := []string{ + `{ + "groups": [ + [ + { + "first_name": "Alice", + "last_name": "Smith" + }, + { + "first_name": "Bob", + "last_name": "Johnson" + } + ], + [ + { + "first_name": "Charlie", + "last_name": "Williams" + }, + { + "first_name": "Diana", + "last_name": "Brown" + } + ] + ] + }`, + `{ + "groups": [ + { + "first_name": "Alice", + "last_name": "Smith" + }, + { + "first_name": "Bob", + "last_name": "Johnson" + }, + { + "first_name": "Charlie", + "last_name": "Williams" + }, + { + "first_name": "Diana", + "last_name": "Brown" + } + ] + }`, + } + + for i, doc := range docs { + var dataMap map[string]interface{} + err := json.Unmarshal([]byte(doc), &dataMap) + if err != nil { + t.Fatalf("failed to unmarshal document %d: %v", i, err) + } + err = idx.Index(fmt.Sprintf("%d", i+1), dataMap) + if err != nil { + t.Fatalf("failed to index document %d: %v", i, err) + } + } + + var ( + firstNameQuery *query.MatchQuery + lastNameQuery *query.MatchQuery + conjQuery *query.ConjunctionQuery + searchReq *SearchRequest + res *SearchResult + ) + + // Search for documents where first_name is "Alice" AND last_name is "Johnson" + firstNameQuery = query.NewMatchQuery("Alice") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Johnson") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Search for documents where first_name is "Bob" AND last_name is "Johnson" + firstNameQuery = query.NewMatchQuery("Bob") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Johnson") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Search for documents where first_name is "Alice" AND last_name is "Williams" + firstNameQuery = query.NewMatchQuery("Alice") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Williams") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Search for documents where first_name is "Diana" AND last_name is "Brown" + firstNameQuery = query.NewMatchQuery("Diana") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Brown") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } +} From 9b6871952e410dd24234b614bd0607263de517f0 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 20 Nov 2025 18:05:58 +0530 Subject: [PATCH 07/70] bug fixes and UTs --- index_update.go | 4 ++ index_update_test.go | 130 +++++++++++++++++++++++++++++++++++++++++++ mapping/index.go | 10 +++- search_test.go | 59 ++++++++++++++++++++ 4 files changed, 202 insertions(+), 1 deletion(-) diff --git a/index_update.go b/index_update.go index 5666d035b..cdd69e458 100644 --- a/index_update.go +++ b/index_update.go @@ -180,6 +180,10 @@ func checkUpdatedMapping(ori, upd *mapping.DocumentMapping) error { return nil } + if ori.Nested != upd.Nested { + return fmt.Errorf("nested property cannot be changed") + } + var err error // Recursively go through the child mappings for name, updDMapping := range upd.Properties { diff --git a/index_update_test.go b/index_update_test.go index 5d6326576..9ae9df83a 100644 --- a/index_update_test.go +++ b/index_update_test.go @@ -3082,3 +3082,133 @@ func BenchmarkIndexUpdateText(b *testing.B) { } } } + +func TestIndexUpdateNestedMapping(t *testing.T) { + // Helper: create a mapping with optional nested structure + createCompanyMapping := func(nestedEmployees, nestedDepartments, nestedProjects, nestedLocations bool) *mapping.IndexMappingImpl { + rv := mapping.NewIndexMapping() + companyMapping := mapping.NewDocumentMapping() + + // Basic fields + companyMapping.AddFieldMappingsAt("id", mapping.NewTextFieldMapping()) + companyMapping.AddFieldMappingsAt("name", mapping.NewTextFieldMapping()) + + var deptMapping *mapping.DocumentMapping + // Departments nested conditionally + if !nestedDepartments { + deptMapping = mapping.NewDocumentMapping() + } else { + deptMapping = mapping.NewNestedDocumentMapping() + } + deptMapping.AddFieldMappingsAt("name", mapping.NewTextFieldMapping()) + deptMapping.AddFieldMappingsAt("budget", mapping.NewNumericFieldMapping()) + + // Employees nested conditionally + var empMapping *mapping.DocumentMapping + if !nestedEmployees { + empMapping = mapping.NewNestedDocumentMapping() + } else { + empMapping = mapping.NewDocumentMapping() + } + empMapping.AddFieldMappingsAt("name", mapping.NewTextFieldMapping()) + empMapping.AddFieldMappingsAt("role", mapping.NewTextFieldMapping()) + deptMapping.AddSubDocumentMapping("employees", empMapping) + + // Projects nested conditionally + var projMapping *mapping.DocumentMapping + if !nestedProjects { + projMapping = mapping.NewNestedDocumentMapping() + } else { + projMapping = mapping.NewDocumentMapping() + } + projMapping.AddFieldMappingsAt("title", mapping.NewTextFieldMapping()) + projMapping.AddFieldMappingsAt("status", mapping.NewTextFieldMapping()) + deptMapping.AddSubDocumentMapping("projects", projMapping) + + companyMapping.AddSubDocumentMapping("departments", deptMapping) + + // Locations nested conditionally + var locMapping *mapping.DocumentMapping + if nestedLocations { + locMapping = mapping.NewNestedDocumentMapping() + } else { + locMapping = mapping.NewDocumentMapping() + } + locMapping.AddFieldMappingsAt("address", mapping.NewTextFieldMapping()) + locMapping.AddFieldMappingsAt("city", mapping.NewTextFieldMapping()) + + companyMapping.AddSubDocumentMapping("locations", locMapping) + + rv.DefaultMapping.AddSubDocumentMapping("company", companyMapping) + return rv + } + + tests := []struct { + name string + original *mapping.IndexMappingImpl + updated *mapping.IndexMappingImpl + expectErr bool + }{ + { + name: "No nested to all nested", + original: createCompanyMapping(false, false, false, false), + updated: createCompanyMapping(true, true, true, true), + expectErr: true, + }, + { + name: "No nested to mixed nested", + original: createCompanyMapping(false, false, false, false), + updated: createCompanyMapping(true, false, true, false), + expectErr: true, + }, + { + name: "No nested to mixed nested", + original: createCompanyMapping(false, false, false, false), + updated: createCompanyMapping(true, true, true, false), + expectErr: true, + }, + { + name: "Mixed nested to no nested", + original: createCompanyMapping(false, true, false, true), + updated: createCompanyMapping(false, false, true, true), + expectErr: true, + }, + { + name: "All nested to no nested", + original: createCompanyMapping(true, true, true, true), + updated: createCompanyMapping(false, false, false, false), + expectErr: true, + }, + { + name: "Mixed nested to all nested", + original: createCompanyMapping(true, false, true, false), + updated: createCompanyMapping(true, true, true, true), + expectErr: true, + }, + { + name: "All nested to mixed nested", + original: createCompanyMapping(true, true, true, true), + updated: createCompanyMapping(true, false, true, false), + expectErr: true, + }, + { + name: "No nested to no nested", + original: createCompanyMapping(false, false, false, false), + updated: createCompanyMapping(false, false, false, false), + expectErr: false, + }, + { + name: "All nested to all nested", + original: createCompanyMapping(true, true, true, true), + updated: createCompanyMapping(true, true, true, true), + expectErr: false, + }, + } + + for _, test := range tests { + _, err := DeletedFields(test.original, test.updated) + if (err != nil) != test.expectErr { + t.Errorf("Test '%s' unexpected error state: got %v, expectErr %t", test.name, err, test.expectErr) + } + } +} diff --git a/mapping/index.go b/mapping/index.go index 9aabab6ca..90d993fc0 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -194,11 +194,19 @@ func (im *IndexMappingImpl) Validate() error { } } fieldAliasCtx := make(map[string]*FieldMapping) + // ensure that the nested property is not set for top-level default mapping + if im.DefaultMapping.Nested { + return fmt.Errorf("default mapping cannot be nested") + } err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { return err } - for _, docMapping := range im.TypeMapping { + for name, docMapping := range im.TypeMapping { + // ensure that the nested property is not set for top-level mappings + if docMapping.Nested { + return fmt.Errorf("type mapping named: %s cannot be nested", name) + } err = docMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { return err diff --git a/search_test.go b/search_test.go index 9372eef68..d667d5ec2 100644 --- a/search_test.go +++ b/search_test.go @@ -5298,6 +5298,7 @@ func createNestedIndexMapping() mapping.IndexMapping { return imap } + func TestNestedPrefixes(t *testing.T) { imap := createNestedIndexMapping() @@ -5448,6 +5449,10 @@ func TestNestedPrefixes(t *testing.T) { func TestNestedConjunctionQuery(t *testing.T) { imap := createNestedIndexMapping() + err := imap.Validate() + if err != nil { + t.Fatalf("expected valid nested index mapping, got error: %v", err) + } tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) idx, err := New(tmpIndexPath, imap) @@ -6123,3 +6128,57 @@ func TestNestedArrayConjunctionQuery(t *testing.T) { t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) } } + +func TestValidNestedMapping(t *testing.T) { + // ensure that top-level mappings - DefaultMapping and any type mappings - cannot be nested mappings + imap := mapping.NewIndexMapping() + nestedMapping := mapping.NewNestedDocumentMapping() + imap.DefaultMapping = nestedMapping + err := imap.Validate() + if err == nil { + t.Fatalf("expected error for nested DefaultMapping, got nil") + } + // invalid nested type mapping + imap = mapping.NewIndexMapping() + imap.AddDocumentMapping("type1", nestedMapping) + err = imap.Validate() + if err == nil { + t.Fatalf("expected error for nested type mapping, got nil") + } + // valid nested mappings within DefaultMapping + imap = mapping.NewIndexMapping() + docMapping := mapping.NewDocumentMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + fieldMapping := mapping.NewTextFieldMapping() + nestedMapping.AddFieldMappingsAt("field1", fieldMapping) + docMapping.AddSubDocumentMapping("nestedField", nestedMapping) + imap.DefaultMapping = docMapping + err = imap.Validate() + if err != nil { + t.Fatalf("expected valid nested mapping, got error: %v", err) + } + // valid nested mappings within type mapping + imap = mapping.NewIndexMapping() + docMapping = mapping.NewDocumentMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + fieldMapping = mapping.NewTextFieldMapping() + nestedMapping.AddFieldMappingsAt("field1", fieldMapping) + docMapping.AddSubDocumentMapping("nestedField", nestedMapping) + imap.AddDocumentMapping("type1", docMapping) + err = imap.Validate() + if err != nil { + t.Fatalf("expected valid nested mapping, got error: %v", err) + } + // some nested type mappings + imap = mapping.NewIndexMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + regularMapping := mapping.NewDocumentMapping() + imap.AddDocumentMapping("non_nested1", regularMapping) + imap.AddDocumentMapping("non_nested2", regularMapping) + imap.AddDocumentMapping("nested1", nestedMapping) + imap.AddDocumentMapping("nested2", nestedMapping) + err = imap.Validate() + if err == nil { + t.Fatalf("expected error for nested type mappings, got nil") + } +} From dddf0ab6c374b63fa41669abfe48d7ee10267c1e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sun, 23 Nov 2025 00:27:16 +0530 Subject: [PATCH 08/70] update readme, add docs --- README.md | 1 + docs/hierarchy.md | 370 ++++++++++++++++++++++++++++++++++++++++++++ mapping.go | 14 ++ mapping/document.go | 11 ++ 4 files changed, 396 insertions(+) create mode 100644 docs/hierarchy.md diff --git a/README.md b/README.md index 1ed7d9abe..eafd2d0e2 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ A modern indexing + search library in GO * [geo spatial search](https://github.com/blevesearch/bleve/blob/master/geo/README.md) * approximate k-nearest neighbors via [vector search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md) * [synonym search](https://github.com/blevesearch/bleve/blob/master/docs/synonyms.md) + * [hierarchy search](https://github.com/blevesearch/bleve/blob/master/docs/hierarchy.md) * [tf-idf](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#tf-idf) / [bm25](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#bm25) scoring models * Hybrid search: exact + semantic * Supports [RRF (Reciprocal Rank Fusion) and RSF (Relative Score Fusion)](docs/score_fusion.md) diff --git a/docs/hierarchy.md b/docs/hierarchy.md new file mode 100644 index 000000000..fc28820db --- /dev/null +++ b/docs/hierarchy.md @@ -0,0 +1,370 @@ +# Hierarchy search + +* *v2.6.0* (and after) will come with support for **Array indexing and hierarchy search**. +* We've achieved this by embedding nested documents within our bleve (scorch) indexes. +* Usage of zap file format: [v17](https://github.com/blevesearch/zapx/blob/master/zap.md). Here we preserve hierarchical document relationships within segments, continuing to conform to the segmented architecture of *scorch*. + +## Supported + +* Indexing `Arrays` allows specifying fields that contain arrays of objects. Each object in the array can have its own set of fields, enabling the representation of hierarchical data structures within a single document. + + ```json + { + "id": "1", + "name": "John Doe", + "addresses": [ + { + "type": "home", + "street": "123 Main St", + "city": "Hometown", + "zip": "12345" + }, + { + "type": "work", + "street": "456 Corporate Blvd", + "city": "Metropolis", + "zip": "67890" + } + ] + } + ``` + +* Multi-level arrays: Arrays can contain objects that themselves have array fields, allowing for deeply nested structures, such as a list of projects, each with its own list of tasks. + + ```json + { + "id": "2", + "name": "Jane Smith", + "projects": [ + { + "name": "Project Alpha", + "tasks": [ + {"title": "Task 1", "status": "completed"}, + {"title": "Task 2", "status": "in-progress"} + ] + }, + { + "name": "Project Beta", + "tasks": [ + {"title": "Task A", "status": "not-started"}, + {"title": "Task B", "status": "completed"} + ] + } + ] + } + ``` + +* Multiple arrays: A document can have multiple fields that are arrays, each representing different hierarchical data, such as a list of phone numbers and a list of email addresses. + + ```json + { + "id": "3", + "name": "Alice Johnson", + "phones": [ + {"type": "mobile", "number": "555-1234"}, + {"type": "home", "number": "555-5678"} + ], + "emails": [ + {"type": "personal", "address": "alice@example.com"}, + {"type": "work", "address": "alice@work.com"} + ] + } + ``` + +* Hybrid arrays: Multi-level and multiple arrays can be combined within the same document to represent complex hierarchical data structures, such as a company with multiple departments, each having its own list of employees and projects. + + ```json + { + "id": "doc1", + "company": { + "id": "c1", + "name": "TechCorp", + "departments": [ + { + "name": "Engineering", + "budget": 2000000, + "employees": [ + {"name": "Alice", "role": "Engineer"}, + {"name": "Bob", "role": "Manager"} + ], + "projects": [ + {"title": "Project X", "status": "ongoing"}, + {"title": "Project Y", "status": "completed"} + ] + }, + { + "name": "Sales", + "budget": 300000, + "employees": [ + {"name": "Eve", "role": "Salesperson"}, + {"name": "Mallory", "role": "Manager"} + ], + "projects": [ + {"title": "Project A", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens","country": "Greece"}, + {"city": "Berlin","country": "USA"} + ] + } + } + ``` + +* Earlier versions of Bleve only supported flat arrays of primitive types (e.g., strings, numbers), and would flatten nested structures, losing the hierarchical relationships, so the above complex documents could not be accurately represented or queried. For example, the "employees" and "projects" fields within each department would be flattened, making it impossible to associate employees with their respective departments. + +* From v2.6.0 onwards, Bleve allows for accurate representation and querying of complex nested structures, preserving the relationships between different levels of the hierarchy, across multi-level, multiple and hybrid arrays. + +* The addition of `nested` document mappings enable defining fields that contain arrays of objects, giving the option to preserve the hierarchical relationships within the array during indexing. Having `nested` as false (default) will flatten the objects within the array, losing the hierarchy, which was the earlier behavior. + + ```json + { + "departments": { + "dynamic": false, + "enabled": true, + "nested": true, + "properties": { + "employees": { + "dynamic": false, + "enabled": true, + "nested": true + }, + "projects": { + "dynamic": false, + "enabled": true, + "nested": true + } + } + }, + "locations": { + "dynamic": false, + "enabled": true, + "nested": true + } + } + ``` + +* Any Bleve query (e.g., match, phrase, term, fuzzy, numeric/date range etc.) can be executed against fields within nested documents, with no special handling required. The query processor will automatically traverse the nested structures to find matches. Additional search constructs +like vector search, synonym search, hybrid and pre-filtered vector search integrate seamlessly with hierarchy search. + +* Conjunction Queries (AND queries) and other queries that depend on term co-occurrence within the same hierarchical context will respect the boundaries of nested documents. This means that terms must appear within the same nested object to be considered a match. For example, a conjunction query searching for an employee named "Alice" with the role "Engineer" within the "Engineering" department will only return results where both name and role terms are found within the same employee object, which is itself within a "Engineering" department object. + +* Some other search constructs will have enhanced precision with hierarchy search. + * Fields/Highlighting: Only fields belonging to matching nested objects are eligible for field‐level retrieval and highlighting. This ensures highlights appear only in the relevant hierarchical context, not anywhere else in the document. For example, if a match occurs in an employee object within the a department object names "Engineering", only fields within that employee object will be highlighted, not names from other employees or unrelated fields. + * Aggregations/Faceting: Facets can be computed over terms that exist inside nested objects, providing more accurate, context-aware aggregation results. A facet on `departments.projects.status` will produce buckets such as `active`, `paused`, `completed` only for the matched departments, instead of aggregating project status across the entire company document. + * Sorting: Sorting can be applied using fields from nested objects. This allows sorting results based on values inside the appropriate nested structure. For example, sorting companies by `departments.budget` (descending) will order documents based on the budget of the specific department involved in the match, rather than the overall document or unrelated departments. + +## Indexing + +Below is an example of using the Bleve API to index documents with hierarchical structures, using hybrid arrays and nested mappings. + +```go +// Define a document to be indexed. +docJSON := + `{ + "company": { + "id": "c3", + "name": "WebSolutions", + "departments": [ + { + "name": "HR", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Manager"}, + {"name": "Frank", "role": "HR"} + ], + "projects": [ + {"title": "Project Beta", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + }, + { + "name": "Engineering", + "budget": 200000, + "employees": [ + {"name": "Heidi", "role": "Support Engineer"}, + {"name": "Ivan", "role": "Manager"} + ], + "projects": [ + {"title": "Project Helpdesk", "status": "ongoing"}, + {"title": "Project FAQ", "status": "completed"} + ] + } + ], + "locations": [ + {"city": "Edinburgh", "country": "UK"}, + {"city": "London", "country": "Canada"} + ] + } + }` + +// Define departments as a nested document mapping (since it contains arrays of objects) +// and index name and budget fields +departmentsMapping := bleve.NewNestedDocumentMapping() +departmentsMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +departmentsMapping.AddFieldMappingsAt("budget", bleve.NewNumericFieldMapping()) + +// Define employees as a nested document mapping within departments (since it contains arrays of objects) +// and index name and role fields +employeesMapping := bleve.NewNestedDocumentMapping() +employeesMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +employeesMapping.AddFieldMappingsAt("role", bleve.NewTextFieldMapping()) +departmentsMapping.AddSubDocumentMapping("employees", employeesMapping) + +// Define projects as a nested document mapping within departments (since it contains arrays of objects) +// and index title and status fields +projectsMapping := bleve.NewNestedDocumentMapping() +projectsMapping.AddFieldMappingsAt("title", bleve.NewTextFieldMapping()) +projectsMapping.AddFieldMappingsAt("status", bleve.NewTextFieldMapping()) +departmentsMapping.AddSubDocumentMapping("projects", projectsMapping) + +// Define locations as a nested document mapping (since it contains arrays of objects) +// and index city and country fields +locationsMapping := bleve.NewNestedDocumentMapping() +locationsMapping.AddFieldMappingsAt("city", bleve.NewTextFieldMapping()) +locationsMapping.AddFieldMappingsAt("country", bleve.NewTextFieldMapping()) + +// Define company as a document mapping and index its name field and +// add departments and locations as sub-document mappings +companyMapping := bleve.NewDocumentMapping() +companyMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +companyMapping.AddSubDocumentMapping("departments", departmentsMapping) +companyMapping.AddSubDocumentMapping("locations", locationsMapping) + +// Define the final index mapping and add company as a sub-document mapping in the default mapping +indexMapping := bleve.NewIndexMapping() +indexMapping.DefaultMapping.AddSubDocumentMapping("company", companyMapping) + +// Create the index with the defined mapping +index, err := bleve.New("hierarchy_example.bleve", indexMapping) +if err != nil { + panic(err) +} + +// Unmarshal the document JSON into a map, for indexing +var doc map[string]interface{} +err = json.Unmarshal([]byte(docJSON), &doc) +if err != nil { + panic(err) +} + +// Index the document +err = index.Index("doc1", doc) +if err != nil { + panic(err) +} +``` + +## Querying + +```go +// Open the index +index, err := bleve.Open("hierarchy_example.bleve") +if err != nil { + panic(err) +} + +var ( + req *bleve.SearchRequest + res *bleve.SearchResult +) + +// Example 1: Simple Match Query on a field within a nested document, should work as if it were a flat field +q1 := bleve.NewMatchQuery("Engineer") +q1.SetField("company.departments.employees.role") +req = bleve.NewSearchRequest(q1) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Match Query Results:", res) + +// Example 2: Conjunction Query (AND) on fields within the same nested document +// like finding employees with name "Eve" and role "Manager". This will only match +// if both terms are in the same employee object. +q1 = bleve.NewMatchQuery("Eve") +q1.SetField("company.departments.employees.name") +q2 := bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +conjQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(conjQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Conjunction Query Results:", res) + +// Example 3: Multi-level Nested Query, finding projects with status "ongoing" +// within the "Engineering" department. This ensures both conditions are met +// within the correct hierarchy, i.e., the ongoing project must belong to the +// Engineering department. +q1 = bleve.NewMatchQuery("Engineering") +q1.SetField("company.departments.name") +q2 = bleve.NewMatchQuery("ongoing") +q2.SetField("company.departments.projects.status") +multiLevelQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(multiLevelQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Multi-level Nested Query Results:", res) + +// Example 4: Multiple Arrays Query, finding documents with a location in "London" +// and an employee with the role "Manager". This checks conditions across different arrays. +q1 = bleve.NewMatchQuery("London") +q1.SetField("company.locations.city") +q2 = bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +multiArrayQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(multiArrayQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Multiple Arrays Query Results:", res) + +// Hybrid Arrays Query, combining multi-level and multiple arrays, +// finding documents with a Manager named Ivan working in Edinburgh, UK +q1 = bleve.NewMatchQuery("Ivan") +q1.SetField("company.departments.employees.name") +q2 = bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +q3 := bleve.NewMatchQuery("Edinburgh") +q3.SetField("company.locations.city") +q4 := bleve.NewMatchQuery("UK") +q4.SetField("company.locations.country") +hybridArrayQuery := bleve.NewConjunctionQuery( + bleve.NewConjunctionQuery( + q1, + q2, + ), + bleve.NewConjunctionQuery( + q3, + q4, + ), +) +req = bleve.NewSearchRequest(hybridArrayQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Hybrid Arrays Query Results:", res) + +// Close the index when done +err = index.Close() +if err != nil { + panic(err) +} +``` diff --git a/mapping.go b/mapping.go index 723105a29..af02db386 100644 --- a/mapping.go +++ b/mapping.go @@ -34,6 +34,20 @@ func NewDocumentStaticMapping() *mapping.DocumentMapping { return mapping.NewDocumentStaticMapping() } +// NewNestedDocumentMapping returns a new document mapping +// that will treat all objects as nested documents. +func NewNestedDocumentMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentMapping() +} + +// NewNestedDocumentStaticMapping returns a new document mapping +// that will treat all objects as nested documents and +// will not automatically index parts of a nested document +// without an explicit mapping. +func NewNestedDocumentStaticMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentStaticMapping() +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *mapping.DocumentMapping { diff --git a/mapping/document.go b/mapping/document.go index 15ab668f4..67353afc4 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -256,6 +256,17 @@ func NewDocumentStaticMapping() *DocumentMapping { } } +// NewNestedDocumentStaticMapping returns a new document +// mapping that treats sub-documents as nested +// objects and will not automatically index parts +// of the nested document without an explicit mapping. +func NewNestedDocumentStaticMapping() *DocumentMapping { + return &DocumentMapping{ + Enabled: true, + Nested: true, + } +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *DocumentMapping { From 78832331a8e7739d0f7c8007bf0d9510a1a5218e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sun, 23 Nov 2025 00:35:54 +0530 Subject: [PATCH 09/70] fix render --- docs/hierarchy.md | 230 +++++++++++++++++++++++----------------------- 1 file changed, 115 insertions(+), 115 deletions(-) diff --git a/docs/hierarchy.md b/docs/hierarchy.md index fc28820db..9518b899e 100644 --- a/docs/hierarchy.md +++ b/docs/hierarchy.md @@ -8,110 +8,110 @@ * Indexing `Arrays` allows specifying fields that contain arrays of objects. Each object in the array can have its own set of fields, enabling the representation of hierarchical data structures within a single document. - ```json - { - "id": "1", - "name": "John Doe", - "addresses": [ - { - "type": "home", - "street": "123 Main St", - "city": "Hometown", - "zip": "12345" - }, - { - "type": "work", - "street": "456 Corporate Blvd", - "city": "Metropolis", - "zip": "67890" - } - ] - } - ``` +```json +{ + "id": "1", + "name": "John Doe", + "addresses": [ + { + "type": "home", + "street": "123 Main St", + "city": "Hometown", + "zip": "12345" + }, + { + "type": "work", + "street": "456 Corporate Blvd", + "city": "Metropolis", + "zip": "67890" + } + ] +} +``` * Multi-level arrays: Arrays can contain objects that themselves have array fields, allowing for deeply nested structures, such as a list of projects, each with its own list of tasks. - ```json - { - "id": "2", - "name": "Jane Smith", - "projects": [ +```json +{ + "id": "2", + "name": "Jane Smith", + "projects": [ + { + "name": "Project Alpha", + "tasks": [ + {"title": "Task 1", "status": "completed"}, + {"title": "Task 2", "status": "in-progress"} + ] + }, + { + "name": "Project Beta", + "tasks": [ + {"title": "Task A", "status": "not-started"}, + {"title": "Task B", "status": "completed"} + ] + } + ] +} +``` + +* Multiple arrays: A document can have multiple fields that are arrays, each representing different hierarchical data, such as a list of phone numbers and a list of email addresses. + +```json +{ + "id": "3", + "name": "Alice Johnson", + "phones": [ + {"type": "mobile", "number": "555-1234"}, + {"type": "home", "number": "555-5678"} + ], + "emails": [ + {"type": "personal", "address": "alice@example.com"}, + {"type": "work", "address": "alice@work.com"} + ] +} +``` + +* Hybrid arrays: Multi-level and multiple arrays can be combined within the same document to represent complex hierarchical data structures, such as a company with multiple departments, each having its own list of employees and projects. + +```json +{ + "id": "doc1", + "company": { + "id": "c1", + "name": "TechCorp", + "departments": [ { - "name": "Project Alpha", - "tasks": [ - {"title": "Task 1", "status": "completed"}, - {"title": "Task 2", "status": "in-progress"} + "name": "Engineering", + "budget": 2000000, + "employees": [ + {"name": "Alice", "role": "Engineer"}, + {"name": "Bob", "role": "Manager"} + ], + "projects": [ + {"title": "Project X", "status": "ongoing"}, + {"title": "Project Y", "status": "completed"} ] }, { - "name": "Project Beta", - "tasks": [ - {"title": "Task A", "status": "not-started"}, - {"title": "Task B", "status": "completed"} + "name": "Sales", + "budget": 300000, + "employees": [ + {"name": "Eve", "role": "Salesperson"}, + {"name": "Mallory", "role": "Manager"} + ], + "projects": [ + {"title": "Project A", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} ] } - ] - } - ``` - -* Multiple arrays: A document can have multiple fields that are arrays, each representing different hierarchical data, such as a list of phone numbers and a list of email addresses. - - ```json - { - "id": "3", - "name": "Alice Johnson", - "phones": [ - {"type": "mobile", "number": "555-1234"}, - {"type": "home", "number": "555-5678"} ], - "emails": [ - {"type": "personal", "address": "alice@example.com"}, - {"type": "work", "address": "alice@work.com"} + "locations": [ + {"city": "Athens","country": "Greece"}, + {"city": "Berlin","country": "USA"} ] } - ``` - -* Hybrid arrays: Multi-level and multiple arrays can be combined within the same document to represent complex hierarchical data structures, such as a company with multiple departments, each having its own list of employees and projects. - - ```json - { - "id": "doc1", - "company": { - "id": "c1", - "name": "TechCorp", - "departments": [ - { - "name": "Engineering", - "budget": 2000000, - "employees": [ - {"name": "Alice", "role": "Engineer"}, - {"name": "Bob", "role": "Manager"} - ], - "projects": [ - {"title": "Project X", "status": "ongoing"}, - {"title": "Project Y", "status": "completed"} - ] - }, - { - "name": "Sales", - "budget": 300000, - "employees": [ - {"name": "Eve", "role": "Salesperson"}, - {"name": "Mallory", "role": "Manager"} - ], - "projects": [ - {"title": "Project A", "status": "completed"}, - {"title": "Project B", "status": "ongoing"} - ] - } - ], - "locations": [ - {"city": "Athens","country": "Greece"}, - {"city": "Berlin","country": "USA"} - ] - } - } - ``` +} +``` * Earlier versions of Bleve only supported flat arrays of primitive types (e.g., strings, numbers), and would flatten nested structures, losing the hierarchical relationships, so the above complex documents could not be accurately represented or queried. For example, the "employees" and "projects" fields within each department would be flattened, making it impossible to associate employees with their respective departments. @@ -119,32 +119,32 @@ * The addition of `nested` document mappings enable defining fields that contain arrays of objects, giving the option to preserve the hierarchical relationships within the array during indexing. Having `nested` as false (default) will flatten the objects within the array, losing the hierarchy, which was the earlier behavior. - ```json - { - "departments": { - "dynamic": false, - "enabled": true, - "nested": true, - "properties": { - "employees": { - "dynamic": false, - "enabled": true, - "nested": true - }, - "projects": { - "dynamic": false, - "enabled": true, - "nested": true - } +```json +{ + "departments": { + "dynamic": false, + "enabled": true, + "nested": true, + "properties": { + "employees": { + "dynamic": false, + "enabled": true, + "nested": true + }, + "projects": { + "dynamic": false, + "enabled": true, + "nested": true } - }, - "locations": { - "dynamic": false, - "enabled": true, - "nested": true } + }, + "locations": { + "dynamic": false, + "enabled": true, + "nested": true } - ``` +} +``` * Any Bleve query (e.g., match, phrase, term, fuzzy, numeric/date range etc.) can be executed against fields within nested documents, with no special handling required. The query processor will automatically traverse the nested structures to find matches. Additional search constructs like vector search, synonym search, hybrid and pre-filtered vector search integrate seamlessly with hierarchy search. From 7b058a650ea7a613f49b6193fef43de1454873e6 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sun, 23 Nov 2025 00:44:45 +0530 Subject: [PATCH 10/70] lint fix --- docs/index_update.md | 12 ++++++---- docs/pagination.md | 9 ++++---- docs/persister.md | 53 ++++++++++++++++++++++---------------------- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/docs/index_update.md b/docs/index_update.md index f736dde73..34cb69038 100644 --- a/docs/index_update.md +++ b/docs/index_update.md @@ -10,11 +10,13 @@ While opening an index, if an updated mapping is provided as a string under the If the update fails, the index is unchanged and an error is returned explaining why the update was unsuccessful. ## What can be deleted and what can't be deleted? + Fields can be partially deleted by changing their Index, Store, and DocValues parameters from true to false, or completely removed by deleting the field itself. Additionally, document mappings can be deleted either by fully removing them from the index mapping or by setting the Enabled value to false, which deletes all fields defined within that mapping. However, if any of the following conditions are met, the index is considered non-updatable. + * Any additional fields or enabled document mappings in the new index mapping * Any changes to IncludeInAll, type, IncludeTermVectors and SkipFreqNorm * Any document mapping having it's enabled value changing from false to true @@ -26,16 +28,18 @@ However, if any of the following conditions are met, the index is considered non * If multiple fields sharing the same field name either from different type mappings or aliases are present, then any non compatible changes across all of these fields ## How to enforce immediate deletion? + Since the deletion is only done during merging, a [force merge](https://github.com/blevesearch/bleve/blob/b82baf10b205511cf12da5cb24330abd9f5b1b74/index/scorch/merge.go#L164) may be used to completely remove the stale data. ## Sample code to update an existing index -``` + +```go newMapping := `` config := map[string]interface{}{ - "updated_mapping": newMapping + "updated_mapping": newMapping, } -index, err := OpenUsing("", config) +index, err := bleve.OpenUsing("", config) if err != nil { - return err + panic(err) } ``` diff --git a/docs/pagination.md b/docs/pagination.md index 80bae753b..fc3adb848 100644 --- a/docs/pagination.md +++ b/docs/pagination.md @@ -2,7 +2,7 @@ ## Why pagination matters -Search queries can match many documents. Pagination lets you fetch and display results in chunks, keeping responses small and fast. +Search queries can match many documents. Pagination lets you fetch and display results in chunks, keeping responses small and fast. By default, Bleve returns the first 10 hits sorted by relevance (score), highest first. @@ -48,7 +48,7 @@ Rules: Where do sort keys come from? -- Each hit includes `Sort` (and `DecodedSort` from Bleve v2.5.2). Take the last hit’s sort keys for `SearchAfter`, or the first hit’s sort keys for `SearchBefore`. +- Each hit includes `Sort` (and `DecodedSort` from Bleve v2.5.2). Take the last hit’s sort keys for `SearchAfter`, or the first hit's sort keys for `SearchBefore`. - If the field/fields to be searched over is numeric, datetime or geo, the values in the `Sort` field may have garbled values; this is because of how Bleve represents such data types internally. To use such fields as sort keys, use the `DecodedSort` field, which decodes the internal representations. This feature is available from Bleve v2.5.4. > When using `DecodedSort`, the `Sort` array in the search request needs to explicitly declare the type of the field for proper decoding. Hence, the `Sort` array must contain either `SortField` objects (for numeric and datetime) or `SortGeoDistance` objects (for geo) rather than just the field names. More info on `SortField` and `SortGeoDistance` can be found in [sort_facet.md](sort_facet.md). @@ -76,6 +76,7 @@ Backward pagination over `_id` and `_score`: ``` Pagination using numeric, datetime and geo fields. Notice how we specify the sort objects, with the "type" field explicitly declared in case of numeric and datetime: + ```json { "query": { @@ -89,8 +90,8 @@ Pagination using numeric, datetime and geo fields. Notice how we specify the sor ], "search_after": ["99.99", "2023-10-15T10:30:00Z", "5.2"] } - ``` + ## Total Sort Order Pagination is deterministic. Ensure your `Sort` defines a total order, so that documents with the same sort keys are not left out: @@ -105,4 +106,4 @@ Pagination is deterministic. Ensure your `Sort` defines a total order, so that d - Offset pagination cost grows with `From` (collects at least `Size + From` results before slicing). - `SearchAfter`/`SearchBefore` keeps memory and network proportional to `Size`. -- For large datasets and deep navigation, prefer using `SearchAfter` and `SearchBefore`. \ No newline at end of file +- For large datasets and deep navigation, prefer using `SearchAfter` and `SearchBefore`. diff --git a/docs/persister.md b/docs/persister.md index 0b5447faf..2f54919d8 100644 --- a/docs/persister.md +++ b/docs/persister.md @@ -2,7 +2,7 @@ ## Memory Management -When data is indexed in Scorch — using either the `index.Index()` or `index.Batch()` API — it is added as part of an in-memory "segment". Memory management in Scorch indexing mainly relates to handling these in-memory segments during workloads that involve inserts or updates. +When data is indexed in Scorch — using either the `index.Index()` or `index.Batch()` API — it is added as part of an in-memory "segment". Memory management in Scorch indexing mainly relates to handling these in-memory segments during workloads that involve inserts or updates. In scenarios with a continuous stream of incoming data, a large number of in-memory segments can accumulate over time. This is where the persister component comes into play—its job is to flush these in-memory segments to disk. @@ -11,27 +11,28 @@ Starting with v2.5.0, Scorch supports parallel flushing of in-memory segments to - `NumPersisterWorkers`: This factor decides how many maximum workers can be spawned to flush out the in-memory segments. Each worker will work on a disjoint subset of segments, merge them, and flush them out to the disk. By default the persister deploys only one worker. - `MaxSizeInMemoryMergePerWorker`: This config decides what's the maximum amount of input data in bytes a single worker can work upon. By default this value is equal to 0 which means that this config is disabled and the worker tries to merge all the data in one shot. Also note that it's imperative that the user set this config if `NumPersisterWorkers > 1`. -If the index is tuned to have a higher `NumPersisterWorkers` value, the memory can potentially drain out faster and ensure stronger consistency behaviour — but there would be a lot of on-disk files, and the background merger would experience the pressure of managing this large number of files, which can be resource-intensive. - - Tuning this config is very dependent on the available CPU resources, and something to keep in mind here is that the process's RSS can increase if the number of workers — and each of them working upon a large amount of data — is high. +If the index is tuned to have a higher `NumPersisterWorkers` value, the memory can potentially drain out faster and ensure stronger consistency behaviour — but there would be a lot of on-disk files, and the background merger would experience the pressure of managing this large number of files, which can be resource-intensive. -Increasing the `MaxSizeInMemoryMergePerWorker` value would mean that each worker acts upon a larger amount of data and spends more time merging and flushing it out to disk — which can be healthy behaviour in terms of I/O, although it comes at the cost of time. -- Changing this config is usecase dependent, for example in usecases where the payload or per doc size is generally large in size (for eg vector usecases), it would be beneficial to have a larger value for this. +- Tuning this config is very dependent on the available CPU resources, and something to keep in mind here is that the process's RSS can increase if the number of workers — and each of them working upon a large amount of data — is high. -So, having the ideal values for these two configs is definitely dependent on the use case and can involve a bunch of experiments, keeping the resource usage in mind. +Increasing the `MaxSizeInMemoryMergePerWorker` value would mean that each worker acts upon a larger amount of data and spends more time merging and flushing it out to disk — which can be healthy behaviour in terms of I/O, although it comes at the cost of time. +- Changing this config is usecase dependent, for example in usecases where the payload or per doc size is generally large in size (for eg vector usecases), it would be beneficial to have a larger value for this. + +So, having the ideal values for these two configs is definitely dependent on the use case and can involve a bunch of experiments, keeping the resource usage in mind. ## File Management -The persister introducing some number of file segments into the system would change the state of the system, and the merger would wake up and try to manage these on-disk files. +The persister introducing some number of file segments into the system would change the state of the system, and the merger would wake up and try to manage these on-disk files. -Management of these files is crucial when it comes to query latency because a higher number of files would dictate searching through a larger number of files and also higher read amplification to some extent, because the backing data structures can potentially be compacted in size across files. +Management of these files is crucial when it comes to query latency because a higher number of files would dictate searching through a larger number of files and also higher read amplification to some extent, because the backing data structures can potentially be compacted in size across files. -The merger sees the files on disk and plans out which segments to merge so that the final layout of segment tiers (each tier having multiple files), which grow in a logarithmic way (the chances of larger tiers growing in number would decrease), is maintained. This also implies that deciding this first-tier size becomes important in deciding the number of segment files across all tiers. +The merger sees the files on disk and plans out which segments to merge so that the final layout of segment tiers (each tier having multiple files), which grow in a logarithmic way (the chances of larger tiers growing in number would decrease), is maintained. This also implies that deciding this first-tier size becomes important in deciding the number of segment files across all tiers. -Starting with v2.5.0, this first-tier size is dependent on the file size using the `FloorSegmentFileSize` config, because that's a better metric to consider (unlike the legacy live doc count metric) in order to ensure that the behaviour is in line with the use case and aware of the payload/doc size. -- This config can also be tuned to dictate how the I/O behaviour should be within an index. While tuning this config, it should be in proportion to the `MaxSizeInMemoryMergePerWorker` since that dictates the amount of data flushed out per flush. -- The observation here is that `FloorSegmentFileSize` is lesser than `MaxSizeInMemoryMergePerWorker` and for an optimal I/O during indexing, this value can be set close to `MaxSizeInMemoryMergePerWorker/6`. +Starting with v2.5.0, this first-tier size is dependent on the file size using the `FloorSegmentFileSize` config, because that's a better metric to consider (unlike the legacy live doc count metric) in order to ensure that the behaviour is in line with the use case and aware of the payload/doc size. +- This config can also be tuned to dictate how the I/O behaviour should be within an index. While tuning this config, it should be in proportion to the `MaxSizeInMemoryMergePerWorker` since that dictates the amount of data flushed out per flush. +- The observation here is that `FloorSegmentFileSize` is lesser than `MaxSizeInMemoryMergePerWorker` and for an optimal I/O during indexing, this value can be set close to `MaxSizeInMemoryMergePerWorker/6`. ## Setting a Persister/Merger Config in Index @@ -39,18 +40,18 @@ The configs are set via the `kvConfig` parameter in the `NewUsing()` or `OpenUsi ```go // setting the persister and merger configs - kvConfig := map[string]interface{}{ - "scorchPersisterOptions": map[string]interface{}{ - "NumPersisterWorkers": 4, - "MaxSizeInMemoryMergePerWorker": 20000000, - }, - "scorchMergePlanOptions": map[string]interface{}{ - "FloorSegmentFileSize": 10000000, - }, - } - // passing the config to the index - index, err := bleve.NewUsing("example.bleve", bleve.NewIndexMapping(), bleve.Config.DefaultIndexType, bleve.Config.DefaultMemKVStore, kvConfig) - if err != nil { - panic(err) - } + kvConfig := map[string]interface{}{ + "scorchPersisterOptions": map[string]interface{}{ + "NumPersisterWorkers": 4, + "MaxSizeInMemoryMergePerWorker": 20000000, + }, + "scorchMergePlanOptions": map[string]interface{}{ + "FloorSegmentFileSize": 10000000, + }, + } + // passing the config to the index + index, err := bleve.NewUsing("example.bleve", bleve.NewIndexMapping(), bleve.Config.DefaultIndexType, bleve.Config.DefaultMemKVStore, kvConfig) + if err != nil { + panic(err) + } ``` From 311f57850f83426b3955fd777a3515dcd1a30210 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 25 Nov 2025 18:19:01 +0530 Subject: [PATCH 11/70] bug fixes --- search/explanation.go | 3 + search/scorer/scorer_conjunction_nested.go | 83 ----- search/search.go | 17 + search/searcher/search_conjunction_nested.go | 325 +++++++++---------- 4 files changed, 178 insertions(+), 250 deletions(-) delete mode 100644 search/scorer/scorer_conjunction_nested.go diff --git a/search/explanation.go b/search/explanation.go index 38273fecb..6568ba734 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -70,6 +70,9 @@ func MergeExpl(first, second *Explanation) *Explanation { if second == nil { return first } + if first == second { + return first + } if first.Message == MergedExplMessage { // reuse first explanation as the merged one first.Value += second.Value diff --git a/search/scorer/scorer_conjunction_nested.go b/search/scorer/scorer_conjunction_nested.go deleted file mode 100644 index a2b366fba..000000000 --- a/search/scorer/scorer_conjunction_nested.go +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2025 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package scorer - -import ( - "reflect" - "slices" - - "github.com/blevesearch/bleve/v2/search" - "github.com/blevesearch/bleve/v2/size" - index "github.com/blevesearch/bleve_index_api" -) - -var reflectStaticSizeNestedConjunctionQueryScorer int - -func init() { - var ncqs NestedConjunctionQueryScorer - reflectStaticSizeNestedConjunctionQueryScorer = int(reflect.TypeOf(ncqs).Size()) -} - -type NestedConjunctionQueryScorer struct { - options search.SearcherOptions -} - -func (s *NestedConjunctionQueryScorer) Size() int { - return reflectStaticSizeNestedConjunctionQueryScorer + size.SizeOfPtr -} - -func NewNestedConjunctionQueryScorer(options search.SearcherOptions) *NestedConjunctionQueryScorer { - return &NestedConjunctionQueryScorer{ - options: options, - } -} - -func (s *NestedConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents []*search.DocumentMatch, - ancestry [][]index.IndexInternalID, joinIdx int) (*search.DocumentMatch, error) { - // Find the constituent with the shortest effective depth. - lcaIdx := 0 - lcaDepth := computeDepth(ancestry[0], joinIdx) - - for i := 1; i < len(ancestry); i++ { - d := computeDepth(ancestry[i], joinIdx) - if d < lcaDepth { - lcaDepth = d - lcaIdx = i - } - } - - // Clone the LCA document ID and start a fresh DocumentMatch. - lcaDocID := constituents[lcaIdx].IndexInternalID - result := &search.DocumentMatch{ - IndexInternalID: slices.Clone(lcaDocID), - } - - // Merge all constituents into the new match. - for _, dm := range constituents { - if err := result.MergeWith(dm); err != nil { - return nil, err - } - } - - return result, nil -} - -// computeDepth returns the depth considered for LCA selection. -func computeDepth(anc []index.IndexInternalID, joinIdx int) int { - if len(anc) <= joinIdx { - return len(anc) - } - return joinIdx + 1 -} diff --git a/search/search.go b/search/search.go index 8df5b43a5..ebc1af3b5 100644 --- a/search/search.go +++ b/search/search.go @@ -492,6 +492,23 @@ type NestedDocumentMatch struct { Fragments FieldFragmentMap `json:"fragments,omitempty"` } +func (ndm *NestedDocumentMatch) String() string { + rv := "\n" + for fragmentField, fragments := range ndm.Fragments { + rv += fmt.Sprintf("\t%s\n", fragmentField) + for _, fragment := range fragments { + rv += fmt.Sprintf("\t\t%s\n", fragment) + } + } + for otherFieldName, otherFieldValue := range ndm.Fields { + if _, ok := ndm.Fragments[otherFieldName]; !ok { + rv += fmt.Sprintf("\t%s\n", otherFieldName) + rv += fmt.Sprintf("\t\t%v\n", otherFieldValue) + } + } + return rv +} + // NewNestedDocumentMatch creates a new NestedDocumentMatch instance // with the given fields and fragments func NewNestedDocumentMatch(fields map[string]interface{}, fragments FieldFragmentMap) *NestedDocumentMatch { diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index d520ef844..d387e10f2 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -21,7 +21,6 @@ import ( "reflect" "github.com/blevesearch/bleve/v2/search" - "github.com/blevesearch/bleve/v2/search/scorer" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" ) @@ -39,11 +38,11 @@ type NestedConjunctionSearcher struct { queryNorm float64 currs []*search.DocumentMatch currAncestors [][]index.IndexInternalID - pivotIDx int - scorer *scorer.NestedConjunctionQueryScorer + currKeys []index.IndexInternalID initialized bool joinIdx int options search.SearcherOptions + docQueue *CoalesceQueue } func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, @@ -62,66 +61,15 @@ func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexRe searchers: searchers, currs: make([]*search.DocumentMatch, len(searchers)), currAncestors: make([][]index.IndexInternalID, len(searchers)), - scorer: scorer.NewNestedConjunctionQueryScorer(options), + currKeys: make([]index.IndexInternalID, len(searchers)), joinIdx: joinIdx, + docQueue: NewCoalesceQueue(), } rv.computeQueryNorm() return &rv, nil } -// getTargetAncestor returns the appropriate ancestor ID for the given joinIdx -// if the ancestry chain is shallower than joinIdx, it returns the deepest ancestor -// otherwise it returns the ancestor at joinIdx level from the top-most ancestor -func getTargetAncestor(ancestors []index.IndexInternalID, joinIdx int) index.IndexInternalID { - if len(ancestors) > joinIdx { - return ancestors[len(ancestors)-joinIdx-1] - } - return ancestors[len(ancestors)-1] -} - -func (s *NestedConjunctionSearcher) initSearchers(ctx *search.SearchContext) (bool, error) { - var err error - // get all searchers pointing at their first match - for i, searcher := range s.searchers { - if s.currs[i] != nil { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return false, err - } - if s.currs[i] == nil { - // one of the searchers is exhausted, so we are done - return true, nil - } - // get the ancestry chain for this match - s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) - if err != nil { - return false, err - } - } - // scan the ancestry chains for all searchers to get the pivotIDx - // the pivot will be the searcher with the longest ancestry chain - // if there are multiple with the same length, pick the one with - // the highest docID - s.pivotIDx = 0 - pivotLength := len(s.currAncestors[0]) - for i := 1; i < len(s.searchers); i++ { - if len(s.currAncestors[i]) > pivotLength { - s.pivotIDx = i - pivotLength = len(s.currAncestors[i]) - } else if len(s.currAncestors[i]) == pivotLength { - // if same length, pick the one with the highest docID - if s.currs[i].IndexInternalID.Compare(s.currs[s.pivotIDx].IndexInternalID) > 0 { - s.pivotIDx = i - } - } - } - s.initialized = true - return false, nil -} - func (s *NestedConjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 @@ -137,8 +85,7 @@ func (s *NestedConjunctionSearcher) computeQueryNorm() { } func (s *NestedConjunctionSearcher) Size() int { - sizeInBytes := reflectStaticSizeNestedConjunctionSearcher + size.SizeOfPtr + - s.scorer.Size() + sizeInBytes := reflectStaticSizeNestedConjunctionSearcher + size.SizeOfPtr for _, entry := range s.searchers { sizeInBytes += entry.Size() @@ -199,109 +146,72 @@ func (s *NestedConjunctionSearcher) DocumentMatchPoolSize() int { } func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + var err error + // initialize on first call to Next, by getting first match + // from each searcher and their ancestry chains if !s.initialized { - exhausted, err := s.initSearchers(ctx) - if err != nil { - return nil, err - } - if exhausted { - return nil, nil - } - } - // we have the pivot searcher, now try to align all the others to it, using the racecar algorithm, - // basically - the idea is simple - we first check if the pivot searcher's indexInternalID - // is behind any of the other searchers, and if so, we are sure that the pivot searcher - // cannot be part of a match, so we advance it to the maximum of the other searchers. - // Now once the pivot searcher is ahead of all the other searchers, we advance all the other - // searchers to the corresponding ancestor of the pivot searcher, if all of them align on the correct - // ancestor, we have a match, otherwise we repeat the process. - for { - pivotSearcher := s.searchers[s.pivotIDx] - pivotDM := s.currs[s.pivotIDx] - if pivotDM == nil { - // one of the searchers is exhausted, so we are done - return nil, nil - } - pivotAncestors := s.currAncestors[s.pivotIDx] - pivotID := pivotDM.IndexInternalID - // first, make sure the pivot is ahead of all the other searchers - // we do this by getting the max of all the other searchers' IDs - // at their respective target ancestors - // and if the pivot is behind that, we advance it to that - maxID := getTargetAncestor(pivotAncestors, s.joinIdx) - for i := 0; i < len(s.searchers); i++ { - if i == s.pivotIDx { - // skip the pivot itself - continue + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) } - curr := s.currs[i] - if curr == nil { - // one of the searchers is exhausted, so we are done - return nil, nil - } - targetAncestor := getTargetAncestor(s.currAncestors[i], s.joinIdx) - // now compare curr's target ancestor with maxID - if targetAncestor.Compare(maxID) > 0 { - maxID = targetAncestor - } - } - if maxID.Compare(pivotID) > 0 { - var err error - // pivot is behind, so advance it - ctx.DocumentMatchPool.Put(pivotDM) - s.currs[s.pivotIDx], err = pivotSearcher.Advance(ctx, maxID) + s.currs[i], err = searcher.Next(ctx) if err != nil { return nil, err } - if s.currs[s.pivotIDx] == nil { + if s.currs[i] == nil { // one of the searchers is exhausted, so we are done return nil, nil } - // recalc ancestors - s.currAncestors[s.pivotIDx], err = s.nestedReader.Ancestors(s.currs[s.pivotIDx].IndexInternalID) + // get the ancestry chain for this match + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) if err != nil { return nil, err } - // now restart the whole process - continue - } - // at this point, we know the pivot is ahead of all the other searchers - // now try to align all the other searchers to the pivot's ancestry - // we do this by advancing each searcher to the corresponding ancestor - // of the pivot, with searchers with insufficient depth being advanced - // to the corresponding document ID in the pivot's ancestry and - // and the searchers with sufficient depth being advanced to the - // ancestor at joinIdx level once that is done we check if all the - // searchers are aligned if they are, we have a match, otherwise we have a - // scenario where one or more searchers have advanced beyond the pivot, so - // we need to restart the whole process where we have to find the new maxID - // and advance the pivot as done above - allAligned := true - for i := 0; i < len(s.searchers); i++ { - if i == s.pivotIDx { - // skip the pivot itself - continue + // check if the ancestry chain is > joinIdx, if not we reset the joinIdx + // to the minimum possible value across all searchers, ideally this will be + // done in query construction time itself, by using the covering depth across + // all sub-queries, but we do this here as a fallback + if s.joinIdx >= len(s.currAncestors[i]) { + s.joinIdx = len(s.currAncestors[i]) - 1 } - curr := s.currs[i] - if curr == nil { - // one of the searchers is exhausted, so we are done + } + // build currKeys for each searcher, do it here as we may have adjusted joinIdx + for i := range s.searchers { + s.currKeys[i] = s.getKeyForIdx(i) + } + s.initialized = true + } + // check if the docQueue has any buffered matches + if s.docQueue.Len() > 0 { + return s.docQueue.Dequeue() + } +OUTER: + for { + // pick the pivot searcher with the highest key (ancestor at joinIdx level) + if s.currs[0] == nil { + return nil, nil + } + maxKey := s.currKeys[0] + for i := 1; i < len(s.searchers); i++ { + // currs[i] is nil means one of the searchers is exhausted + if s.currs[i] == nil { return nil, nil } - // try to align curr to the pivot's ancestry by advancing the - // searcher to the corresponding ancestor of the pivot - var targetAncestor index.IndexInternalID - if len(s.currAncestors[i]) > s.joinIdx { - // this searcher has sufficient depth, so use the pivot's ancestor at joinIdx - targetAncestor = pivotAncestors[len(pivotAncestors)-s.joinIdx-1] - } else { - // this searcher does not have sufficient depth, so use the pivot's - // ancestor at the searcher's max depth - targetAncestor = pivotAncestors[len(s.currAncestors[i])-1] + currKey := s.currKeys[i] + if maxKey.Compare(currKey) < 0 { + maxKey = currKey } - if curr.IndexInternalID.Compare(targetAncestor) < 0 { + } + // now try to align all other searchers to the + // we check if the a searchers key matches maxKey + // if not, we advance the pivot searcher to maxKey + // else do nothing and move to the next searcher + for i := 0; i < len(s.searchers); i++ { + if s.currKeys[i].Compare(maxKey) < 0 { + // not aligned, so advance this searcher to maxKey var err error - ctx.DocumentMatchPool.Put(curr) - s.currs[i], err = s.searchers[i].Advance(ctx, targetAncestor) + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = s.searchers[i].Advance(ctx, maxKey) if err != nil { return nil, err } @@ -314,39 +224,66 @@ func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.Doc if err != nil { return nil, err } - } - // now check if we are aligned - currID := getTargetAncestor(s.currAncestors[i], s.joinIdx) - if currID.Compare(targetAncestor) != 0 { - allAligned = false + // recalc key + s.currKeys[i] = s.getKeyForIdx(i) } } - if allAligned { - // we have a match, so we can build the resulting DocumentMatch - // we do this by delegating to the scorer, which will pick the lowest - // common ancestor (LCA) and merge all the constituents into it - dm, err := s.scorer.Score(ctx, s.currs, s.currAncestors, s.joinIdx) - if err != nil { - return nil, err - } - // now advance the pivot searcher to get ready for the next call - ctx.DocumentMatchPool.Put(pivotDM) - s.currs[s.pivotIDx], err = pivotSearcher.Next(ctx) - if err != nil { - return nil, err + // now check if all the searchers are aligned at the same maxKey + // if they are not aligned, we need to restart the loop of picking + // the pivot searcher with the highest key + for i := 0; i < len(s.searchers); i++ { + if !s.currKeys[i].Equals(maxKey) { + // not aligned, so restart the outer loop + continue OUTER } - if s.currs[s.pivotIDx] != nil { - s.currAncestors[s.pivotIDx], err = s.nestedReader.Ancestors(s.currs[s.pivotIDx].IndexInternalID) + } + // if we are here, all the searchers are aligned at maxKey + // now we need to buffer all the intermediate matches for every + // searcher at this key, until either the searcher's key changes + // or the searcher is exhausted + for i := 0; i < len(s.searchers); i++ { + for { + // buffer the current match + recycle, err := s.docQueue.Enqueue(s.currs[i]) + if err != nil { + return nil, err + } + if recycle != nil { + // we got a match to recycle + ctx.DocumentMatchPool.Put(recycle) + } + // advance to next match + s.currs[i], err = s.searchers[i].Next(ctx) if err != nil { return nil, err } + if s.currs[i] == nil { + // searcher exhausted, break out + break + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = s.getKeyForIdx(i) + // check if key has changed + if !s.currKeys[i].Equals(maxKey) { + // key changed, break out + break + } } - // return the match we have - return dm, nil } + // finally return the first buffered match + return s.docQueue.Dequeue() } } +func (s *NestedConjunctionSearcher) getKeyForIdx(i int) index.IndexInternalID { + return s.currAncestors[i][len(s.currAncestors[i])-s.joinIdx-1] +} + func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { for { next, err := s.Next(ctx) @@ -362,3 +299,57 @@ func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index. ctx.DocumentMatchPool.Put(next) } } + +// ------------------------------------------------------------------------------------------ +type CoalesceQueue struct { + order []*search.DocumentMatch // queue of DocumentMatch + items map[uint64]*search.DocumentMatch // map of ID to DocumentMatch +} + +func NewCoalesceQueue() *CoalesceQueue { + return &CoalesceQueue{ + order: make([]*search.DocumentMatch, 0), + items: make(map[uint64]*search.DocumentMatch), + } +} + +func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatch, error) { + val, err := it.IndexInternalID.Value() + if err != nil { + // cannot coalesce without a valid uint64 ID + return nil, err + } + + if existing, ok := cq.items[val]; ok { + // merge with current version + existing.MergeWith(it) + // return it to caller for recycling + return it, nil + } + + // first time we see this ID — enqueue + cq.items[val] = it + cq.order = append(cq.order, it) + // no recycling needed as we added a new item + return nil, nil +} + +func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { + if cq.Len() == 0 { + return nil, nil + } + rv := cq.order[0] + cq.order = cq.order[1:] + + val, err := rv.IndexInternalID.Value() + if err != nil { + return nil, err + } + + delete(cq.items, val) + return rv, nil +} + +func (cq *CoalesceQueue) Len() int { + return len(cq.order) +} From 106dcb3468818cb4cf72fed49aab81c628cea012 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 26 Nov 2025 14:07:17 +0530 Subject: [PATCH 12/70] fix bugs --- search/explanation.go | 8 ++++- search/searcher/search_conjunction_nested.go | 33 +++++++++++++++++--- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/search/explanation.go b/search/explanation.go index 6568ba734..1769a0919 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -76,7 +76,13 @@ func MergeExpl(first, second *Explanation) *Explanation { if first.Message == MergedExplMessage { // reuse first explanation as the merged one first.Value += second.Value - first.Children = append(first.Children, second) + if second.Message == MergedExplMessage { + // append all children from second to first + first.Children = append(first.Children, second.Children...) + } else { + // append second as a child to first + first.Children = append(first.Children, second) + } return first } // create a new explanation to hold the merged one diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index d387e10f2..a3708cbbe 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -15,6 +15,7 @@ package searcher import ( + "container/heap" "context" "fmt" "math" @@ -307,10 +308,12 @@ type CoalesceQueue struct { } func NewCoalesceQueue() *CoalesceQueue { - return &CoalesceQueue{ + cq := &CoalesceQueue{ order: make([]*search.DocumentMatch, 0), items: make(map[uint64]*search.DocumentMatch), } + heap.Init(cq) + return cq } func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatch, error) { @@ -329,7 +332,7 @@ func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatc // first time we see this ID — enqueue cq.items[val] = it - cq.order = append(cq.order, it) + heap.Push(cq, it) // no recycling needed as we added a new item return nil, nil } @@ -338,8 +341,8 @@ func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { if cq.Len() == 0 { return nil, nil } - rv := cq.order[0] - cq.order = cq.order[1:] + + rv := heap.Pop(cq).(*search.DocumentMatch) val, err := rv.IndexInternalID.Value() if err != nil { @@ -350,6 +353,28 @@ func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { return rv, nil } +// heap implementation + func (cq *CoalesceQueue) Len() int { return len(cq.order) } + +func (cq *CoalesceQueue) Less(i, j int) bool { + return cq.order[i].IndexInternalID.Compare(cq.order[j].IndexInternalID) < 0 +} + +func (cq *CoalesceQueue) Swap(i, j int) { + cq.order[i], cq.order[j] = cq.order[j], cq.order[i] +} + +func (cq *CoalesceQueue) Push(x any) { + cq.order = append(cq.order, x.(*search.DocumentMatch)) +} + +func (cq *CoalesceQueue) Pop() any { + old := cq.order + n := len(old) + x := old[n-1] + cq.order = old[:n-1] + return x +} From 0d5bafc0184cddbce3673dedaebd59cc7755c161 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 26 Nov 2025 17:16:43 +0530 Subject: [PATCH 13/70] perf optimization --- search/collector/eligible.go | 31 ++++++------ search/collector/knn.go | 35 +++++++------- search/collector/nested.go | 94 ++++++++++++++++++------------------ search/collector/topn.go | 48 ++++++++++-------- 4 files changed, 108 insertions(+), 100 deletions(-) diff --git a/search/collector/eligible.go b/search/collector/eligible.go index 7b0ada345..5ffb4e3a4 100644 --- a/search/collector/eligible.go +++ b/search/collector/eligible.go @@ -112,8 +112,9 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search default: next, err = searcher.Next(searchContext) } + var totalDocs uint64 for err == nil && next != nil { - if ec.total%CheckDoneEvery == 0 { + if totalDocs%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -121,20 +122,20 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search default: } } - ec.total++ + totalDocs++ if ec.nestedStore != nil { - doc, err := ec.nestedStore.AddDocument(next) + next, err = ec.nestedStore.ProcessNestedDocument(searchContext, next) if err != nil { - return err + break } - // recycle - searchContext.DocumentMatchPool.Put(doc) - } else { + } + if next != nil { err = dmHandler(next) if err != nil { break } + ec.total++ } next, err = searcher.Next(searchContext) @@ -143,20 +144,16 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search return err } + // if we have a nested store, we may have an interim root if ec.nestedStore != nil { - var count uint64 - err := ec.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { - // process the root document - if err := dmHandler(doc); err != nil { + currRoot := ec.nestedStore.CurrentRoot() + if currRoot != nil { + err = dmHandler(currRoot) + if err != nil { return err } - count++ - return nil - }) - if err != nil { - return err + ec.total++ } - ec.total = count } // help finalize/flush the results in case diff --git a/search/collector/knn.go b/search/collector/knn.go index 5ae79509b..8706bf286 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -192,8 +192,10 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: next, err = searcher.Next(searchContext) } + + var totalDocs uint64 for err == nil && next != nil { - if hc.total%CheckDoneEvery == 0 { + if totalDocs%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -201,41 +203,40 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: } } - hc.total++ + totalDocs++ if hc.nestedStore != nil { - doc, err := hc.nestedStore.AddDocument(next) + next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) if err != nil { - return err + break } - searchContext.DocumentMatchPool.Put(doc) - } else { + } + if next != nil { err = dmHandler(next) if err != nil { break } + // increment total only for actual(root) collected documents + hc.total++ } - next, err = searcher.Next(searchContext) } if err != nil { return err } + // if we have a nested store, we may have an interim root + // that needs to be finalized now if hc.nestedStore != nil { - var count uint64 - err := hc.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { - // process the root document - if err := dmHandler(doc); err != nil { + currRoot := hc.nestedStore.CurrentRoot() + if currRoot != nil { + // process the interim root now + err = dmHandler(currRoot) + if err != nil { return err } - count++ - return nil - }) - if err != nil { - return err + hc.total++ } - hc.total = count } // help finalize/flush the results in case diff --git a/search/collector/nested.go b/search/collector/nested.go index d680ad06d..ab14c0e76 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -15,8 +15,6 @@ package collector import ( - "fmt" - "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" ) @@ -24,67 +22,71 @@ import ( type collectStoreNested struct { nr index.NestedReader - interim map[uint64]*search.DocumentMatch + // the current root document match being built + currRoot *search.DocumentMatch } func newStoreNested(nr index.NestedReader) *collectStoreNested { rv := &collectStoreNested{ - interim: make(map[uint64]*search.DocumentMatch), - nr: nr, + nr: nr, } return rv } -func (c *collectStoreNested) AddDocument(doc *search.DocumentMatch) (*search.DocumentMatch, error) { + +// ProcessNestedDocument adds a document to the nested store, merging it into its root document +// as needed. If the returned DocumentMatch is nil, the incoming doc has been merged +// into its parent and should not be processed further. If the returned DocumentMatch +// is non-nil, it represents a complete root document that should be processed further. +// NOTE: This implementation assumes that documents are added in increasing order of their internal IDs +// which is guaranteed by all searchers in bleve. +func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { // find ancestors for the doc ancestors, err := c.nr.Ancestors(doc.IndexInternalID) - if err != nil || len(ancestors) == 0 { - return nil, fmt.Errorf("error getting ancestors for doc %v: %v", doc.IndexInternalID, err) - } - // root docID is the last ancestor - rootID := ancestors[len(ancestors)-1] - rootIDVal, err := rootID.Value() if err != nil { return nil, err } - // lookup existing root - rootDocument, ok := c.interim[rootIDVal] - if !ok { - // no interim root yet - if len(ancestors) == 1 { - // incoming doc is the root itself - c.interim[rootIDVal] = doc - return nil, nil - } - - // create new interim root and merge child into it - rootDocument = &search.DocumentMatch{IndexInternalID: rootID} - if err := rootDocument.MergeWith(doc); err != nil { + if len(ancestors) == 0 { + // should not happen, every doc should have at least itself as ancestor + return nil, nil + } + // root docID is the last ancestor + rootID := ancestors[len(ancestors)-1] + // check if there is an interim root already and if the incoming doc belongs to it + if c.currRoot != nil && c.currRoot.IndexInternalID.Equals(rootID) { + // there is an interim root already, and the incoming doc belongs to it + if err := c.currRoot.MergeWith(doc); err != nil { return nil, err } - c.interim[rootIDVal] = rootDocument - - // return the child for recycling - return doc, nil + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return nil, nil } - - // merge child into existing root - if err := rootDocument.MergeWith(doc); err != nil { + // completedRoot is the root document match to return, if any + var completedRoot *search.DocumentMatch + if c.currRoot != nil { + // we have an existing interim root, return it for processing + completedRoot = c.currRoot + // clear current root + c.currRoot = nil + } + // no interim root for now so either we have a root document incoming + // or we have a child doc and need to create an interim root + if len(ancestors) == 1 { + // incoming doc is the root itself + c.currRoot = doc + return completedRoot, nil + } + // this is a child doc, create interim root + c.currRoot = &search.DocumentMatch{IndexInternalID: rootID} + if err := c.currRoot.MergeWith(doc); err != nil { return nil, err } - return doc, nil + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return completedRoot, nil } -// NestedDocumentVisitor is the callback invoked for each root document. -// root is the merged root DocumentMatch. -type NestedDocumentVisitor func(root *search.DocumentMatch) error - -// VisitRoots walks over all collected interim values and calls the visitor. -func (c *collectStoreNested) VisitRoots(visitor NestedDocumentVisitor) error { - for _, root := range c.interim { - // invoke the visitor - if err := visitor(root); err != nil { - return err - } - } - return nil +// CurrentRoot returns the current interim root document match being built, if any +func (c *collectStoreNested) CurrentRoot() *search.DocumentMatch { + return c.currRoot } diff --git a/search/collector/topn.go b/search/collector/topn.go index ab76ff33b..85d405608 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -311,8 +311,13 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: next, err = searcher.Next(searchContext) } + // use a local totalDocs for counting total docs seen + // for context deadline checking, as hc.total is only + // incremented for actual(root) collected documents, and + // we need to check deadline for every document seen (root or nested) + var totalDocs uint64 for err == nil && next != nil { - if hc.total%CheckDoneEvery == 0 { + if totalDocs%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -320,16 +325,19 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: } } - + totalDocs++ if hc.nestedStore != nil { - hc.total++ - doc, err := hc.nestedStore.AddDocument(next) + // This may be a nested document — add it to the nested store first. + // If the nested store returns nil, the document was merged into its parent + // and should not be processed further. + // If it returns a non-nil document, it represents a complete root document + // and should be processed further. + next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) if err != nil { - return err + break } - // recycle - searchContext.DocumentMatchPool.Put(doc) - } else { + } + if next != nil { err = hc.adjustDocumentMatch(searchContext, reader, next) if err != nil { break @@ -344,7 +352,6 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, if err != nil { break } - } next, err = searcher.Next(searchContext) } @@ -352,25 +359,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, return err } + // if we have a nested store, we may have an interim root + // that needs to be returned for processing if hc.nestedStore != nil { - var count uint64 - err := hc.nestedStore.VisitRoots(func(doc *search.DocumentMatch) error { - if err := hc.adjustDocumentMatch(searchContext, reader, doc); err != nil { + currRoot := hc.nestedStore.CurrentRoot() + if currRoot != nil { + err = hc.adjustDocumentMatch(searchContext, reader, currRoot) + if err != nil { return err } - if err := hc.prepareDocumentMatch(searchContext, reader, doc, false); err != nil { + // no descendants at this point + err = hc.prepareDocumentMatch(searchContext, reader, currRoot, false) + if err != nil { return err } - if err := dmHandler(doc); err != nil { + + err = dmHandler(currRoot) + if err != nil { return err } - count++ - return nil - }) - if err != nil { - return err } - hc.total = count } if hc.knnHits != nil { From 9da53cf42618d9236cc27acf34255d4081329989 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 26 Nov 2025 20:20:09 +0530 Subject: [PATCH 14/70] perf optimization --- index/scorch/snapshot_index.go | 21 ++++++++------------ index/scorch/snapshot_segment.go | 6 +++--- search/collector/nested.go | 11 ++++++---- search/searcher/search_conjunction_nested.go | 12 +++++------ 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index e283c2340..b3e4b772f 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -1277,24 +1277,19 @@ func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending boo return termFreqs[:limit], nil } -func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID) ([]index.IndexInternalID, error) { +func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID) ([]index.AncestorID, error) { seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID) if err != nil { return nil, err } - + // get ancestors from the segment ancestors := i.segment[seg].Ancestors(ldoc) - - // allocate space: +1 for the doc itself - rv := make([]index.IndexInternalID, len(ancestors)+1) + // get global offset for the segment (correcting factor for multi-segment indexes) globalOffset := i.offsets[seg] - - // first element is the doc itself - rv[0] = index.NewIndexInternalID(nil, ldoc+globalOffset) - - // then all ancestors shifted by +1 - for j := 0; j < len(ancestors); j++ { - rv[j+1] = index.NewIndexInternalID(nil, ancestors[j]+globalOffset) + // adjust ancestors to global doc numbers, not local to segment + for idx := range ancestors { + ancestors[idx] = ancestors[idx].Add(globalOffset) } - return rv, nil + // return adjusted ancestors + return ancestors, nil } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index ed3684557..10cafec8f 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -375,10 +375,10 @@ func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { return rv } -func (s *SegmentSnapshot) Ancestors(docID uint64) []uint64 { +func (s *SegmentSnapshot) Ancestors(docNum uint64) []index.AncestorID { nsb, ok := s.segment.(segment.NestedSegment) if !ok { - return nil + return []index.AncestorID{index.NewAncestorID(docNum)} } - return nsb.Ancestors(docID) + return nsb.Ancestors(docNum) } diff --git a/search/collector/nested.go b/search/collector/nested.go index ab14c0e76..a60dbc276 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -24,6 +24,9 @@ type collectStoreNested struct { // the current root document match being built currRoot *search.DocumentMatch + + // the ancestor ID of the current root document being built + currRootAncestorID index.AncestorID } func newStoreNested(nr index.NestedReader) *collectStoreNested { @@ -52,7 +55,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do // root docID is the last ancestor rootID := ancestors[len(ancestors)-1] // check if there is an interim root already and if the incoming doc belongs to it - if c.currRoot != nil && c.currRoot.IndexInternalID.Equals(rootID) { + if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { // there is an interim root already, and the incoming doc belongs to it if err := c.currRoot.MergeWith(doc); err != nil { return nil, err @@ -66,18 +69,18 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do if c.currRoot != nil { // we have an existing interim root, return it for processing completedRoot = c.currRoot - // clear current root - c.currRoot = nil } // no interim root for now so either we have a root document incoming // or we have a child doc and need to create an interim root if len(ancestors) == 1 { // incoming doc is the root itself c.currRoot = doc + c.currRootAncestorID = rootID return completedRoot, nil } // this is a child doc, create interim root - c.currRoot = &search.DocumentMatch{IndexInternalID: rootID} + c.currRoot = &search.DocumentMatch{IndexInternalID: rootID.ToIndexInternalID()} + c.currRootAncestorID = rootID if err := c.currRoot.MergeWith(doc); err != nil { return nil, err } diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index a3708cbbe..0b3cf0023 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -38,8 +38,8 @@ type NestedConjunctionSearcher struct { searchers []search.Searcher queryNorm float64 currs []*search.DocumentMatch - currAncestors [][]index.IndexInternalID - currKeys []index.IndexInternalID + currAncestors [][]index.AncestorID + currKeys []index.AncestorID initialized bool joinIdx int options search.SearcherOptions @@ -61,8 +61,8 @@ func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexRe options: options, searchers: searchers, currs: make([]*search.DocumentMatch, len(searchers)), - currAncestors: make([][]index.IndexInternalID, len(searchers)), - currKeys: make([]index.IndexInternalID, len(searchers)), + currAncestors: make([][]index.AncestorID, len(searchers)), + currKeys: make([]index.AncestorID, len(searchers)), joinIdx: joinIdx, docQueue: NewCoalesceQueue(), } @@ -212,7 +212,7 @@ OUTER: // not aligned, so advance this searcher to maxKey var err error ctx.DocumentMatchPool.Put(s.currs[i]) - s.currs[i], err = s.searchers[i].Advance(ctx, maxKey) + s.currs[i], err = s.searchers[i].Advance(ctx, maxKey.ToIndexInternalID()) if err != nil { return nil, err } @@ -281,7 +281,7 @@ OUTER: } } -func (s *NestedConjunctionSearcher) getKeyForIdx(i int) index.IndexInternalID { +func (s *NestedConjunctionSearcher) getKeyForIdx(i int) index.AncestorID { return s.currAncestors[i][len(s.currAncestors[i])-s.joinIdx-1] } From 53a56992828d797048a02580dc5d4c3c79eb2462 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 26 Nov 2025 21:42:09 +0530 Subject: [PATCH 15/70] perf optimiztion 2 --- search/collector/nested.go | 9 ++- search/explanation.go | 68 +++++++++----------- search/search.go | 37 +++++------ search/searcher/search_conjunction_nested.go | 5 +- search/util.go | 15 +++++ 5 files changed, 74 insertions(+), 60 deletions(-) diff --git a/search/collector/nested.go b/search/collector/nested.go index a60dbc276..855df74af 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -57,7 +57,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do // check if there is an interim root already and if the incoming doc belongs to it if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { // there is an interim root already, and the incoming doc belongs to it - if err := c.currRoot.MergeWith(doc); err != nil { + if err := c.currRoot.AddDescendant(doc); err != nil { return nil, err } // recycle the child document now that it's merged into the interim root @@ -79,9 +79,12 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do return completedRoot, nil } // this is a child doc, create interim root - c.currRoot = &search.DocumentMatch{IndexInternalID: rootID.ToIndexInternalID()} + newDM := ctx.DocumentMatchPool.Get() + newDM.IndexInternalID = rootID.ToIndexInternalID() + // merge the incoming doc into the new interim root + c.currRoot = newDM c.currRootAncestorID = rootID - if err := c.currRoot.MergeWith(doc); err != nil { + if err := c.currRoot.AddDescendant(doc); err != nil { return nil, err } // recycle the child document now that it's merged into the interim root diff --git a/search/explanation.go b/search/explanation.go index 1769a0919..98c5e099d 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -29,6 +29,8 @@ func init() { reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) } +const MergedExplMessage = "sum of merged explanations:" + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -55,55 +57,49 @@ func (expl *Explanation) Size() int { return sizeInBytes } -const MergedExplMessage = "sum of merged explanations:" - // MergeExpl merges two explanations into one. // If either explanation is nil, the other is returned. // If the first explanation is already a merged explanation, // the second explanation is appended to its children. // Otherwise, a new merged explanation is created // with the two explanations as its children. -func MergeExpl(first, second *Explanation) *Explanation { - if first == nil { - return second +func (expl *Explanation) MergeWith(other *Explanation) *Explanation { + if expl == nil { + return other } - if second == nil { - return first + if other == nil || expl == other { + return expl + } + + newScore := expl.Value + other.Value + + // if both are merged explanations, combine children + if expl.Message == MergedExplMessage && other.Message == MergedExplMessage { + expl.Value = newScore + expl.Children = append(expl.Children, other.Children...) + return expl } - if first == second { - return first + + // atleast one is not a merged explanation see which one it is + // if expl is merged, append other + if expl.Message == MergedExplMessage { + // append other as a child to first + expl.Value = newScore + expl.Children = append(expl.Children, other) + return expl } - if first.Message == MergedExplMessage { - // reuse first explanation as the merged one - first.Value += second.Value - if second.Message == MergedExplMessage { - // append all children from second to first - first.Children = append(first.Children, second.Children...) - } else { - // append second as a child to first - first.Children = append(first.Children, second) - } - return first + + // if other is merged, append expl + if other.Message == MergedExplMessage { + other.Value = newScore + other.Children = append(other.Children, expl) + return other } // create a new explanation to hold the merged one rv := &Explanation{ - Value: first.Value + second.Value, + Value: expl.Value + other.Value, Message: MergedExplMessage, - Children: []*Explanation{first, second}, + Children: []*Explanation{expl, other}, } return rv } - -func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { - if first == nil { - return second - } - if second == nil { - return first - } - // reuse first to store the union of both - for k, v := range second { - first[k] += v - } - return first -} diff --git a/search/search.go b/search/search.go index ebc1af3b5..2490ecae8 100644 --- a/search/search.go +++ b/search/search.go @@ -180,7 +180,7 @@ type DocumentMatch struct { IndexNames []string `json:"index_names,omitempty"` // Children holds any descendant/child matches that contributed - // to this root (or intermediate LCA) DocumentMatch. + // to this root DocumentMatch. Children DescendantStore `json:"-"` } @@ -231,6 +231,12 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { for i := range ftls { // recycle the ArrayPositions of each location ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] } + // remember the Children backing map + children := dm.Children + // reset to empty map for Children + for k := range children { + delete(children, k) + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) @@ -240,6 +246,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { dm.DecodedSort = dm.DecodedSort[:0] // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] + // reuse the descendant store if it exists + dm.Children = children return dm } @@ -370,28 +378,26 @@ func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score) } -func (dm *DocumentMatch) MergeWith(other *DocumentMatch) error { - // merge score +// AddDescendant merges another DocumentMatch into this one as a descendant. +func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { + // add descendant score to parent score dm.Score += other.Score // merge explanations - dm.Expl = MergeExpl(dm.Expl, other.Expl) + dm.Expl = dm.Expl.MergeWith(other.Expl) // merge field term locations dm.FieldTermLocations = MergeFieldTermLocations(dm.FieldTermLocations, []*DocumentMatch{other}) // merge score breakdown dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) - // merge Descendants/Children - // if the base and other have the same ID, then we are merging the same - // document match (from different clauses), so we need to merge their children/descendants + // add other as descendant only if it is not the same document if !dm.IndexInternalID.Equals(other.IndexInternalID) { if dm.Children == nil { - dm.Children = make(DescendantStore) + dm.Children = NewDescendantStore() } err := dm.Children.AddDescendant(other.IndexInternalID) if err != nil { return err } } - dm.Children = MergeDescendants(dm.Children, other.Children) return nil } @@ -444,17 +450,8 @@ func (sc *SearchContext) Size() int { type DescendantStore map[uint64]index.IndexInternalID -func MergeDescendants(first, second DescendantStore) DescendantStore { - if first == nil { - return second - } - if second == nil { - return first - } - for k, v := range second { - first[k] = v - } - return first +func NewDescendantStore() DescendantStore { + return make(DescendantStore) } func (ds DescendantStore) AddDescendant(descendant index.IndexInternalID) error { diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index 0b3cf0023..964e94799 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -325,7 +325,10 @@ func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatc if existing, ok := cq.items[val]; ok { // merge with current version - existing.MergeWith(it) + existing.Score += it.Score + existing.Expl = existing.Expl.MergeWith(it.Expl) + existing.FieldTermLocations = search.MergeFieldTermLocations( + existing.FieldTermLocations, []*search.DocumentMatch{it}) // return it to caller for recycling return it, nil } diff --git a/search/util.go b/search/util.go index 1af7bce56..2f1d764b1 100644 --- a/search/util.go +++ b/search/util.go @@ -74,6 +74,21 @@ func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) return dest } +// MergeScoreBreakdown merges two score breakdown maps together +func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { + if first == nil { + return second + } + if second == nil { + return first + } + // reuse first to store the union of both + for k, v := range second { + first[k] += v + } + return first +} + type SearchIOStatsCallbackFunc func(uint64) // Implementation of SearchIncrementalCostCallbackFn should handle the following messages From 32bfd15d6d97ef0c0324107b248eff35b293189e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 08:15:30 +0530 Subject: [PATCH 16/70] fix prefilter case --- search/collector/eligible.go | 47 ++++-------------------------------- search_knn.go | 41 +++++++++++-------------------- 2 files changed, 19 insertions(+), 69 deletions(-) diff --git a/search/collector/eligible.go b/search/collector/eligible.go index 5ffb4e3a4..b1dad813e 100644 --- a/search/collector/eligible.go +++ b/search/collector/eligible.go @@ -31,18 +31,12 @@ type EligibleCollector struct { total uint64 took time.Duration eligibleSelector index.EligibleDocumentSelector - - nestedStore *collectStoreNested } func NewEligibleCollector(size int) *EligibleCollector { return newEligibleCollector(size) } -func NewNestedEligibleCollector(nr index.NestedReader, size int) *EligibleCollector { - return newNestedEligibleCollector(nr, size) -} - func newEligibleCollector(size int) *EligibleCollector { // No sort order & skip always 0 since this is only to filter eligible docs. ec := &EligibleCollector{ @@ -51,15 +45,6 @@ func newEligibleCollector(size int) *EligibleCollector { return ec } -func newNestedEligibleCollector(nr index.NestedReader, size int) *EligibleCollector { - // No sort order & skip always 0 since this is only to filter eligible docs. - ec := &EligibleCollector{ - size: size, - nestedStore: newStoreNested(nr), - } - return ec -} - func makeEligibleDocumentMatchHandler(ctx *search.SearchContext, reader index.IndexReader) (search.DocumentMatchHandler, error) { if ec, ok := ctx.Collector.(*EligibleCollector); ok { if vr, ok := reader.(index.VectorIndexReader); ok { @@ -112,9 +97,8 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search default: next, err = searcher.Next(searchContext) } - var totalDocs uint64 for err == nil && next != nil { - if totalDocs%CheckDoneEvery == 0 { + if ec.total%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -122,21 +106,12 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search default: } } - totalDocs++ - if ec.nestedStore != nil { - next, err = ec.nestedStore.ProcessNestedDocument(searchContext, next) - if err != nil { - break - } - } - if next != nil { - err = dmHandler(next) - if err != nil { - break - } - ec.total++ + err = dmHandler(next) + if err != nil { + break } + ec.total++ next, err = searcher.Next(searchContext) } @@ -144,18 +119,6 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search return err } - // if we have a nested store, we may have an interim root - if ec.nestedStore != nil { - currRoot := ec.nestedStore.CurrentRoot() - if currRoot != nil { - err = dmHandler(currRoot) - if err != nil { - return err - } - ec.total++ - } - } - // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) diff --git a/search_knn.go b/search_knn.go index f4aa18879..f3c1ce082 100644 --- a/search_knn.go +++ b/search_knn.go @@ -402,24 +402,29 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea continue } // Applies to all supported types of queries. - filterSearcher, _ := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ + filterSearcher, err := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ Score: "none", // just want eligible hits --> don't compute scores if not needed }) + if err != nil { + return nil, err + } // Using the index doc count to determine collector size since we do not // have an estimate of the number of eligible docs in the index yet. indexDocCount, err := i.DocCount() if err != nil { return nil, err } - filterColl, err := i.buildEligibleCollector(ctx, filterQ, reader, int(indexDocCount)) + filterColl := collector.NewEligibleCollector(int(indexDocCount)) + err = filterColl.Collect(ctx, filterSearcher, reader) if err != nil { return nil, err } - err = filterColl.Collect(ctx, filterSearcher, reader) + knnFilterResults[idx] = filterColl.EligibleSelector() + // Close the filter searcher once done + err = filterSearcher.Close() if err != nil { return nil, err } - knnFilterResults[idx] = filterColl.EligibleSelector() } // Add the filter hits when creating the kNN query @@ -445,6 +450,11 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if !preSearch { knnHits = finalizeKNNResults(req, knnHits) } + // close the knn searcher once done + err = knnSearcher.Close() + if err != nil { + return nil, err + } // at this point, irrespective of whether it is a preSearch or not, // the knn hits are populated with Sort and Fields. // it must be ensured downstream that the Sort and Fields are not @@ -691,26 +701,3 @@ func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, return collector.NewKNNCollector(kArray, sumOfK), nil } - -func (i *indexImpl) buildEligibleCollector(ctx context.Context, filterQuery query.Query, reader index.IndexReader, size int) (*collector.EligibleCollector, error) { - // check if we are in nested mode - if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { - // get the nested reader from the index reader - if nr, ok := reader.(index.NestedReader); ok { - // check if the filter query intersects with the nested mapping - if nm, ok := i.m.(mapping.NestedMapping); ok { - var fs search.FieldSet - var err error - fs, err = query.ExtractFields(filterQuery, i.m, fs) - if err != nil { - return nil, err - } - if nm.IntersectsPrefix(fs) { - return collector.NewNestedEligibleCollector(nr, size), nil - } - } - } - } - - return collector.NewEligibleCollector(size), nil -} From e341e38802b2520cf423821313f011629489cd28 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 08:25:52 +0530 Subject: [PATCH 17/70] cleanup --- search/collector/eligible.go | 2 +- search/collector/knn.go | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/search/collector/eligible.go b/search/collector/eligible.go index b1dad813e..49e044812 100644 --- a/search/collector/eligible.go +++ b/search/collector/eligible.go @@ -106,12 +106,12 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search default: } } + ec.total++ err = dmHandler(next) if err != nil { break } - ec.total++ next, err = searcher.Next(searchContext) } diff --git a/search/collector/knn.go b/search/collector/knn.go index 8706bf286..1c6a6765a 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -138,7 +138,6 @@ type KNNCollector struct { maxScore float64 nestedStore *collectStoreNested - descendants [][]index.IndexInternalID } func NewKNNCollector(kArray []int64, size int64) *KNNCollector { From b94610da5bddf92ef89cbc2edebb15765c4930a8 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 11:08:37 +0530 Subject: [PATCH 18/70] cleanup --- index_impl.go | 7 ++-- search/collector/topn.go | 6 ++-- search/search.go | 75 +++++++++++++++------------------------- 3 files changed, 34 insertions(+), 54 deletions(-) diff --git a/index_impl.go b/index_impl.go index ee1a00edf..719533f4c 100644 --- a/index_impl.go +++ b/index_impl.go @@ -1126,10 +1126,10 @@ func LoadAndHighlightAllFields( return err, totalStoredFieldsBytes } // collect all descendant documents - nestedDocs := make([]*search.NestedDocumentMatch, 0) + nestedDocs := make([]*search.NestedDocumentMatch, 0, root.NumDescendants()) // create a dummy desc DocumentMatch to reuse LoadAndHighlightFields desc := &search.DocumentMatch{} - err = root.Children.IterateDescendants(func(descID index.IndexInternalID) error { + err = root.IterateDescendants(func(descID index.IndexInternalID) error { extID, err := r.ExternalID(descID) if err != nil { return err @@ -1147,7 +1147,8 @@ func LoadAndHighlightAllFields( if len(desc.Fields) != 0 || len(desc.Fragments) != 0 { nestedDocs = append(nestedDocs, search.NewNestedDocumentMatch(desc.Fields, desc.Fragments)) } - desc.Reset() + desc.Fields = nil + desc.Fragments = nil return nil }) if err != nil { diff --git a/search/collector/topn.go b/search/collector/topn.go index 85d405608..640eafff1 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -570,10 +570,8 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc } } - // if this is a nested document, we need to visit the doc values - // for all its ancestors as well - // so that facets/sorts can be computed correctly - err := d.Children.IterateDescendants(func(descendant index.IndexInternalID) error { + // first visit descendants if any + err := d.IterateDescendants(func(descendant index.IndexInternalID) error { return hc.dvReader.VisitDocValues(descendant, v) }) if err != nil { diff --git a/search/search.go b/search/search.go index 2490ecae8..ec91daf7b 100644 --- a/search/search.go +++ b/search/search.go @@ -179,9 +179,9 @@ type DocumentMatch struct { // of the current alias view, used in alias of aliases scenario IndexNames []string `json:"index_names,omitempty"` - // Children holds any descendant/child matches that contributed + // Descendants holds the IDs of any child/descendant document that contributed // to this root DocumentMatch. - Children DescendantStore `json:"-"` + Descendants map[uint64]index.IndexInternalID `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -231,11 +231,11 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { for i := range ftls { // recycle the ArrayPositions of each location ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] } - // remember the Children backing map - children := dm.Children + // remember the Descendants backing map + descendants := dm.Descendants // reset to empty map for Children - for k := range children { - delete(children, k) + for k := range descendants { + delete(descendants, k) } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} @@ -247,7 +247,7 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] // reuse the descendant store if it exists - dm.Children = children + dm.Descendants = descendants return dm } @@ -294,10 +294,6 @@ func (dm *DocumentMatch) Size() int { size.SizeOfPtr } - if dm.Children != nil { - sizeInBytes += dm.Children.Size() - } - return sizeInBytes } @@ -390,17 +386,35 @@ func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) // add other as descendant only if it is not the same document if !dm.IndexInternalID.Equals(other.IndexInternalID) { - if dm.Children == nil { - dm.Children = NewDescendantStore() + if dm.Descendants == nil { + dm.Descendants = make(map[uint64]index.IndexInternalID) } - err := dm.Children.AddDescendant(other.IndexInternalID) + key, err := other.IndexInternalID.Value() if err != nil { return err } + if _, exists := dm.Descendants[key]; exists { + return nil // already exists + } + // use clone to avoid potential issues with reusing IndexInternalID slices + dm.Descendants[key] = slices.Clone(other.IndexInternalID) } return nil } +func (dm *DocumentMatch) IterateDescendants(fn func(id index.IndexInternalID) error) error { + for _, descendant := range dm.Descendants { + if err := fn(descendant); err != nil { + return err + } + } + return nil +} + +func (dm *DocumentMatch) NumDescendants() int { + return len(dm.Descendants) +} + type DocumentMatchCollection []*DocumentMatch func (c DocumentMatchCollection) Len() int { return len(c) } @@ -448,39 +462,6 @@ func (sc *SearchContext) Size() int { return sizeInBytes } -type DescendantStore map[uint64]index.IndexInternalID - -func NewDescendantStore() DescendantStore { - return make(DescendantStore) -} - -func (ds DescendantStore) AddDescendant(descendant index.IndexInternalID) error { - key, err := descendant.Value() - if err != nil { - return err - } - // use clone to keep the store stateless - ds[key] = slices.Clone(descendant) - return nil -} - -func (ds DescendantStore) IterateDescendants(fn func(descendant index.IndexInternalID) error) error { - for _, descendant := range ds { - if err := fn(descendant); err != nil { - return err - } - } - return nil -} - -func (ds DescendantStore) Size() int { - sizeInBytes := size.SizeOfMap - for _, entry := range ds { - sizeInBytes += size.SizeOfPtr + len(entry) - } - return sizeInBytes -} - // A NestedDocumentMatch is like a DocumentMatch but used for nested documents // and does not have score or locations, or a score and is mainly used to // hold field values and fragments, to be embedded in the parent DocumentMatch From 204aa2380a14eb3321e6c13f151a713affce7d49 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 12:18:35 +0530 Subject: [PATCH 19/70] performance optimization 3 --- index/scorch/snapshot_index.go | 13 ++++++++----- index/scorch/snapshot_segment.go | 6 +++--- search/collector/nested.go | 12 ++++++++---- search/searcher/search_conjunction_nested.go | 6 +++--- search/searcher/search_match_all.go | 6 ++++-- 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index b3e4b772f..c6bc2a5e6 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -1277,19 +1277,22 @@ func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending boo return termFreqs[:limit], nil } -func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID) ([]index.AncestorID, error) { +// Ancestors returns the ancestor IDs for the given document ID. The prealloc +// slice can be provided to avoid allocations downstream, and MUST be empty. +func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID, prealloc []index.AncestorID) ([]index.AncestorID, error) { + // get segment and local doc num for the ID seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID) if err != nil { return nil, err } // get ancestors from the segment - ancestors := i.segment[seg].Ancestors(ldoc) + prealloc = i.segment[seg].Ancestors(ldoc, prealloc) // get global offset for the segment (correcting factor for multi-segment indexes) globalOffset := i.offsets[seg] // adjust ancestors to global doc numbers, not local to segment - for idx := range ancestors { - ancestors[idx] = ancestors[idx].Add(globalOffset) + for idx := range prealloc { + prealloc[idx] = prealloc[idx].Add(globalOffset) } // return adjusted ancestors - return ancestors, nil + return prealloc, nil } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 10cafec8f..34f7a4695 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -375,10 +375,10 @@ func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { return rv } -func (s *SegmentSnapshot) Ancestors(docNum uint64) []index.AncestorID { +func (s *SegmentSnapshot) Ancestors(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { nsb, ok := s.segment.(segment.NestedSegment) if !ok { - return []index.AncestorID{index.NewAncestorID(docNum)} + return append(prealloc, index.NewAncestorID(docNum)) } - return nsb.Ancestors(docNum) + return nsb.Ancestors(docNum, prealloc) } diff --git a/search/collector/nested.go b/search/collector/nested.go index 855df74af..a79bc9fac 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -27,6 +27,9 @@ type collectStoreNested struct { // the ancestor ID of the current root document being built currRootAncestorID index.AncestorID + + // prealloc slice for ancestor IDs + ancestors []index.AncestorID } func newStoreNested(nr index.NestedReader) *collectStoreNested { @@ -44,16 +47,17 @@ func newStoreNested(nr index.NestedReader) *collectStoreNested { // which is guaranteed by all searchers in bleve. func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { // find ancestors for the doc - ancestors, err := c.nr.Ancestors(doc.IndexInternalID) + var err error + c.ancestors, err = c.nr.Ancestors(doc.IndexInternalID, c.ancestors[:0]) if err != nil { return nil, err } - if len(ancestors) == 0 { + if len(c.ancestors) == 0 { // should not happen, every doc should have at least itself as ancestor return nil, nil } // root docID is the last ancestor - rootID := ancestors[len(ancestors)-1] + rootID := c.ancestors[len(c.ancestors)-1] // check if there is an interim root already and if the incoming doc belongs to it if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { // there is an interim root already, and the incoming doc belongs to it @@ -72,7 +76,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do } // no interim root for now so either we have a root document incoming // or we have a child doc and need to create an interim root - if len(ancestors) == 1 { + if len(c.ancestors) == 1 { // incoming doc is the root itself c.currRoot = doc c.currRootAncestorID = rootID diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index 964e94799..f321f1d4f 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -164,7 +164,7 @@ func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.Doc return nil, nil } // get the ancestry chain for this match - s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) if err != nil { return nil, err } @@ -221,7 +221,7 @@ OUTER: return nil, nil } // recalc ancestors - s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) if err != nil { return nil, err } @@ -263,7 +263,7 @@ OUTER: break } // recalc ancestors - s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID) + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) if err != nil { return nil, err } diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 3ce7a69d9..57966a924 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -37,6 +37,7 @@ type MatchAllSearcher struct { scorer *scorer.ConstantScorer count uint64 nested bool + ancestors []index.AncestorID } func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boost float64, options search.SearcherOptions) (*MatchAllSearcher, error) { @@ -86,13 +87,14 @@ func (s *MatchAllSearcher) isNested(id index.IndexInternalID) bool { if !s.nested { return false } + var err error // check if this doc has ancestors, if so it is nested if nr, ok := s.reader.(index.NestedReader); ok { - anc, err := nr.Ancestors(id) + s.ancestors, err = nr.Ancestors(id, s.ancestors[:0]) if err != nil { return false } - return len(anc) > 1 + return len(s.ancestors) > 1 } return false } From 056d96355435523f3fdb2c3e0dd3c39f80cb6446 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 15:27:19 +0530 Subject: [PATCH 20/70] fix knn --- search/collector/knn.go | 94 ++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 1c6a6765a..401d92be4 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -19,6 +19,7 @@ package collector import ( "context" + "slices" "time" "github.com/blevesearch/bleve/v2/search" @@ -191,10 +192,8 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: next, err = searcher.Next(searchContext) } - - var totalDocs uint64 for err == nil && next != nil { - if totalDocs%CheckDoneEvery == 0 { + if hc.total%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -202,42 +201,19 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: } } - totalDocs++ + hc.total++ - if hc.nestedStore != nil { - next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) - if err != nil { - break - } - } - if next != nil { - err = dmHandler(next) - if err != nil { - break - } - // increment total only for actual(root) collected documents - hc.total++ + err = dmHandler(next) + if err != nil { + break } + next, err = searcher.Next(searchContext) } if err != nil { return err } - // if we have a nested store, we may have an interim root - // that needs to be finalized now - if hc.nestedStore != nil { - currRoot := hc.nestedStore.CurrentRoot() - if currRoot != nil { - // process the interim root now - err = dmHandler(currRoot) - if err != nil { - return err - } - hc.total++ - } - } - // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) @@ -249,26 +225,74 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r hc.took = time.Since(startTime) // finalize actual results - err = hc.finalizeResults(reader) + err = hc.finalizeResults(searchContext, reader) if err != nil { return err } return nil } -func (hc *KNNCollector) finalizeResults(r index.IndexReader) error { +func (hc *KNNCollector) finalizeResults(ctx *search.SearchContext, r index.IndexReader) error { var err error - hc.results, err = hc.knnStore.Final(func(doc *search.DocumentMatch) error { + // finalize the KNN store results + // if collector is used in non-nested mode, then directly finalize from knnStore + docFixup := func(doc *search.DocumentMatch) error { if doc.ID == "" { // look up the id since we need it for lookup - var err error doc.ID, err = r.ExternalID(doc.IndexInternalID) if err != nil { return err } } return nil + } + if hc.nestedStore == nil { + hc.results, err = hc.knnStore.Final(docFixup) + return err + } + // knn collector is used in nested mode, this means that the documents + // in the knnStore need to be further processed to build the root documents + // first get the raw results without any fixup + rawResults, err := hc.knnStore.Final(nil) + if err != nil { + return err + } + // now sort all the document matches by indexInternalID to ensure that + // the nested processing works correctly, as it expects documents to be + // added in increasing order of internal IDs + slices.SortFunc(rawResults, func(i, j *search.DocumentMatch) int { + return index.IndexInternalID.Compare(i.IndexInternalID, j.IndexInternalID) }) + finalResults := make(search.DocumentMatchCollection, 0, len(rawResults)) + // now process each document through the nested store + for _, doc := range rawResults { + // override doc with the returned root document match, if any + doc, err = hc.nestedStore.ProcessNestedDocument(ctx, doc) + if err != nil { + return err + } + // if doc is nil, it means the incoming doc was merged into its parent + // and no root document is ready yet + if doc != nil { + // completed root document match, do fixup and add to results + err = docFixup(doc) + if err != nil { + return err + } + finalResults = append(finalResults, doc) + } + } + // finally, check if there is an interim root document left to be returned + doc := hc.nestedStore.CurrentRoot() + if doc != nil { + // completed root document match, do fixup and add to results + err = docFixup(doc) + if err != nil { + return err + } + finalResults = append(finalResults, doc) + } + hc.results = finalResults return err } From e13bcc510fc20fbbdb846ad789db8ac5fc266e9f Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 15:57:07 +0530 Subject: [PATCH 21/70] add vector use case in md --- docs/hierarchy.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/hierarchy.md b/docs/hierarchy.md index 9518b899e..d2024e461 100644 --- a/docs/hierarchy.md +++ b/docs/hierarchy.md @@ -156,6 +156,10 @@ like vector search, synonym search, hybrid and pre-filtered vector search integr * Aggregations/Faceting: Facets can be computed over terms that exist inside nested objects, providing more accurate, context-aware aggregation results. A facet on `departments.projects.status` will produce buckets such as `active`, `paused`, `completed` only for the matched departments, instead of aggregating project status across the entire company document. * Sorting: Sorting can be applied using fields from nested objects. This allows sorting results based on values inside the appropriate nested structure. For example, sorting companies by `departments.budget` (descending) will order documents based on the budget of the specific department involved in the match, rather than the overall document or unrelated departments. +* Vector Search (KNN / Multi-KNN): When an array of objects is marked as nested and contains vector fields, each vector is treated as belonging to its own nested document. Vector similarity is computed only within the same nested object, not across siblings. For example, if `departments.employees` is a nested array where each employee has a `skills_vector`, a KNN search using the embedding of `machine learning engineer` will match only employees whose own `skills_vector` is similar—other employees vectors within the same department or document do not contribute to the score or match. This also means that a vector search query for `K = 3` will return the top 3 most similar employees across all departments and all companies, and may return multiple employees from the same department or company if they rank among the top 3 most similar overall. + +* Pre-Filtered Vector Search: When vector search is combined with filters on fields inside a nested array, the filters are applied first to pick which nested items are eligible. The vector search then runs only on those filtered items. For example, if `departments.employees` is a `nested` array, a pre-filtered KNN query for employees with the role `Manager` in the `Sales` department will first narrow the candidate set to only employees who meet those field conditions, and then compute vector similarity on the `skills_vector` of that filtered subset. This ensures that vector search results come only from the employees that satisfy the filter, while still treating each employee as an independent vector candidate. + ## Indexing Below is an example of using the Bleve API to index documents with hierarchical structures, using hybrid arrays and nested mappings. From 7c58c80c9eb63313dddeef0016a53fe074dc5ddb Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 21:17:33 +0530 Subject: [PATCH 22/70] perf optimization 4 --- search/collector/knn.go | 2 +- search/searcher/search_conjunction_nested.go | 52 ++++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 401d92be4..8a9488d9a 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -261,7 +261,7 @@ func (hc *KNNCollector) finalizeResults(ctx *search.SearchContext, r index.Index // the nested processing works correctly, as it expects documents to be // added in increasing order of internal IDs slices.SortFunc(rawResults, func(i, j *search.DocumentMatch) int { - return index.IndexInternalID.Compare(i.IndexInternalID, j.IndexInternalID) + return i.IndexInternalID.Compare(j.IndexInternalID) }) finalResults := make(search.DocumentMatchCollection, 0, len(rawResults)) // now process each document through the nested store diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index f321f1d4f..a863ed9f0 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -15,11 +15,11 @@ package searcher import ( - "container/heap" "context" "fmt" "math" "reflect" + "slices" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/size" @@ -276,6 +276,8 @@ OUTER: } } } + // finalize the docQueue for dequeueing + s.docQueue.Finalize() // finally return the first buffered match return s.docQueue.Dequeue() } @@ -312,10 +314,13 @@ func NewCoalesceQueue() *CoalesceQueue { order: make([]*search.DocumentMatch, 0), items: make(map[uint64]*search.DocumentMatch), } - heap.Init(cq) return cq } +// Enqueue adds the given DocumentMatch to the queue. If a DocumentMatch with the same +// IndexInternalID already exists in the queue, it merges the scores and explanations, +// and returns the given DocumentMatch for recycling. If it's a new entry, it adds it +// to the queue and returns nil. func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatch, error) { val, err := it.IndexInternalID.Value() if err != nil { @@ -335,17 +340,33 @@ func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatc // first time we see this ID — enqueue cq.items[val] = it - heap.Push(cq, it) + // append to order slice (this is a stack) + cq.order = append(cq.order, it) // no recycling needed as we added a new item return nil, nil } +// Finalize prepares the queue for dequeue operations by sorting the items based on +// their IndexInternalID values. This MUST be called before any Dequeue operations, +// and after all Enqueue operations are complete. The sort is done in descending order +// so that dequeueing will basically be popping from the end of the slice, allowing for +// slice reuse. +func (cq *CoalesceQueue) Finalize() { + slices.SortFunc(cq.order, func(a, b *search.DocumentMatch) int { + return b.IndexInternalID.Compare(a.IndexInternalID) + }) +} + +// Dequeue removes and returns the next DocumentMatch from the queue in sorted order. +// If the queue is empty, it returns nil. func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { if cq.Len() == 0 { return nil, nil } - rv := heap.Pop(cq).(*search.DocumentMatch) + // pop from end of slice + rv := cq.order[len(cq.order)-1] + cq.order = cq.order[:len(cq.order)-1] val, err := rv.IndexInternalID.Value() if err != nil { @@ -356,28 +377,7 @@ func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { return rv, nil } -// heap implementation - +// Len returns the number of DocumentMatch items currently in the queue. func (cq *CoalesceQueue) Len() int { return len(cq.order) } - -func (cq *CoalesceQueue) Less(i, j int) bool { - return cq.order[i].IndexInternalID.Compare(cq.order[j].IndexInternalID) < 0 -} - -func (cq *CoalesceQueue) Swap(i, j int) { - cq.order[i], cq.order[j] = cq.order[j], cq.order[i] -} - -func (cq *CoalesceQueue) Push(x any) { - cq.order = append(cq.order, x.(*search.DocumentMatch)) -} - -func (cq *CoalesceQueue) Pop() any { - old := cq.order - n := len(old) - x := old[n-1] - cq.order = old[:n-1] - return x -} From 9ee54b1efaf24d8665b0b7631397c45de0a3394c Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 27 Nov 2025 22:08:32 +0530 Subject: [PATCH 23/70] use clear --- search/search.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/search/search.go b/search/search.go index ec91daf7b..36043a293 100644 --- a/search/search.go +++ b/search/search.go @@ -233,10 +233,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { } // remember the Descendants backing map descendants := dm.Descendants - // reset to empty map for Children - for k := range descendants { - delete(descendants, k) - } + // reset to empty map for Descendants + clear(descendants) // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) From 2f6769af5262cce30b150a7cb5e6acea9453cbc7 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 28 Nov 2025 01:04:22 +0530 Subject: [PATCH 24/70] perf opt 7 --- index_impl.go | 12 +++---- search/collector/nested.go | 2 +- search/collector/topn.go | 12 +++---- search/search.go | 38 +++++--------------- search/searcher/search_conjunction_nested.go | 16 ++++++++- search/searcher/search_knn.go | 2 +- 6 files changed, 35 insertions(+), 47 deletions(-) diff --git a/index_impl.go b/index_impl.go index 719533f4c..eac400ad1 100644 --- a/index_impl.go +++ b/index_impl.go @@ -1126,13 +1126,13 @@ func LoadAndHighlightAllFields( return err, totalStoredFieldsBytes } // collect all descendant documents - nestedDocs := make([]*search.NestedDocumentMatch, 0, root.NumDescendants()) + nestedDocs := make([]*search.NestedDocumentMatch, 0, len(root.Descendants)) // create a dummy desc DocumentMatch to reuse LoadAndHighlightFields desc := &search.DocumentMatch{} - err = root.IterateDescendants(func(descID index.IndexInternalID) error { + for _, descID := range root.Descendants { extID, err := r.ExternalID(descID) if err != nil { - return err + return err, totalStoredFieldsBytes } // reset desc for reuse desc.ID = extID @@ -1141,7 +1141,7 @@ func LoadAndHighlightAllFields( err, bytes := LoadAndHighlightFields(desc, req, indexName, r, highlighter) totalStoredFieldsBytes += bytes if err != nil { - return err + return err, totalStoredFieldsBytes } // copy fields to nested doc and append if len(desc.Fields) != 0 || len(desc.Fragments) != 0 { @@ -1149,10 +1149,6 @@ func LoadAndHighlightAllFields( } desc.Fields = nil desc.Fragments = nil - return nil - }) - if err != nil { - return err, totalStoredFieldsBytes } // add nested documents to root under _$nested key if len(nestedDocs) > 0 { diff --git a/search/collector/nested.go b/search/collector/nested.go index a79bc9fac..1442d822c 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -84,7 +84,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do } // this is a child doc, create interim root newDM := ctx.DocumentMatchPool.Get() - newDM.IndexInternalID = rootID.ToIndexInternalID() + newDM.IndexInternalID = rootID.ToIndexInternalID(newDM.IndexInternalID) // merge the incoming doc into the new interim root c.currRoot = newDM c.currRootAncestorID = rootID diff --git a/search/collector/topn.go b/search/collector/topn.go index 640eafff1..61ac8f640 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -571,14 +571,14 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc } // first visit descendants if any - err := d.IterateDescendants(func(descendant index.IndexInternalID) error { - return hc.dvReader.VisitDocValues(descendant, v) - }) - if err != nil { - return err + for _, descID := range d.Descendants { + err := hc.dvReader.VisitDocValues(descID, v) + if err != nil { + return err + } } // now visit the doc values for this document - err = hc.dvReader.VisitDocValues(d.IndexInternalID, v) + err := hc.dvReader.VisitDocValues(d.IndexInternalID, v) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() } diff --git a/search/search.go b/search/search.go index 36043a293..e24a26f08 100644 --- a/search/search.go +++ b/search/search.go @@ -181,7 +181,7 @@ type DocumentMatch struct { // Descendants holds the IDs of any child/descendant document that contributed // to this root DocumentMatch. - Descendants map[uint64]index.IndexInternalID `json:"-"` + Descendants []index.IndexInternalID `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -226,6 +226,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { indexInternalID := dm.IndexInternalID // remember the []interface{} used for sort sort := dm.Sort + // remember the []interface{} used for DecodedSort + decodedSort := dm.DecodedSort // remember the FieldTermLocations backing array ftls := dm.FieldTermLocations for i := range ftls { // recycle the ArrayPositions of each location @@ -233,19 +235,18 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { } // remember the Descendants backing map descendants := dm.Descendants - // reset to empty map for Descendants - clear(descendants) // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) dm.IndexInternalID = indexInternalID[:0] // reuse the []interface{} already allocated (and reset len to 0) dm.Sort = sort[:0] - dm.DecodedSort = dm.DecodedSort[:0] + // reuse the []interface{} already allocated (and reset len to 0) + dm.DecodedSort = decodedSort[:0] // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] - // reuse the descendant store if it exists - dm.Descendants = descendants + // reuse the Descendants already allocated (and reset len to 0) + dm.Descendants = descendants[:0] return dm } @@ -384,35 +385,12 @@ func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) // add other as descendant only if it is not the same document if !dm.IndexInternalID.Equals(other.IndexInternalID) { - if dm.Descendants == nil { - dm.Descendants = make(map[uint64]index.IndexInternalID) - } - key, err := other.IndexInternalID.Value() - if err != nil { - return err - } - if _, exists := dm.Descendants[key]; exists { - return nil // already exists - } // use clone to avoid potential issues with reusing IndexInternalID slices - dm.Descendants[key] = slices.Clone(other.IndexInternalID) + dm.Descendants = append(dm.Descendants, slices.Clone(other.IndexInternalID)) } return nil } -func (dm *DocumentMatch) IterateDescendants(fn func(id index.IndexInternalID) error) error { - for _, descendant := range dm.Descendants { - if err := fn(descendant); err != nil { - return err - } - } - return nil -} - -func (dm *DocumentMatch) NumDescendants() int { - return len(dm.Descendants) -} - type DocumentMatchCollection []*DocumentMatch func (c DocumentMatchCollection) Len() int { return len(c) } diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index a863ed9f0..f6ece9572 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -44,6 +44,8 @@ type NestedConjunctionSearcher struct { joinIdx int options search.SearcherOptions docQueue *CoalesceQueue + // reusable ID buffer for Advance() calls + advanceID index.IndexInternalID } func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, @@ -203,6 +205,8 @@ OUTER: maxKey = currKey } } + // convert maxKey to advanceID for Advance calls + advanceID := s.toAdvanceID(maxKey) // now try to align all other searchers to the // we check if the a searchers key matches maxKey // if not, we advance the pivot searcher to maxKey @@ -212,7 +216,7 @@ OUTER: // not aligned, so advance this searcher to maxKey var err error ctx.DocumentMatchPool.Put(s.currs[i]) - s.currs[i], err = s.searchers[i].Advance(ctx, maxKey.ToIndexInternalID()) + s.currs[i], err = s.searchers[i].Advance(ctx, advanceID) if err != nil { return nil, err } @@ -287,6 +291,16 @@ func (s *NestedConjunctionSearcher) getKeyForIdx(i int) index.AncestorID { return s.currAncestors[i][len(s.currAncestors[i])-s.joinIdx-1] } +// toAdvanceID converts an AncestorID to IndexInternalID, reusing the advanceID buffer. +// The returned ID is safe to pass to Advance() since Advance() never retains references. +func (s *NestedConjunctionSearcher) toAdvanceID(key index.AncestorID) index.IndexInternalID { + // Reset length to 0 while preserving capacity for buffer reuse + s.advanceID = s.advanceID[:0] + // Convert key to IndexInternalID, reusing the underlying buffer + s.advanceID = key.ToIndexInternalID(s.advanceID) + return s.advanceID +} + func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { for { next, err := s.Next(ctx) diff --git a/search/searcher/search_knn.go b/search/searcher/search_knn.go index a95a714b3..26d5a2af1 100644 --- a/search/searcher/search_knn.go +++ b/search/searcher/search_knn.go @@ -84,7 +84,7 @@ func (s *KNNSearcher) VectorOptimize(ctx context.Context, octx index.VectorOptim func (s *KNNSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) ( *search.DocumentMatch, error) { - knnMatch, err := s.vectorReader.Next(s.vd.Reset()) + knnMatch, err := s.vectorReader.Advance(ID, s.vd.Reset()) if err != nil { return nil, err } From 48411731c20c021d289423115d46ed8dd82723cf Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 28 Nov 2025 11:03:49 +0530 Subject: [PATCH 25/70] rewire api to avoid alloc --- search/search.go | 2 +- search/searcher/search_conjunction_nested.go | 4 +- search/util.go | 49 +++++++++++++++----- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/search/search.go b/search/search.go index e24a26f08..b5d102c30 100644 --- a/search/search.go +++ b/search/search.go @@ -380,7 +380,7 @@ func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { // merge explanations dm.Expl = dm.Expl.MergeWith(other.Expl) // merge field term locations - dm.FieldTermLocations = MergeFieldTermLocations(dm.FieldTermLocations, []*DocumentMatch{other}) + dm.FieldTermLocations = MergeFieldTermLocationsFromMatch(dm.FieldTermLocations, other) // merge score breakdown dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) // add other as descendant only if it is not the same document diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index f6ece9572..bf1f807ec 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -346,8 +346,8 @@ func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatc // merge with current version existing.Score += it.Score existing.Expl = existing.Expl.MergeWith(it.Expl) - existing.FieldTermLocations = search.MergeFieldTermLocations( - existing.FieldTermLocations, []*search.DocumentMatch{it}) + existing.FieldTermLocations = search.MergeFieldTermLocationsFromMatch( + existing.FieldTermLocations, it) // return it to caller for recycling return it, nil } diff --git a/search/util.go b/search/util.go index 2f1d764b1..7b06ec1d8 100644 --- a/search/util.go +++ b/search/util.go @@ -50,30 +50,55 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { n := len(dest) for _, dm := range matches { - n += len(dm.FieldTermLocations) + if dm != nil { + n += len(dm.FieldTermLocations) + } } if cap(dest) < n { dest = append(make([]FieldTermLocation, 0, n), dest...) } for _, dm := range matches { - for _, ftl := range dm.FieldTermLocations { - dest = append(dest, FieldTermLocation{ - Field: ftl.Field, - Term: ftl.Term, - Location: Location{ - Pos: ftl.Location.Pos, - Start: ftl.Location.Start, - End: ftl.Location.End, - ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), - }, - }) + if dm != nil { + dest = mergeFieldTermLocationFromMatch(dest, dm) } } return dest } +// MergeFieldTermLocationsFromMatch merges field term locations from a single DocumentMatch +// into dest, returning the updated slice. +func MergeFieldTermLocationsFromMatch(dest []FieldTermLocation, match *DocumentMatch) []FieldTermLocation { + if match == nil { + return dest + } + n := len(dest) + len(match.FieldTermLocations) + if cap(dest) < n { + dest = append(make([]FieldTermLocation, 0, n), dest...) + } + return mergeFieldTermLocationFromMatch(dest, match) +} + +// mergeFieldTermLocationFromMatch appends field term locations from a DocumentMatch into dest. +// Assumes dest has sufficient capacity. +func mergeFieldTermLocationFromMatch(dest []FieldTermLocation, dm *DocumentMatch) []FieldTermLocation { + for _, ftl := range dm.FieldTermLocations { + dest = append(dest, FieldTermLocation{ + Field: ftl.Field, + Term: ftl.Term, + Location: Location{ + Pos: ftl.Location.Pos, + Start: ftl.Location.Start, + End: ftl.Location.End, + ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), + }, + }) + } + + return dest +} + // MergeScoreBreakdown merges two score breakdown maps together func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { if first == nil { From 2e318dc2b723c2bbcb35fe6cb463f45e0659230f Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 28 Nov 2025 15:09:16 +0530 Subject: [PATCH 26/70] use new API --- search/scorer/scorer_knn.go | 2 +- search/scorer/scorer_term.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/search/scorer/scorer_knn.go b/search/scorer/scorer_knn.go index 8d9043427..06f50cd4a 100644 --- a/search/scorer/scorer_knn.go +++ b/search/scorer/scorer_knn.go @@ -123,7 +123,7 @@ func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext, if sqs.options.Explain { rv.Expl = scoreExplanation } - rv.IndexInternalID = append(rv.IndexInternalID, knnMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, knnMatch.ID) return rv } diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index f5f8ec935..d7e77f977 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -243,7 +243,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term } } - rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, termMatch.ID) if len(termMatch.Vectors) > 0 { if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { From fa38d98509bf01735427f4f03f0b5edbdb8ca939 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 28 Nov 2025 16:23:40 +0530 Subject: [PATCH 27/70] reuse descendantIDs --- search/search.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/search/search.go b/search/search.go index b5d102c30..e7b6ac46c 100644 --- a/search/search.go +++ b/search/search.go @@ -385,8 +385,16 @@ func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) // add other as descendant only if it is not the same document if !dm.IndexInternalID.Equals(other.IndexInternalID) { - // use clone to avoid potential issues with reusing IndexInternalID slices - dm.Descendants = append(dm.Descendants, slices.Clone(other.IndexInternalID)) + // Add a copy of other.IndexInternalID to descendants, because + // other.IndexInternalID will be reset when 'other' is recycled. + var descendantID index.IndexInternalID + // first check if dm's descendants slice has capacity to reuse + if len(dm.Descendants) < cap(dm.Descendants) { + // reuse the buffer element at len(dm.Descendants) by reslicing + descendantID = dm.Descendants[:len(dm.Descendants)+1][len(dm.Descendants)] + } + // copy the contents of other.IndexInternalID into descendantID, allocating if needed + dm.Descendants = append(dm.Descendants, index.NewIndexInternalIDFrom(descendantID, other.IndexInternalID)) } return nil } From 7857fefa3ed96d6b36509c4380f5cd2e3cad588b Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sat, 29 Nov 2025 00:27:54 +0530 Subject: [PATCH 28/70] fix bug --- search/search.go | 7 +- search/searcher/search_conjunction_nested.go | 69 ++++++++++++-------- 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/search/search.go b/search/search.go index e7b6ac46c..fae97a83e 100644 --- a/search/search.go +++ b/search/search.go @@ -233,8 +233,11 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { for i := range ftls { // recycle the ArrayPositions of each location ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] } - // remember the Descendants backing map + // remember the Descendants backing array descendants := dm.Descendants + for i := range descendants { // recycle each IndexInternalID + descendants[i] = descendants[i][:0] + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) @@ -390,7 +393,7 @@ func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { var descendantID index.IndexInternalID // first check if dm's descendants slice has capacity to reuse if len(dm.Descendants) < cap(dm.Descendants) { - // reuse the buffer element at len(dm.Descendants) by reslicing + // reuse the buffer element at len(dm.Descendants) descendantID = dm.Descendants[:len(dm.Descendants)+1][len(dm.Descendants)] } // copy the contents of other.IndexInternalID into descendantID, allocating if needed diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index bf1f807ec..bca4022d8 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -148,41 +148,52 @@ func (s *NestedConjunctionSearcher) DocumentMatchPoolSize() int { return rv } -func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { +func (s *NestedConjunctionSearcher) initialize(ctx *search.SearchContext) (bool, error) { var err error + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return false, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return true, nil + } + // get the ancestry chain for this match + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return false, err + } + // check if the ancestry chain is > joinIdx, if not we reset the joinIdx + // to the minimum possible value across all searchers, ideally this will be + // done in query construction time itself, by using the covering depth across + // all sub-queries, but we do this here as a fallback + if s.joinIdx >= len(s.currAncestors[i]) { + s.joinIdx = len(s.currAncestors[i]) - 1 + } + } + // build currKeys for each searcher, do it here as we may have adjusted joinIdx + for i := range s.searchers { + s.currKeys[i] = s.getKeyForIdx(i) + } + s.initialized = true + return false, nil +} + +func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { // initialize on first call to Next, by getting first match // from each searcher and their ancestry chains if !s.initialized { - for i, searcher := range s.searchers { - if s.currs[i] != nil { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return nil, err - } - if s.currs[i] == nil { - // one of the searchers is exhausted, so we are done - return nil, nil - } - // get the ancestry chain for this match - s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) - if err != nil { - return nil, err - } - // check if the ancestry chain is > joinIdx, if not we reset the joinIdx - // to the minimum possible value across all searchers, ideally this will be - // done in query construction time itself, by using the covering depth across - // all sub-queries, but we do this here as a fallback - if s.joinIdx >= len(s.currAncestors[i]) { - s.joinIdx = len(s.currAncestors[i]) - 1 - } + done, err := s.initialize(ctx) + if err != nil { + return nil, err } - // build currKeys for each searcher, do it here as we may have adjusted joinIdx - for i := range s.searchers { - s.currKeys[i] = s.getKeyForIdx(i) + if done { + return nil, nil } - s.initialized = true } // check if the docQueue has any buffered matches if s.docQueue.Len() > 0 { From 5eb7766b8b54e33dbc12596910e32cc49c34f0bb Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sat, 29 Nov 2025 14:39:59 +0530 Subject: [PATCH 29/70] heuristic for Advance() --- search/searcher/search_conjunction_nested.go | 123 ++++++++++++++++--- 1 file changed, 107 insertions(+), 16 deletions(-) diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go index bca4022d8..688142e13 100644 --- a/search/searcher/search_conjunction_nested.go +++ b/search/searcher/search_conjunction_nested.go @@ -46,6 +46,8 @@ type NestedConjunctionSearcher struct { docQueue *CoalesceQueue // reusable ID buffer for Advance() calls advanceID index.IndexInternalID + // reusable buffer for Advance() calls + ancestors []index.AncestorID } func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, @@ -177,7 +179,7 @@ func (s *NestedConjunctionSearcher) initialize(ctx *search.SearchContext) (bool, } // build currKeys for each searcher, do it here as we may have adjusted joinIdx for i := range s.searchers { - s.currKeys[i] = s.getKeyForIdx(i) + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) } s.initialized = true return false, nil @@ -199,6 +201,8 @@ func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.Doc if s.docQueue.Len() > 0 { return s.docQueue.Dequeue() } + // now enter the main alignment loop + n := len(s.searchers) OUTER: for { // pick the pivot searcher with the highest key (ancestor at joinIdx level) @@ -206,7 +210,7 @@ OUTER: return nil, nil } maxKey := s.currKeys[0] - for i := 1; i < len(s.searchers); i++ { + for i := 1; i < n; i++ { // currs[i] is nil means one of the searchers is exhausted if s.currs[i] == nil { return nil, nil @@ -216,15 +220,22 @@ OUTER: maxKey = currKey } } - // convert maxKey to advanceID for Advance calls - advanceID := s.toAdvanceID(maxKey) + // store maxkey as advanceID only once only if needed + var advanceID index.IndexInternalID + // flag to track if all searchers are aligned + var aligned bool = true // now try to align all other searchers to the // we check if the a searchers key matches maxKey // if not, we advance the pivot searcher to maxKey // else do nothing and move to the next searcher - for i := 0; i < len(s.searchers); i++ { - if s.currKeys[i].Compare(maxKey) < 0 { + for i := 0; i < n; i++ { + cmp := s.currKeys[i].Compare(maxKey) + if cmp < 0 { // not aligned, so advance this searcher to maxKey + // convert maxKey to advanceID only once + if advanceID == nil { + advanceID = s.toAdvanceID(maxKey) + } var err error ctx.DocumentMatchPool.Put(s.currs[i]) s.currs[i], err = s.searchers[i].Advance(ctx, advanceID) @@ -241,23 +252,26 @@ OUTER: return nil, err } // recalc key - s.currKeys[i] = s.getKeyForIdx(i) + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + // recalc cmp + cmp = s.currKeys[i].Compare(maxKey) + } + if cmp != 0 { + // not aligned + aligned = false } } // now check if all the searchers are aligned at the same maxKey // if they are not aligned, we need to restart the loop of picking // the pivot searcher with the highest key - for i := 0; i < len(s.searchers); i++ { - if !s.currKeys[i].Equals(maxKey) { - // not aligned, so restart the outer loop - continue OUTER - } + if !aligned { + continue OUTER } // if we are here, all the searchers are aligned at maxKey // now we need to buffer all the intermediate matches for every // searcher at this key, until either the searcher's key changes // or the searcher is exhausted - for i := 0; i < len(s.searchers); i++ { + for i := 0; i < n; i++ { for { // buffer the current match recycle, err := s.docQueue.Enqueue(s.currs[i]) @@ -283,7 +297,7 @@ OUTER: return nil, err } // recalc key - s.currKeys[i] = s.getKeyForIdx(i) + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) // check if key has changed if !s.currKeys[i].Equals(maxKey) { // key changed, break out @@ -298,8 +312,10 @@ OUTER: } } -func (s *NestedConjunctionSearcher) getKeyForIdx(i int) index.AncestorID { - return s.currAncestors[i][len(s.currAncestors[i])-s.joinIdx-1] +// ancestorFromRoot gets the AncestorID at the given position from the root +// if pos is 0, it returns the root AncestorID, and so on +func ancestorFromRoot(ancestors []index.AncestorID, pos int) index.AncestorID { + return ancestors[len(ancestors)-pos-1] } // toAdvanceID converts an AncestorID to IndexInternalID, reusing the advanceID buffer. @@ -313,6 +329,81 @@ func (s *NestedConjunctionSearcher) toAdvanceID(key index.AncestorID) index.Inde } func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + done, err := s.initialize(ctx) + if err != nil { + return nil, err + } + if done { + return nil, nil + } + } + // first check if the docQueue has any buffered matches + // if so we first check if any of them can satisfy the Advance(ID) + for s.docQueue.Len() > 0 { + dm, err := s.docQueue.Dequeue() + if err != nil { + return nil, err + } + if dm.IndexInternalID.Compare(ID) >= 0 { + return dm, nil + } + // otherwise recycle this match + ctx.DocumentMatchPool.Put(dm) + } + var err error + // now we first get the ancestry chain for the given ID + s.ancestors, err = s.nestedReader.Ancestors(ID, s.ancestors[:0]) + if err != nil { + return nil, err + } + // we now follow the the following logic for each searcher: + // let S be the length of the ancestry chain for the searcher + // let I be the length of the ancestry chain for the given ID + // 1. if S > I: + // then we just Advance() the searcher to the given ID if required + // 2. else if S <= I: + // then we get the AncestorID at position (S - 1) from the root of + // the given ID's ancestry chain, and Advance() the searcher to + // it if required + for i, searcher := range s.searchers { + if s.currs[i] == nil { + return nil, nil // already exhausted, nothing to do + } + var targetID index.IndexInternalID + S := len(s.currAncestors[i]) + I := len(s.ancestors) + if S > I { + // case 1: S > I + targetID = ID + } else { + // case 2: S <= I + targetID = s.toAdvanceID(ancestorFromRoot(s.ancestors, S-1)) + } + if s.currs[i].IndexInternalID.Compare(targetID) < 0 { + // need to advance this searcher + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = searcher.Advance(ctx, targetID) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + } + } + // we need to call Next() in a loop until we reach or exceed the given ID + // the Next() call basically gives us a match that is aligned correctly, but + // if joinIdx < I, we can have multiple matches for the same joinIdx ancestor + // and they may be < ID, so we need to loop for { next, err := s.Next(ctx) if err != nil { From 77f4b32dd96166089ff196e44f61bf75574e62de Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Sat, 29 Nov 2025 21:57:09 +0530 Subject: [PATCH 30/70] trivial fix --- search/collector/knn.go | 1 + 1 file changed, 1 insertion(+) diff --git a/search/collector/knn.go b/search/collector/knn.go index 8a9488d9a..de611c7b0 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -239,6 +239,7 @@ func (hc *KNNCollector) finalizeResults(ctx *search.SearchContext, r index.Index docFixup := func(doc *search.DocumentMatch) error { if doc.ID == "" { // look up the id since we need it for lookup + var err error doc.ID, err = r.ExternalID(doc.IndexInternalID) if err != nil { return err From f4b8c1f964d65823cbdb954904499285b5955b1b Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 1 Dec 2025 00:10:00 +0530 Subject: [PATCH 31/70] fix String() method --- docs/hierarchy.md | 10 +++--- index_impl.go | 4 ++- search.go | 80 +++++++++++++++++++++++++++++++++++++---------- search/search.go | 17 ---------- search_test.go | 24 +++++++------- 5 files changed, 86 insertions(+), 49 deletions(-) diff --git a/docs/hierarchy.md b/docs/hierarchy.md index d2024e461..4aa62af57 100644 --- a/docs/hierarchy.md +++ b/docs/hierarchy.md @@ -152,11 +152,13 @@ like vector search, synonym search, hybrid and pre-filtered vector search integr * Conjunction Queries (AND queries) and other queries that depend on term co-occurrence within the same hierarchical context will respect the boundaries of nested documents. This means that terms must appear within the same nested object to be considered a match. For example, a conjunction query searching for an employee named "Alice" with the role "Engineer" within the "Engineering" department will only return results where both name and role terms are found within the same employee object, which is itself within a "Engineering" department object. * Some other search constructs will have enhanced precision with hierarchy search. - * Fields/Highlighting: Only fields belonging to matching nested objects are eligible for field‐level retrieval and highlighting. This ensures highlights appear only in the relevant hierarchical context, not anywhere else in the document. For example, if a match occurs in an employee object within the a department object names "Engineering", only fields within that employee object will be highlighted, not names from other employees or unrelated fields. - * Aggregations/Faceting: Facets can be computed over terms that exist inside nested objects, providing more accurate, context-aware aggregation results. A facet on `departments.projects.status` will produce buckets such as `active`, `paused`, `completed` only for the matched departments, instead of aggregating project status across the entire company document. - * Sorting: Sorting can be applied using fields from nested objects. This allows sorting results based on values inside the appropriate nested structure. For example, sorting companies by `departments.budget` (descending) will order documents based on the budget of the specific department involved in the match, rather than the overall document or unrelated departments. + * Field-Level Highlighting: Only fields within the matched nested object are retrieved and highlighted, ensuring highlights appear in the correct hierarchical context. For example, a match in `departments[name=Engineering].employees` highlights only employees in that department. -* Vector Search (KNN / Multi-KNN): When an array of objects is marked as nested and contains vector fields, each vector is treated as belonging to its own nested document. Vector similarity is computed only within the same nested object, not across siblings. For example, if `departments.employees` is a nested array where each employee has a `skills_vector`, a KNN search using the embedding of `machine learning engineer` will match only employees whose own `skills_vector` is similar—other employees vectors within the same department or document do not contribute to the score or match. This also means that a vector search query for `K = 3` will return the top 3 most similar employees across all departments and all companies, and may return multiple employees from the same department or company if they rank among the top 3 most similar overall. + * Nested Faceting / Aggregations: Facets are computed within matched nested objects, producing context-aware buckets. E.g., a facet on `departments.projects.status` returns ongoing or completed only for projects in matched departments. + + * Sorting by Nested Fields: Sorting can use fields from the relevant nested object, e.g., ordering companies by `departments.budget sorts` based on the budget of the specific matched department, not unrelated departments. + +* Vector Search (KNN / Multi-KNN): When an array of objects is marked as nested and contains vector fields, each vector is treated as belonging to its own nested document. Vector similarity is computed only within the same nested object, not across siblings. For example, if `departments.employees` is a nested array where each employee has a `skills_vector`, a KNN search using the embedding of `machine learning engineer` will match only employees whose own `skills_vector` is similar; other employees vectors within the same department or document do not contribute to the score or match. This also means that a vector search query for `K = 3` will return the top 3 most similar employees across all departments and all companies, and may return multiple employees from the same department or company if they rank among the top 3 most similar overall. * Pre-Filtered Vector Search: When vector search is combined with filters on fields inside a nested array, the filters are applied first to pick which nested items are eligible. The vector search then runs only on those filtered items. For example, if `departments.employees` is a `nested` array, a pre-filtered KNN query for employees with the role `Manager` in the `Sales` department will first narrow the candidate set to only employees who meet those field conditions, and then compute vector similarity on the `skills_vector` of that filtered subset. This ensures that vector search results come only from the employees that satisfy the filter, while still treating each employee as an independent vector candidate. diff --git a/index_impl.go b/index_impl.go index eac400ad1..99eb11eb7 100644 --- a/index_impl.go +++ b/index_impl.go @@ -1109,6 +1109,8 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, return nil, totalStoredFieldsBytes } +const NestedDocumentKey = "_$nested" + // LoadAndHighlightAllFields loads stored fields + highlights for root and its descendants. // All descendant documents are collected into a _$nested array in the root DocumentMatch. func LoadAndHighlightAllFields( @@ -1152,7 +1154,7 @@ func LoadAndHighlightAllFields( } // add nested documents to root under _$nested key if len(nestedDocs) > 0 { - root.AddFieldValue("_$nested", nestedDocs) + root.AddFieldValue(NestedDocumentKey, nestedDocs) } return nil, totalStoredFieldsBytes } diff --git a/search.go b/search.go index 23d5d6386..41e1afb0e 100644 --- a/search.go +++ b/search.go @@ -571,26 +571,74 @@ func (sr *SearchResult) Size() int { } func (sr *SearchResult) String() string { - rv := "" - if sr.Total > 0 { - if sr.Request != nil && sr.Request.Size > 0 { - rv = fmt.Sprintf("%d matches, showing %d through %d, took %s\n", sr.Total, sr.Request.From+1, sr.Request.From+len(sr.Hits), sr.Took) - for i, hit := range sr.Hits { - rv += fmt.Sprintf("%5d. %s (%f)\n", i+sr.Request.From+1, hit.ID, hit.Score) - for fragmentField, fragments := range hit.Fragments { - rv += fmt.Sprintf("\t%s\n", fragmentField) - for _, fragment := range fragments { - rv += fmt.Sprintf("\t\t%s\n", fragment) + // Helper to format one hit + formatHit := func(i int, hit *search.DocumentMatch, start int) string { + rv := fmt.Sprintf("%5d. %s (%f)\n", start+i+1, hit.ID, hit.Score) + for fragmentField, fragments := range hit.Fragments { + rv += fmt.Sprintf("\t%s\n", fragmentField) + for _, fragment := range fragments { + rv += fmt.Sprintf("\t\t%s\n", fragment) + } + } + for otherFieldName, otherFieldValue := range hit.Fields { + if otherFieldName == NestedDocumentKey { + continue + } + if _, ok := hit.Fragments[otherFieldName]; !ok { + rv += fmt.Sprintf("\t%s\n", otherFieldName) + rv += fmt.Sprintf("\t\t%v\n", otherFieldValue) + } + } + // nested documents + if nested, ok := hit.Fields[NestedDocumentKey]; ok { + if list, ok := nested.([]*search.NestedDocumentMatch); ok { + rv += fmt.Sprintf("\t%s (%d nested documents)\n", NestedDocumentKey, len(list)) + for ni, nd := range list { + rv += fmt.Sprintf("\t\tNested #%d:\n", ni+1) + for f, frags := range nd.Fragments { + rv += fmt.Sprintf("\t\t\t%s\n", f) + for _, frag := range frags { + rv += fmt.Sprintf("\t\t\t\t%s\n", frag) + } } - } - for otherFieldName, otherFieldValue := range hit.Fields { - if _, ok := hit.Fragments[otherFieldName]; !ok { - rv += fmt.Sprintf("\t%s\n", otherFieldName) - rv += fmt.Sprintf("\t\t%v\n", otherFieldValue) + for f, v := range nd.Fields { + if _, ok := nd.Fragments[f]; !ok { + rv += fmt.Sprintf("\t\t\t%s\n", f) + rv += fmt.Sprintf("\t\t\t\t%v\n", v) + } } } } - } else { + } + if len(hit.DecodedSort) > 0 { + rv += "\t_sort: [" + for i, v := range hit.DecodedSort { + if i > 0 { + rv += ", " + } + rv += fmt.Sprintf("%v", v) + } + rv += "]\n" + } + return rv + } + var rv string + // main header + if sr.Total > 0 { + switch { + case sr.Request != nil && sr.Request.Size > 0: + start := sr.Request.From + end := sr.Request.From + len(sr.Hits) + rv = fmt.Sprintf("%d matches, showing %d through %d, took %s\n", sr.Total, start+1, end, sr.Took) + for i, hit := range sr.Hits { + rv += formatHit(i, hit, start) + } + case sr.Request == nil: + rv = fmt.Sprintf("%d matches, took %s\n", sr.Total, sr.Took) + for i, hit := range sr.Hits { + rv += formatHit(i, hit, 0) + } + default: rv = fmt.Sprintf("%d matches, took %s\n", sr.Total, sr.Took) } } else { diff --git a/search/search.go b/search/search.go index fae97a83e..6b476ac8c 100644 --- a/search/search.go +++ b/search/search.go @@ -457,23 +457,6 @@ type NestedDocumentMatch struct { Fragments FieldFragmentMap `json:"fragments,omitempty"` } -func (ndm *NestedDocumentMatch) String() string { - rv := "\n" - for fragmentField, fragments := range ndm.Fragments { - rv += fmt.Sprintf("\t%s\n", fragmentField) - for _, fragment := range fragments { - rv += fmt.Sprintf("\t\t%s\n", fragment) - } - } - for otherFieldName, otherFieldValue := range ndm.Fields { - if _, ok := ndm.Fragments[otherFieldName]; !ok { - rv += fmt.Sprintf("\t%s\n", otherFieldName) - rv += fmt.Sprintf("\t\t%v\n", otherFieldValue) - } - } - return rv -} - // NewNestedDocumentMatch creates a new NestedDocumentMatch instance // with the given fields and fragments func NewNestedDocumentMatch(fields map[string]interface{}, fragments FieldFragmentMap) *NestedDocumentMatch { diff --git a/search_test.go b/search_test.go index d667d5ec2..ce6505551 100644 --- a/search_test.go +++ b/search_test.go @@ -5471,7 +5471,7 @@ func TestNestedConjunctionQuery(t *testing.T) { data string }{ { - id: "1", + id: "doc1", data: `{ "company": { "id": "c1", @@ -5510,7 +5510,7 @@ func TestNestedConjunctionQuery(t *testing.T) { }`, }, { - id: "2", + id: "doc2", data: `{ "company" : { "id": "c2", @@ -5549,7 +5549,7 @@ func TestNestedConjunctionQuery(t *testing.T) { }`, }, { - id: "3", + id: "doc3", data: `{ "company": { "id": "c3", @@ -5604,6 +5604,8 @@ func TestNestedConjunctionQuery(t *testing.T) { var buildReq = func(subQueries []query.Query) *SearchRequest { rv := NewSearchRequest(query.NewConjunctionQuery(subQueries)) rv.SortBy([]string{"_id"}) + rv.Fields = []string{"*"} + rv.Highlight = NewHighlightWithStyle(ansi.Name) return rv } @@ -5636,7 +5638,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 2 { t.Fatalf("expected 2 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + if res.Hits[0].ID != "doc1" || res.Hits[1].ID != "doc2" { t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) } @@ -5655,7 +5657,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 2 { t.Fatalf("expected 2 hits, got %d", len(res.Hits)) } - if res.Hits[0].ID != "1" || res.Hits[1].ID != "3" { + if res.Hits[0].ID != "doc1" || res.Hits[1].ID != "doc3" { t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) } @@ -5679,7 +5681,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "2" { + if res.Hits[0].ID != "doc2" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } @@ -5736,7 +5738,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hits, got %d", len(res.Hits)) } - if res.Hits[0].ID != "1" { + if res.Hits[0].ID != "doc1" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } @@ -5793,7 +5795,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "2" { + if res.Hits[0].ID != "doc2" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } @@ -5850,7 +5852,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "3" { + if res.Hits[0].ID != "doc3" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } @@ -5907,7 +5909,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "3" { + if res.Hits[0].ID != "doc3" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } @@ -5941,7 +5943,7 @@ func TestNestedConjunctionQuery(t *testing.T) { if len(res.Hits) != 1 { t.Fatalf("expected 1 hit, got %d", len(res.Hits)) } - if res.Hits[0].ID != "2" { + if res.Hits[0].ID != "doc2" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } } From 7215a0a4f12b8a54c59dd49cdb9cff7c6b174dc8 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 2 Dec 2025 06:43:51 +0530 Subject: [PATCH 32/70] performance optimization --- mapping/index.go | 28 +++---- mapping/mapping.go | 9 ++- registry/nested.go | 144 ++++++++++++++++++++-------------- search/query/conjunction.go | 9 ++- search_test.go | 149 +++++++++++++++++++++++------------- 5 files changed, 211 insertions(+), 128 deletions(-) diff --git a/mapping/index.go b/mapping/index.go index 90d993fc0..5c456da04 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -588,17 +588,18 @@ func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceV return nil } -func (im *IndexMappingImpl) buildNestedPrefixes() { +func (im *IndexMappingImpl) buildNestedPrefixes() map[string]int { + prefixDepth := make(map[string]int) var collectNestedFields func(dm *DocumentMapping, pathComponents []string, currentDepth int) collectNestedFields = func(dm *DocumentMapping, pathComponents []string, currentDepth int) { for name, docMapping := range dm.Properties { newPathComponents := append(pathComponents, name) if docMapping.Nested { // This is a nested field boundary - path := strings.Join(newPathComponents, ".") - im.cache.NestedPrefixes.AddPrefix(path, currentDepth+1) + newDepth := currentDepth + 1 + prefixDepth[strings.Join(newPathComponents, pathSeparator)] = newDepth // Continue deeper with incremented depth - collectNestedFields(docMapping, newPathComponents, currentDepth+1) + collectNestedFields(docMapping, newPathComponents, newDepth) } else { // Not nested, continue with same depth collectNestedFields(docMapping, newPathComponents, currentDepth) @@ -615,18 +616,19 @@ func (im *IndexMappingImpl) buildNestedPrefixes() { collectNestedFields(docMapping, []string{}, 0) } } + return prefixDepth } -func (im *IndexMappingImpl) CoveringDepth(fs search.FieldSet) int { +func (im *IndexMappingImpl) NestedDepth(fs search.FieldSet) (int, int) { if im.cache == nil || im.cache.NestedPrefixes == nil { - return 0 + return 0, 0 } - im.cache.NestedPrefixes.InitOnce(func() { - im.buildNestedPrefixes() + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() }) - return im.cache.NestedPrefixes.CoveringDepth(fs) + return im.cache.NestedPrefixes.NestedDepth(fs) } func (im *IndexMappingImpl) CountNested() int { @@ -634,8 +636,8 @@ func (im *IndexMappingImpl) CountNested() int { return 0 } - im.cache.NestedPrefixes.InitOnce(func() { - im.buildNestedPrefixes() + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() }) return im.cache.NestedPrefixes.CountNested() @@ -646,8 +648,8 @@ func (im *IndexMappingImpl) IntersectsPrefix(fs search.FieldSet) bool { return false } - im.cache.NestedPrefixes.InitOnce(func() { - im.buildNestedPrefixes() + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() }) return im.cache.NestedPrefixes.IntersectsPrefix(fs) diff --git a/mapping/mapping.go b/mapping/mapping.go index 0653f7531..7ff2f9927 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -79,9 +79,12 @@ type SynonymMapping interface { // A NestedMapping extends the IndexMapping interface to provide // additional methods for working with nested object mappings. type NestedMapping interface { - // CoveringDepth returns the deepest nested - // level common to all field paths - CoveringDepth(fieldPaths search.FieldSet) int + // NestedDepth returns two values: + // - common: the highest nested level that is common to all given field paths, + // if 0 then there is no common nested level among the given field paths + // - max: the highest nested level that applies to at least one of the given field paths + // if 0 then none of the given field paths are nested + NestedDepth(fieldPaths search.FieldSet) (int, int) // IntersectsPrefix returns true if any of the given // field paths intersect with a known nested prefix diff --git a/registry/nested.go b/registry/nested.go index 54745fe3e..a6b5336dd 100644 --- a/registry/nested.go +++ b/registry/nested.go @@ -21,88 +21,116 @@ import ( "github.com/blevesearch/bleve/v2/search" ) +// NestedFieldCache caches nested field prefixes and their corresponding nesting levels. +// A nested field prefix is a field path prefix that indicates the start of a nested document. +// The nesting level indicates how deep the nested document is in the overall document structure. type NestedFieldCache struct { // nested prefix -> nested level - c *ConcurrentCache - - once sync.Once + prefixDepth map[string]int + once sync.Once + m sync.RWMutex } func NewNestedFieldCache() *NestedFieldCache { - return &NestedFieldCache{ - NewConcurrentCache(), - sync.Once{}, - } -} - -func (nfc *NestedFieldCache) InitOnce(initFunc func()) { - nfc.once.Do(initFunc) + return &NestedFieldCache{} } -func (nfc *NestedFieldCache) AddPrefix(prefix string, level int) error { - buildFunc := func(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { - return level, nil - } - _, err := nfc.c.DefineItem(prefix, "", nil, nil, buildFunc) - if err == ErrAlreadyDefined { - // Already exists, that's ok - return nil - } - return err +func (nfc *NestedFieldCache) InitOnce(buildFunc func() map[string]int) { + nfc.once.Do(func() { + nfc.m.Lock() + defer nfc.m.Unlock() + nfc.prefixDepth = buildFunc() + }) } -// Returns the deepest nested level that covers all the given field paths -func (nfc *NestedFieldCache) CoveringDepth(fieldPaths search.FieldSet) int { +// NestedDepth returns two values: +// - common: The nesting level of the longest prefix that applies to every field path +// in the provided FieldSet. A value of 0 means no nested prefix is shared +// across all field paths. +// - max: The nesting level of the longest prefix that applies to at least one +// field path in the provided FieldSet. A value of 0 means none of the +// field paths match any nested prefix. +func (nfc *NestedFieldCache) NestedDepth(fieldPaths search.FieldSet) (common int, max int) { + // if no field paths, no nested depth if len(fieldPaths) == 0 { - return 0 + return } - - nfc.c.mutex.RLock() - defer nfc.c.mutex.RUnlock() - - deepestLevel := 0 - - // Check each cached nested prefix - for prefix, item := range nfc.c.data { - level, ok := item.(int) - if !ok { + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no nested depth + if len(nfc.prefixDepth) == 0 { + return + } + // for each prefix, check if its a common prefix or matches any path + // update common and max accordingly with the highest nesting level + // possible for each respective case + for prefix, level := range nfc.prefixDepth { + // only check prefixes that could increase one of the results + if level <= common && level <= max { continue } - - // Check if this nested prefix belongs to all the given paths - isCommonPrefix := true - for path := range fieldPaths { - if !strings.HasPrefix(path, prefix) { - isCommonPrefix = false - break - } + // check prefix against field paths, getting whether it matches all paths (common) + // and whether it matches at least one path (any) + matchAll, matchAny := nfc.prefixMatch(prefix, fieldPaths) + // if it matches all paths, update common + if matchAll && level > common { + common = level } - - // If it's a common prefix and deeper than what we've found so far - if isCommonPrefix && level > deepestLevel { - deepestLevel = level + // if it matches any path, update max + if matchAny && level > max { + max = level } } - - return deepestLevel + return common, max } +// CountNested returns the number of nested prefixes func (nfc *NestedFieldCache) CountNested() int { - nfc.c.mutex.RLock() - defer nfc.c.mutex.RUnlock() + nfc.m.RLock() + defer nfc.m.RUnlock() - return len(nfc.c.data) + return len(nfc.prefixDepth) } +// IntersectsPrefix returns true if any of the given +// field paths have a nested prefix func (nfc *NestedFieldCache) IntersectsPrefix(fieldPaths search.FieldSet) bool { - nfc.c.mutex.RLock() - defer nfc.c.mutex.RUnlock() - for prefix := range nfc.c.data { - for path := range fieldPaths { - if strings.HasPrefix(path, prefix) { - return true - } + // if no field paths, no intersection + if len(fieldPaths) == 0 { + return false + } + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no intersection + if len(nfc.prefixDepth) == 0 { + return false + } + // Check each cached nested prefix to see if it intersects with any path + for prefix := range nfc.prefixDepth { + _, matchAny := nfc.prefixMatch(prefix, fieldPaths) + if matchAny { + return true } } return false } + +// prefixMatch checks whether the prefix matches all paths (common) and whether it matches at least one path (any) +// Caller must hold the read lock. +func (nfc *NestedFieldCache) prefixMatch(prefix string, fieldPaths search.FieldSet) (common bool, any bool) { + common = true + any = false + for path := range fieldPaths { + has := strings.HasPrefix(path, prefix) + if has { + any = true + } else { + common = false + } + // early exit if we have determined both values + if any && !common { + break + } + } + return common, any +} diff --git a/search/query/conjunction.go b/search/query/conjunction.go index 25fda0400..6870b1ae2 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -101,7 +101,14 @@ func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m } if nestedMode { - return searcher.NewNestedConjunctionSearcher(ctx, i, ss, nm.CoveringDepth(qfs), options) + // first determine the nested depth info for the query fields + commonDepth, maxDepth := nm.NestedDepth(qfs) + // if we have common depth == max depth then we can just use + // the normal conjunction searcher, as all fields share the same + // nested context, otherwise we need to use the nested conjunction searcher + if commonDepth < maxDepth { + return searcher.NewNestedConjunctionSearcher(ctx, i, ss, commonDepth, options) + } } return searcher.NewConjunctionSearcher(ctx, i, ss, options) diff --git a/search_test.go b/search_test.go index ce6505551..ccb25556c 100644 --- a/search_test.go +++ b/search_test.go @@ -5310,8 +5310,7 @@ func TestNestedPrefixes(t *testing.T) { t.Fatal(err) } defer func() { - err = idx.Close() - if err != nil { + if err := idx.Close(); err != nil { t.Fatal(err) } }() @@ -5321,88 +5320,121 @@ func TestNestedPrefixes(t *testing.T) { t.Fatal("index mapping is not a NestedMapping") } + // ---------------------------------------------------------------------- // Test 1: Employee Role AND Employee Name + // ---------------------------------------------------------------------- fs := search.NewFieldSet() fs.AddField("company.departments.employees.role") fs.AddField("company.departments.employees.name") - // Expected depth is 2 (employees are nested within departments) - expectedDepth := 2 + expectedCommon := 2 + expectedMax := 2 - actualDepth := nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + common, max := nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test1: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 2: Employee Role AND Employee Name AND Department Name + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.departments.employees.role") fs.AddField("company.departments.employees.name") fs.AddField("company.departments.name") - // Expected depth is 1 (employees and department share the same department context) - expectedDepth = 1 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + expectedCommon = 1 + expectedMax = 2 // employees nested deeper + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test2: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 3: Employee Role AND Location City + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.departments.employees.role") fs.AddField("company.locations.city") - // Expected depth is 0 (employees and locations are in different nested contexts) - expectedDepth = 0 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + expectedCommon = 0 + expectedMax = 2 // employees deeper than locations (1) + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test3: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 4: Company Name AND Location Country + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.name") fs.AddField("company.locations.country") fs.AddField("company.locations.city") - // Expected depth is 0 (company.name is at root, locations are nested) - expectedDepth = 0 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 0 + expectedMax = 1 // locations.country and locations.city share depth 1 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test4: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 5: Department Budget AND Project Status AND Employee Name + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.departments.budget") fs.AddField("company.departments.projects.status") fs.AddField("company.departments.employees.name") - // Expected depth is 1 (all share the same department context) - expectedDepth = 1 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 1 + expectedMax = 2 // employees + projects go deeper + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test5: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } - // Test 6: Single Field - Company ID + // ---------------------------------------------------------------------- + // Test 6: Single Field + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.id") - // Expected depth is 0 (company.id is at root) - expectedDepth = 0 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 0 + expectedMax = 0 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test6: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 7: No Fields + // ---------------------------------------------------------------------- fs = search.NewFieldSet() - // Expected depth is 0 (no fields) - expectedDepth = 0 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 0 + expectedMax = 0 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test7: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 8: All Fields + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.id") fs.AddField("company.name") @@ -5414,36 +5446,47 @@ func TestNestedPrefixes(t *testing.T) { fs.AddField("company.departments.projects.status") fs.AddField("company.locations.city") fs.AddField("company.locations.country") - // Expected depth is 0 (fields span multiple nested contexts) - expectedDepth = 0 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 0 // spans different contexts + expectedMax = 2 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test8: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 9: Project Title AND Project Status + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.departments.projects.title") fs.AddField("company.departments.projects.status") - // Expected depth is 2 (projects are nested within departments) - expectedDepth = 2 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + + expectedCommon = 2 + expectedMax = 2 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test9: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } + // ---------------------------------------------------------------------- // Test 10: Department Name AND Location Country + // ---------------------------------------------------------------------- fs = search.NewFieldSet() fs.AddField("company.departments.name") fs.AddField("company.locations.country") fs.AddField("company.locations.city") - // Expected depth is 0 (departments and locations are in different nested contexts) - expectedDepth = 0 + expectedCommon = 0 + expectedMax = 1 // locations share depth 1 - actualDepth = nmap.CoveringDepth(fs) - if actualDepth != expectedDepth { - t.Fatalf("expected depth %d, got %d", expectedDepth, actualDepth) + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test10: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) } } From f3471986d9e79949ae466e19b62b4a58921de9dd Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 2 Dec 2025 16:39:39 +0530 Subject: [PATCH 33/70] fix merge --- search_knn.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/search_knn.go b/search_knn.go index 899dbd28d..50a89ddcf 100644 --- a/search_knn.go +++ b/search_knn.go @@ -447,7 +447,10 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea err = serr } }() - knnCollector := collector.NewKNNCollector(kArray, sumOfK) + knnCollector, err := i.buildKNNCollector(ctx, KNNQuery, reader, kArray, sumOfK) + if err != nil { + return nil, err + } err = knnCollector.Collect(ctx, knnSearcher, reader) if err != nil { return nil, err From 933a97cd77b082055dfc1519159efa37c2006546 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 2 Dec 2025 16:42:12 +0530 Subject: [PATCH 34/70] fix merge 2 --- search_knn.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/search_knn.go b/search_knn.go index 50a89ddcf..c6aae7700 100644 --- a/search_knn.go +++ b/search_knn.go @@ -424,7 +424,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return nil, err } knnFilterResults[idx] = filterColl.EligibleSelector() - // Close the filter searcher once done + // Close the filter searcher, as we are done with it. err = filterSearcher.Close() if err != nil { return nil, err @@ -459,11 +459,6 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if !preSearch { knnHits = finalizeKNNResults(req, knnHits) } - // close the knn searcher once done - err = knnSearcher.Close() - if err != nil { - return nil, err - } // at this point, irrespective of whether it is a preSearch or not, // the knn hits are populated with Sort and Fields. // it must be ensured downstream that the Sort and Fields are not From 0edb0adf85afafbfa4394f4eb18aeb9528e9e154 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 2 Dec 2025 19:32:44 +0530 Subject: [PATCH 35/70] hybrid search fix part 1 --- index_impl.go | 2 +- search/collector/topn.go | 12 ++++--- search/util.go | 78 +++++++++++++++++++++++++++++----------- search_knn.go | 19 +++++----- 4 files changed, 76 insertions(+), 35 deletions(-) diff --git a/index_impl.go b/index_impl.go index 99eb11eb7..bbc9a01a4 100644 --- a/index_impl.go +++ b/index_impl.go @@ -799,7 +799,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // if score fusion, no faceting for knn hits is done // hence we can skip setting the knn hits in the collector if !contextScoreFusionKeyExists { - setKnnHitsInCollector(knnHits, req, coll) + setKnnHitsInCollector(knnHits, coll) } if fts != nil { diff --git a/search/collector/topn.go b/search/collector/topn.go index 61ac8f640..e954027f5 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -78,7 +78,7 @@ type TopNCollector struct { searchAfter *search.DocumentMatch knnHits map[string]*search.DocumentMatch - computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc + hybridMergeCallback search.HybridMergeCallbackFn nestedStore *collectStoreNested } @@ -385,7 +385,6 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, // we may have some knn hits left that did not match any of the top N tf-idf hits // we need to add them to the collector store to consider them as well. for _, knnDoc := range hc.knnHits { - // no descendants for knn docs err = hc.prepareDocumentMatch(searchContext, reader, knnDoc, true) if err != nil { return err @@ -435,7 +434,10 @@ func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext, return err } if knnHit, ok := hc.knnHits[d.ID]; ok { - d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit) + // we have a knn hit corresponding to this document + hc.hybridMergeCallback(d, knnHit) + // remove this knn hit from the map as it's already + // been merged delete(hc.knnHits, d.ID) } } @@ -656,10 +658,10 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { return nil } -func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) { +func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, hybridMergeCallback search.HybridMergeCallbackFn) { hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits)) for _, hit := range knnHits { hc.knnHits[hit.ID] = hit } - hc.computeNewScoreExpl = newScoreExplComputer + hc.hybridMergeCallback = hybridMergeCallback } diff --git a/search/util.go b/search/util.go index 7b06ec1d8..1a01713b1 100644 --- a/search/util.go +++ b/search/util.go @@ -16,7 +16,9 @@ package search import ( "context" + "slices" + index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/geo/s2" ) @@ -114,18 +116,6 @@ func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { return first } -type SearchIOStatsCallbackFunc func(uint64) - -// Implementation of SearchIncrementalCostCallbackFn should handle the following messages -// - add: increment the cost of a search operation -// (which can be specific to a query type as well) -// - abort: query was aborted due to a cancel of search's context (for eg), -// which can be handled differently as well -// - done: indicates that a search was complete and the tracked cost can be -// handled safely by the implementation. -type SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, - SearchQueryType, uint64) - type ( SearchIncrementalCostCallbackMsg uint SearchQueryType uint @@ -228,9 +218,7 @@ const ( MinGeoBufPoolSize = 24 ) -type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool - -// *PreSearchDataKey are used to store the data gathered during the presearch phase +// PreSearchDataKey are used to store the data gathered during the presearch phase // which would be use in the actual search phase. const ( KnnPreSearchDataKey = "_knn_pre_search_data_key" @@ -241,14 +229,35 @@ const ( const GlobalScoring = "_global_scoring" type ( + // SearcherStartCallbackFn is a callback function type used to signal the start of + // searcher creation phase. SearcherStartCallbackFn func(size uint64) error - SearcherEndCallbackFn func(size uint64) error + // SearcherEndCallbackFn is a callback function type used to signal the end of + // a searcher creation phase. + SearcherEndCallbackFn func(size uint64) error + // GetScoringModelCallbackFn is a callback function type used to get the scoring model + // to be used for scoring documents during search. + GetScoringModelCallbackFn func() string + // HybridMergeCallbackFn is a callback function type used to merge a KNN document match + // into a full text search document match, of the same docID as part of hybrid search. + HybridMergeCallbackFn func(ftsMatch *DocumentMatch, knnMatch *DocumentMatch) + // GeoBufferPoolCallbackFunc is a callback function type used to get the geo buffer pool + // to be used during geo searches. + GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool + // SearchIOStatsCallbackFunc is a callback function type used to report search IO stats + // during search. + SearchIOStatsCallbackFunc func(uint64) + // Implementation of SearchIncrementalCostCallbackFn should handle the following messages + // - add: increment the cost of a search operation + // (which can be specific to a query type as well) + // - abort: query was aborted due to a cancel of search's context (for eg), + // which can be handled differently as well + // - done: indicates that a search was complete and the tracked cost can be + // handled safely by the implementation. + SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, + SearchQueryType, uint64) ) -type GetScoringModelCallbackFn func() string - -type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) - // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string @@ -303,3 +312,32 @@ func (fs FieldSet) Slice() []string { } return rv } + +// SotedUnionIDs returns the union of two sorted slices of IndexInternalID, +// preserving order and removing duplicates, reusing the underlying array of dest +// where possible. +func SortedUnion(dest, src []index.IndexInternalID) []index.IndexInternalID { + // If dest is empty, return src + if len(dest) == 0 { + return src + } + // If src is empty, return dest + if len(src) == 0 { + return dest + } + // Append src to dest - reuses the underlying array if it has capacity + dest = append(dest, src...) + // Sort the combined slice, dest is now having atleast 2 elements + slices.SortFunc(dest, func(a, b index.IndexInternalID) int { + return a.Compare(b) + }) + // Now remove duplicates, reusing the underlying array by adding the first element + // as the initial unique element + rv := dest[:1] + for i := 1; i < len(dest); i++ { + if !rv[len(rv)-1].Equals(dest[i]) { + rv = append(rv, dest[i]) + } + } + return rv +} diff --git a/search_knn.go b/search_knn.go index c6aae7700..f9db21235 100644 --- a/search_knn.go +++ b/search_knn.go @@ -473,17 +473,18 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return knnHits, nil } -func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopNCollector) { if len(knnHits) > 0 { - newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) { - totalScore := queryMatch.Score + knnMatch.Score - if !req.Explain { - // exit early as we don't need to compute the explanation - return totalScore, nil - } - return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}} + mergeFn := func(ftsMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) { + // Boost the FTS score using the KNN score + ftsMatch.Score += knnMatch.Score + // Combine the FTS explanation with the KNN explanation, if present + ftsMatch.Expl.MergeWith(knnMatch.Expl) + // Add the Descendants from the KNN match to the FTS match, deduplicating them on the way + // The Descendants of a DocumentMatch is always sorted, and we must maintain that invariant + ftsMatch.Descendants = search.SortedUnion(ftsMatch.Descendants, knnMatch.Descendants) } - coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer)) + coll.SetKNNHits(knnHits, search.HybridMergeCallbackFn(mergeFn)) } } From 35ba4ff436f891912a7d72f27e8ddb5a31daa367 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 3 Dec 2025 17:15:37 +0530 Subject: [PATCH 36/70] Fix duplicate results when performing KNN search --- search_knn.go | 34 +++++++++++ search_knn_test.go | 142 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) diff --git a/search_knn.go b/search_knn.go index fae4f52e9..d9f2f8599 100644 --- a/search_knn.go +++ b/search_knn.go @@ -496,6 +496,40 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } + // early exit if there are no hits + if len(knnHits) == 0 { + return knnHits + } + // at this point, we have the final/global set of vectors that satisfy the KNN request. + // We may have multiple vectors per document, so we need to deduplicate the hits + // by document ID, and retain only the best scoring vector per knn query per document. + // This means that if hits have docA twice, we union the score breakdowns for docA, and + // retain only the best score per knn query for docA. + // sort by document ID + sort.Slice(knnHits, func(i, j int) bool { + return knnHits[i].ID < knnHits[j].ID + }) + // now deduplicate the hits by document ID, by using the sorted order + uniqueHits := knnHits[:1] + for i := 1; i < len(knnHits); i++ { + lastUniqueHit := uniqueHits[len(uniqueHits)-1] + currHit := knnHits[i] + if currHit.ID != lastUniqueHit.ID { + // we have found a new unique document + uniqueHits = append(uniqueHits, currHit) + } else { + // we have encountered a duplicate document, so we need to + // union the score breakdowns, retaining the best score + // per knn query + for k, score := range currHit.ScoreBreakdown { + if score > lastUniqueHit.ScoreBreakdown[k] { + lastUniqueHit.ScoreBreakdown[k] = score + } + } + } + } + // now uniqueHits contains only unique documents, so we can set knnHits to uniqueHits + knnHits = uniqueHits // if score fusion required, return early because // score breakdown is retained diff --git a/search_knn_test.go b/search_knn_test.go index f518d337e..5847208a8 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2071,3 +2071,145 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } + +func TestVectorObjectArray(t *testing.T) { + // Setup 6 documents each with one vector field + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = 3 + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + arrayMapping := mapping.NewDocumentMapping() + indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) + arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + index, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [1, 2, 3]}`, + `{"vec": [4, 5, 6]}`, + `{"vec": [7, 8, 9]}`, + `{"vec": [10, 11, 12]}`, + `{"vec": [13, 14, 15]}`, + `{"vec": [16, 17, 18]}`, + } + docs := make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + // Index documents + batch := index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search with a vector that is an array of objects + searchRequest := NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) + searchRequest.Explain = true + + result, err := index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(result.Hits)) + } + + expectedResult := map[string]float64{ + "doc-1": 1.0, + "doc-2": 0.975, + "doc-3": 0.959, + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } + + // Now create 2 docs with 3 vectors each + docsString = []string{ + `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, + } + docs = make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + batch = index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search again with the same vector + searchRequest = NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) + + result, err = index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(result.Hits)) + } + + expectedResult = map[string]float64{ + "doc-multi-1": 1.0, // best score from the 3 vectors + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } +} From 234b4aaf41f9b80f89ecdd37180c7d5fe5c3e82e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 3 Dec 2025 19:29:39 +0530 Subject: [PATCH 37/70] fix duplicate issue --- search_knn.go | 39 +++++++++++++ search_knn_test.go | 142 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) diff --git a/search_knn.go b/search_knn.go index f9db21235..9c0703eff 100644 --- a/search_knn.go +++ b/search_knn.go @@ -501,6 +501,45 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } + // early exit if there are no hits + if len(knnHits) == 0 { + return knnHits + } + // at this point, we have the final/global set of vectors that satisfy the KNN request. + // We may have multiple vectors per document, so we need to deduplicate the hits + // by document ID, and retain only the best scoring vector per knn query per document. + // This means that if hits have docA twice, we union the score breakdowns for docA, and + // retain only the best score per knn query for docA. + // sort by document ID + sort.Slice(knnHits, func(i, j int) bool { + return knnHits[i].ID < knnHits[j].ID + }) + // now deduplicate the hits by document ID, by using the sorted order + uniqueHits := knnHits[:1] + for i := 1; i < len(knnHits); i++ { + lastUniqueHit := uniqueHits[len(uniqueHits)-1] + currHit := knnHits[i] + if currHit.ID != lastUniqueHit.ID { + // we have found a new unique document + uniqueHits = append(uniqueHits, currHit) + } else { + // we have encountered a duplicate document, so we need to + // union the score breakdowns, retaining the best score + // per knn query, while also merging the explanations if req.Explain is true. + for k, score := range currHit.ScoreBreakdown { + if existing, ok := lastUniqueHit.ScoreBreakdown[k]; !ok || score > existing { + lastUniqueHit.ScoreBreakdown[k] = score + // Also update the explanation for this query index if Explain is enabled. + // Both Expl.Children slices are of size len(req.KNN), so indexing by k is safe. + if req.Explain { + lastUniqueHit.Expl.Children[k] = currHit.Expl.Children[k] + } + } + } + } + } + // now uniqueHits contains only unique documents, so we can set knnHits to uniqueHits + knnHits = uniqueHits // if score fusion required, return early because // score breakdown is retained diff --git a/search_knn_test.go b/search_knn_test.go index f518d337e..4f30d83cc 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2071,3 +2071,145 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } + +func TestVectorObjectArray(t *testing.T) { + // Setup 6 documents each with one vector field + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = 3 + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + arrayMapping := mapping.NewDocumentMapping() + indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) + arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + index, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [1, 2, 3]}`, + `{"vec": [4, 5, 6]}`, + `{"vec": [7, 8, 9]}`, + `{"vec": [10, 11, 12]}`, + `{"vec": [13, 14, 15]}`, + `{"vec": [16, 17, 18]}`, + } + docs := make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + // Index documents + batch := index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search with simple single-vector documents + searchRequest := NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) + searchRequest.Explain = true + + result, err := index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(result.Hits)) + } + + expectedResult := map[string]float64{ + "doc-1": 1.0, + "doc-2": 0.975, + "doc-3": 0.959, + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } + + // Now create 2 docs with 3 vectors each + docsString = []string{ + `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, + } + docs = make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + batch = index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search again with the same vector + searchRequest = NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) + + result, err = index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(result.Hits)) + } + + expectedResult = map[string]float64{ + "doc-multi-1": 1.0, // best score from the 3 vectors + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } +} From 3b5d30c78f360ccd10c8135bce521093d7a2ceeb Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 3 Dec 2025 19:35:25 +0530 Subject: [PATCH 38/70] code review --- search_knn.go | 13 +++++++++++-- search_knn_test.go | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/search_knn.go b/search_knn.go index d9f2f8599..ed5eef229 100644 --- a/search_knn.go +++ b/search_knn.go @@ -520,10 +520,19 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } else { // we have encountered a duplicate document, so we need to // union the score breakdowns, retaining the best score - // per knn query + // per knn query, while also merging the explanations if req.Explain is true. for k, score := range currHit.ScoreBreakdown { - if score > lastUniqueHit.ScoreBreakdown[k] { + if existing, ok := lastUniqueHit.ScoreBreakdown[k]; !ok || score > existing { lastUniqueHit.ScoreBreakdown[k] = score + // Also update the explanation for this query index if Explain is enabled. + // Both Expl.Children slices are of size len(req.KNN), so indexing by k is safe. + if req.Explain { + // just defensive check to ensure that the Children slice is valid + if len(lastUniqueHit.Expl.Children) <= k { + lastUniqueHit.Expl.Children = append(lastUniqueHit.Expl.Children, make([]*search.Explanation, k-len(lastUniqueHit.Expl.Children)+1)...) + } + lastUniqueHit.Expl.Children[k] = currHit.Expl.Children[k] + } } } } diff --git a/search_knn_test.go b/search_knn_test.go index 5847208a8..4f30d83cc 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2129,7 +2129,7 @@ func TestVectorObjectArray(t *testing.T) { t.Fatal(err) } - // Search with a vector that is an array of objects + // Search with simple single-vector documents searchRequest := NewSearchRequest(NewMatchNoneQuery()) searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) searchRequest.Explain = true From c2cc7503c6bfcd2e75018507b64561da614f9d2d Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 01:05:05 +0530 Subject: [PATCH 39/70] fix stat --- index_test.go | 6 +++--- search_knn_test.go | 52 +++++++++++++--------------------------------- 2 files changed, 18 insertions(+), 40 deletions(-) diff --git a/index_test.go b/index_test.go index 7ed27ff86..0cc6ce8e1 100644 --- a/index_test.go +++ b/index_test.go @@ -614,7 +614,7 @@ func TestBytesRead(t *testing.T) { expectedBytesRead := uint64(22049) if supportForVectorSearch { - expectedBytesRead = 22459 + expectedBytesRead = 21574 } if prevBytesRead != expectedBytesRead && res.Cost == prevBytesRead { @@ -772,7 +772,7 @@ func TestBytesReadStored(t *testing.T) { expectedBytesRead := uint64(11911) if supportForVectorSearch { - expectedBytesRead = 12321 + expectedBytesRead = 11435 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { @@ -849,7 +849,7 @@ func TestBytesReadStored(t *testing.T) { expectedBytesRead = uint64(4097) if supportForVectorSearch { - expectedBytesRead = 4507 + expectedBytesRead = 3622 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { diff --git a/search_knn_test.go b/search_knn_test.go index 4f30d83cc..2f7d00f71 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2071,7 +2071,6 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } - func TestVectorObjectArray(t *testing.T) { // Setup 6 documents each with one vector field tmpIndexPath := createTmpIndexPath(t) @@ -2083,9 +2082,13 @@ func TestVectorObjectArray(t *testing.T) { vecFieldMapping.Similarity = index.CosineSimilarity indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - arrayMapping := mapping.NewDocumentMapping() - indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) - arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) + arrayFlatMapping := mapping.NewDocumentMapping() + arrayFlatMapping.AddFieldMappingsAt("vec", vecFieldMapping) + indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayFlatMapping) + + // arrayNestedMapping := mapping.NewNestedDocumentMapping() + // indexMapping.DefaultMapping.AddSubDocumentMapping("vectors_nested", arrayNestedMapping) + // arrayNestedMapping.AddFieldMappingsAt("vec", vecFieldMapping) index, err := New(tmpIndexPath, indexMapping) if err != nil { @@ -2105,6 +2108,10 @@ func TestVectorObjectArray(t *testing.T) { `{"vec": [10, 11, 12]}`, `{"vec": [13, 14, 15]}`, `{"vec": [16, 17, 18]}`, + `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, + // `{"vectors_nested": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + // `{"vectors_nested": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, } docs := make([]map[string]interface{}, 0, len(docsString)) for _, docStr := range docsString { @@ -2132,7 +2139,6 @@ func TestVectorObjectArray(t *testing.T) { // Search with simple single-vector documents searchRequest := NewSearchRequest(NewMatchNoneQuery()) searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) - searchRequest.Explain = true result, err := index.Search(searchRequest) if err != nil { @@ -2158,39 +2164,11 @@ func TestVectorObjectArray(t *testing.T) { t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) } } - - // Now create 2 docs with 3 vectors each - docsString = []string{ - `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, - `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, - } - docs = make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - batch = index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - // Search again with the same vector - searchRequest = NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) + searchRequestFlat := NewSearchRequest(NewMatchNoneQuery()) + searchRequestFlat.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) - result, err = index.Search(searchRequest) + result, err = index.Search(searchRequestFlat) if err != nil { t.Fatal(err) } @@ -2200,7 +2178,7 @@ func TestVectorObjectArray(t *testing.T) { } expectedResult = map[string]float64{ - "doc-multi-1": 1.0, // best score from the 3 vectors + "doc-7": 1.0, // best score from the 3 vectors } for _, hit := range result.Hits { From e6cd8ea844e7861546217d2861f97ebf72fb1a77 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 03:03:41 +0530 Subject: [PATCH 40/70] Fix vector field aliase validation --- mapping/document.go | 10 +-- mapping/index.go | 7 +- mapping/mapping_no_vectors.go | 2 +- mapping/mapping_vectors.go | 120 ++++++++++++++++++++++------------ 4 files changed, 88 insertions(+), 51 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index 67353afc4..3da925038 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -54,7 +54,7 @@ type DocumentMapping struct { } func (dm *DocumentMapping) Validate(cache *registry.Cache, - parentName string, fieldAliasCtx map[string]*FieldMapping, + path []string, fieldAliasCtx map[string]*FieldMapping, ) error { var err error if dm.DefaultAnalyzer != "" { @@ -70,11 +70,7 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, } } for propertyName, property := range dm.Properties { - newParent := propertyName - if parentName != "" { - newParent = fmt.Sprintf("%s.%s", parentName, propertyName) - } - err = property.Validate(cache, newParent, fieldAliasCtx) + err = property.Validate(cache, append(path, propertyName), fieldAliasCtx) if err != nil { return err } @@ -98,7 +94,7 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } - err := validateFieldMapping(field, parentName, fieldAliasCtx) + err := validateFieldMapping(field, path, fieldAliasCtx) if err != nil { return err } diff --git a/mapping/index.go b/mapping/index.go index 5c456da04..bafb6ee89 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -193,12 +193,15 @@ func (im *IndexMappingImpl) Validate() error { return err } } + // fieldAliasCtx is used to detect any field alias conflicts across the entire mapping + // the map will hold the fully qualified field name to FieldMapping, so we can + // check for conflicts as we validate each DocumentMapping. fieldAliasCtx := make(map[string]*FieldMapping) // ensure that the nested property is not set for top-level default mapping if im.DefaultMapping.Nested { return fmt.Errorf("default mapping cannot be nested") } - err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) + err = im.DefaultMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err } @@ -207,7 +210,7 @@ func (im *IndexMappingImpl) Validate() error { if docMapping.Nested { return fmt.Errorf("type mapping named: %s cannot be nested", name) } - err = docMapping.Validate(im.cache, "", fieldAliasCtx) + err = docMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err } diff --git a/mapping/mapping_no_vectors.go b/mapping/mapping_no_vectors.go index 90cb1e225..cbe9d81bc 100644 --- a/mapping/mapping_no_vectors.go +++ b/mapping/mapping_no_vectors.go @@ -38,7 +38,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVector interface{}, // ----------------------------------------------------------------------------- // document validation functions -func validateFieldMapping(field *FieldMapping, parentName string, +func validateFieldMapping(field *FieldMapping, path []string, fieldAliasCtx map[string]*FieldMapping) error { return validateFieldType(field) } diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 20cbac6a8..e02cc1fb7 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -141,15 +141,28 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, if !ok { return false } + // Apply defaults for similarity and optimization if not set + similarity := fm.Similarity + if similarity == "" { + similarity = index.DefaultVectorSimilarityMetric + } + vectorIndexOptimizedFor := fm.VectorIndexOptimizedFor + if vectorIndexOptimizedFor == "" { + vectorIndexOptimizedFor = index.DefaultIndexOptimization + } // normalize raw vector if similarity is cosine - if fm.Similarity == index.CosineSimilarity { + if similarity == index.CosineSimilarity { vector = NormalizeVector(vector) } fieldName := getFieldName(pathString, path, fm) options := fm.Options() + // ensure the options are set to not store/index term vectors/doc values + options &^= index.StoreField | index.IncludeTermVectors | index.DocValues + // skip freq/norms for vector field + options |= index.SkipFreqNorm field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector, - fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options) + fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) // "_all" composite field is not applicable for vector field @@ -168,15 +181,28 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if err != nil || len(decodedVector) != fm.Dims { return } + // Apply defaults for similarity and optimization if not set + similarity := fm.Similarity + if similarity == "" { + similarity = index.DefaultVectorSimilarityMetric + } + vectorIndexOptimizedFor := fm.VectorIndexOptimizedFor + if vectorIndexOptimizedFor == "" { + vectorIndexOptimizedFor = index.DefaultIndexOptimization + } // normalize raw vector if similarity is cosine - if fm.Similarity == index.CosineSimilarity { + if similarity == index.CosineSimilarity { decodedVector = NormalizeVector(decodedVector) } fieldName := getFieldName(pathString, path, fm) options := fm.Options() + // ensure the options are set to not store/index term vectors/doc values + options &^= index.StoreField | index.IncludeTermVectors | index.DocValues + // skip freq/norms for vector field + options |= index.SkipFreqNorm field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector, - fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options) + fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) // "_all" composite field is not applicable for vector_base64 field @@ -186,77 +212,89 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac // ----------------------------------------------------------------------------- // document validation functions -func validateFieldMapping(field *FieldMapping, parentName string, +func validateFieldMapping(field *FieldMapping, path []string, fieldAliasCtx map[string]*FieldMapping) error { switch field.Type { case "vector", "vector_base64": - return validateVectorFieldAlias(field, parentName, fieldAliasCtx) + return validateVectorFieldAlias(field, path, fieldAliasCtx) default: // non-vector field return validateFieldType(field) } } -func validateVectorFieldAlias(field *FieldMapping, parentName string, +func validateVectorFieldAlias(field *FieldMapping, path []string, fieldAliasCtx map[string]*FieldMapping) error { - - if field.Name == "" { - field.Name = parentName - } - - if field.Similarity == "" { - field.Similarity = index.DefaultVectorSimilarityMetric - } - - if field.VectorIndexOptimizedFor == "" { - field.VectorIndexOptimizedFor = index.DefaultIndexOptimization + // fully qualified field name + pathString := encodePath(path) + // check if field has a name set, else use path to compute effective name + effectiveFieldName := getFieldName(pathString, path, field) + // Compute effective values for validation + effectiveSimilarity := field.Similarity + if effectiveSimilarity == "" { + effectiveSimilarity = index.DefaultVectorSimilarityMetric } - if _, exists := index.SupportedVectorIndexOptimizations[field.VectorIndexOptimizedFor]; !exists { - // if an unsupported config is provided, override to default - field.VectorIndexOptimizedFor = index.DefaultIndexOptimization + effectiveOptimizedFor := field.VectorIndexOptimizedFor + if effectiveOptimizedFor == "" { + effectiveOptimizedFor = index.DefaultIndexOptimization } - // following fields are not applicable for vector - // thus, we set them to default values - field.IncludeInAll = false - field.IncludeTermVectors = false - field.Store = false - field.DocValues = false - field.SkipFreqNorm = true - - // # If alias is present, validate the field options as per the alias + // # If alias is present, validate the field options as per the alias. // note: reading from a nil map is safe - if fieldAlias, ok := fieldAliasCtx[field.Name]; ok { + if fieldAlias, ok := fieldAliasCtx[effectiveFieldName]; ok { if field.Dims != fieldAlias.Dims { return fmt.Errorf("field: '%s', invalid alias "+ - "(different dimensions %d and %d)", fieldAlias.Name, field.Dims, + "(different dimensions %d and %d)", effectiveFieldName, field.Dims, fieldAlias.Dims) } - if field.Similarity != fieldAlias.Similarity { + // Compare effective similarity values + aliasSimilarity := fieldAlias.Similarity + if aliasSimilarity == "" { + aliasSimilarity = index.DefaultVectorSimilarityMetric + } + if effectiveSimilarity != aliasSimilarity { return fmt.Errorf("field: '%s', invalid alias "+ - "(different similarity values %s and %s)", fieldAlias.Name, - field.Similarity, fieldAlias.Similarity) + "(different similarity values %s and %s)", effectiveFieldName, + effectiveSimilarity, aliasSimilarity) + } + + // Compare effective vector index optimization values + aliasOptimizedFor := fieldAlias.VectorIndexOptimizedFor + if aliasOptimizedFor == "" { + aliasOptimizedFor = index.DefaultIndexOptimization + } + if effectiveOptimizedFor != aliasOptimizedFor { + return fmt.Errorf("field: '%s', invalid alias "+ + "(different vector index optimization values %s and %s)", effectiveFieldName, + effectiveOptimizedFor, aliasOptimizedFor) } return nil } // # Validate field options - + // Vector dimensions must be within allowed range if field.Dims < MinVectorDims || field.Dims > MaxVectorDims { return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+ - " value should be in range (%d, %d)", field.Name, field.Dims, + " value should be in range (%d, %d)", effectiveFieldName, field.Dims, MinVectorDims, MaxVectorDims) } - - if _, ok := index.SupportedVectorSimilarityMetrics[field.Similarity]; !ok { + // Similarity metric must be supported + if _, ok := index.SupportedVectorSimilarityMetrics[effectiveSimilarity]; !ok { return fmt.Errorf("field: '%s', invalid similarity "+ - "metric: '%s', valid metrics are: %+v", field.Name, field.Similarity, + "metric: '%s', valid metrics are: %+v", effectiveFieldName, effectiveSimilarity, reflect.ValueOf(index.SupportedVectorSimilarityMetrics).MapKeys()) } + // Vector index optimization must be supported + if _, ok := index.SupportedVectorIndexOptimizations[effectiveOptimizedFor]; !ok { + return fmt.Errorf("field: '%s', invalid vector index "+ + "optimization: '%s', valid optimizations are: %+v", effectiveFieldName, + effectiveOptimizedFor, + reflect.ValueOf(index.SupportedVectorIndexOptimizations).MapKeys()) + } if fieldAliasCtx != nil { // writing to a nil map is unsafe - fieldAliasCtx[field.Name] = field + fieldAliasCtx[effectiveFieldName] = field } return nil From fc13aca432f38e190fcbfff4e869f32ccf0a0d01 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 03:48:22 +0530 Subject: [PATCH 41/70] unit tests --- document/field_geoshape.go | 4 +- mapping/mapping_vectors.go | 6 +- mapping/mapping_vectors_test.go | 739 +++++++++++++++++++++++++++++++- 3 files changed, 745 insertions(+), 4 deletions(-) diff --git a/document/field_geoshape.go b/document/field_geoshape.go index aa73c2917..6282ff12b 100644 --- a/document/field_geoshape.go +++ b/document/field_geoshape.go @@ -180,7 +180,7 @@ func NewGeoShapeFieldFromShapeWithIndexingOptions(name string, arrayPositions [] // docvalues are always enabled for geoshape fields, even if the // indexing options are set to not include docvalues. - options = options | index.DocValues + options |= index.DocValues return &GeoShapeField{ shape: shape, @@ -232,7 +232,7 @@ func NewGeometryCollectionFieldFromShapesWithIndexingOptions(name string, // docvalues are always enabled for geoshape fields, even if the // indexing options are set to not include docvalues. - options = options | index.DocValues + options |= index.DocValues return &GeoShapeField{ shape: shape, diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index e02cc1fb7..246954022 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -156,11 +156,13 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, } fieldName := getFieldName(pathString, path, fm) + options := fm.Options() // ensure the options are set to not store/index term vectors/doc values options &^= index.StoreField | index.IncludeTermVectors | index.DocValues // skip freq/norms for vector field options |= index.SkipFreqNorm + field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) @@ -197,10 +199,12 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac fieldName := getFieldName(pathString, path, fm) options := fm.Options() + // ensure the options are set to not store/index term vectors/doc values options &^= index.StoreField | index.IncludeTermVectors | index.DocValues // skip freq/norms for vector field options |= index.SkipFreqNorm + field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) @@ -276,7 +280,7 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, // Vector dimensions must be within allowed range if field.Dims < MinVectorDims || field.Dims > MaxVectorDims { return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+ - " value should be in range (%d, %d)", effectiveFieldName, field.Dims, + " value should be in range [%d, %d]", effectiveFieldName, field.Dims, MinVectorDims, MaxVectorDims) } // Similarity metric must be supported diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index b9742376f..b00e5c094 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -19,6 +19,7 @@ package mapping import ( "reflect" + "strings" "testing" ) @@ -175,6 +176,740 @@ func TestVectorFieldAliasValidation(t *testing.T) { expValidity: false, errMsgs: []string{`field: 'vecData', invalid alias (different dimensions 4 and 3)`}, }, + // Test 6: Different vector index optimization values (alias case) + { + name: "different_optimization_alias", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "recall" + }, + { + "name": "cityVec", + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "latency" + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{`field: 'cityVec', invalid alias (different vector index optimization values latency and recall)`}, + }, + // Test 7: Invalid dimensions - below minimum + { + name: "dims_below_minimum", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 0 + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{`field: 'cityVec', invalid vector dimension: 0, value should be in range [1, 4096]`}, + }, + // Test 8: Invalid dimensions - above maximum + { + name: "dims_above_maximum", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 5000 + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{`field: 'cityVec', invalid vector dimension: 5000, value should be in range [1, 4096]`}, + }, + // Test 9: Invalid similarity metric + { + name: "invalid_similarity_metric", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 3, + "similarity": "invalid_metric" + } + ] + } + } + } + }`, + expValidity: false, + // Note: error message contains map keys which have non-deterministic order + errMsgs: []string{`invalid similarity metric: 'invalid_metric'`}, + }, + // Test 10: Invalid vector index optimization + { + name: "invalid_optimization", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "invalid_opt" + } + ] + } + } + } + }`, + expValidity: false, + // Note: error message contains map keys which have non-deterministic order + errMsgs: []string{`invalid vector index optimization: 'invalid_opt'`}, + }, + // Test 11: vector_base64 type with valid dimensions + { + name: "vector_base64_valid", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector_base64", + "dims": 128 + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 12: vector_base64 alias with different dimensions + { + name: "vector_base64_different_dims_alias", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector_base64", + "dims": 128 + }, + { + "name": "cityVec", + "type": "vector_base64", + "dims": 256 + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{`field: 'cityVec', invalid alias (different dimensions 256 and 128)`}, + }, + // Test 13: Default similarity matching explicit similarity in alias + { + name: "default_similarity_matches_explicit", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 3 + }, + { + "name": "cityVec", + "type": "vector", + "dims": 3, + "similarity": "l2_norm" + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 14: Default optimization matching explicit optimization in alias + { + name: "default_optimization_matches_explicit", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 3 + }, + { + "name": "cityVec", + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "recall" + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 15: Valid alias with all explicit matching values + { + name: "valid_alias_all_explicit_matching", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "type": "vector", + "dims": 64, + "similarity": "dot_product", + "vector_index_optimized_for": "latency" + }, + { + "name": "cityVec", + "type": "vector", + "dims": 64, + "similarity": "dot_product", + "vector_index_optimized_for": "latency" + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 16: Cross-property alias with different similarity + { + name: "cross_property_different_similarity", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 3, + "similarity": "cosine" + } + ] + }, + "countryVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 3, + "similarity": "l2_norm" + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{ + `field: 'vecData', invalid alias (different similarity values l2_norm and cosine)`, + `field: 'vecData', invalid alias (different similarity values cosine and l2_norm)`, + }, + }, + // Test 17: Cross-property alias with different optimization + { + name: "cross_property_different_optimization", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "recall" + } + ] + }, + "countryVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 3, + "vector_index_optimized_for": "memory-efficient" + } + ] + } + } + } + }`, + expValidity: false, + errMsgs: []string{ + `field: 'vecData', invalid alias (different vector index optimization values memory-efficient and recall)`, + `field: 'vecData', invalid alias (different vector index optimization values recall and memory-efficient)`, + }, + }, + // Test 18: Valid cross-property alias with matching values + { + name: "valid_cross_property_alias", + mappingStr: ` + { + "default_mapping": { + "properties": { + "cityVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 64, + "similarity": "dot_product", + "vector_index_optimized_for": "latency" + } + ] + }, + "countryVec": { + "fields": [ + { + "name": "vecData", + "type": "vector", + "dims": 64, + "similarity": "dot_product", + "vector_index_optimized_for": "latency" + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 20: Different fully qualified paths - a.b.c.f vs f (different effective names, no conflict) + { + name: "different_fq_paths_no_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "properties": { + "c": { + "fields": [ + { + "name": "f", + "type": "vector", + "dims": 64 + } + ] + } + } + } + } + }, + "x": { + "fields": [ + { + "name": "f", + "type": "vector", + "dims": 128 + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 21: Same leaf property name at different paths (a.b.vec vs x.y.vec) - no conflict + { + name: "same_leaf_different_paths_no_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "properties": { + "vec": { + "fields": [ + { + "type": "vector", + "dims": 64 + } + ] + } + } + } + } + }, + "x": { + "properties": { + "y": { + "properties": { + "vec": { + "fields": [ + { + "type": "vector", + "dims": 128 + } + ] + } + } + } + } + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 22: Field name override creates same effective name - alias conflict + // a.b with name "data" → effective "a.data" + // a with name "data" → effective "data" + // These are different, so no conflict + { + name: "field_name_override_different_parents_no_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "fields": [ + { + "name": "data", + "type": "vector", + "dims": 64 + } + ] + } + } + }, + "a2": { + "fields": [ + { + "name": "data", + "type": "vector", + "dims": 128 + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 23: Same effective field name via name override - should conflict + // a.b with name "sharedVec" → effective "a.sharedVec" + // a.c with name "sharedVec" → effective "a.sharedVec" + // Both resolve to same effective name with different dims → conflict + { + name: "same_effective_name_via_override_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "fields": [ + { + "name": "sharedVec", + "type": "vector", + "dims": 64 + } + ] + }, + "c": { + "fields": [ + { + "name": "sharedVec", + "type": "vector", + "dims": 128 + } + ] + } + } + } + } + } + }`, + expValidity: false, + errMsgs: []string{ + `field: 'a.sharedVec', invalid alias (different dimensions 128 and 64)`, + `field: 'a.sharedVec', invalid alias (different dimensions 64 and 128)`, + }, + }, + // Test 24: Deep nesting with same effective name via name override - should conflict + // level1.level2.propA with name "vec" → effective "level1.level2.vec" + // level1.level2.propB with name "vec" → effective "level1.level2.vec" + { + name: "deep_nesting_same_effective_name_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "level1": { + "properties": { + "level2": { + "properties": { + "propA": { + "fields": [ + { + "name": "vec", + "type": "vector", + "dims": 64 + } + ] + }, + "propB": { + "fields": [ + { + "name": "vec", + "type": "vector", + "dims": 128 + } + ] + } + } + } + } + } + } + } + }`, + expValidity: false, + errMsgs: []string{ + `field: 'level1.level2.vec', invalid alias (different dimensions 128 and 64)`, + `field: 'level1.level2.vec', invalid alias (different dimensions 64 and 128)`, + }, + }, + // Test 25: Root level field vs nested field with same name - no conflict + // Root: "embedding" → effective "embedding" + // Nested: a.b.embedding → effective "a.b.embedding" + { + name: "root_vs_nested_same_name_no_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "embedding": { + "fields": [ + { + "type": "vector", + "dims": 64 + } + ] + }, + "nested": { + "properties": { + "deep": { + "properties": { + "embedding": { + "fields": [ + { + "type": "vector", + "dims": 256 + } + ] + } + } + } + } + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 26: Multiple levels with name override targeting same effective path + // a.b.x with name "target" → effective "a.b.target" + // a.b.target (no override) → effective "a.b.target" + // Same effective name, different dims → conflict + { + name: "name_override_matches_sibling_path_conflict", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "properties": { + "x": { + "fields": [ + { + "name": "target", + "type": "vector", + "dims": 64 + } + ] + }, + "target": { + "fields": [ + { + "type": "vector", + "dims": 128 + } + ] + } + } + } + } + } + } + } + }`, + expValidity: false, + errMsgs: []string{ + `field: 'a.b.target', invalid alias (different dimensions 128 and 64)`, + `field: 'a.b.target', invalid alias (different dimensions 64 and 128)`, + }, + }, + // Test 27: Valid alias at deep nesting level + { + name: "valid_alias_deep_nesting", + mappingStr: ` + { + "default_mapping": { + "properties": { + "a": { + "properties": { + "b": { + "properties": { + "c": { + "properties": { + "vec": { + "fields": [ + { + "type": "vector", + "dims": 128, + "similarity": "dot_product" + }, + { + "name": "vec", + "type": "vector", + "dims": 128, + "similarity": "dot_product" + } + ] + } + } + } + } + } + } + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, + // Test 28: Valid alias with different paths but same effective field name + // vectors.vec with name "vec" → effective "vectors.vec" + // vec with name "vec" → effective "vec" + // Different effective names, so no conflict + { + name: "valid_alias_different_paths_same_field_name", + mappingStr: ` + { + "default_mapping": { + "dynamic": false, + "enabled": true, + "properties": { + "vectors": { + "dynamic": true, + "enabled": true, + "properties": { + "vec": { + "enabled": true, + "dynamic": false, + "fields": [ + { + "dims": 3, + "index": true, + "name": "vec", + "type": "vector" + } + ] + } + } + }, + "vec": { + "enabled": true, + "dynamic": false, + "fields": [ + { + "dims": 3, + "index": true, + "name": "vec", + "similarity": "l2_norm", + "type": "vector", + "vector_index_optimized_for": "recall" + } + ] + } + } + } + }`, + expValidity: true, + errMsgs: []string{}, + }, } for _, test := range tests { @@ -195,7 +930,9 @@ func TestVectorFieldAliasValidation(t *testing.T) { if !isValid { errStringMatched := false for _, possibleErrMsg := range test.errMsgs { - if err.Error() == possibleErrMsg { + // Use Contains for matching since some error messages include + // map keys which have non-deterministic ordering + if err.Error() == possibleErrMsg || strings.Contains(err.Error(), possibleErrMsg) { errStringMatched = true break } From 56e46faecf17f979b3b83fba7ed817794969a565 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 05:39:23 +0530 Subject: [PATCH 42/70] Fix vector normalization to handle multi-vectors correctly --- mapping/mapping_vectors.go | 33 +++++++-- mapping/mapping_vectors_test.go | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 246954022..24bd9b78c 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -20,6 +20,7 @@ package mapping import ( "fmt" "reflect" + "slices" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/util" @@ -151,8 +152,10 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, vectorIndexOptimizedFor = index.DefaultIndexOptimization } // normalize raw vector if similarity is cosine + // Since the vector can be multi-vector (flattened array of multiple vectors), + // we use NormalizeMultiVector to normalize each sub-vector independently. if similarity == index.CosineSimilarity { - vector = NormalizeVector(vector) + vector = NormalizeMultiVector(vector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -194,7 +197,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac } // normalize raw vector if similarity is cosine if similarity == index.CosineSimilarity { - decodedVector = NormalizeVector(decodedVector) + decodedVector = NormalizeMultiVector(decodedVector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -304,11 +307,33 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, return nil } +// NormalizeVector normalizes a single vector to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. func NormalizeVector(vec []float32) []float32 { // make a copy of the vector to avoid modifying the original // vector in-place - vecCopy := make([]float32, len(vec)) - copy(vecCopy, vec) + vecCopy := slices.Clone(vec) // normalize the vector copy using in-place normalization provided by faiss return faiss.NormalizeVector(vecCopy) } + +// NormalizeMultiVector normalizes each sub-vector of size `dims` independently. +// For a flattened array containing multiple vectors, each sub-vector is +// normalized separately to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. +func NormalizeMultiVector(vec []float32, dims int) []float32 { + if len(vec) == 0 || dims <= 0 || len(vec)%dims != 0 { + return vec + } + // Single vector - delegate to NormalizeVector + if len(vec) == dims { + return NormalizeVector(vec) + } + // Multi-vector - make a copy to avoid modifying the original + result := slices.Clone(vec) + // Normalize each sub-vector in-place + for i := 0; i < len(result); i += dims { + faiss.NormalizeVector(result[i : i+dims]) + } + return result +} diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index b00e5c094..0620510a0 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -18,6 +18,7 @@ package mapping import ( + "math" "reflect" "strings" "testing" @@ -1069,3 +1070,120 @@ func TestNormalizeVector(t *testing.T) { } } } + +func TestNormalizeMultiVectors(t *testing.T) { + tests := []struct { + name string + input []float32 + dims int + expected []float32 + }{ + { + name: "single vector - already normalized", + input: []float32{1, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "single vector - needs normalization", + input: []float32{3, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "two vectors - X and Y directions", + input: []float32{3, 0, 0, 0, 4, 0}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0}, + }, + { + name: "three vectors", + input: []float32{3, 0, 0, 0, 4, 0, 0, 0, 5}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0, 0, 0, 1}, + }, + { + name: "two 2D vectors", + input: []float32{3, 4, 5, 12}, + dims: 2, + expected: []float32{0.6, 0.8, 0.38461538, 0.92307693}, + }, + { + name: "empty vector", + input: []float32{}, + dims: 3, + expected: []float32{}, + }, + { + name: "zero dims", + input: []float32{1, 2, 3}, + dims: 0, + expected: []float32{1, 2, 3}, + }, + { + name: "negative dims", + input: []float32{1, 2, 3}, + dims: -1, + expected: []float32{1, 2, 3}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Make a copy of input to verify original is not modified + inputCopy := make([]float32, len(tt.input)) + copy(inputCopy, tt.input) + + result := NormalizeMultiVector(tt.input, tt.dims) + + // Check result matches expected + if len(result) != len(tt.expected) { + t.Errorf("length mismatch: expected %d, got %d", len(tt.expected), len(result)) + return + } + + for i := range result { + if !floatApproxEqual(result[i], tt.expected[i], 1e-5) { + t.Errorf("value mismatch at index %d: expected %v, got %v", + i, tt.expected[i], result[i]) + } + } + + // Verify original input was not modified + if !reflect.DeepEqual(tt.input, inputCopy) { + t.Errorf("original input was modified: was %v, now %v", inputCopy, tt.input) + } + + // For valid multi-vectors, verify each sub-vector has unit magnitude + if tt.dims > 0 && len(tt.input) > 0 && len(tt.input)%tt.dims == 0 { + numVecs := len(result) / tt.dims + for i := 0; i < numVecs; i++ { + subVec := result[i*tt.dims : (i+1)*tt.dims] + mag := magnitude(subVec) + // Allow for zero vectors (magnitude 0) or unit vectors (magnitude 1) + if mag > 1e-6 && !floatApproxEqual(mag, 1.0, 1e-5) { + t.Errorf("sub-vector %d has magnitude %v, expected 1.0", i, mag) + } + } + } + }) + } +} + +// Helper to compute magnitude of a vector +func magnitude(v []float32) float32 { + var sum float32 + for _, x := range v { + sum += x * x + } + return float32(math.Sqrt(float64(sum))) +} + +// Helper for approximate float comparison +func floatApproxEqual(a, b, epsilon float32) bool { + diff := a - b + if diff < 0 { + diff = -diff + } + return diff < epsilon +} From 781335b7b6ea2254c804132d8f24a2d8304b4655 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 05:39:23 +0530 Subject: [PATCH 43/70] Fix vector normalization to handle multi-vectors correctly --- mapping/mapping_vectors.go | 33 +++++++-- mapping/mapping_vectors_test.go | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 20cbac6a8..dbba711eb 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -20,6 +20,7 @@ package mapping import ( "fmt" "reflect" + "slices" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/util" @@ -142,8 +143,10 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, return false } // normalize raw vector if similarity is cosine + // Since the vector can be multi-vector (flattened array of multiple vectors), + // we use NormalizeMultiVector to normalize each sub-vector independently. if fm.Similarity == index.CosineSimilarity { - vector = NormalizeVector(vector) + vector = NormalizeMultiVector(vector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -170,7 +173,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac } // normalize raw vector if similarity is cosine if fm.Similarity == index.CosineSimilarity { - decodedVector = NormalizeVector(decodedVector) + decodedVector = NormalizeMultiVector(decodedVector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -262,11 +265,33 @@ func validateVectorFieldAlias(field *FieldMapping, parentName string, return nil } +// NormalizeVector normalizes a single vector to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. func NormalizeVector(vec []float32) []float32 { // make a copy of the vector to avoid modifying the original // vector in-place - vecCopy := make([]float32, len(vec)) - copy(vecCopy, vec) + vecCopy := slices.Clone(vec) // normalize the vector copy using in-place normalization provided by faiss return faiss.NormalizeVector(vecCopy) } + +// NormalizeMultiVector normalizes each sub-vector of size `dims` independently. +// For a flattened array containing multiple vectors, each sub-vector is +// normalized separately to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. +func NormalizeMultiVector(vec []float32, dims int) []float32 { + if len(vec) == 0 || dims <= 0 || len(vec)%dims != 0 { + return vec + } + // Single vector - delegate to NormalizeVector + if len(vec) == dims { + return NormalizeVector(vec) + } + // Multi-vector - make a copy to avoid modifying the original + result := slices.Clone(vec) + // Normalize each sub-vector in-place + for i := 0; i < len(result); i += dims { + faiss.NormalizeVector(result[i : i+dims]) + } + return result +} diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index b9742376f..7fda5c3b5 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -18,6 +18,7 @@ package mapping import ( + "math" "reflect" "testing" ) @@ -332,3 +333,120 @@ func TestNormalizeVector(t *testing.T) { } } } + +func TestNormalizeMultiVectors(t *testing.T) { + tests := []struct { + name string + input []float32 + dims int + expected []float32 + }{ + { + name: "single vector - already normalized", + input: []float32{1, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "single vector - needs normalization", + input: []float32{3, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "two vectors - X and Y directions", + input: []float32{3, 0, 0, 0, 4, 0}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0}, + }, + { + name: "three vectors", + input: []float32{3, 0, 0, 0, 4, 0, 0, 0, 5}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0, 0, 0, 1}, + }, + { + name: "two 2D vectors", + input: []float32{3, 4, 5, 12}, + dims: 2, + expected: []float32{0.6, 0.8, 0.38461538, 0.92307693}, + }, + { + name: "empty vector", + input: []float32{}, + dims: 3, + expected: []float32{}, + }, + { + name: "zero dims", + input: []float32{1, 2, 3}, + dims: 0, + expected: []float32{1, 2, 3}, + }, + { + name: "negative dims", + input: []float32{1, 2, 3}, + dims: -1, + expected: []float32{1, 2, 3}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Make a copy of input to verify original is not modified + inputCopy := make([]float32, len(tt.input)) + copy(inputCopy, tt.input) + + result := NormalizeMultiVector(tt.input, tt.dims) + + // Check result matches expected + if len(result) != len(tt.expected) { + t.Errorf("length mismatch: expected %d, got %d", len(tt.expected), len(result)) + return + } + + for i := range result { + if !floatApproxEqual(result[i], tt.expected[i], 1e-5) { + t.Errorf("value mismatch at index %d: expected %v, got %v", + i, tt.expected[i], result[i]) + } + } + + // Verify original input was not modified + if !reflect.DeepEqual(tt.input, inputCopy) { + t.Errorf("original input was modified: was %v, now %v", inputCopy, tt.input) + } + + // For valid multi-vectors, verify each sub-vector has unit magnitude + if tt.dims > 0 && len(tt.input) > 0 && len(tt.input)%tt.dims == 0 { + numVecs := len(result) / tt.dims + for i := 0; i < numVecs; i++ { + subVec := result[i*tt.dims : (i+1)*tt.dims] + mag := magnitude(subVec) + // Allow for zero vectors (magnitude 0) or unit vectors (magnitude 1) + if mag > 1e-6 && !floatApproxEqual(mag, 1.0, 1e-5) { + t.Errorf("sub-vector %d has magnitude %v, expected 1.0", i, mag) + } + } + } + }) + } +} + +// Helper to compute magnitude of a vector +func magnitude(v []float32) float32 { + var sum float32 + for _, x := range v { + sum += x * x + } + return float32(math.Sqrt(float64(sum))) +} + +// Helper for approximate float comparison +func floatApproxEqual(a, b, epsilon float32) bool { + diff := a - b + if diff < 0 { + diff = -diff + } + return diff < epsilon +} From 4e3891fd0880b3acca5f97e564eb1b6f02e4f860 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 06:41:25 +0530 Subject: [PATCH 44/70] UT --- search_knn_test.go | 279 ++++++++++++++++++++++----------------------- 1 file changed, 137 insertions(+), 142 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index 4f30d83cc..7d110b5ec 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1645,6 +1645,143 @@ func TestNestedVectors(t *testing.T) { } } +// TestMultiVectorCosineNormalization verifies that multi-vector fields are +// normalized correctly with cosine similarity. Each sub-vector in a multi-vector +// should be independently normalized, producing correct similarity scores. +func TestMultiVectorCosineNormalization(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + const dims = 3 + + // Create index with cosine similarity + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = dims + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + // Multi-vector field + vecFieldMappingNested := mapping.NewVectorFieldMapping() + vecFieldMappingNested.Dims = dims + vecFieldMappingNested.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec_nested", vecFieldMappingNested) + + idx, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [3, 0, 0]}`, + `{"vec": [0, 4, 0]}`, + `{"vec_nested": [[3, 0, 0], [0, 4, 0]]}`, + } + + for i, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + err = idx.Index(fmt.Sprintf("doc%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + + // Query for X direction [1,0,0] + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{1, 0, 0}, 3, 1.0) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc1 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc1" { + t.Fatalf("expected doc1 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc2 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc2" { + t.Fatalf("expected doc2 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Query for Y direction [0,1,0] + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc2 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc2" { + t.Fatalf("expected doc2 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc1 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc1" { + t.Fatalf("expected doc1 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Now test querying the nested multi-vector field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{1, 0, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on first sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Query for Y direction [0,1,0] on nested field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on second sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } +} + func TestNumVecsStat(t *testing.T) { dataset, _, err := readDatasetAndQueries(testInputCompressedFile) @@ -2071,145 +2208,3 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } - -func TestVectorObjectArray(t *testing.T) { - // Setup 6 documents each with one vector field - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - indexMapping := NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = 3 - vecFieldMapping.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - arrayMapping := mapping.NewDocumentMapping() - indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) - arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - index, err := New(tmpIndexPath, indexMapping) - if err != nil { - t.Fatal(err) - } - defer func() { - err := index.Close() - if err != nil { - t.Fatal(err) - } - }() - - docsString := []string{ - `{"vec": [1, 2, 3]}`, - `{"vec": [4, 5, 6]}`, - `{"vec": [7, 8, 9]}`, - `{"vec": [10, 11, 12]}`, - `{"vec": [13, 14, 15]}`, - `{"vec": [16, 17, 18]}`, - } - docs := make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - // Index documents - batch := index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search with simple single-vector documents - searchRequest := NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) - searchRequest.Explain = true - - result, err := index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 3 { - t.Fatalf("expected 3 hits, got %d", len(result.Hits)) - } - - expectedResult := map[string]float64{ - "doc-1": 1.0, - "doc-2": 0.975, - "doc-3": 0.959, - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } - - // Now create 2 docs with 3 vectors each - docsString = []string{ - `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, - `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, - } - docs = make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - batch = index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search again with the same vector - searchRequest = NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) - - result, err = index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(result.Hits)) - } - - expectedResult = map[string]float64{ - "doc-multi-1": 1.0, // best score from the 3 vectors - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } -} From 9dae8324de7950fc352147625aaf17835a29cd16 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 06:43:11 +0530 Subject: [PATCH 45/70] merge conflict --- search_knn_test.go | 142 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/search_knn_test.go b/search_knn_test.go index 7d110b5ec..29b3ac088 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2208,3 +2208,145 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } + +func TestVectorObjectArray(t *testing.T) { + // Setup 6 documents each with one vector field + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = 3 + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + arrayMapping := mapping.NewDocumentMapping() + indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) + arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + index, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [1, 2, 3]}`, + `{"vec": [4, 5, 6]}`, + `{"vec": [7, 8, 9]}`, + `{"vec": [10, 11, 12]}`, + `{"vec": [13, 14, 15]}`, + `{"vec": [16, 17, 18]}`, + } + docs := make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + // Index documents + batch := index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search with simple single-vector documents + searchRequest := NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) + searchRequest.Explain = true + + result, err := index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(result.Hits)) + } + + expectedResult := map[string]float64{ + "doc-1": 1.0, + "doc-2": 0.975, + "doc-3": 0.959, + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } + + // Now create 2 docs with 3 vectors each + docsString = []string{ + `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, + } + docs = make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + batch = index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search again with the same vector + searchRequest = NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) + + result, err = index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(result.Hits)) + } + + expectedResult = map[string]float64{ + "doc-multi-1": 1.0, // best score from the 3 vectors + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } +} From d654a70238cb1cfc503d0ffc218989ccb0eb4b10 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 3 Dec 2025 17:15:37 +0530 Subject: [PATCH 46/70] Fix duplicate results when performing KNN search --- search_knn.go | 34 +++++++++++ search_knn_test.go | 142 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) diff --git a/search_knn.go b/search_knn.go index fae4f52e9..d9f2f8599 100644 --- a/search_knn.go +++ b/search_knn.go @@ -496,6 +496,40 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } + // early exit if there are no hits + if len(knnHits) == 0 { + return knnHits + } + // at this point, we have the final/global set of vectors that satisfy the KNN request. + // We may have multiple vectors per document, so we need to deduplicate the hits + // by document ID, and retain only the best scoring vector per knn query per document. + // This means that if hits have docA twice, we union the score breakdowns for docA, and + // retain only the best score per knn query for docA. + // sort by document ID + sort.Slice(knnHits, func(i, j int) bool { + return knnHits[i].ID < knnHits[j].ID + }) + // now deduplicate the hits by document ID, by using the sorted order + uniqueHits := knnHits[:1] + for i := 1; i < len(knnHits); i++ { + lastUniqueHit := uniqueHits[len(uniqueHits)-1] + currHit := knnHits[i] + if currHit.ID != lastUniqueHit.ID { + // we have found a new unique document + uniqueHits = append(uniqueHits, currHit) + } else { + // we have encountered a duplicate document, so we need to + // union the score breakdowns, retaining the best score + // per knn query + for k, score := range currHit.ScoreBreakdown { + if score > lastUniqueHit.ScoreBreakdown[k] { + lastUniqueHit.ScoreBreakdown[k] = score + } + } + } + } + // now uniqueHits contains only unique documents, so we can set knnHits to uniqueHits + knnHits = uniqueHits // if score fusion required, return early because // score breakdown is retained diff --git a/search_knn_test.go b/search_knn_test.go index f518d337e..5847208a8 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2071,3 +2071,145 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } + +func TestVectorObjectArray(t *testing.T) { + // Setup 6 documents each with one vector field + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = 3 + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + arrayMapping := mapping.NewDocumentMapping() + indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) + arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + index, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [1, 2, 3]}`, + `{"vec": [4, 5, 6]}`, + `{"vec": [7, 8, 9]}`, + `{"vec": [10, 11, 12]}`, + `{"vec": [13, 14, 15]}`, + `{"vec": [16, 17, 18]}`, + } + docs := make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + // Index documents + batch := index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search with a vector that is an array of objects + searchRequest := NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) + searchRequest.Explain = true + + result, err := index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(result.Hits)) + } + + expectedResult := map[string]float64{ + "doc-1": 1.0, + "doc-2": 0.975, + "doc-3": 0.959, + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } + + // Now create 2 docs with 3 vectors each + docsString = []string{ + `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, + `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, + } + docs = make([]map[string]interface{}, 0, len(docsString)) + for _, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + docs = append(docs, doc) + } + + batch = index.NewBatch() + for i, doc := range docs { + err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // Search again with the same vector + searchRequest = NewSearchRequest(NewMatchNoneQuery()) + searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) + + result, err = index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } + + if len(result.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(result.Hits)) + } + + expectedResult = map[string]float64{ + "doc-multi-1": 1.0, // best score from the 3 vectors + } + + for _, hit := range result.Hits { + expectedScore, exists := expectedResult[hit.ID] + if !exists { + t.Fatalf("unexpected doc ID %s", hit.ID) + } + if math.Abs(hit.Score-expectedScore) > 0.001 { + t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) + } + } +} From 7e65ecd498b68f74e5cab4db98733a44ec58259f Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 3 Dec 2025 19:35:25 +0530 Subject: [PATCH 47/70] code review --- search_knn.go | 13 +++++++++++-- search_knn_test.go | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/search_knn.go b/search_knn.go index d9f2f8599..ed5eef229 100644 --- a/search_knn.go +++ b/search_knn.go @@ -520,10 +520,19 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } else { // we have encountered a duplicate document, so we need to // union the score breakdowns, retaining the best score - // per knn query + // per knn query, while also merging the explanations if req.Explain is true. for k, score := range currHit.ScoreBreakdown { - if score > lastUniqueHit.ScoreBreakdown[k] { + if existing, ok := lastUniqueHit.ScoreBreakdown[k]; !ok || score > existing { lastUniqueHit.ScoreBreakdown[k] = score + // Also update the explanation for this query index if Explain is enabled. + // Both Expl.Children slices are of size len(req.KNN), so indexing by k is safe. + if req.Explain { + // just defensive check to ensure that the Children slice is valid + if len(lastUniqueHit.Expl.Children) <= k { + lastUniqueHit.Expl.Children = append(lastUniqueHit.Expl.Children, make([]*search.Explanation, k-len(lastUniqueHit.Expl.Children)+1)...) + } + lastUniqueHit.Expl.Children[k] = currHit.Expl.Children[k] + } } } } diff --git a/search_knn_test.go b/search_knn_test.go index 5847208a8..4f30d83cc 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2129,7 +2129,7 @@ func TestVectorObjectArray(t *testing.T) { t.Fatal(err) } - // Search with a vector that is an array of objects + // Search with simple single-vector documents searchRequest := NewSearchRequest(NewMatchNoneQuery()) searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) searchRequest.Explain = true From 8838f8926a2a58fbf9d9565c5aae40e5474ee5c0 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 19:23:26 +0530 Subject: [PATCH 48/70] fix dedup logic --- search/collector/knn.go | 75 +++++++++++++++++++++++++++++++++++++++++ search/util.go | 22 ++++++++++++ search_knn.go | 43 ----------------------- 3 files changed, 97 insertions(+), 43 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 465bf6927..65dc0845f 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -95,6 +95,61 @@ func (c *collectStoreKNN) Final(fixup collectorFixup) (search.DocumentMatchColle return rv, nil } +// ----------------------------------------------------------------------------------------- +// knnMerger is a preprocessing layer on top of collectStoreKNN that merges +// duplicate document matches before adding them to the underlying KNN store. +// Since KNN searchers may return the same document multiple times (with different +// scoreBreakdown values for each KNN query), we must merge these to retain the +// best score per KNN query before adding them to the KNN store. +// NOTE: This implementation assumes documents arrive in increasing order of their +// internal IDs, which is guaranteed by all searchers in bleve. +type knnMerger struct { + // curr is the current document match being merged + curr *search.DocumentMatch +} + +func newKNNMerger() *knnMerger { + return &knnMerger{} +} + +// Merge merges duplicate document matches by combining their score breakdowns. +// Returns nil if the incoming doc was merged into the current document. +// Returns a non-nil DocumentMatch when a new document arrives, representing +// the completed merge of the previous document ready for further processing. +func (c *knnMerger) Merge(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { + // see if the document has been seen before + if c.curr != nil && c.curr.IndexInternalID.Equals(doc.IndexInternalID) { + // merge the score breakdowns + c.curr.ScoreBreakdown, c.curr.Expl = search.MergeScoreExplBreakdown( + c.curr.ScoreBreakdown, doc.ScoreBreakdown, + c.curr.Expl, doc.Expl) + // recycle the incoming document now that it's merged + ctx.DocumentMatchPool.Put(doc) + // return nil since no document to process further + return nil, nil + } + // now we are sure that this is a new document, check if we have an existing + // document to return for processing + if c.curr != nil { + // we have an existing document, return it for processing + toReturn := c.curr + // set the current to the incoming document + c.curr = doc + return toReturn, nil + } + // first document seen, set it as current and return nil + c.curr = doc + return nil, nil +} + +// Current returns the current document match being merged, if any. +// Call this after processing all documents to flush the last document. +func (c *knnMerger) Current() *search.DocumentMatch { + return c.curr +} + +// ----------------------------------------------------------------------------------------- + func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHandler, error) { var hc *KNNCollector var ok bool @@ -130,6 +185,9 @@ func GetNewKNNCollectorStore(kArray []int64) *collectStoreKNN { // implements Collector interface type KNNCollector struct { + // merger merges duplicate document matches before adding to knnStore + merger *knnMerger + // knnStore is the underlying store for KNN document matches knnStore *collectStoreKNN size int total uint64 @@ -140,6 +198,7 @@ type KNNCollector struct { func NewKNNCollector(kArray []int64, size int64) *KNNCollector { return &KNNCollector{ + merger: newKNNMerger(), knnStore: GetNewKNNCollectorStore(kArray), size: int(size), } @@ -191,6 +250,14 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r } hc.total++ + // since we may get duplicate document matches from the KNN searcher, + // we must merge them before adding to the KNN store, keeping the + // best scoreBreakdown for each KNN query per document. + next, err = hc.merger.Merge(searchContext, next) + if err != nil { + break + } + err = dmHandler(next) if err != nil { break @@ -202,6 +269,14 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r return err } + // flush the last merged document if any + if lastDoc := hc.merger.Current(); lastDoc != nil { + err = dmHandler(lastDoc) + if err != nil { + return err + } + } + // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) diff --git a/search/util.go b/search/util.go index 005fda67d..e90b3c0f2 100644 --- a/search/util.go +++ b/search/util.go @@ -237,3 +237,25 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } + +// MergeScoreBreakdown merges two score breakdown maps together +// by picking the best score per query component, and merging them +// into the first map. +func MergeScoreExplBreakdown(first, second map[int]float64, firstExpl, secondExpl *Explanation) (map[int]float64, *Explanation) { + if first == nil { + return second, secondExpl + } + if second == nil { + return first, firstExpl + } + // pick the best score per query component between the two maps + for k, score := range second { + if existing, ok := first[k]; !ok || existing < score { + first[k] = score + if firstExpl != nil && secondExpl != nil { + firstExpl.Children[k] = secondExpl.Children[k] + } + } + } + return first, firstExpl +} diff --git a/search_knn.go b/search_knn.go index ed5eef229..fae4f52e9 100644 --- a/search_knn.go +++ b/search_knn.go @@ -496,49 +496,6 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } - // early exit if there are no hits - if len(knnHits) == 0 { - return knnHits - } - // at this point, we have the final/global set of vectors that satisfy the KNN request. - // We may have multiple vectors per document, so we need to deduplicate the hits - // by document ID, and retain only the best scoring vector per knn query per document. - // This means that if hits have docA twice, we union the score breakdowns for docA, and - // retain only the best score per knn query for docA. - // sort by document ID - sort.Slice(knnHits, func(i, j int) bool { - return knnHits[i].ID < knnHits[j].ID - }) - // now deduplicate the hits by document ID, by using the sorted order - uniqueHits := knnHits[:1] - for i := 1; i < len(knnHits); i++ { - lastUniqueHit := uniqueHits[len(uniqueHits)-1] - currHit := knnHits[i] - if currHit.ID != lastUniqueHit.ID { - // we have found a new unique document - uniqueHits = append(uniqueHits, currHit) - } else { - // we have encountered a duplicate document, so we need to - // union the score breakdowns, retaining the best score - // per knn query, while also merging the explanations if req.Explain is true. - for k, score := range currHit.ScoreBreakdown { - if existing, ok := lastUniqueHit.ScoreBreakdown[k]; !ok || score > existing { - lastUniqueHit.ScoreBreakdown[k] = score - // Also update the explanation for this query index if Explain is enabled. - // Both Expl.Children slices are of size len(req.KNN), so indexing by k is safe. - if req.Explain { - // just defensive check to ensure that the Children slice is valid - if len(lastUniqueHit.Expl.Children) <= k { - lastUniqueHit.Expl.Children = append(lastUniqueHit.Expl.Children, make([]*search.Explanation, k-len(lastUniqueHit.Expl.Children)+1)...) - } - lastUniqueHit.Expl.Children[k] = currHit.Expl.Children[k] - } - } - } - } - } - // now uniqueHits contains only unique documents, so we can set knnHits to uniqueHits - knnHits = uniqueHits // if score fusion required, return early because // score breakdown is retained From 42c98f1c772aee549e4835386b96fe6b3c845b48 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 21:40:08 +0530 Subject: [PATCH 49/70] unit test --- search_knn_test.go | 347 ++++++++++++++++++++++++++------------------- 1 file changed, 205 insertions(+), 142 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index 4f30d83cc..cb211169e 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1645,6 +1645,211 @@ func TestNestedVectors(t *testing.T) { } } +// ----------------------------------------------------------------------------- +// TestKNNMerger tests the KNN merger functionality which handles duplicate +// document matches from the KNN searcher. When a document has multiple vectors +// (via [[]] array of vectors or [{}] array of objects with vectors), the KNN +// searcher may return the same document multiple times with different scores. +// The merger must: +// 1. Detect duplicates by IndexInternalID +// 2. Merge score breakdowns, keeping the best score per KNN query +// 3. Properly flush the last document after iteration completes +func TestKNNMerger(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + // JSON documents covering merger scenarios: + // - Single vector (baseline) + // - [[]] style: array of vectors (same doc appears multiple times) + // - [{}] style: array of objects with vector field (chunks pattern) + docs := map[string]string{ + // Single vector - baseline + "doc1": `{ + "vec": [10, 10, 10], + "vecB": [100, 100, 100] + }`, + // [[]] style - array of 2 vectors + "doc2": `{ + "vec": [[0, 0, 0], [500, 500, 500]], + "vecB": [[900, 900, 900], [950, 950, 950], [975, 975, 975], [990, 990, 990]] + }`, + // [[]] style - array of 3 vectors + "doc3": `{ + "vec": [[50, 50, 50], [200, 200, 200], [400, 400, 400]], + "vecB": [[800, 800, 800], [850, 850, 850]] + }`, + // Single vector - baseline + "doc4": `{ + "vec": [1000, 1000, 1000], + "vecB": [1, 1, 1] + }`, + // [{}] style - array of objects with vector field (chunks pattern) + "doc5": `{ + "chunks": [ + {"vec": [10, 10, 10], "text": "chunk1"}, + {"vec": [20, 20, 20], "text": "chunk2"}, + {"vec": [30, 30, 30], "text": "chunk3"}, + {"vec": [40, 40, 40], "text": "chunk3"} + ] + }`, + "doc6": `{ + "chunks": [ + {"vec": [[10, 10, 10],[20, 20, 20]], "text": "chunk1"}, + {"vec": [[30, 30, 30],[40, 40, 40]], "text": "chunk2"} + ] + }`, + } + + // Parse JSON documents + dataset := make(map[string]map[string]interface{}) + for docID, jsonStr := range docs { + var doc map[string]interface{} + if err := json.Unmarshal([]byte(jsonStr), &doc); err != nil { + t.Fatalf("failed to unmarshal %s: %v", docID, err) + } + dataset[docID] = doc + } + + // Index mapping + indexMapping := NewIndexMapping() + + vecMapping := mapping.NewVectorFieldMapping() + vecMapping.Dims = 3 + vecMapping.Similarity = index.InnerProduct + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecMapping) + indexMapping.DefaultMapping.AddFieldMappingsAt("vecB", vecMapping) + + // Nested chunks mapping for [{}] style + chunksMapping := mapping.NewDocumentMapping() + chunksMapping.AddFieldMappingsAt("vec", vecMapping) + indexMapping.DefaultMapping.AddSubDocumentMapping("chunks", chunksMapping) + + // Create and populate index + idx, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + if err := idx.Close(); err != nil { + t.Fatal(err) + } + }() + + batch := idx.NewBatch() + for docID, doc := range dataset { + if err := batch.Index(docID, doc); err != nil { + t.Fatal(err) + } + } + if err := idx.Batch(batch); err != nil { + t.Fatal(err) + } + + // Test: Single KNN query - basic functionality + t.Run("VecFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{1, 1, 1}, 20, 1.0) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + // Inner product: score = sum(query_i * doc_i) + // doc1 vec=[10,10,10]: 1*10*3 = 30 + // doc2 vec best is [500,500,500]: 1*500*3 = 1500 + // doc3 vec best is [400,400,400]: 1*400*3 = 1200 + // doc4 vec=[1000,1000,1000]: 1*1000*3 = 3000 + expectedResult := []struct { + docID string + expectedScore float64 + }{ + {docID: "doc4", expectedScore: 3000}, + {docID: "doc2", expectedScore: 1500}, + {docID: "doc3", expectedScore: 1200}, + {docID: "doc1", expectedScore: 30}, + } + + if len(res.Hits) != len(expectedResult) { + t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits)) + } + + for i, expected := range expectedResult { + if res.Hits[i].ID != expected.docID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID) + } + if res.Hits[i].Score != expected.expectedScore { + t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score) + } + } + }) + + // Test: Single KNN query on vecB field + t.Run("VecBFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vecB", []float32{1000, 1000, 1000}, 20, 1.0) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + // Inner product: score = sum(query_i * doc_i) for each dimension + // doc1: vecB=[100,100,100] -> 1000*100*3 = 300,000 + // doc2: vecB best is [990,990,990] -> 1000*990*3 = 2,970,000 + // doc3: vecB best is [850,850,850] -> 1000*850*3 = 2,550,000 + // doc4: vecB=[1,1,1] -> 1000*1*3 = 3,000 + expectedResult := []struct { + docID string + expectedScore float64 + }{ + {docID: "doc2", expectedScore: 2970000}, + {docID: "doc3", expectedScore: 2550000}, + {docID: "doc1", expectedScore: 300000}, + {docID: "doc4", expectedScore: 3000}, + } + + if len(res.Hits) != len(expectedResult) { + t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits)) + } + + for i, expected := range expectedResult { + if res.Hits[i].ID != expected.docID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID) + } + if res.Hits[i].Score != expected.expectedScore { + t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score) + } + } + }) + + // Test: Single KNN query on nested chunks.vec field + t.Run("ChunksVecFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("chunks.vec", []float32{1, 1, 1}, 20, 1.0) + searchReq.SortBy([]string{"_score", "docID"}) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + + // Only doc5 and doc6 have chunks.vec + // doc5 chunks: [10,10,10], [20,20,20], [30,30,30], [40,40,40] + // Best score: 1*40*3 = 120 + // doc6 chunks: [[10,10,10],[20,20,20]], [[30,30,30],[40,40,40]] + // Best score: 1*40*3 = 120 + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + // Both should have score 120 + for _, hit := range res.Hits { + if hit.ID != "doc5" && hit.ID != "doc6" { + t.Fatalf("unexpected docID %s, expected doc5 or doc6", hit.ID) + } + if hit.Score != 120 { + t.Fatalf("for %s, expected score 120, got %v", hit.ID, hit.Score) + } + } + }) +} + func TestNumVecsStat(t *testing.T) { dataset, _, err := readDatasetAndQueries(testInputCompressedFile) @@ -2071,145 +2276,3 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } - -func TestVectorObjectArray(t *testing.T) { - // Setup 6 documents each with one vector field - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - indexMapping := NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = 3 - vecFieldMapping.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - arrayMapping := mapping.NewDocumentMapping() - indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) - arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - index, err := New(tmpIndexPath, indexMapping) - if err != nil { - t.Fatal(err) - } - defer func() { - err := index.Close() - if err != nil { - t.Fatal(err) - } - }() - - docsString := []string{ - `{"vec": [1, 2, 3]}`, - `{"vec": [4, 5, 6]}`, - `{"vec": [7, 8, 9]}`, - `{"vec": [10, 11, 12]}`, - `{"vec": [13, 14, 15]}`, - `{"vec": [16, 17, 18]}`, - } - docs := make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - // Index documents - batch := index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search with simple single-vector documents - searchRequest := NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) - searchRequest.Explain = true - - result, err := index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 3 { - t.Fatalf("expected 3 hits, got %d", len(result.Hits)) - } - - expectedResult := map[string]float64{ - "doc-1": 1.0, - "doc-2": 0.975, - "doc-3": 0.959, - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } - - // Now create 2 docs with 3 vectors each - docsString = []string{ - `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, - `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, - } - docs = make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - batch = index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search again with the same vector - searchRequest = NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) - - result, err = index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(result.Hits)) - } - - expectedResult = map[string]float64{ - "doc-multi-1": 1.0, // best score from the 3 vectors - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } -} From c4dd9d450f77d792bdd14923debf0ae8e981b005 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 21:58:08 +0530 Subject: [PATCH 50/70] fix --- search/collector/knn.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 65dc0845f..f5bcd0dc2 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -258,9 +258,13 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r break } - err = dmHandler(next) - if err != nil { - break + // we may have stored next for merging, or we may have completed a merge + // and have a document ready for further processing, so next can be nil + if next != nil { + err = dmHandler(next) + if err != nil { + break + } } next, err = searcher.Next(searchContext) From 351d8bec298b64abffb7f91c125b605b5d0d93a0 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 22:00:17 +0530 Subject: [PATCH 51/70] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- search/collector/knn.go | 6 +++--- search/util.go | 18 ++++++++++++++++-- search_knn_test.go | 2 +- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index f5bcd0dc2..85e382339 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -104,7 +104,7 @@ func (c *collectStoreKNN) Final(fixup collectorFixup) (search.DocumentMatchColle // NOTE: This implementation assumes documents arrive in increasing order of their // internal IDs, which is guaranteed by all searchers in bleve. type knnMerger struct { - // curr is the current document match being merged + // curr holds the current document match being accumulated during the merge process. curr *search.DocumentMatch } @@ -114,8 +114,8 @@ func newKNNMerger() *knnMerger { // Merge merges duplicate document matches by combining their score breakdowns. // Returns nil if the incoming doc was merged into the current document. -// Returns a non-nil DocumentMatch when a new document arrives, representing -// the completed merge of the previous document ready for further processing. +// Returns the completed previous document when a new document with a different ID arrives. +// The returned DocumentMatch is ready for further processing. func (c *knnMerger) Merge(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { // see if the document has been seen before if c.curr != nil && c.curr.IndexInternalID.Equals(doc.IndexInternalID) { diff --git a/search/util.go b/search/util.go index e90b3c0f2..2406aa749 100644 --- a/search/util.go +++ b/search/util.go @@ -238,9 +238,9 @@ type BM25Stats struct { FieldCardinality map[string]int `json:"field_cardinality"` } -// MergeScoreBreakdown merges two score breakdown maps together +// MergeScoreExplBreakdown merges two score breakdown maps and their explanations together // by picking the best score per query component, and merging them -// into the first map. +// (and their corresponding explanations) into the first map. func MergeScoreExplBreakdown(first, second map[int]float64, firstExpl, secondExpl *Explanation) (map[int]float64, *Explanation) { if first == nil { return second, secondExpl @@ -253,6 +253,20 @@ func MergeScoreExplBreakdown(first, second map[int]float64, firstExpl, secondExp if existing, ok := first[k]; !ok || existing < score { first[k] = score if firstExpl != nil && secondExpl != nil { + // Ensure Children slices are non-nil and long enough + if firstExpl.Children == nil || len(firstExpl.Children) <= k { + newLen := k + 1 + newChildren := make([]*Explanation, newLen) + if firstExpl.Children != nil { + copy(newChildren, firstExpl.Children) + } + firstExpl.Children = newChildren + } + if secondExpl.Children == nil || len(secondExpl.Children) <= k { + // If secondExpl.Children is nil or too short, skip assignment + // (or could set to nil, but here we skip) + continue + } firstExpl.Children[k] = secondExpl.Children[k] } } diff --git a/search_knn_test.go b/search_knn_test.go index cb211169e..79dc3dade 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1689,7 +1689,7 @@ func TestKNNMerger(t *testing.T) { {"vec": [10, 10, 10], "text": "chunk1"}, {"vec": [20, 20, 20], "text": "chunk2"}, {"vec": [30, 30, 30], "text": "chunk3"}, - {"vec": [40, 40, 40], "text": "chunk3"} + {"vec": [40, 40, 40], "text": "chunk4"} ] }`, "doc6": `{ From 2db81995dac766644f64548dd143fcabe66188f8 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 4 Dec 2025 10:11:50 -0700 Subject: [PATCH 52/70] go fmt ./... --- search/facet/facet_builder_terms_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/search/facet/facet_builder_terms_test.go b/search/facet/facet_builder_terms_test.go index fad4be301..3ed2fcccb 100644 --- a/search/facet/facet_builder_terms_test.go +++ b/search/facet/facet_builder_terms_test.go @@ -201,11 +201,11 @@ func TestTermsFacetPrefixAndRegex(t *testing.T) { terms := []string{ "env:prod", "env:staging", - "env:dev", // has prefix but doesn't match regex - "env:test", // has prefix but doesn't match regex - "type:server", // no prefix - "env:prod", // duplicate - "env:staging", // duplicate + "env:dev", // has prefix but doesn't match regex + "env:test", // has prefix but doesn't match regex + "type:server", // no prefix + "env:prod", // duplicate + "env:staging", // duplicate } for _, term := range terms { From a5fd25579c2db24a88936799c892ccc29d16d637 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 22:52:19 +0530 Subject: [PATCH 53/70] fix total calc --- search/collector/knn.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 85e382339..669d99940 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -158,6 +158,9 @@ func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHand if d == nil { return nil } + // increment total count as we are sure that this is a + // valid document match to be added to the KNN store + hc.total++ toRelease := hc.knnStore.AddDocument(d) for _, doc := range toRelease { ctx.DocumentMatchPool.Put(doc) @@ -239,8 +242,10 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: next, err = searcher.Next(searchContext) } + // maintain a total count of documents processed, for context cancellation checks + var total uint64 for err == nil && next != nil { - if hc.total%CheckDoneEvery == 0 { + if total%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -248,7 +253,7 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: } } - hc.total++ + total++ // since we may get duplicate document matches from the KNN searcher, // we must merge them before adding to the KNN store, keeping the From b9142049fa927eec01a5c7feca550093bc6ac065 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 23:16:11 +0530 Subject: [PATCH 54/70] fix edge case --- search_knn.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/search_knn.go b/search_knn.go index fae4f52e9..1651e790b 100644 --- a/search_knn.go +++ b/search_knn.go @@ -288,6 +288,11 @@ func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleD // If it's a filtered kNN but has no eligible filter hits, then // do not run the kNN query. if selector, exists := knnFilterResults[i]; exists && selector == nil { + // if the kNN query is filtered and has no eligible filter hits, then + // do not run the kNN query, so we add a match_none query to the subQueries. + // this will ensure that the score breakdown is set to 0 for this kNN query. + subQueries = append(subQueries, NewMatchNoneQuery()) + kArray = append(kArray, 0) continue } knnQuery := query.NewKNNQuery(knn.Vector) From 6b153a08f2583d9e503948afed89b5fd2bd8ce4d Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 4 Dec 2025 23:48:01 +0530 Subject: [PATCH 55/70] fix test --- search_knn_test.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index 79dc3dade..7914aeac2 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1281,23 +1281,29 @@ func TestKNNScoreBoosting(t *testing.T) { searchRequest.AddKNN("vector", queryVec, 3, 1.0) searchRequest.Fields = []string{"content", "vector"} - hits, _ := index.Search(searchRequest) + hits, err := index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } hitsMap := make(map[string]float64, 0) for _, hit := range hits.Hits { hitsMap[hit.ID] = (hit.Score) } - searchRequest2 := NewSearchRequest(NewMatchNoneQuery()) + searchRequest = NewSearchRequest(NewMatchNoneQuery()) searchRequest.AddKNN("vector", queryVec, 3, 10.0) searchRequest.Fields = []string{"content", "vector"} - hits2, _ := index.Search(searchRequest2) + hits, err = index.Search(searchRequest) + if err != nil { + t.Fatal(err) + } hitsMap2 := make(map[string]float64, 0) - for _, hit := range hits2.Hits { + for _, hit := range hits.Hits { hitsMap2[hit.ID] = (hit.Score) } - for _, hit := range hits2.Hits { + for _, hit := range hits.Hits { if hitsMap[hit.ID] != hitsMap2[hit.ID]/10 { t.Errorf("boosting not working: %v %v \n", hitsMap[hit.ID], hitsMap2[hit.ID]) } From 68760c2799b0605ea2ffc3a6b1b212d150667a83 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 5 Dec 2025 14:53:09 +0530 Subject: [PATCH 56/70] use normalizeVector for base64 --- mapping/mapping_vectors.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 3270006d3..393262b35 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -189,9 +189,10 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if err != nil || len(decodedVector) != fm.Dims { return } - // normalize raw vector if similarity is cosine + // normalize raw vector if similarity is cosine, multi-vector is not supported + // for base64 encoded vectors, so we use NormalizeVector directly. if similarity == index.CosineSimilarity { - decodedVector = NormalizeMultiVector(decodedVector, fm.Dims) + decodedVector = NormalizeVector(decodedVector) } fieldName := getFieldName(pathString, path, fm) From 99e21207283f134d6af656a9005feb21214328b6 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 5 Dec 2025 14:55:09 +0530 Subject: [PATCH 57/70] fix merge conflict --- search_knn.go | 43 -------------- search_knn_test.go | 142 --------------------------------------------- 2 files changed, 185 deletions(-) diff --git a/search_knn.go b/search_knn.go index 0a8e1021e..1651e790b 100644 --- a/search_knn.go +++ b/search_knn.go @@ -501,49 +501,6 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } - // early exit if there are no hits - if len(knnHits) == 0 { - return knnHits - } - // at this point, we have the final/global set of vectors that satisfy the KNN request. - // We may have multiple vectors per document, so we need to deduplicate the hits - // by document ID, and retain only the best scoring vector per knn query per document. - // This means that if hits have docA twice, we union the score breakdowns for docA, and - // retain only the best score per knn query for docA. - // sort by document ID - sort.Slice(knnHits, func(i, j int) bool { - return knnHits[i].ID < knnHits[j].ID - }) - // now deduplicate the hits by document ID, by using the sorted order - uniqueHits := knnHits[:1] - for i := 1; i < len(knnHits); i++ { - lastUniqueHit := uniqueHits[len(uniqueHits)-1] - currHit := knnHits[i] - if currHit.ID != lastUniqueHit.ID { - // we have found a new unique document - uniqueHits = append(uniqueHits, currHit) - } else { - // we have encountered a duplicate document, so we need to - // union the score breakdowns, retaining the best score - // per knn query, while also merging the explanations if req.Explain is true. - for k, score := range currHit.ScoreBreakdown { - if existing, ok := lastUniqueHit.ScoreBreakdown[k]; !ok || score > existing { - lastUniqueHit.ScoreBreakdown[k] = score - // Also update the explanation for this query index if Explain is enabled. - // Both Expl.Children slices are of size len(req.KNN), so indexing by k is safe. - if req.Explain { - // just defensive check to ensure that the Children slice is valid - if len(lastUniqueHit.Expl.Children) <= k { - lastUniqueHit.Expl.Children = append(lastUniqueHit.Expl.Children, make([]*search.Explanation, k-len(lastUniqueHit.Expl.Children)+1)...) - } - lastUniqueHit.Expl.Children[k] = currHit.Expl.Children[k] - } - } - } - } - } - // now uniqueHits contains only unique documents, so we can set knnHits to uniqueHits - knnHits = uniqueHits // if score fusion required, return early because // score breakdown is retained diff --git a/search_knn_test.go b/search_knn_test.go index 8ed5aa0df..e3e4e34ba 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2419,145 +2419,3 @@ func TestIndexInsightsCentroidCardinalities(t *testing.T) { } } } - -func TestVectorObjectArray(t *testing.T) { - // Setup 6 documents each with one vector field - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - indexMapping := NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = 3 - vecFieldMapping.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - arrayMapping := mapping.NewDocumentMapping() - indexMapping.DefaultMapping.AddSubDocumentMapping("vectors", arrayMapping) - arrayMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - index, err := New(tmpIndexPath, indexMapping) - if err != nil { - t.Fatal(err) - } - defer func() { - err := index.Close() - if err != nil { - t.Fatal(err) - } - }() - - docsString := []string{ - `{"vec": [1, 2, 3]}`, - `{"vec": [4, 5, 6]}`, - `{"vec": [7, 8, 9]}`, - `{"vec": [10, 11, 12]}`, - `{"vec": [13, 14, 15]}`, - `{"vec": [16, 17, 18]}`, - } - docs := make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - // Index documents - batch := index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search with simple single-vector documents - searchRequest := NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vec", []float32{1, 2, 3}, 3, 1.0) - searchRequest.Explain = true - - result, err := index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 3 { - t.Fatalf("expected 3 hits, got %d", len(result.Hits)) - } - - expectedResult := map[string]float64{ - "doc-1": 1.0, - "doc-2": 0.975, - "doc-3": 0.959, - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } - - // Now create 2 docs with 3 vectors each - docsString = []string{ - `{"vectors": [ {"vec": [1, 2, 3]}, {"vec": [4, 5, 6]}, {"vec": [7, 8, 9]}]}`, - `{"vectors": [ {"vec": [10, 11, 12]}, {"vec": [13, 14, 15]}, {"vec": [16, 17, 18]}]}`, - } - docs = make([]map[string]interface{}, 0, len(docsString)) - for _, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - docs = append(docs, doc) - } - - batch = index.NewBatch() - for i, doc := range docs { - err = batch.Index(fmt.Sprintf("doc-multi-%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - - // Search again with the same vector - searchRequest = NewSearchRequest(NewMatchNoneQuery()) - searchRequest.AddKNN("vectors.vec", []float32{1, 2, 3}, 3, 1.0) - - result, err = index.Search(searchRequest) - if err != nil { - t.Fatal(err) - } - - if len(result.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(result.Hits)) - } - - expectedResult = map[string]float64{ - "doc-multi-1": 1.0, // best score from the 3 vectors - } - - for _, hit := range result.Hits { - expectedScore, exists := expectedResult[hit.ID] - if !exists { - t.Fatalf("unexpected doc ID %s", hit.ID) - } - if math.Abs(hit.Score-expectedScore) > 0.001 { - t.Fatalf("for doc ID %s, expected score %.3f, got %.3f", hit.ID, expectedScore, hit.Score) - } - } -} From 8721d16416ff78aa6db50c1b38c592c05bbea924 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 5 Dec 2025 17:19:23 +0530 Subject: [PATCH 58/70] Fix interface --- search/query/knn.go | 2 +- search_knn.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/search/query/knn.go b/search/query/knn.go index ea3d38ce4..ea8780a41 100644 --- a/search/query/knn.go +++ b/search/query/knn.go @@ -53,7 +53,7 @@ func (q *KNNQuery) SetK(k int64) { q.K = k } -func (q *KNNQuery) SetFieldVal(field string) { +func (q *KNNQuery) SetField(field string) { q.VectorField = field } diff --git a/search_knn.go b/search_knn.go index 1651e790b..54771ede0 100644 --- a/search_knn.go +++ b/search_knn.go @@ -296,7 +296,7 @@ func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleD continue } knnQuery := query.NewKNNQuery(knn.Vector) - knnQuery.SetFieldVal(knn.Field) + knnQuery.SetField(knn.Field) knnQuery.SetK(knn.K) knnQuery.SetBoost(knn.Boost.Value()) knnQuery.SetParams(knn.Params) From f3ed293eb8b7bf1b5e40d17c475ae00ff54f76de Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 5 Dec 2025 21:29:41 +0530 Subject: [PATCH 59/70] fix KNN case --- mapping/mapping_vectors.go | 19 ------ search/collector/knn.go | 134 ++++++++++++++++++------------------- search/collector/nested.go | 19 +++--- search/collector/topn.go | 17 +++-- search/search.go | 21 ++---- search/util.go | 21 ++---- search_knn.go | 2 +- 7 files changed, 101 insertions(+), 132 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 3f5f708f6..7c7ff1b98 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -151,15 +151,6 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } - // Apply defaults for similarity and optimization if not set - similarity := fm.Similarity - if similarity == "" { - similarity = index.DefaultVectorSimilarityMetric - } - vectorIndexOptimizedFor := fm.VectorIndexOptimizedFor - if vectorIndexOptimizedFor == "" { - vectorIndexOptimizedFor = index.DefaultIndexOptimization - } // normalize raw vector if similarity is cosine // Since the vector can be multi-vector (flattened array of multiple vectors), // we use NormalizeMultiVector to normalize each sub-vector independently. @@ -170,11 +161,6 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, fieldName := getFieldName(pathString, path, fm) options := fm.Options() - // ensure the options are set to not store/index term vectors/doc values - options &^= index.StoreField | index.IncludeTermVectors | index.DocValues - // skip freq/norms for vector field - options |= index.SkipFreqNorm - field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) @@ -212,11 +198,6 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac fieldName := getFieldName(pathString, path, fm) options := fm.Options() - // ensure the options are set to not store/index term vectors/doc values - options &^= index.StoreField | index.IncludeTermVectors | index.DocValues - // skip freq/norms for vector field - options |= index.SkipFreqNorm - field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) diff --git a/search/collector/knn.go b/search/collector/knn.go index c529463aa..77baf650f 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -19,7 +19,6 @@ package collector import ( "context" - "slices" "time" "github.com/blevesearch/bleve/v2/search" @@ -202,21 +201,37 @@ type KNNCollector struct { nestedStore *collectStoreNested } +// NewKNNCollector creates a new KNNCollector for the given K values and size. func NewKNNCollector(kArray []int64, size int64) *KNNCollector { - return &KNNCollector{ - merger: newKNNMerger(), - knnStore: GetNewKNNCollectorStore(kArray), - size: int(size), - } + return newKNNCollector(kArray, size, nil) +} + +// NewNestedKNNCollector creates a new KNNCollector for the given K values and size, +// with support for nested documents using the provided NestedReader. +func NewNestedKNNCollector(kArray []int64, size int64, nr index.NestedReader) *KNNCollector { + return newKNNCollector(kArray, size, nr) } -func NewNestedKNNCollector(nr index.NestedReader, kArray []int64, size int64) *KNNCollector { - return &KNNCollector{ +// internal constructor for KNNCollector, with optional NestedReader +func newKNNCollector(kArray []int64, size int64, nr index.NestedReader) *KNNCollector { + knnCollector := &KNNCollector{ + merger: newKNNMerger(), knnStore: GetNewKNNCollectorStore(kArray), size: int(size), - - nestedStore: newStoreNested(nr), } + if nr != nil { + descAdder := func(parent *search.DocumentMatch, child *search.DocumentMatch) error { + // here we merge the child's score and explanation breakdowns into the parent + parent.ScoreBreakdown, parent.Expl = search.MergeScoreExplBreakdown( + parent.ScoreBreakdown, child.ScoreBreakdown, + parent.Expl, child.Expl) + // add the child's internal ID as a descendant ID to the parent + parent.AddDescendantID(child.IndexInternalID) + return nil + } + knnCollector.nestedStore = newStoreNested(nr, search.DescendantAdderCallbackFn(descAdder)) + } + return knnCollector } func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { @@ -275,8 +290,21 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r break } - // we may have stored next for merging, or we may have completed a merge - // and have a document ready for further processing, so next can be nil + // if the collector is used in nested mode and if next is non-nil, + // we must further process next to merge it into its root document + // via the nested store, if needed. + if hc.nestedStore != nil && next != nil { + // override next with the returned root document match, if any + next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) + if err != nil { + break + } + // if next is nil, it means the incoming doc was merged into its parent + // and no root document is ready yet + } + + // if next is non-nil at this point we finally have a document match ready + // to be added to the KNN store via the document match handler if next != nil { err = dmHandler(next) if err != nil { @@ -292,9 +320,30 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r // flush the last merged document if any if lastDoc := hc.merger.Current(); lastDoc != nil { - err = dmHandler(lastDoc) - if err != nil { - return err + // if the collector is used in nested mode, we must further process lastDoc + // to merge it into its root document via the nested store, if needed. + if hc.nestedStore != nil { + var err error + lastDoc, err = hc.nestedStore.ProcessNestedDocument(searchContext, lastDoc) + if err != nil { + return err + } + // if lastDoc is nil, it means the incoming doc was merged into its parent + // and no root document is ready yet + } + if lastDoc != nil { + if err = dmHandler(lastDoc); err != nil { + return err + } + } + } + // double check if there is an interim root document left to be returned + if hc.nestedStore != nil { + doc := hc.nestedStore.Current() + if doc != nil { + if err = dmHandler(doc); err != nil { + return err + } } } @@ -309,18 +358,16 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r hc.took = time.Since(startTime) // finalize actual results - err = hc.finalizeResults(searchContext, reader) + err = hc.finalizeResults(reader) if err != nil { return err } return nil } -func (hc *KNNCollector) finalizeResults(ctx *search.SearchContext, r index.IndexReader) error { +func (hc *KNNCollector) finalizeResults(r index.IndexReader) error { var err error - // finalize the KNN store results - // if collector is used in non-nested mode, then directly finalize from knnStore - docFixup := func(doc *search.DocumentMatch) error { + hc.results, err = hc.knnStore.Final(func(doc *search.DocumentMatch) error { if doc.ID == "" { // look up the id since we need it for lookup var err error @@ -330,54 +377,7 @@ func (hc *KNNCollector) finalizeResults(ctx *search.SearchContext, r index.Index } } return nil - } - if hc.nestedStore == nil { - hc.results, err = hc.knnStore.Final(docFixup) - return err - } - // knn collector is used in nested mode, this means that the documents - // in the knnStore need to be further processed to build the root documents - // first get the raw results without any fixup - rawResults, err := hc.knnStore.Final(nil) - if err != nil { - return err - } - // now sort all the document matches by indexInternalID to ensure that - // the nested processing works correctly, as it expects documents to be - // added in increasing order of internal IDs - slices.SortFunc(rawResults, func(i, j *search.DocumentMatch) int { - return i.IndexInternalID.Compare(j.IndexInternalID) }) - finalResults := make(search.DocumentMatchCollection, 0, len(rawResults)) - // now process each document through the nested store - for _, doc := range rawResults { - // override doc with the returned root document match, if any - doc, err = hc.nestedStore.ProcessNestedDocument(ctx, doc) - if err != nil { - return err - } - // if doc is nil, it means the incoming doc was merged into its parent - // and no root document is ready yet - if doc != nil { - // completed root document match, do fixup and add to results - err = docFixup(doc) - if err != nil { - return err - } - finalResults = append(finalResults, doc) - } - } - // finally, check if there is an interim root document left to be returned - doc := hc.nestedStore.CurrentRoot() - if doc != nil { - // completed root document match, do fixup and add to results - err = docFixup(doc) - if err != nil { - return err - } - finalResults = append(finalResults, doc) - } - hc.results = finalResults return err } diff --git a/search/collector/nested.go b/search/collector/nested.go index 1442d822c..9b137c6fe 100644 --- a/search/collector/nested.go +++ b/search/collector/nested.go @@ -20,21 +20,22 @@ import ( ) type collectStoreNested struct { + // descAdder is used to customize how descendants are merged into their parent + descAdder search.DescendantAdderCallbackFn + // nested reader to retrieve ancestor information nr index.NestedReader - // the current root document match being built currRoot *search.DocumentMatch - // the ancestor ID of the current root document being built currRootAncestorID index.AncestorID - // prealloc slice for ancestor IDs ancestors []index.AncestorID } -func newStoreNested(nr index.NestedReader) *collectStoreNested { +func newStoreNested(nr index.NestedReader, descAdder search.DescendantAdderCallbackFn) *collectStoreNested { rv := &collectStoreNested{ - nr: nr, + descAdder: descAdder, + nr: nr, } return rv } @@ -61,7 +62,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do // check if there is an interim root already and if the incoming doc belongs to it if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { // there is an interim root already, and the incoming doc belongs to it - if err := c.currRoot.AddDescendant(doc); err != nil { + if err := c.descAdder(c.currRoot, doc); err != nil { return nil, err } // recycle the child document now that it's merged into the interim root @@ -88,7 +89,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do // merge the incoming doc into the new interim root c.currRoot = newDM c.currRootAncestorID = rootID - if err := c.currRoot.AddDescendant(doc); err != nil { + if err := c.descAdder(c.currRoot, doc); err != nil { return nil, err } // recycle the child document now that it's merged into the interim root @@ -96,7 +97,7 @@ func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, do return completedRoot, nil } -// CurrentRoot returns the current interim root document match being built, if any -func (c *collectStoreNested) CurrentRoot() *search.DocumentMatch { +// Current returns the current interim root document match being built, if any +func (c *collectStoreNested) Current() *search.DocumentMatch { return c.currRoot } diff --git a/search/collector/topn.go b/search/collector/topn.go index e954027f5..395005428 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -132,7 +132,18 @@ func newTopNCollector(size int, skip int, sort search.SortOrder, nr index.Nested }) if nr != nil { - hc.nestedStore = newStoreNested(nr) + descAdder := func(parent, child *search.DocumentMatch) error { + // add descendant score to parent score + parent.Score += child.Score + // merge explanations + parent.Expl = parent.Expl.MergeWith(child.Expl) + // merge field term locations + parent.FieldTermLocations = search.MergeFieldTermLocationsFromMatch(parent.FieldTermLocations, child) + // add child's ID to parent's Descendants + parent.AddDescendantID(child.IndexInternalID) + return nil + } + hc.nestedStore = newStoreNested(nr, search.DescendantAdderCallbackFn(descAdder)) } // these lookups traverse an interface, so do once up-front @@ -342,12 +353,10 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, if err != nil { break } - // no descendants at this point err = hc.prepareDocumentMatch(searchContext, reader, next, false) if err != nil { break } - err = dmHandler(next) if err != nil { break @@ -362,7 +371,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, // if we have a nested store, we may have an interim root // that needs to be returned for processing if hc.nestedStore != nil { - currRoot := hc.nestedStore.CurrentRoot() + currRoot := hc.nestedStore.Current() if currRoot != nil { err = hc.adjustDocumentMatch(searchContext, reader, currRoot) if err != nil { diff --git a/search/search.go b/search/search.go index 12343b3f8..2996fa6f7 100644 --- a/search/search.go +++ b/search/search.go @@ -382,30 +382,19 @@ func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score) } -// AddDescendant merges another DocumentMatch into this one as a descendant. -func (dm *DocumentMatch) AddDescendant(other *DocumentMatch) error { - // add descendant score to parent score - dm.Score += other.Score - // merge explanations - dm.Expl = dm.Expl.MergeWith(other.Expl) - // merge field term locations - dm.FieldTermLocations = MergeFieldTermLocationsFromMatch(dm.FieldTermLocations, other) - // merge score breakdown - dm.ScoreBreakdown = MergeScoreBreakdown(dm.ScoreBreakdown, other.ScoreBreakdown) +func (dm *DocumentMatch) AddDescendantID(id index.IndexInternalID) { // add other as descendant only if it is not the same document - if !dm.IndexInternalID.Equals(other.IndexInternalID) { - // Add a copy of other.IndexInternalID to descendants, because - // other.IndexInternalID will be reset when 'other' is recycled. + if !dm.IndexInternalID.Equals(id) { + // Add a copy of id to descendants var descendantID index.IndexInternalID // first check if dm's descendants slice has capacity to reuse if len(dm.Descendants) < cap(dm.Descendants) { // reuse the buffer element at len(dm.Descendants) descendantID = dm.Descendants[:len(dm.Descendants)+1][len(dm.Descendants)] } - // copy the contents of other.IndexInternalID into descendantID, allocating if needed - dm.Descendants = append(dm.Descendants, index.NewIndexInternalIDFrom(descendantID, other.IndexInternalID)) + // copy the contents of id into descendantID, allocating if needed + dm.Descendants = append(dm.Descendants, index.NewIndexInternalIDFrom(descendantID, id)) } - return nil } type DocumentMatchCollection []*DocumentMatch diff --git a/search/util.go b/search/util.go index 8e603760e..61fb78e97 100644 --- a/search/util.go +++ b/search/util.go @@ -101,21 +101,6 @@ func mergeFieldTermLocationFromMatch(dest []FieldTermLocation, dm *DocumentMatch return dest } -// MergeScoreBreakdown merges two score breakdown maps together -func MergeScoreBreakdown(first, second map[int]float64) map[int]float64 { - if first == nil { - return second - } - if second == nil { - return first - } - // reuse first to store the union of both - for k, v := range second { - first[k] += v - } - return first -} - type ( SearchIncrementalCostCallbackMsg uint SearchQueryType uint @@ -241,6 +226,10 @@ type ( // HybridMergeCallbackFn is a callback function type used to merge a KNN document match // into a full text search document match, of the same docID as part of hybrid search. HybridMergeCallbackFn func(ftsMatch *DocumentMatch, knnMatch *DocumentMatch) + // DescendantAdderCallback is a callback function type used to customize how a descendant + // DocumentMatch is merged into its parent. This allows different descendant addition strategies for + // different use cases (e.g., TopN vs KNN collection). + DescendantAdderCallbackFn func(parent *DocumentMatch, descendant *DocumentMatch) error // GeoBufferPoolCallbackFunc is a callback function type used to get the geo buffer pool // to be used during geo searches. GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool @@ -342,7 +331,7 @@ func SortedUnion(dest, src []index.IndexInternalID) []index.IndexInternalID { return rv } -// MergeScoreBreakdown merges two score breakdown maps together +// MergeScoreExplBreakdown merges two score breakdown maps together // by picking the best score per query component, and merging them // (and their corresponding explanations) into the first map. func MergeScoreExplBreakdown(first, second map[int]float64, firstExpl, secondExpl *Explanation) (map[int]float64, *Explanation) { diff --git a/search_knn.go b/search_knn.go index 1a1fad926..dabd130a2 100644 --- a/search_knn.go +++ b/search_knn.go @@ -703,7 +703,7 @@ func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, return nil, err } if nm.IntersectsPrefix(fs) { - return collector.NewNestedKNNCollector(nr, kArray, sumOfK), nil + return collector.NewNestedKNNCollector(kArray, sumOfK, nr), nil } } } From a233b67e33fbe19d3f56e2dc194ef35284cc41f0 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 8 Dec 2025 11:48:31 +0530 Subject: [PATCH 60/70] MB-69655: Fix vector normalization to handle multi-vectors correctly (#2260) - When indexing multi-vector fields (e.g., `[[3,0,0], [0,4,0]]`) with `cosine` similarity, normalization was incorrectly applied to the entire flattened array instead of each sub-vector independently, resulting in degraded similarity scores. - Added `NormalizeMultiVector(vec, dims)` that normalizes each sub-vector separately, fixing scores for multi-vector documents (e.g., score now correctly returns 1.0 instead of 0.6 for exact matches). --- mapping/mapping_vectors.go | 34 +++++++- mapping/mapping_vectors_test.go | 118 +++++++++++++++++++++++++++ search_knn_test.go | 137 ++++++++++++++++++++++++++++++++ 3 files changed, 285 insertions(+), 4 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index c3dee9310..393262b35 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -20,6 +20,7 @@ package mapping import ( "fmt" "reflect" + "slices" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/util" @@ -151,8 +152,10 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, vectorIndexOptimizedFor = index.DefaultIndexOptimization } // normalize raw vector if similarity is cosine + // Since the vector can be multi-vector (flattened array of multiple vectors), + // we use NormalizeMultiVector to normalize each sub-vector independently. if similarity == index.CosineSimilarity { - vector = NormalizeVector(vector) + vector = NormalizeMultiVector(vector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -186,7 +189,8 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if err != nil || len(decodedVector) != fm.Dims { return } - // normalize raw vector if similarity is cosine + // normalize raw vector if similarity is cosine, multi-vector is not supported + // for base64 encoded vectors, so we use NormalizeVector directly. if similarity == index.CosineSimilarity { decodedVector = NormalizeVector(decodedVector) } @@ -292,11 +296,33 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, return nil } +// NormalizeVector normalizes a single vector to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. func NormalizeVector(vec []float32) []float32 { // make a copy of the vector to avoid modifying the original // vector in-place - vecCopy := make([]float32, len(vec)) - copy(vecCopy, vec) + vecCopy := slices.Clone(vec) // normalize the vector copy using in-place normalization provided by faiss return faiss.NormalizeVector(vecCopy) } + +// NormalizeMultiVector normalizes each sub-vector of size `dims` independently. +// For a flattened array containing multiple vectors, each sub-vector is +// normalized separately to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. +func NormalizeMultiVector(vec []float32, dims int) []float32 { + if len(vec) == 0 || dims <= 0 || len(vec)%dims != 0 { + return vec + } + // Single vector - delegate to NormalizeVector + if len(vec) == dims { + return NormalizeVector(vec) + } + // Multi-vector - make a copy to avoid modifying the original + result := slices.Clone(vec) + // Normalize each sub-vector in-place + for i := 0; i < len(result); i += dims { + faiss.NormalizeVector(result[i : i+dims]) + } + return result +} diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index b00e5c094..0620510a0 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -18,6 +18,7 @@ package mapping import ( + "math" "reflect" "strings" "testing" @@ -1069,3 +1070,120 @@ func TestNormalizeVector(t *testing.T) { } } } + +func TestNormalizeMultiVectors(t *testing.T) { + tests := []struct { + name string + input []float32 + dims int + expected []float32 + }{ + { + name: "single vector - already normalized", + input: []float32{1, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "single vector - needs normalization", + input: []float32{3, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "two vectors - X and Y directions", + input: []float32{3, 0, 0, 0, 4, 0}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0}, + }, + { + name: "three vectors", + input: []float32{3, 0, 0, 0, 4, 0, 0, 0, 5}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0, 0, 0, 1}, + }, + { + name: "two 2D vectors", + input: []float32{3, 4, 5, 12}, + dims: 2, + expected: []float32{0.6, 0.8, 0.38461538, 0.92307693}, + }, + { + name: "empty vector", + input: []float32{}, + dims: 3, + expected: []float32{}, + }, + { + name: "zero dims", + input: []float32{1, 2, 3}, + dims: 0, + expected: []float32{1, 2, 3}, + }, + { + name: "negative dims", + input: []float32{1, 2, 3}, + dims: -1, + expected: []float32{1, 2, 3}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Make a copy of input to verify original is not modified + inputCopy := make([]float32, len(tt.input)) + copy(inputCopy, tt.input) + + result := NormalizeMultiVector(tt.input, tt.dims) + + // Check result matches expected + if len(result) != len(tt.expected) { + t.Errorf("length mismatch: expected %d, got %d", len(tt.expected), len(result)) + return + } + + for i := range result { + if !floatApproxEqual(result[i], tt.expected[i], 1e-5) { + t.Errorf("value mismatch at index %d: expected %v, got %v", + i, tt.expected[i], result[i]) + } + } + + // Verify original input was not modified + if !reflect.DeepEqual(tt.input, inputCopy) { + t.Errorf("original input was modified: was %v, now %v", inputCopy, tt.input) + } + + // For valid multi-vectors, verify each sub-vector has unit magnitude + if tt.dims > 0 && len(tt.input) > 0 && len(tt.input)%tt.dims == 0 { + numVecs := len(result) / tt.dims + for i := 0; i < numVecs; i++ { + subVec := result[i*tt.dims : (i+1)*tt.dims] + mag := magnitude(subVec) + // Allow for zero vectors (magnitude 0) or unit vectors (magnitude 1) + if mag > 1e-6 && !floatApproxEqual(mag, 1.0, 1e-5) { + t.Errorf("sub-vector %d has magnitude %v, expected 1.0", i, mag) + } + } + } + }) + } +} + +// Helper to compute magnitude of a vector +func magnitude(v []float32) float32 { + var sum float32 + for _, x := range v { + sum += x * x + } + return float32(math.Sqrt(float64(sum))) +} + +// Helper for approximate float comparison +func floatApproxEqual(a, b, epsilon float32) bool { + diff := a - b + if diff < 0 { + diff = -diff + } + return diff < epsilon +} diff --git a/search_knn_test.go b/search_knn_test.go index 7914aeac2..e3e4e34ba 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1856,6 +1856,143 @@ func TestKNNMerger(t *testing.T) { }) } +// TestMultiVectorCosineNormalization verifies that multi-vector fields are +// normalized correctly with cosine similarity. Each sub-vector in a multi-vector +// should be independently normalized, producing correct similarity scores. +func TestMultiVectorCosineNormalization(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + const dims = 3 + + // Create index with cosine similarity + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = dims + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + // Multi-vector field + vecFieldMappingNested := mapping.NewVectorFieldMapping() + vecFieldMappingNested.Dims = dims + vecFieldMappingNested.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec_nested", vecFieldMappingNested) + + idx, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [3, 0, 0]}`, + `{"vec": [0, 4, 0]}`, + `{"vec_nested": [[3, 0, 0], [0, 4, 0]]}`, + } + + for i, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + err = idx.Index(fmt.Sprintf("doc%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + + // Query for X direction [1,0,0] + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{1, 0, 0}, 3, 1.0) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc1 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc1" { + t.Fatalf("expected doc1 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc2 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc2" { + t.Fatalf("expected doc2 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Query for Y direction [0,1,0] + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc2 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc2" { + t.Fatalf("expected doc2 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc1 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc1" { + t.Fatalf("expected doc1 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Now test querying the nested multi-vector field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{1, 0, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on first sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Query for Y direction [0,1,0] on nested field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on second sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } +} + func TestNumVecsStat(t *testing.T) { dataset, _, err := readDatasetAndQueries(testInputCompressedFile) From dd2422db01f6853363897ccf2f887402894a72b0 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 8 Dec 2025 13:37:26 +0530 Subject: [PATCH 61/70] revert --- index_alias_impl.go | 11 +++++- search/collector/knn.go | 74 ----------------------------------------- search_knn.go | 42 +++++++++++++++++++---- search_no_knn.go | 4 +++ 4 files changed, 50 insertions(+), 81 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 8212c74b9..41e78f1f2 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -946,7 +946,16 @@ func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearc return } if flags.knn { - preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) + knnHits := preSearchResult.Hits + // we are done calculating the final top K vectors, so we need to prepare + // the payload for returning the final results. + knnHits = prepareKNNResults(req, knnHits) + // if score fusion is not requested, then finalize the KNN results now. + // else, defer the finalization to the score fusion phase. + if !IsScoreFusionRequested(req) { + knnHits = finalizeKNNResults(req, knnHits) + } + preSearchResult.Hits = knnHits } } diff --git a/search/collector/knn.go b/search/collector/knn.go index 669d99940..6ed8059ec 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -95,61 +95,6 @@ func (c *collectStoreKNN) Final(fixup collectorFixup) (search.DocumentMatchColle return rv, nil } -// ----------------------------------------------------------------------------------------- -// knnMerger is a preprocessing layer on top of collectStoreKNN that merges -// duplicate document matches before adding them to the underlying KNN store. -// Since KNN searchers may return the same document multiple times (with different -// scoreBreakdown values for each KNN query), we must merge these to retain the -// best score per KNN query before adding them to the KNN store. -// NOTE: This implementation assumes documents arrive in increasing order of their -// internal IDs, which is guaranteed by all searchers in bleve. -type knnMerger struct { - // curr holds the current document match being accumulated during the merge process. - curr *search.DocumentMatch -} - -func newKNNMerger() *knnMerger { - return &knnMerger{} -} - -// Merge merges duplicate document matches by combining their score breakdowns. -// Returns nil if the incoming doc was merged into the current document. -// Returns the completed previous document when a new document with a different ID arrives. -// The returned DocumentMatch is ready for further processing. -func (c *knnMerger) Merge(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { - // see if the document has been seen before - if c.curr != nil && c.curr.IndexInternalID.Equals(doc.IndexInternalID) { - // merge the score breakdowns - c.curr.ScoreBreakdown, c.curr.Expl = search.MergeScoreExplBreakdown( - c.curr.ScoreBreakdown, doc.ScoreBreakdown, - c.curr.Expl, doc.Expl) - // recycle the incoming document now that it's merged - ctx.DocumentMatchPool.Put(doc) - // return nil since no document to process further - return nil, nil - } - // now we are sure that this is a new document, check if we have an existing - // document to return for processing - if c.curr != nil { - // we have an existing document, return it for processing - toReturn := c.curr - // set the current to the incoming document - c.curr = doc - return toReturn, nil - } - // first document seen, set it as current and return nil - c.curr = doc - return nil, nil -} - -// Current returns the current document match being merged, if any. -// Call this after processing all documents to flush the last document. -func (c *knnMerger) Current() *search.DocumentMatch { - return c.curr -} - -// ----------------------------------------------------------------------------------------- - func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHandler, error) { var hc *KNNCollector var ok bool @@ -188,8 +133,6 @@ func GetNewKNNCollectorStore(kArray []int64) *collectStoreKNN { // implements Collector interface type KNNCollector struct { - // merger merges duplicate document matches before adding to knnStore - merger *knnMerger // knnStore is the underlying store for KNN document matches knnStore *collectStoreKNN size int @@ -201,7 +144,6 @@ type KNNCollector struct { func NewKNNCollector(kArray []int64, size int64) *KNNCollector { return &KNNCollector{ - merger: newKNNMerger(), knnStore: GetNewKNNCollectorStore(kArray), size: int(size), } @@ -255,14 +197,6 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r } total++ - // since we may get duplicate document matches from the KNN searcher, - // we must merge them before adding to the KNN store, keeping the - // best scoreBreakdown for each KNN query per document. - next, err = hc.merger.Merge(searchContext, next) - if err != nil { - break - } - // we may have stored next for merging, or we may have completed a merge // and have a document ready for further processing, so next can be nil if next != nil { @@ -278,14 +212,6 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r return err } - // flush the last merged document if any - if lastDoc := hc.merger.Current(); lastDoc != nil { - err = dmHandler(lastDoc) - if err != nil { - return err - } - } - // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) diff --git a/search_knn.go b/search_knn.go index 54771ede0..e395d1085 100644 --- a/search_knn.go +++ b/search_knn.go @@ -458,7 +458,14 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea } knnHits = knnCollector.Results() if !preSearch { - knnHits = finalizeKNNResults(req, knnHits) + // we are done calculating the final top K vectors, so we need to prepare + // the payload for returning the final results. + knnHits = prepareKNNResults(req, knnHits) + // if score fusion is not requested, then finalize the KNN results now. + // else, defer the finalization to the score fusion phase. + if !IsScoreFusionRequested(req) { + knnHits = finalizeKNNResults(req, knnHits) + } } // at this point, irrespective of whether it is a preSearch or not, // the knn hits are populated with Sort and Fields. @@ -488,7 +495,7 @@ func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, } } -func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { +func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { // if the KNN operator is AND, then we need to filter out the hits that // do not have match the KNN queries. if req.KNNOperator == knnOperatorAnd { @@ -501,12 +508,35 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } - - // if score fusion required, return early because - // score breakdown is retained - if IsScoreFusionRequested(req) { + if len(knnHits) == 0 { return knnHits } + // we may be getting multiple vectors for the same document, so + // we need to deduplicate the hits based on the Document ID. + // sort the hits based on the Document ID. + sort.Slice(knnHits, func(i, j int) bool { + return knnHits[i].ID < knnHits[j].ID + }) + rv := knnHits[:1] + lastUnique := rv[0] + for i := 1; i < len(knnHits); i++ { + current := knnHits[i] + if current.ID != lastUnique.ID { + rv = append(rv, current) + lastUnique = current + } else { + // we have a duplicate document, so we take the best score breakdown + // for each KNN query. + lastUnique.ScoreBreakdown, lastUnique.Expl = search.MergeScoreExplBreakdown( + lastUnique.ScoreBreakdown, current.ScoreBreakdown, + lastUnique.Expl, current.Expl) + } + } + knnHits = rv + return knnHits +} + +func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { // fix the score using score breakdown now // if the score is none, then we need to set the score to 0.0 // if req.Explain is true, then we need to use the expl breakdown to diff --git a/search_no_knn.go b/search_no_knn.go index 172f258ec..673d0479a 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -224,6 +224,10 @@ func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preS return mergedOut, nil } +func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { { + return knnHits +} + func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { return knnHits } From f3540a6bd7eed85c1751ae60f5ceba5a04ae56ee Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 8 Dec 2025 14:17:46 +0530 Subject: [PATCH 62/70] fix --- search/collector/knn.go | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index 6ed8059ec..465bf6927 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -103,9 +103,6 @@ func MakeKNNDocMatchHandler(ctx *search.SearchContext) (search.DocumentMatchHand if d == nil { return nil } - // increment total count as we are sure that this is a - // valid document match to be added to the KNN store - hc.total++ toRelease := hc.knnStore.AddDocument(d) for _, doc := range toRelease { ctx.DocumentMatchPool.Put(doc) @@ -133,7 +130,6 @@ func GetNewKNNCollectorStore(kArray []int64) *collectStoreKNN { // implements Collector interface type KNNCollector struct { - // knnStore is the underlying store for KNN document matches knnStore *collectStoreKNN size int total uint64 @@ -184,10 +180,8 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: next, err = searcher.Next(searchContext) } - // maintain a total count of documents processed, for context cancellation checks - var total uint64 for err == nil && next != nil { - if total%CheckDoneEvery == 0 { + if hc.total%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -195,15 +189,11 @@ func (hc *KNNCollector) Collect(ctx context.Context, searcher search.Searcher, r default: } } - total++ + hc.total++ - // we may have stored next for merging, or we may have completed a merge - // and have a document ready for further processing, so next can be nil - if next != nil { - err = dmHandler(next) - if err != nil { - break - } + err = dmHandler(next) + if err != nil { + break } next, err = searcher.Next(searchContext) From fcb0d76dd0f9df0350bce9f9b8de13f72b2c2c0d Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 8 Dec 2025 14:44:58 +0530 Subject: [PATCH 63/70] fix --- search_no_knn.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/search_no_knn.go b/search_no_knn.go index 673d0479a..3e9bad9f6 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -224,7 +224,7 @@ func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preS return mergedOut, nil } -func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { { +func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { return knnHits } From fc32bb8d64648003cdd117b5aa75ea1cc0339dd4 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 8 Dec 2025 08:14:28 -0700 Subject: [PATCH 64/70] Revert "MB-69655: Fix vector normalization to handle multi-vectors correctly (#2260)" This reverts commit a233b67e33fbe19d3f56e2dc194ef35284cc41f0. --- mapping/mapping_vectors.go | 34 +------- mapping/mapping_vectors_test.go | 118 --------------------------- search_knn_test.go | 137 -------------------------------- 3 files changed, 4 insertions(+), 285 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 393262b35..c3dee9310 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -20,7 +20,6 @@ package mapping import ( "fmt" "reflect" - "slices" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/util" @@ -152,10 +151,8 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, vectorIndexOptimizedFor = index.DefaultIndexOptimization } // normalize raw vector if similarity is cosine - // Since the vector can be multi-vector (flattened array of multiple vectors), - // we use NormalizeMultiVector to normalize each sub-vector independently. if similarity == index.CosineSimilarity { - vector = NormalizeMultiVector(vector, fm.Dims) + vector = NormalizeVector(vector) } fieldName := getFieldName(pathString, path, fm) @@ -189,8 +186,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if err != nil || len(decodedVector) != fm.Dims { return } - // normalize raw vector if similarity is cosine, multi-vector is not supported - // for base64 encoded vectors, so we use NormalizeVector directly. + // normalize raw vector if similarity is cosine if similarity == index.CosineSimilarity { decodedVector = NormalizeVector(decodedVector) } @@ -296,33 +292,11 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, return nil } -// NormalizeVector normalizes a single vector to unit length. -// It makes a copy of the input vector to avoid modifying it in-place. func NormalizeVector(vec []float32) []float32 { // make a copy of the vector to avoid modifying the original // vector in-place - vecCopy := slices.Clone(vec) + vecCopy := make([]float32, len(vec)) + copy(vecCopy, vec) // normalize the vector copy using in-place normalization provided by faiss return faiss.NormalizeVector(vecCopy) } - -// NormalizeMultiVector normalizes each sub-vector of size `dims` independently. -// For a flattened array containing multiple vectors, each sub-vector is -// normalized separately to unit length. -// It makes a copy of the input vector to avoid modifying it in-place. -func NormalizeMultiVector(vec []float32, dims int) []float32 { - if len(vec) == 0 || dims <= 0 || len(vec)%dims != 0 { - return vec - } - // Single vector - delegate to NormalizeVector - if len(vec) == dims { - return NormalizeVector(vec) - } - // Multi-vector - make a copy to avoid modifying the original - result := slices.Clone(vec) - // Normalize each sub-vector in-place - for i := 0; i < len(result); i += dims { - faiss.NormalizeVector(result[i : i+dims]) - } - return result -} diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index 0620510a0..b00e5c094 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -18,7 +18,6 @@ package mapping import ( - "math" "reflect" "strings" "testing" @@ -1070,120 +1069,3 @@ func TestNormalizeVector(t *testing.T) { } } } - -func TestNormalizeMultiVectors(t *testing.T) { - tests := []struct { - name string - input []float32 - dims int - expected []float32 - }{ - { - name: "single vector - already normalized", - input: []float32{1, 0, 0}, - dims: 3, - expected: []float32{1, 0, 0}, - }, - { - name: "single vector - needs normalization", - input: []float32{3, 0, 0}, - dims: 3, - expected: []float32{1, 0, 0}, - }, - { - name: "two vectors - X and Y directions", - input: []float32{3, 0, 0, 0, 4, 0}, - dims: 3, - expected: []float32{1, 0, 0, 0, 1, 0}, - }, - { - name: "three vectors", - input: []float32{3, 0, 0, 0, 4, 0, 0, 0, 5}, - dims: 3, - expected: []float32{1, 0, 0, 0, 1, 0, 0, 0, 1}, - }, - { - name: "two 2D vectors", - input: []float32{3, 4, 5, 12}, - dims: 2, - expected: []float32{0.6, 0.8, 0.38461538, 0.92307693}, - }, - { - name: "empty vector", - input: []float32{}, - dims: 3, - expected: []float32{}, - }, - { - name: "zero dims", - input: []float32{1, 2, 3}, - dims: 0, - expected: []float32{1, 2, 3}, - }, - { - name: "negative dims", - input: []float32{1, 2, 3}, - dims: -1, - expected: []float32{1, 2, 3}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Make a copy of input to verify original is not modified - inputCopy := make([]float32, len(tt.input)) - copy(inputCopy, tt.input) - - result := NormalizeMultiVector(tt.input, tt.dims) - - // Check result matches expected - if len(result) != len(tt.expected) { - t.Errorf("length mismatch: expected %d, got %d", len(tt.expected), len(result)) - return - } - - for i := range result { - if !floatApproxEqual(result[i], tt.expected[i], 1e-5) { - t.Errorf("value mismatch at index %d: expected %v, got %v", - i, tt.expected[i], result[i]) - } - } - - // Verify original input was not modified - if !reflect.DeepEqual(tt.input, inputCopy) { - t.Errorf("original input was modified: was %v, now %v", inputCopy, tt.input) - } - - // For valid multi-vectors, verify each sub-vector has unit magnitude - if tt.dims > 0 && len(tt.input) > 0 && len(tt.input)%tt.dims == 0 { - numVecs := len(result) / tt.dims - for i := 0; i < numVecs; i++ { - subVec := result[i*tt.dims : (i+1)*tt.dims] - mag := magnitude(subVec) - // Allow for zero vectors (magnitude 0) or unit vectors (magnitude 1) - if mag > 1e-6 && !floatApproxEqual(mag, 1.0, 1e-5) { - t.Errorf("sub-vector %d has magnitude %v, expected 1.0", i, mag) - } - } - } - }) - } -} - -// Helper to compute magnitude of a vector -func magnitude(v []float32) float32 { - var sum float32 - for _, x := range v { - sum += x * x - } - return float32(math.Sqrt(float64(sum))) -} - -// Helper for approximate float comparison -func floatApproxEqual(a, b, epsilon float32) bool { - diff := a - b - if diff < 0 { - diff = -diff - } - return diff < epsilon -} diff --git a/search_knn_test.go b/search_knn_test.go index e3e4e34ba..7914aeac2 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1856,143 +1856,6 @@ func TestKNNMerger(t *testing.T) { }) } -// TestMultiVectorCosineNormalization verifies that multi-vector fields are -// normalized correctly with cosine similarity. Each sub-vector in a multi-vector -// should be independently normalized, producing correct similarity scores. -func TestMultiVectorCosineNormalization(t *testing.T) { - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - const dims = 3 - - // Create index with cosine similarity - indexMapping := NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = dims - vecFieldMapping.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - // Multi-vector field - vecFieldMappingNested := mapping.NewVectorFieldMapping() - vecFieldMappingNested.Dims = dims - vecFieldMappingNested.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec_nested", vecFieldMappingNested) - - idx, err := New(tmpIndexPath, indexMapping) - if err != nil { - t.Fatal(err) - } - defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) - } - }() - - docsString := []string{ - `{"vec": [3, 0, 0]}`, - `{"vec": [0, 4, 0]}`, - `{"vec_nested": [[3, 0, 0], [0, 4, 0]]}`, - } - - for i, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) - if err != nil { - t.Fatal(err) - } - err = idx.Index(fmt.Sprintf("doc%d", i+1), doc) - if err != nil { - t.Fatal(err) - } - } - - // Query for X direction [1,0,0] - searchReq := NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec", []float32{1, 0, 0}, 3, 1.0) - res, err := idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 2 { - t.Fatalf("expected 2 hits, got %d", len(res.Hits)) - } - // Hit 1 should be doc1 with score 1.0 (perfect match) - if res.Hits[0].ID != "doc1" { - t.Fatalf("expected doc1 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Hit 2 should be doc2 with a score of 0.0 (orthogonal) - if res.Hits[1].ID != "doc2" { - t.Fatalf("expected doc2 as second hit, got %s", res.Hits[1].ID) - } - if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { - t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) - } - - // Query for Y direction [0,1,0] - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec", []float32{0, 1, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 2 { - t.Fatalf("expected 2 hits, got %d", len(res.Hits)) - } - // Hit 1 should be doc2 with score 1.0 (perfect match) - if res.Hits[0].ID != "doc2" { - t.Fatalf("expected doc2 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Hit 2 should be doc1 with a score of 0.0 (orthogonal) - if res.Hits[1].ID != "doc1" { - t.Fatalf("expected doc1 as second hit, got %s", res.Hits[1].ID) - } - if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { - t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) - } - - // Now test querying the nested multi-vector field - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec_nested", []float32{1, 0, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(res.Hits)) - } - // Hit should be doc3 with score 1.0 (perfect match on first sub-vector) - if res.Hits[0].ID != "doc3" { - t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Query for Y direction [0,1,0] on nested field - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec_nested", []float32{0, 1, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(res.Hits)) - } - // Hit should be doc3 with score 1.0 (perfect match on second sub-vector) - if res.Hits[0].ID != "doc3" { - t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } -} - func TestNumVecsStat(t *testing.T) { dataset, _, err := readDatasetAndQueries(testInputCompressedFile) From 8a4e70e75c7816d1d4eac7a1114ed63974a39bdc Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Mon, 8 Dec 2025 11:48:31 +0530 Subject: [PATCH 65/70] MB-69655: Fix vector normalization to handle multi-vectors correctly (#2260) - When indexing multi-vector fields (e.g., `[[3,0,0], [0,4,0]]`) with `cosine` similarity, normalization was incorrectly applied to the entire flattened array instead of each sub-vector independently, resulting in degraded similarity scores. - Added `NormalizeMultiVector(vec, dims)` that normalizes each sub-vector separately, fixing scores for multi-vector documents (e.g., score now correctly returns 1.0 instead of 0.6 for exact matches). --- mapping/mapping_vectors.go | 34 +++++++- mapping/mapping_vectors_test.go | 118 +++++++++++++++++++++++++++ search_knn_test.go | 137 ++++++++++++++++++++++++++++++++ 3 files changed, 285 insertions(+), 4 deletions(-) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index c3dee9310..393262b35 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -20,6 +20,7 @@ package mapping import ( "fmt" "reflect" + "slices" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/util" @@ -151,8 +152,10 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, vectorIndexOptimizedFor = index.DefaultIndexOptimization } // normalize raw vector if similarity is cosine + // Since the vector can be multi-vector (flattened array of multiple vectors), + // we use NormalizeMultiVector to normalize each sub-vector independently. if similarity == index.CosineSimilarity { - vector = NormalizeVector(vector) + vector = NormalizeMultiVector(vector, fm.Dims) } fieldName := getFieldName(pathString, path, fm) @@ -186,7 +189,8 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if err != nil || len(decodedVector) != fm.Dims { return } - // normalize raw vector if similarity is cosine + // normalize raw vector if similarity is cosine, multi-vector is not supported + // for base64 encoded vectors, so we use NormalizeVector directly. if similarity == index.CosineSimilarity { decodedVector = NormalizeVector(decodedVector) } @@ -292,11 +296,33 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, return nil } +// NormalizeVector normalizes a single vector to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. func NormalizeVector(vec []float32) []float32 { // make a copy of the vector to avoid modifying the original // vector in-place - vecCopy := make([]float32, len(vec)) - copy(vecCopy, vec) + vecCopy := slices.Clone(vec) // normalize the vector copy using in-place normalization provided by faiss return faiss.NormalizeVector(vecCopy) } + +// NormalizeMultiVector normalizes each sub-vector of size `dims` independently. +// For a flattened array containing multiple vectors, each sub-vector is +// normalized separately to unit length. +// It makes a copy of the input vector to avoid modifying it in-place. +func NormalizeMultiVector(vec []float32, dims int) []float32 { + if len(vec) == 0 || dims <= 0 || len(vec)%dims != 0 { + return vec + } + // Single vector - delegate to NormalizeVector + if len(vec) == dims { + return NormalizeVector(vec) + } + // Multi-vector - make a copy to avoid modifying the original + result := slices.Clone(vec) + // Normalize each sub-vector in-place + for i := 0; i < len(result); i += dims { + faiss.NormalizeVector(result[i : i+dims]) + } + return result +} diff --git a/mapping/mapping_vectors_test.go b/mapping/mapping_vectors_test.go index b00e5c094..0620510a0 100644 --- a/mapping/mapping_vectors_test.go +++ b/mapping/mapping_vectors_test.go @@ -18,6 +18,7 @@ package mapping import ( + "math" "reflect" "strings" "testing" @@ -1069,3 +1070,120 @@ func TestNormalizeVector(t *testing.T) { } } } + +func TestNormalizeMultiVectors(t *testing.T) { + tests := []struct { + name string + input []float32 + dims int + expected []float32 + }{ + { + name: "single vector - already normalized", + input: []float32{1, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "single vector - needs normalization", + input: []float32{3, 0, 0}, + dims: 3, + expected: []float32{1, 0, 0}, + }, + { + name: "two vectors - X and Y directions", + input: []float32{3, 0, 0, 0, 4, 0}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0}, + }, + { + name: "three vectors", + input: []float32{3, 0, 0, 0, 4, 0, 0, 0, 5}, + dims: 3, + expected: []float32{1, 0, 0, 0, 1, 0, 0, 0, 1}, + }, + { + name: "two 2D vectors", + input: []float32{3, 4, 5, 12}, + dims: 2, + expected: []float32{0.6, 0.8, 0.38461538, 0.92307693}, + }, + { + name: "empty vector", + input: []float32{}, + dims: 3, + expected: []float32{}, + }, + { + name: "zero dims", + input: []float32{1, 2, 3}, + dims: 0, + expected: []float32{1, 2, 3}, + }, + { + name: "negative dims", + input: []float32{1, 2, 3}, + dims: -1, + expected: []float32{1, 2, 3}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Make a copy of input to verify original is not modified + inputCopy := make([]float32, len(tt.input)) + copy(inputCopy, tt.input) + + result := NormalizeMultiVector(tt.input, tt.dims) + + // Check result matches expected + if len(result) != len(tt.expected) { + t.Errorf("length mismatch: expected %d, got %d", len(tt.expected), len(result)) + return + } + + for i := range result { + if !floatApproxEqual(result[i], tt.expected[i], 1e-5) { + t.Errorf("value mismatch at index %d: expected %v, got %v", + i, tt.expected[i], result[i]) + } + } + + // Verify original input was not modified + if !reflect.DeepEqual(tt.input, inputCopy) { + t.Errorf("original input was modified: was %v, now %v", inputCopy, tt.input) + } + + // For valid multi-vectors, verify each sub-vector has unit magnitude + if tt.dims > 0 && len(tt.input) > 0 && len(tt.input)%tt.dims == 0 { + numVecs := len(result) / tt.dims + for i := 0; i < numVecs; i++ { + subVec := result[i*tt.dims : (i+1)*tt.dims] + mag := magnitude(subVec) + // Allow for zero vectors (magnitude 0) or unit vectors (magnitude 1) + if mag > 1e-6 && !floatApproxEqual(mag, 1.0, 1e-5) { + t.Errorf("sub-vector %d has magnitude %v, expected 1.0", i, mag) + } + } + } + }) + } +} + +// Helper to compute magnitude of a vector +func magnitude(v []float32) float32 { + var sum float32 + for _, x := range v { + sum += x * x + } + return float32(math.Sqrt(float64(sum))) +} + +// Helper for approximate float comparison +func floatApproxEqual(a, b, epsilon float32) bool { + diff := a - b + if diff < 0 { + diff = -diff + } + return diff < epsilon +} diff --git a/search_knn_test.go b/search_knn_test.go index f518d337e..7d110b5ec 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1645,6 +1645,143 @@ func TestNestedVectors(t *testing.T) { } } +// TestMultiVectorCosineNormalization verifies that multi-vector fields are +// normalized correctly with cosine similarity. Each sub-vector in a multi-vector +// should be independently normalized, producing correct similarity scores. +func TestMultiVectorCosineNormalization(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + const dims = 3 + + // Create index with cosine similarity + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = dims + vecFieldMapping.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + // Multi-vector field + vecFieldMappingNested := mapping.NewVectorFieldMapping() + vecFieldMappingNested.Dims = dims + vecFieldMappingNested.Similarity = index.CosineSimilarity + indexMapping.DefaultMapping.AddFieldMappingsAt("vec_nested", vecFieldMappingNested) + + idx, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + docsString := []string{ + `{"vec": [3, 0, 0]}`, + `{"vec": [0, 4, 0]}`, + `{"vec_nested": [[3, 0, 0], [0, 4, 0]]}`, + } + + for i, docStr := range docsString { + var doc map[string]interface{} + err = json.Unmarshal([]byte(docStr), &doc) + if err != nil { + t.Fatal(err) + } + err = idx.Index(fmt.Sprintf("doc%d", i+1), doc) + if err != nil { + t.Fatal(err) + } + } + + // Query for X direction [1,0,0] + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{1, 0, 0}, 3, 1.0) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc1 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc1" { + t.Fatalf("expected doc1 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc2 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc2" { + t.Fatalf("expected doc2 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Query for Y direction [0,1,0] + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + // Hit 1 should be doc2 with score 1.0 (perfect match) + if res.Hits[0].ID != "doc2" { + t.Fatalf("expected doc2 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Hit 2 should be doc1 with a score of 0.0 (orthogonal) + if res.Hits[1].ID != "doc1" { + t.Fatalf("expected doc1 as second hit, got %s", res.Hits[1].ID) + } + if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { + t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) + } + + // Now test querying the nested multi-vector field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{1, 0, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on first sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } + // Query for Y direction [0,1,0] on nested field + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec_nested", []float32{0, 1, 0}, 3, 1.0) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Hit should be doc3 with score 1.0 (perfect match on second sub-vector) + if res.Hits[0].ID != "doc3" { + t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) + } + if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { + t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) + } +} + func TestNumVecsStat(t *testing.T) { dataset, _, err := readDatasetAndQueries(testInputCompressedFile) From 0250c8ffe2eba2e34a6228262b392fae08cd41d8 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 10 Dec 2025 14:30:12 +0530 Subject: [PATCH 66/70] revert again --- index_alias_impl.go | 11 +---------- search/util.go | 36 ------------------------------------ search_knn.go | 41 +++++------------------------------------ 3 files changed, 6 insertions(+), 82 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 41e78f1f2..8212c74b9 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -946,16 +946,7 @@ func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearc return } if flags.knn { - knnHits := preSearchResult.Hits - // we are done calculating the final top K vectors, so we need to prepare - // the payload for returning the final results. - knnHits = prepareKNNResults(req, knnHits) - // if score fusion is not requested, then finalize the KNN results now. - // else, defer the finalization to the score fusion phase. - if !IsScoreFusionRequested(req) { - knnHits = finalizeKNNResults(req, knnHits) - } - preSearchResult.Hits = knnHits + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) } } diff --git a/search/util.go b/search/util.go index 2406aa749..005fda67d 100644 --- a/search/util.go +++ b/search/util.go @@ -237,39 +237,3 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } - -// MergeScoreExplBreakdown merges two score breakdown maps and their explanations together -// by picking the best score per query component, and merging them -// (and their corresponding explanations) into the first map. -func MergeScoreExplBreakdown(first, second map[int]float64, firstExpl, secondExpl *Explanation) (map[int]float64, *Explanation) { - if first == nil { - return second, secondExpl - } - if second == nil { - return first, firstExpl - } - // pick the best score per query component between the two maps - for k, score := range second { - if existing, ok := first[k]; !ok || existing < score { - first[k] = score - if firstExpl != nil && secondExpl != nil { - // Ensure Children slices are non-nil and long enough - if firstExpl.Children == nil || len(firstExpl.Children) <= k { - newLen := k + 1 - newChildren := make([]*Explanation, newLen) - if firstExpl.Children != nil { - copy(newChildren, firstExpl.Children) - } - firstExpl.Children = newChildren - } - if secondExpl.Children == nil || len(secondExpl.Children) <= k { - // If secondExpl.Children is nil or too short, skip assignment - // (or could set to nil, but here we skip) - continue - } - firstExpl.Children[k] = secondExpl.Children[k] - } - } - } - return first, firstExpl -} diff --git a/search_knn.go b/search_knn.go index e395d1085..8c3731ac1 100644 --- a/search_knn.go +++ b/search_knn.go @@ -458,14 +458,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea } knnHits = knnCollector.Results() if !preSearch { - // we are done calculating the final top K vectors, so we need to prepare - // the payload for returning the final results. - knnHits = prepareKNNResults(req, knnHits) - // if score fusion is not requested, then finalize the KNN results now. - // else, defer the finalization to the score fusion phase. - if !IsScoreFusionRequested(req) { - knnHits = finalizeKNNResults(req, knnHits) - } + knnHits = finalizeKNNResults(req, knnHits) } // at this point, irrespective of whether it is a preSearch or not, // the knn hits are populated with Sort and Fields. @@ -495,7 +488,7 @@ func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, } } -func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { +func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { // if the KNN operator is AND, then we need to filter out the hits that // do not have match the KNN queries. if req.KNNOperator == knnOperatorAnd { @@ -508,35 +501,11 @@ func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*s } knnHits = knnHits[:idx] } - if len(knnHits) == 0 { + // if score fusion required, return early because + // score breakdown is retained + if IsScoreFusionRequested(req) { return knnHits } - // we may be getting multiple vectors for the same document, so - // we need to deduplicate the hits based on the Document ID. - // sort the hits based on the Document ID. - sort.Slice(knnHits, func(i, j int) bool { - return knnHits[i].ID < knnHits[j].ID - }) - rv := knnHits[:1] - lastUnique := rv[0] - for i := 1; i < len(knnHits); i++ { - current := knnHits[i] - if current.ID != lastUnique.ID { - rv = append(rv, current) - lastUnique = current - } else { - // we have a duplicate document, so we take the best score breakdown - // for each KNN query. - lastUnique.ScoreBreakdown, lastUnique.Expl = search.MergeScoreExplBreakdown( - lastUnique.ScoreBreakdown, current.ScoreBreakdown, - lastUnique.Expl, current.Expl) - } - } - knnHits = rv - return knnHits -} - -func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { // fix the score using score breakdown now // if the score is none, then we need to set the score to 0.0 // if req.Explain is true, then we need to use the expl breakdown to From d2faeb6e45be93c1dcf42ecd63391b128f097f99 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 10 Dec 2025 14:44:49 +0530 Subject: [PATCH 67/70] fix test --- search_knn_test.go | 15 +++++++-------- search_no_knn.go | 4 ---- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index 7914aeac2..df21cff4c 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1652,15 +1652,14 @@ func TestNestedVectors(t *testing.T) { } // ----------------------------------------------------------------------------- -// TestKNNMerger tests the KNN merger functionality which handles duplicate -// document matches from the KNN searcher. When a document has multiple vectors +// TestMultiVector tests the KNN functionality which handles duplicate +// vectors being matched within the same document. When a document has multiple vectors // (via [[]] array of vectors or [{}] array of objects with vectors), the KNN -// searcher may return the same document multiple times with different scores. -// The merger must: -// 1. Detect duplicates by IndexInternalID -// 2. Merge score breakdowns, keeping the best score per KNN query -// 3. Properly flush the last document after iteration completes -func TestKNNMerger(t *testing.T) { +// searcher must pick the best scoring vector match for that document. This test covers these scenarios: +// - Single vector field (baseline) +// - [[]] style: array of vectors (same doc appears multiple times) +// - [{}] style: array of objects with vector field (chunks pattern) +func TestMultiVector(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) diff --git a/search_no_knn.go b/search_no_knn.go index 3e9bad9f6..172f258ec 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -224,10 +224,6 @@ func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preS return mergedOut, nil } -func prepareKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { - return knnHits -} - func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { return knnHits } From dd7d1b228a8713037dd758f4ec061658c99faba8 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Wed, 10 Dec 2025 14:45:47 +0530 Subject: [PATCH 68/70] remove newline --- search_knn.go | 1 + 1 file changed, 1 insertion(+) diff --git a/search_knn.go b/search_knn.go index 8c3731ac1..54771ede0 100644 --- a/search_knn.go +++ b/search_knn.go @@ -501,6 +501,7 @@ func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []* } knnHits = knnHits[:idx] } + // if score fusion required, return early because // score breakdown is retained if IsScoreFusionRequested(req) { From d8aafeae40a0dce7e0768313b7faee0e07d60980 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 11 Dec 2025 18:50:15 +0530 Subject: [PATCH 69/70] finally --- search/collector/knn.go | 3 --- search/collector/topn.go | 14 +++++++++++++- search/search.go | 15 --------------- search_knn.go | 32 +------------------------------- 4 files changed, 14 insertions(+), 50 deletions(-) diff --git a/search/collector/knn.go b/search/collector/knn.go index dbda24252..465bf6927 100644 --- a/search/collector/knn.go +++ b/search/collector/knn.go @@ -136,11 +136,8 @@ type KNNCollector struct { took time.Duration results search.DocumentMatchCollection maxScore float64 - - nestedStore *collectStoreNested } -// NewKNNCollector creates a new KNNCollector for the given K values and size. func NewKNNCollector(kArray []int64, size int64) *KNNCollector { return &KNNCollector{ knnStore: GetNewKNNCollectorStore(kArray), diff --git a/search/collector/topn.go b/search/collector/topn.go index 395005428..bab318d5c 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -140,7 +140,19 @@ func newTopNCollector(size int, skip int, sort search.SortOrder, nr index.Nested // merge field term locations parent.FieldTermLocations = search.MergeFieldTermLocationsFromMatch(parent.FieldTermLocations, child) // add child's ID to parent's Descendants - parent.AddDescendantID(child.IndexInternalID) + // add other as descendant only if it is not the same document + if !parent.IndexInternalID.Equals(child.IndexInternalID) { + // Add a copy of child.IndexInternalID to descendants, because + // child.IndexInternalID will be reset when 'child' is recycled. + var descendantID index.IndexInternalID + // first check if parent's descendants slice has capacity to reuse + if len(parent.Descendants) < cap(parent.Descendants) { + // reuse the buffer element at len(parent.Descendants) + descendantID = parent.Descendants[:len(parent.Descendants)+1][len(parent.Descendants)] + } + // copy the contents of id into descendantID, allocating if needed + parent.Descendants = append(parent.Descendants, index.NewIndexInternalIDFrom(descendantID, child.IndexInternalID)) + } return nil } hc.nestedStore = newStoreNested(nr, search.DescendantAdderCallbackFn(descAdder)) diff --git a/search/search.go b/search/search.go index 2996fa6f7..541bbe42a 100644 --- a/search/search.go +++ b/search/search.go @@ -382,21 +382,6 @@ func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", dm.ID, dm.Score) } -func (dm *DocumentMatch) AddDescendantID(id index.IndexInternalID) { - // add other as descendant only if it is not the same document - if !dm.IndexInternalID.Equals(id) { - // Add a copy of id to descendants - var descendantID index.IndexInternalID - // first check if dm's descendants slice has capacity to reuse - if len(dm.Descendants) < cap(dm.Descendants) { - // reuse the buffer element at len(dm.Descendants) - descendantID = dm.Descendants[:len(dm.Descendants)+1][len(dm.Descendants)] - } - // copy the contents of id into descendantID, allocating if needed - dm.Descendants = append(dm.Descendants, index.NewIndexInternalIDFrom(descendantID, id)) - } -} - type DocumentMatchCollection []*DocumentMatch func (c DocumentMatchCollection) Len() int { return len(c) } diff --git a/search_knn.go b/search_knn.go index dabd130a2..203d02629 100644 --- a/search_knn.go +++ b/search_knn.go @@ -24,7 +24,6 @@ import ( "sort" "github.com/blevesearch/bleve/v2/document" - "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/query" @@ -452,10 +451,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea err = serr } }() - knnCollector, err := i.buildKNNCollector(ctx, KNNQuery, reader, kArray, sumOfK) - if err != nil { - return nil, err - } + knnCollector := collector.NewKNNCollector(kArray, sumOfK) err = knnCollector.Collect(ctx, knnSearcher, reader) if err != nil { return nil, err @@ -485,9 +481,6 @@ func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopN ftsMatch.Score += knnMatch.Score // Combine the FTS explanation with the KNN explanation, if present ftsMatch.Expl.MergeWith(knnMatch.Expl) - // Add the Descendants from the KNN match to the FTS match, deduplicating them on the way - // The Descendants of a DocumentMatch is always sorted, and we must maintain that invariant - ftsMatch.Descendants = search.SortedUnion(ftsMatch.Descendants, knnMatch.Descendants) } coll.SetKNNHits(knnHits, search.HybridMergeCallbackFn(mergeFn)) } @@ -688,26 +681,3 @@ func (r *rescorer) restoreKnnRequest() { r.req.KNN[i].Boost = &b } } - -func (i *indexImpl) buildKNNCollector(ctx context.Context, KNNQuery query.Query, reader index.IndexReader, kArray []int64, sumOfK int64) (*collector.KNNCollector, error) { - // check if we are in nested mode - if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { - // get the nested reader from the index reader - if nr, ok := reader.(index.NestedReader); ok { - // check if the KNN query intersects with the nested mapping - if nm, ok := i.m.(mapping.NestedMapping); ok { - var fs search.FieldSet - var err error - fs, err = query.ExtractFields(KNNQuery, i.m, fs) - if err != nil { - return nil, err - } - if nm.IntersectsPrefix(fs) { - return collector.NewNestedKNNCollector(kArray, sumOfK, nr), nil - } - } - } - } - - return collector.NewKNNCollector(kArray, sumOfK), nil -} From 835b14228352649066f49943950968d9f4d7c4bd Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 12 Dec 2025 19:19:07 +0530 Subject: [PATCH 70/70] fix test --- search_knn_test.go | 270 ++++++++++++++++++++++++++++----------------- 1 file changed, 166 insertions(+), 104 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index fc97081d2..988f8ec73 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -1663,134 +1663,196 @@ func TestMultiVector(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) - const dims = 3 + // JSON documents covering merger scenarios: + // - Single vector (baseline) + // - [[]] style: array of vectors (same doc appears multiple times) + // - [{}] style: array of objects with vector field (chunks pattern) + docs := map[string]string{ + // Single vector - baseline + "doc1": `{ + "vec": [10, 10, 10], + "vecB": [100, 100, 100] + }`, + // [[]] style - array of 2 vectors + "doc2": `{ + "vec": [[0, 0, 0], [500, 500, 500]], + "vecB": [[900, 900, 900], [950, 950, 950], [975, 975, 975], [990, 990, 990]] + }`, + // [[]] style - array of 3 vectors + "doc3": `{ + "vec": [[50, 50, 50], [200, 200, 200], [400, 400, 400]], + "vecB": [[800, 800, 800], [850, 850, 850]] + }`, + // Single vector - baseline + "doc4": `{ + "vec": [1000, 1000, 1000], + "vecB": [1, 1, 1] + }`, + // [{}] style - array of objects with vector field (chunks pattern) + "doc5": `{ + "chunks": [ + {"vec": [10, 10, 10], "text": "chunk1"}, + {"vec": [20, 20, 20], "text": "chunk2"}, + {"vec": [30, 30, 30], "text": "chunk3"}, + {"vec": [40, 40, 40], "text": "chunk4"} + ] + }`, + "doc6": `{ + "chunks": [ + {"vec": [[10, 10, 10],[20, 20, 20]], "text": "chunk1"}, + {"vec": [[30, 30, 30],[40, 40, 40]], "text": "chunk2"} + ] + }`, + } + + // Parse JSON documents + dataset := make(map[string]map[string]interface{}) + for docID, jsonStr := range docs { + var doc map[string]interface{} + if err := json.Unmarshal([]byte(jsonStr), &doc); err != nil { + t.Fatalf("failed to unmarshal %s: %v", docID, err) + } + dataset[docID] = doc + } - // Create index with cosine similarity + // Index mapping indexMapping := NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = dims - vecFieldMapping.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - // Multi-vector field - vecFieldMappingNested := mapping.NewVectorFieldMapping() - vecFieldMappingNested.Dims = dims - vecFieldMappingNested.Similarity = index.CosineSimilarity - indexMapping.DefaultMapping.AddFieldMappingsAt("vec_nested", vecFieldMappingNested) + vecMapping := mapping.NewVectorFieldMapping() + vecMapping.Dims = 3 + vecMapping.Similarity = index.InnerProduct + indexMapping.DefaultMapping.AddFieldMappingsAt("vec", vecMapping) + indexMapping.DefaultMapping.AddFieldMappingsAt("vecB", vecMapping) + + // Nested chunks mapping for [{}] style + chunksMapping := mapping.NewDocumentMapping() + chunksMapping.AddFieldMappingsAt("vec", vecMapping) + indexMapping.DefaultMapping.AddSubDocumentMapping("chunks", chunksMapping) + // Create and populate index idx, err := New(tmpIndexPath, indexMapping) if err != nil { t.Fatal(err) } defer func() { - err := idx.Close() - if err != nil { + if err := idx.Close(); err != nil { t.Fatal(err) } }() - docsString := []string{ - `{"vec": [3, 0, 0]}`, - `{"vec": [0, 4, 0]}`, - `{"vec_nested": [[3, 0, 0], [0, 4, 0]]}`, + batch := idx.NewBatch() + for docID, doc := range dataset { + if err := batch.Index(docID, doc); err != nil { + t.Fatal(err) + } + } + if err := idx.Batch(batch); err != nil { + t.Fatal(err) } - for i, docStr := range docsString { - var doc map[string]interface{} - err = json.Unmarshal([]byte(docStr), &doc) + // Test: Single KNN query - basic functionality + t.Run("VecFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vec", []float32{1, 1, 1}, 20, 1.0) + res, err := idx.Search(searchReq) if err != nil { t.Fatal(err) } - err = idx.Index(fmt.Sprintf("doc%d", i+1), doc) + // Inner product: score = sum(query_i * doc_i) + // doc1 vec=[10,10,10]: 1*10*3 = 30 + // doc2 vec best is [500,500,500]: 1*500*3 = 1500 + // doc3 vec best is [400,400,400]: 1*400*3 = 1200 + // doc4 vec=[1000,1000,1000]: 1*1000*3 = 3000 + expectedResult := []struct { + docID string + expectedScore float64 + }{ + {docID: "doc4", expectedScore: 3000}, + {docID: "doc2", expectedScore: 1500}, + {docID: "doc3", expectedScore: 1200}, + {docID: "doc1", expectedScore: 30}, + } + + if len(res.Hits) != len(expectedResult) { + t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits)) + } + + for i, expected := range expectedResult { + if res.Hits[i].ID != expected.docID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID) + } + if res.Hits[i].Score != expected.expectedScore { + t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score) + } + } + }) + + // Test: Single KNN query on vecB field + t.Run("VecBFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("vecB", []float32{1000, 1000, 1000}, 20, 1.0) + res, err := idx.Search(searchReq) if err != nil { t.Fatal(err) } - } + // Inner product: score = sum(query_i * doc_i) for each dimension + // doc1: vecB=[100,100,100] -> 1000*100*3 = 300,000 + // doc2: vecB best is [990,990,990] -> 1000*990*3 = 2,970,000 + // doc3: vecB best is [850,850,850] -> 1000*850*3 = 2,550,000 + // doc4: vecB=[1,1,1] -> 1000*1*3 = 3,000 + expectedResult := []struct { + docID string + expectedScore float64 + }{ + {docID: "doc2", expectedScore: 2970000}, + {docID: "doc3", expectedScore: 2550000}, + {docID: "doc1", expectedScore: 300000}, + {docID: "doc4", expectedScore: 3000}, + } - // Query for X direction [1,0,0] - searchReq := NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec", []float32{1, 0, 0}, 3, 1.0) - res, err := idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 2 { - t.Fatalf("expected 2 hits, got %d", len(res.Hits)) - } - // Hit 1 should be doc1 with score 1.0 (perfect match) - if res.Hits[0].ID != "doc1" { - t.Fatalf("expected doc1 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Hit 2 should be doc2 with a score of 0.0 (orthogonal) - if res.Hits[1].ID != "doc2" { - t.Fatalf("expected doc2 as second hit, got %s", res.Hits[1].ID) - } - if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { - t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) - } + if len(res.Hits) != len(expectedResult) { + t.Fatalf("expected %d hits, got %d", len(expectedResult), len(res.Hits)) + } - // Query for Y direction [0,1,0] - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec", []float32{0, 1, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 2 { - t.Fatalf("expected 2 hits, got %d", len(res.Hits)) - } - // Hit 1 should be doc2 with score 1.0 (perfect match) - if res.Hits[0].ID != "doc2" { - t.Fatalf("expected doc2 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Hit 2 should be doc1 with a score of 0.0 (orthogonal) - if res.Hits[1].ID != "doc1" { - t.Fatalf("expected doc1 as second hit, got %s", res.Hits[1].ID) - } - if math.Abs(float64(res.Hits[1].Score-0.0)) > 1e-6 { - t.Fatalf("expected score 0.0, got %f", res.Hits[1].Score) - } + for i, expected := range expectedResult { + if res.Hits[i].ID != expected.docID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expected.docID, res.Hits[i].ID) + } + if res.Hits[i].Score != expected.expectedScore { + t.Fatalf("at rank %d, expected score %v, got %v", i+1, expected.expectedScore, res.Hits[i].Score) + } + } + }) - // Now test querying the nested multi-vector field - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec_nested", []float32{1, 0, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(res.Hits)) - } - // Hit should be doc3 with score 1.0 (perfect match on first sub-vector) - if res.Hits[0].ID != "doc3" { - t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } - // Query for Y direction [0,1,0] on nested field - searchReq = NewSearchRequest(query.NewMatchNoneQuery()) - searchReq.AddKNN("vec_nested", []float32{0, 1, 0}, 3, 1.0) - res, err = idx.Search(searchReq) - if err != nil { - t.Fatal(err) - } - if len(res.Hits) != 1 { - t.Fatalf("expected 1 hit, got %d", len(res.Hits)) - } - // Hit should be doc3 with score 1.0 (perfect match on second sub-vector) - if res.Hits[0].ID != "doc3" { - t.Fatalf("expected doc3 as first hit, got %s", res.Hits[0].ID) - } - if math.Abs(float64(res.Hits[0].Score-1.0)) > 1e-6 { - t.Fatalf("expected score 1.0, got %f", res.Hits[0].Score) - } + // Test: Single KNN query on nested chunks.vec field + t.Run("ChunksVecFieldSingle", func(t *testing.T) { + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("chunks.vec", []float32{1, 1, 1}, 20, 1.0) + searchReq.SortBy([]string{"_score", "docID"}) + res, err := idx.Search(searchReq) + if err != nil { + t.Fatal(err) + } + + // Only doc5 and doc6 have chunks.vec + // doc5 chunks: [10,10,10], [20,20,20], [30,30,30], [40,40,40] + // Best score: 1*40*3 = 120 + // doc6 chunks: [[10,10,10],[20,20,20]], [[30,30,30],[40,40,40]] + // Best score: 1*40*3 = 120 + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + // Both should have score 120 + for _, hit := range res.Hits { + if hit.ID != "doc5" && hit.ID != "doc6" { + t.Fatalf("unexpected docID %s, expected doc5 or doc6", hit.ID) + } + if hit.Score != 120 { + t.Fatalf("for %s, expected score 120, got %v", hit.ID, hit.Score) + } + } + }) } // TestMultiVectorCosineNormalization verifies that multi-vector fields are