Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
16093d1
rebase
CascadingRadium Nov 17, 2025
abaddc7
minor UT change
CascadingRadium Nov 17, 2025
f8f4061
revert gomod change
CascadingRadium Nov 17, 2025
b6ac3e1
typos
CascadingRadium Nov 17, 2025
a5fab17
frankUT
CascadingRadium Nov 17, 2025
2be276f
fix array of arrays
CascadingRadium Nov 19, 2025
9b68719
bug fixes and UTs
CascadingRadium Nov 20, 2025
dddf0ab
update readme, add docs
CascadingRadium Nov 22, 2025
7883233
fix render
CascadingRadium Nov 22, 2025
7b058a6
lint fix
CascadingRadium Nov 22, 2025
311f578
bug fixes
CascadingRadium Nov 25, 2025
106dcb3
fix bugs
CascadingRadium Nov 26, 2025
0d5bafc
perf optimization
CascadingRadium Nov 26, 2025
9da53cf
perf optimization
CascadingRadium Nov 26, 2025
53a5699
perf optimiztion 2
CascadingRadium Nov 26, 2025
32bfd15
fix prefilter case
CascadingRadium Nov 27, 2025
e341e38
cleanup
CascadingRadium Nov 27, 2025
b94610d
cleanup
CascadingRadium Nov 27, 2025
204aa23
performance optimization 3
CascadingRadium Nov 27, 2025
056d963
fix knn
CascadingRadium Nov 27, 2025
e13bcc5
add vector use case in md
CascadingRadium Nov 27, 2025
7c58c80
perf optimization 4
CascadingRadium Nov 27, 2025
9ee54b1
use clear
CascadingRadium Nov 27, 2025
2f6769a
perf opt 7
CascadingRadium Nov 27, 2025
4841173
rewire api to avoid alloc
CascadingRadium Nov 28, 2025
2e318dc
use new API
CascadingRadium Nov 28, 2025
fa38d98
reuse descendantIDs
CascadingRadium Nov 28, 2025
7857fef
fix bug
CascadingRadium Nov 28, 2025
5eb7766
heuristic for Advance()
CascadingRadium Nov 29, 2025
77f4b32
trivial fix
CascadingRadium Nov 29, 2025
f4b8c1f
fix String() method
CascadingRadium Nov 30, 2025
7215a0a
performance optimization
CascadingRadium Dec 2, 2025
63edd06
Merge branch 'master' into nested
CascadingRadium Dec 2, 2025
f347198
fix merge
CascadingRadium Dec 2, 2025
933a97c
fix merge 2
CascadingRadium Dec 2, 2025
0edb0ad
hybrid search fix part 1
CascadingRadium Dec 2, 2025
35ba4ff
Fix duplicate results when performing KNN search
CascadingRadium Dec 3, 2025
234b4aa
fix duplicate issue
CascadingRadium Dec 3, 2025
3b5d30c
code review
CascadingRadium Dec 3, 2025
43a4845
Merge branch 'knnDup' into nested
CascadingRadium Dec 3, 2025
c2cc750
fix stat
CascadingRadium Dec 3, 2025
e6cd8ea
Fix vector field aliase validation
CascadingRadium Dec 3, 2025
fc13aca
unit tests
CascadingRadium Dec 3, 2025
56e46fa
Fix vector normalization to handle multi-vectors correctly
CascadingRadium Dec 4, 2025
781335b
Fix vector normalization to handle multi-vectors correctly
CascadingRadium Dec 4, 2025
4e3891f
UT
CascadingRadium Dec 4, 2025
9dae832
merge conflict
CascadingRadium Dec 4, 2025
d654a70
Fix duplicate results when performing KNN search
CascadingRadium Dec 3, 2025
7e65ecd
code review
CascadingRadium Dec 3, 2025
8838f89
fix dedup logic
CascadingRadium Dec 4, 2025
9850084
Merge branch 'knnDup' into nested
CascadingRadium Dec 4, 2025
42c98f1
unit test
CascadingRadium Dec 4, 2025
c4dd9d4
fix
CascadingRadium Dec 4, 2025
351d8be
Apply suggestions from code review
CascadingRadium Dec 4, 2025
2db8199
go fmt ./...
abhinavdangeti Dec 4, 2025
a5fd255
fix total calc
CascadingRadium Dec 4, 2025
b914204
fix edge case
CascadingRadium Dec 4, 2025
6b153a0
fix test
CascadingRadium Dec 4, 2025
9ac8392
Merge branch 'knnDup' into cosineFix
CascadingRadium Dec 5, 2025
68760c2
use normalizeVector for base64
CascadingRadium Dec 5, 2025
99e2120
fix merge conflict
CascadingRadium Dec 5, 2025
b1f596c
Merge branch 'cosineFix' into nested
CascadingRadium Dec 5, 2025
8721d16
Fix interface
CascadingRadium Dec 5, 2025
70798cc
Merge branch 'knnDup' into nested
CascadingRadium Dec 5, 2025
f3ed293
fix KNN case
CascadingRadium Dec 5, 2025
a233b67
MB-69655: Fix vector normalization to handle multi-vectors correctly …
CascadingRadium Dec 8, 2025
dd2422d
revert
CascadingRadium Dec 8, 2025
f3540a6
fix
CascadingRadium Dec 8, 2025
fcb0d76
fix
CascadingRadium Dec 8, 2025
fc32bb8
Revert "MB-69655: Fix vector normalization to handle multi-vectors co…
abhinavdangeti Dec 8, 2025
8a4e70e
MB-69655: Fix vector normalization to handle multi-vectors correctly …
CascadingRadium Dec 8, 2025
0250c8f
revert again
CascadingRadium Dec 10, 2025
d2faeb6
fix test
CascadingRadium Dec 10, 2025
dd7d1b2
remove newline
CascadingRadium Dec 10, 2025
2d00d0b
Merge branch 'master' into nested
CascadingRadium Dec 11, 2025
210416b
Merge remote-tracking branch 'origin/cosineFix' into nested
CascadingRadium Dec 11, 2025
281e784
Merge branch 'knnDup' into nested
CascadingRadium Dec 11, 2025
d8aafea
finally
CascadingRadium Dec 11, 2025
835b142
fix test
CascadingRadium Dec 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ A modern indexing + search library in GO
* [geo spatial search](https://github.com/blevesearch/bleve/blob/master/geo/README.md)
* approximate k-nearest neighbors via [vector search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md)
* [synonym search](https://github.com/blevesearch/bleve/blob/master/docs/synonyms.md)
* [hierarchy search](https://github.com/blevesearch/bleve/blob/master/docs/hierarchy.md)
* [tf-idf](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#tf-idf) / [bm25](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#bm25) scoring models
* Hybrid search: exact + semantic
* Supports [RRF (Reciprocal Rank Fusion) and RSF (Relative Score Fusion)](docs/score_fusion.md)
Expand Down
376 changes: 376 additions & 0 deletions docs/hierarchy.md

Large diffs are not rendered by default.

37 changes: 35 additions & 2 deletions document/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"fmt"
"reflect"

"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)
Expand All @@ -30,8 +31,9 @@
}

type Document struct {
id string `json:"id"`
Fields []Field `json:"fields"`
id string
Fields []Field `json:"fields"`
NestedDocuments []*Document `json:"nested_documents"`
CompositeFields []*CompositeField
StoredFieldsSize uint64
indexed bool
Expand Down Expand Up @@ -157,3 +159,34 @@
func (d *Document) Indexed() bool {
return d.indexed
}

func (d *Document) AddNestedDocument(doc *Document) {
d.NestedDocuments = append(d.NestedDocuments, doc)
}

func (d *Document) NestedFields() search.FieldSet {
if len(d.NestedDocuments) == 0 {
return nil
}
fieldSet := search.NewFieldSet()
var collectFields func(index.Document)
collectFields = func(doc index.Document) {
// Add all field names from this nested document
doc.VisitFields(func(field index.Field) {
fieldSet.AddField(field.Name())
})
// Recursively collect from this document's nested documents
if nd, ok := doc.(index.NestedDocument); ok {

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / coverage

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.23.x, ubuntu-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.24.x, ubuntu-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.25.x, ubuntu-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.25.x, windows-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.24.x, windows-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.23.x, macos-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.25.x, macos-latest)

undefined: index.NestedDocument

Check failure on line 179 in document/document.go

View workflow job for this annotation

GitHub Actions / test (1.24.x, macos-latest)

undefined: index.NestedDocument
nd.VisitNestedDocuments(collectFields)
}
}
// Start collection from nested documents only (not root document)
d.VisitNestedDocuments(collectFields)
return fieldSet
}

func (d *Document) VisitNestedDocuments(visitor func(doc index.Document)) {
for _, doc := range d.NestedDocuments {
visitor(doc)
}
}
5 changes: 5 additions & 0 deletions index/scorch/introducer.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
newss.deleted = nil
}

// update the deleted bitmap to include any nested/sub-documents as well
// if the segment supports that
if ns, ok := newss.segment.(segment.NestedSegment); ok {
newss.deleted = ns.AddNestedDocuments(newss.deleted)
}
// check for live size before copying
if newss.LiveSize() > 0 {
newSnapshot.segment = append(newSnapshot.segment, newss)
Expand Down
6 changes: 6 additions & 0 deletions index/scorch/scorch.go
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,12 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) {
}
}
})
if nd, ok := d.(index.NestedDocument); ok {
nd.VisitNestedDocuments(func(doc index.Document) {
doc.AddIDField()
analyze(doc, fn)
})
}
}

func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
Expand Down
59 changes: 29 additions & 30 deletions index/scorch/snapshot_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ package scorch
import (
"container/heap"
"context"
"encoding/binary"
"fmt"
"os"
"path/filepath"
Expand All @@ -42,9 +41,8 @@ type asynchSegmentResult struct {
dict segment.TermDictionary
dictItr segment.DictionaryIterator

cardinality int
index int
docs *roaring.Bitmap
index int
docs *roaring.Bitmap

thesItr segment.ThesaurusIterator

Expand All @@ -59,11 +57,11 @@ func init() {
var err error
lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
panic(fmt.Errorf("levenshtein automaton ed1 builder err: %v", err))
}
lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true)
if err != nil {
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
panic(fmt.Errorf("levenshtein automaton ed2 builder err: %v", err))
}
}

Expand Down Expand Up @@ -474,7 +472,7 @@ func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) {
func (is *IndexSnapshot) DocCount() (uint64, error) {
var rv uint64
for _, segment := range is.segment {
rv += segment.Count()
rv += segment.CountRoot()
}
return rv, nil
}
Expand All @@ -501,7 +499,7 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) {
return nil, nil
}

docNum, err := docInternalToNumber(next.ID)
docNum, err := next.ID.Value()
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -571,7 +569,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in
}

func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
docNum, err := docInternalToNumber(id)
docNum, err := id.Value()
if err != nil {
return "", err
}
Expand All @@ -589,7 +587,7 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
}

func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) {
docNum, err := docInternalToNumber(id)
docNum, err := id.Value()
if err != nil {
return 0, 0, err
}
Expand Down Expand Up @@ -776,25 +774,6 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade
is.m2.Unlock()
}

func docNumberToBytes(buf []byte, in uint64) []byte {
if len(buf) != 8 {
if cap(buf) >= 8 {
buf = buf[0:8]
} else {
buf = make([]byte, 8)
}
}
binary.BigEndian.PutUint64(buf, in)
return buf
}

func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
if len(in) != 8 {
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
}
return binary.BigEndian.Uint64(in), nil
}

func (is *IndexSnapshot) documentVisitFieldTermsOnSegment(
segmentIndex int, localDocNum uint64, fields []string, cFields []string,
visitor index.DocValueVisitor, dvs segment.DocVisitState) (
Expand Down Expand Up @@ -897,7 +876,7 @@ func (dvr *DocValueReader) BytesRead() uint64 {
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
visitor index.DocValueVisitor,
) (err error) {
docNum, err := docInternalToNumber(id)
docNum, err := id.Value()
if err != nil {
return err
}
Expand Down Expand Up @@ -1297,3 +1276,23 @@ func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending boo

return termFreqs[:limit], nil
}

// Ancestors returns the ancestor IDs for the given document ID. The prealloc
// slice can be provided to avoid allocations downstream, and MUST be empty.
func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID, prealloc []index.AncestorID) ([]index.AncestorID, error) {
// get segment and local doc num for the ID
seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID)
if err != nil {
return nil, err
}
// get ancestors from the segment
prealloc = i.segment[seg].Ancestors(ldoc, prealloc)
// get global offset for the segment (correcting factor for multi-segment indexes)
globalOffset := i.offsets[seg]
// adjust ancestors to global doc numbers, not local to segment
for idx := range prealloc {
prealloc[idx] = prealloc[idx].Add(globalOffset)
}
// return adjusted ancestors
return prealloc, nil
}
5 changes: 2 additions & 3 deletions index/scorch/snapshot_index_doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
package scorch

import (
"bytes"
"reflect"

"github.com/RoaringBitmap/roaring/v2"
Expand Down Expand Up @@ -49,7 +48,7 @@ func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
next := i.iterators[i.segmentOffset].Next()
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
return docNumberToBytes(nil, uint64(next)+globalOffset), nil
return index.NewIndexInternalID(nil, uint64(next)+globalOffset), nil
}
return nil, nil
}
Expand All @@ -63,7 +62,7 @@ func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.Inde
if next == nil {
return nil, nil
}
for bytes.Compare(next, ID) < 0 {
for next.Compare(ID) < 0 {
next, err = i.Next()
if err != nil {
return nil, err
Expand Down
9 changes: 4 additions & 5 deletions index/scorch/snapshot_index_tfr.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
package scorch

import (
"bytes"
"context"
"fmt"
"reflect"
Expand Down Expand Up @@ -94,7 +93,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset)
i.postingToTermFieldDoc(next, rv)

i.currID = rv.ID
Expand Down Expand Up @@ -146,7 +145,7 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin
func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
// FIXME do something better
// for now, if we need to seek backwards, then restart from the beginning
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
if i.currPosting != nil && i.currID.Compare(ID) >= 0 {
// Check if the TFR is a special unadorned composite optimization.
// Such a TFR will NOT have a valid `term` or `field` set, making it
// impossible for the TFR to replace itself with a new one.
Expand All @@ -171,7 +170,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
}
}
}
num, err := docInternalToNumber(ID)
num, err := ID.Value()
if err != nil {
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
}
Expand All @@ -196,7 +195,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
if preAlloced == nil {
preAlloced = &index.TermFieldDoc{}
}
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.postingToTermFieldDoc(next, preAlloced)
i.currID = preAlloced.ID
Expand Down
9 changes: 4 additions & 5 deletions index/scorch/snapshot_index_vr.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package scorch

import (
"bytes"
"context"
"encoding/json"
"fmt"
Expand Down Expand Up @@ -96,7 +95,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) (
// make segment number into global number by adding offset
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset)
rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset)
rv.Score = float64(next.Score())

i.currID = rv.ID
Expand All @@ -113,7 +112,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) (
func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {

if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
if i.currPosting != nil && i.currID.Compare(ID) >= 0 {
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k,
i.searchParams, i.eligibleSelector)
if err != nil {
Expand All @@ -124,7 +123,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
*i = *(i2.(*IndexSnapshotVectorReader))
}

num, err := docInternalToNumber(ID)
num, err := ID.Value()
if err != nil {
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
}
Expand All @@ -149,7 +148,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
if preAlloced == nil {
preAlloced = &index.VectorDoc{}
}
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+
i.snapshot.offsets[segIndex])
i.currID = preAlloced.ID
i.currPosting = next
Expand Down
21 changes: 21 additions & 0 deletions index/scorch/snapshot_segment.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ func (s *SegmentSnapshot) Count() uint64 {
return rv
}

// this counts the root documents in the segment this differs from Count() in that
// Count() counts all live documents including nested children, whereas this method
// counts only root live documents
func (s *SegmentSnapshot) CountRoot() uint64 {
var rv uint64
if nsb, ok := s.segment.(segment.NestedSegment); ok {
rv = nsb.CountRoot(s.deleted)
} else {
rv = s.Count()
}
return rv
}

func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
rv, err := s.segment.DocNumbers(docIDs)
if err != nil {
Expand Down Expand Up @@ -361,3 +374,11 @@ func (c *cachedMeta) fetchMeta(field string) (rv interface{}) {
c.m.RUnlock()
return rv
}

func (s *SegmentSnapshot) Ancestors(docNum uint64, prealloc []index.AncestorID) []index.AncestorID {
nsb, ok := s.segment.(segment.NestedSegment)
if !ok {
return append(prealloc, index.NewAncestorID(docNum))
}
return nsb.Ancestors(docNum, prealloc)
}
Loading
Loading