Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions centroid_index_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//go:build vectors
// +build vectors

package bleve

import (
"encoding/json"
"fmt"
"os"
"testing"

"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/mapping"
index "github.com/blevesearch/bleve_index_api"
)

func loadSiftData() ([]map[string]interface{}, error) {
fileContent, err := os.ReadFile("~/fts/data/datasets/vec-sift-bucket.json")
if err != nil {
return nil, err
}
var documents []map[string]interface{}
err = json.Unmarshal(fileContent, &documents)
if err != nil {
return nil, err
}
return documents, nil
}

func TestCentroidIndex(t *testing.T) {
_, _, err := readDatasetAndQueries(testInputCompressedFile)
if err != nil {
t.Fatal(err)
}
documents, err := loadSiftData()
if err != nil {
t.Fatal(err)
}
contentFieldMapping := NewTextFieldMapping()
contentFieldMapping.Analyzer = en.AnalyzerName

vecFieldMappingL2 := mapping.NewVectorFieldMapping()
vecFieldMappingL2.Dims = 128
vecFieldMappingL2.Similarity = index.EuclideanDistance

indexMappingL2Norm := NewIndexMapping()
indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("content", contentFieldMapping)
indexMappingL2Norm.DefaultMapping.AddFieldMappingsAt("vector", vecFieldMappingL2)

idx, err := newIndexUsing(t.TempDir(), indexMappingL2Norm, Config.DefaultIndexType, Config.DefaultKVStore, nil)
if err != nil {
t.Fatal(err)
}
defer func() {
err := idx.Close()
if err != nil {
t.Fatal(err)
}
}()

batch := idx.NewBatch()
for _, doc := range documents[:100000] {
docId := fmt.Sprintf("%s:%s", index.TrainDataPrefix, doc["id"])
err = batch.Index(docId, doc)
if err != nil {
t.Fatal(err)
}
}

err = idx.Train(batch)
if err != nil {
t.Fatal(err)
}
}
22 changes: 22 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,25 @@ require (
github.com/spf13/pflag v1.0.6 // indirect
golang.org/x/sys v0.29.0 // indirect
)

replace github.com/blevesearch/bleve/v2 => /Users/thejas.orkombu/fts/blevesearch/bleve

replace github.com/blevesearch/zapx/v11 => /Users/thejas.orkombu/fts/blevesearch/zapx11

replace github.com/blevesearch/zapx/v12 => /Users/thejas.orkombu/fts/blevesearch/zapx12

replace github.com/blevesearch/zapx/v13 => /Users/thejas.orkombu/fts/blevesearch/zapx13

replace github.com/blevesearch/zapx/v14 => /Users/thejas.orkombu/fts/blevesearch/zapx14

replace github.com/blevesearch/zapx/v15 => /Users/thejas.orkombu/fts/blevesearch/zapx15

replace github.com/blevesearch/zapx/v16 => /Users/thejas.orkombu/fts/blevesearch/zapx

replace github.com/blevesearch/scorch_segment_api/v2 => /Users/thejas.orkombu/fts/blevesearch/scorch_segment_api

replace github.com/blevesearch/go-faiss => /Users/thejas.orkombu/fts/blevesearch/go-faiss

replace github.com/blevesearch/bleve_index_api => /Users/thejas.orkombu/fts/blevesearch/bleve_index_api

replace github.com/blevesearch/sear => /Users/thejas.orkombu/fts/blevesearch/sear
18 changes: 0 additions & 18 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s=
github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk=
github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8=
github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI=
github.com/blevesearch/go-faiss v1.0.26/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA=
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A=
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
Expand All @@ -20,8 +16,6 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY=
github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
Expand All @@ -34,18 +28,6 @@ github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMG
github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs=
github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc=
github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE=
github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58=
github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks=
github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk=
github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0=
github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
github.com/blevesearch/zapx/v16 v16.2.7 h1:xcgFRa7f/tQXOwApVq7JWgPYSlzyUMmkuYa54tMDuR0=
github.com/blevesearch/zapx/v16 v16.2.7/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down
11 changes: 11 additions & 0 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ func (b *Batch) Index(id string, data interface{}) error {
eventIndex.FireIndexEvent()
}
doc := document.NewDocument(id)
// fmt.Printf("data is before mapping %#v\n", data)
err := b.index.Mapping().MapDocument(doc, data)
if err != nil {
return err
}
// fmt.Printf("data is after mapping %#v\n", doc)
b.internal.Update(doc)

b.lastDocSize = uint64(doc.Size() +
Expand Down Expand Up @@ -353,6 +355,11 @@ type IndexCopyable interface {
CopyTo(d index.Directory) error
}

type IndexFileCopyable interface {
UpdateFileInBolt(key []byte, value []byte) error
CopyFile(file string, d index.IndexDirectory) error
}

// FileSystemDirectory is the default implementation for the
// index.Directory interface.
type FileSystemDirectory string
Expand Down Expand Up @@ -396,3 +403,7 @@ type InsightsIndex interface {
// CentroidCardinalities returns the centroids (clusters) from IVF indexes ordered by data density.
CentroidCardinalities(field string, limit int, desceding bool) ([]index.CentroidCardinality, error)
}
type VectorIndex interface {
Index
Train(*Batch) error
}
11 changes: 6 additions & 5 deletions index/scorch/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,9 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,

atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
prevBytesReadTotal := cumulateBytesRead(segmentsToMerge)
newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path,
cw.cancelCh, s)

newDocNums, _, err := s.segPlugin.MergeEx(segmentsToMerge, docsToDrop, path,
cw.cancelCh, s, s.segmentConfig)
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)

fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
Expand All @@ -379,7 +380,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context,
return fmt.Errorf("merging failed: %v", err)
}

seg, err = s.segPlugin.Open(path)
seg, err = s.segPlugin.OpenEx(path, s.segmentConfig)
if err != nil {
s.unmarkIneligibleForRemoval(filename)
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
Expand Down Expand Up @@ -528,7 +529,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
// the newly merged segment is already flushed out to disk, just needs
// to be opened using mmap.
newDocIDs, _, err :=
s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s)
s.segPlugin.MergeEx(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig)
if err != nil {
em.Lock()
errs = append(errs, err)
Expand All @@ -543,7 +544,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot,
s.markIneligibleForRemoval(filename)
newMergedSegmentIDs[id] = newSegmentID
newDocIDsSet[id] = newDocIDs
newMergedSegments[id], err = s.segPlugin.Open(path)
newMergedSegments[id], err = s.segPlugin.OpenEx(path, s.segmentConfig)
if err != nil {
em.Lock()
errs = append(errs, err)
Expand Down
32 changes: 30 additions & 2 deletions index/scorch/persister.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint
}
}()
for segmentID, path := range newSegmentPaths {
newSegments[segmentID], err = s.segPlugin.Open(path)
newSegments[segmentID], err = s.segPlugin.OpenEx(path, s.segmentConfig)
if err != nil {
return fmt.Errorf("error opening new segment at %s, %v", path, err)
}
Expand Down Expand Up @@ -842,6 +842,22 @@ func zapFileName(epoch uint64) string {
return fmt.Sprintf("%012x.zap", epoch)
}

func (s *Scorch) updateCentroidIndex(bucket *bolt.Bucket) error {
if bucket == nil {
return nil
}
fmt.Println("updateCentroidIndex bucket", bucket != nil)
segmentSnapshot, err := s.loadSegment(bucket)
if err != nil {
return err
}
s.rootLock.Lock()
defer s.rootLock.Unlock()
fmt.Println("updateCentroidIndex", segmentSnapshot.segment != nil)
s.centroidIndex = segmentSnapshot
return nil
}

// bolt snapshot code

func (s *Scorch) loadFromBolt() error {
Expand All @@ -862,6 +878,12 @@ func (s *Scorch) loadFromBolt() error {
s.AddEligibleForRemoval(snapshotEpoch)
continue
}
// fmt.Println("loadFromBolt key %s", k)
// if k[0] == util.BoltCentroidIndexKey[0] {
// fmt.Println("loadFromBolt centroid index key", string(k))

// continue
// }
snapshot := snapshots.Bucket(k)
if snapshot == nil {
log.Printf("snapshot key, but bucket missing %x, continuing", k)
Expand Down Expand Up @@ -893,6 +915,12 @@ func (s *Scorch) loadFromBolt() error {

foundRoot = true
}

centroidIndexBucket := snapshots.Bucket(util.BoltCentroidIndexKey)
err := s.updateCentroidIndex(centroidIndexBucket)
if err != nil {
return err
}
return nil
})
if err != nil {
Expand Down Expand Up @@ -1005,7 +1033,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
return nil, fmt.Errorf("segment path missing")
}
segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
seg, err := s.segPlugin.Open(segmentPath)
seg, err := s.segPlugin.OpenEx(segmentPath, s.segmentConfig)
if err != nil {
return nil, fmt.Errorf("error opening bolt segment: %v", err)
}
Expand Down
Loading