diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3f2a330c5..664e6dfe9 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -815,6 +815,10 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( // Filter out fields that have been completely deleted or had their // docvalues data deleted from both visitable fields and required fields filterUpdatedFields := func(fields []string) []string { + // fast path: if no updatedFields just return the input + if len(is.updatedFields) == 0 { + return fields + } filteredFields := make([]string, 0, len(fields)) for _, field := range fields { if info, ok := is.updatedFields[field]; ok && @@ -826,8 +830,8 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( return filteredFields } - fieldsFiltered := filterUpdatedFields(fields) - vFieldsFiltered := filterUpdatedFields(vFields) + fields = filterUpdatedFields(fields) + vFields = filterUpdatedFields(vFields) var errCh chan error @@ -836,9 +840,9 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( // if the caller happens to know we're on the same segmentIndex // from a previous invocation if cFields == nil { - cFields = subtractStrings(fieldsFiltered, vFieldsFiltered) + cFields = subtractStrings(fields, vFields) - if !ss.cachedDocs.hasFields(cFields) { + if len(cFields) > 0 && !ss.cachedDocs.hasFields(cFields) { errCh = make(chan error, 1) go func() { @@ -851,8 +855,8 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( } } - if ssvOk && ssv != nil && len(vFieldsFiltered) > 0 { - dvs, err = ssv.VisitDocValues(localDocNum, fieldsFiltered, visitor, dvs) + if ssvOk && ssv != nil && len(vFields) > 0 { + dvs, err = ssv.VisitDocValues(localDocNum, fields, visitor, dvs) if err != nil { return nil, nil, err } @@ -980,17 +984,15 @@ func subtractStrings(a, b []string) []string { return a } - // Create a map for O(1) lookups - bMap := make(map[string]struct{}, len(b)) - for _, bs := range b { - bMap[bs] = struct{}{} - } - rv := make([]string, 0, len(a)) +OUTER: for _, as := range a { - if _, exists := bMap[as]; !exists { - rv = append(rv, as) + for _, bs := range b { + if as == bs { + continue OUTER + } } + rv = append(rv, as) } return rv } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index c6f3584cc..1421d10d0 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -241,7 +241,7 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { type cachedDocs struct { size uint64 - m sync.Mutex // As the cache is asynchronously prepared, need a lock + m sync.RWMutex // As the cache is asynchronously prepared, need a lock cache map[string]*cachedFieldDocs // Keyed by field } @@ -283,14 +283,14 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e // hasFields returns true if the cache has all the given fields func (c *cachedDocs) hasFields(fields []string) bool { - c.m.Lock() + c.m.RLock() for _, field := range fields { if _, exists := c.cache[field]; !exists { - c.m.Unlock() + c.m.RUnlock() return false // found a field not in cache } } - c.m.Unlock() + c.m.RUnlock() return true } @@ -311,13 +311,13 @@ func (c *cachedDocs) updateSizeLOCKED() { func (c *cachedDocs) visitDoc(localDocNum uint64, fields []string, visitor index.DocValueVisitor) { - c.m.Lock() + c.m.RLock() for _, field := range fields { if cachedFieldDocs, exists := c.cache[field]; exists { - c.m.Unlock() + c.m.RUnlock() <-cachedFieldDocs.readyCh - c.m.Lock() + c.m.RLock() if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { for { @@ -332,7 +332,7 @@ func (c *cachedDocs) visitDoc(localDocNum uint64, } } - c.m.Unlock() + c.m.RUnlock() } // the purpose of the cachedMeta is to simply allow the user of this type to record diff --git a/numeric/prefix_coded.go b/numeric/prefix_coded.go index 29bd0fc5c..03ba043e3 100644 --- a/numeric/prefix_coded.go +++ b/numeric/prefix_coded.go @@ -66,6 +66,14 @@ func MustNewPrefixCodedInt64(in int64, shift uint) PrefixCoded { return rv } +func MustNewPrefixCodedInt64Prealloc(in int64, shift uint, prealloc []byte) PrefixCoded { + rv, _, err := NewPrefixCodedInt64Prealloc(in, shift, prealloc) + if err != nil { + panic(err) + } + return rv +} + // Shift returns the number of bits shifted // returns 0 if in uninitialized state func (p PrefixCoded) Shift() (uint, error) { diff --git a/search/query/boolean.go b/search/query/boolean.go index 3bf6f9145..96df4a6d8 100644 --- a/search/query/boolean.go +++ b/search/query/boolean.go @@ -204,6 +204,8 @@ func (q *BooleanQuery) Searcher(ctx context.Context, i index.IndexReader, m mapp // Compare document IDs cmp := refDoc.IndexInternalID.Compare(d.IndexInternalID) if cmp < 0 { + // recycle refDoc now that we do not need it + sctx.DocumentMatchPool.Put(refDoc) // filterSearcher is behind the current document, Advance() it refDoc, err = filterSearcher.Advance(sctx, d.IndexInternalID) if err != nil || refDoc == nil { diff --git a/search/searcher/search_filter.go b/search/searcher/search_filter.go index 97d706b5f..ef070c73f 100644 --- a/search/searcher/search_filter.go +++ b/search/searcher/search_filter.go @@ -60,6 +60,9 @@ func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMat if f.accept(ctx, next) { return next, nil } + // recycle this document match now, since + // we do not need it anymore + ctx.DocumentMatchPool.Put(next) next, err = f.child.Next(ctx) } return nil, err @@ -76,6 +79,9 @@ func (f *FilteringSearcher) Advance(ctx *search.SearchContext, ID index.IndexInt if f.accept(ctx, adv) { return adv, nil } + // recycle this document match now, since + // we do not need it anymore + ctx.DocumentMatchPool.Put(adv) return f.Next(ctx) } diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index c2551a871..a146fa654 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -53,7 +53,7 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade } return NewFilteringSearcher(ctx, boxSearcher, buildRectFilter(ctx, dvReader, - field, minLon, minLat, maxLon, maxLat)), nil + minLon, minLat, maxLon, maxLat)), nil } } @@ -88,7 +88,7 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade } // add filter to check points near the boundary onBoundarySearcher = NewFilteringSearcher(ctx, rawOnBoundarySearcher, - buildRectFilter(ctx, dvReader, field, minLon, minLat, maxLon, maxLat)) + buildRectFilter(ctx, dvReader, minLon, minLat, maxLon, maxLat)) openedSearchers = append(openedSearchers, onBoundarySearcher) } @@ -205,28 +205,35 @@ func buildIsIndexedFunc(ctx context.Context, indexReader index.IndexReader, fiel return isIndexed, closeF, err } -func buildRectFilter(ctx context.Context, dvReader index.DocValueReader, field string, +func buildRectFilter(ctx context.Context, dvReader index.DocValueReader, minLon, minLat, maxLon, maxLat float64, ) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + var i64 int64 + i64, err = prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - var i64 int64 - i64, err = prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - if err == nil && found { + lons, lats = lons[:0], lats[:0] + found = false + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/search/searcher/search_geopointdistance.go b/search/searcher/search_geopointdistance.go index 357ac4de3..7591bcc60 100644 --- a/search/searcher/search_geopointdistance.go +++ b/search/searcher/search_geopointdistance.go @@ -66,7 +66,7 @@ func NewGeoPointDistanceSearcher(ctx context.Context, indexReader index.IndexRea // wrap it in a filtering searcher which checks the actual distance return NewFilteringSearcher(ctx, rectSearcher, - buildDistFilter(ctx, dvReader, field, centerLon, centerLat, dist)), nil + buildDistFilter(ctx, dvReader, centerLon, centerLat, dist)), nil } // boxSearcher builds a searcher for the described bounding box @@ -113,27 +113,33 @@ func boxSearcher(ctx context.Context, indexReader index.IndexReader, return boxSearcher, nil } -func buildDistFilter(ctx context.Context, dvReader index.DocValueReader, field string, +func buildDistFilter(ctx context.Context, dvReader index.DocValueReader, centerLon, centerLat, maxDist float64) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - if err == nil && found { + lons, lats = lons[:0], lats[:0] + found = false + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/search/searcher/search_geopolygon.go b/search/searcher/search_geopolygon.go index dc04bb66a..fb6e09be4 100644 --- a/search/searcher/search_geopolygon.go +++ b/search/searcher/search_geopolygon.go @@ -85,28 +85,37 @@ func almostEqual(a, b float64) bool { // here: https://wrf.ecse.rpi.edu/nikola/pubdetails/pnpoly.html func buildPolygonFilter(ctx context.Context, dvReader index.DocValueReader, field string, coordinates []geo.Point) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } + rayIntersectsSegment := func(point, a, b geo.Point) bool { + return (a.Lat > point.Lat) != (b.Lat > point.Lat) && + point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - + lons, lats = lons[:0], lats[:0] + found = false // Note: this approach works for points which are strictly inside // the polygon. ie it might fail for certain points on the polygon boundaries. - if err == nil && found { + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) @@ -116,10 +125,6 @@ func buildPolygonFilter(ctx context.Context, dvReader index.DocValueReader, fiel if len(coordinates) < 3 { return false } - rayIntersectsSegment := func(point, a, b geo.Point) bool { - return (a.Lat > point.Lat) != (b.Lat > point.Lat) && - point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon - } for i := range lons { pt := geo.Point{Lon: lons[i], Lat: lats[i]} diff --git a/search/searcher/search_geoshape.go b/search/searcher/search_geoshape.go index 703693d78..4f90808a1 100644 --- a/search/searcher/search_geoshape.go +++ b/search/searcher/search_geoshape.go @@ -69,7 +69,7 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea // this is for accumulating the shape's actual complete value // spread across multiple docvalue visitor callbacks. var dvShapeValue []byte - var startReading, finishReading bool + var startReading, finishReading, found bool var reader *bytes.Reader var bufPool *s2.GeoBufferPool @@ -77,51 +77,58 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea bufPool = bufPoolCallback() } - return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, - func(field string, term []byte) { - // only consider the values which are GlueBytes prefixed or - // if it had already started reading the shape bytes from previous callbacks. - if startReading || len(term) > geo.GlueBytesOffset { - - if !startReading && bytes.Equal(geo.GlueBytes, term[:geo.GlueBytesOffset]) { - startReading = true - - if bytes.Equal(geo.GlueBytes, term[len(term)-geo.GlueBytesOffset:]) { - term = term[:len(term)-geo.GlueBytesOffset] - finishReading = true - } - - dvShapeValue = append(dvShapeValue, term[geo.GlueBytesOffset:]...) - - } else if startReading && !finishReading { - if len(term) > geo.GlueBytesOffset && - bytes.Equal(geo.GlueBytes, term[len(term)-geo.GlueBytesOffset:]) { - term = term[:len(term)-geo.GlueBytesOffset] - finishReading = true - } - - term = append(termSeparatorSplitSlice, term...) - dvShapeValue = append(dvShapeValue, term...) - } - - // apply the filter once the entire docvalue is finished reading. - if finishReading { - v, err := geojson.FilterGeoShapesOnRelation(shape, dvShapeValue, relation, &reader, bufPool) - if err == nil && v { - found = true - } - - dvShapeValue = dvShapeValue[:0] - startReading = false - finishReading = false - } + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + tl := len(term) + // only consider the values which are GlueBytes prefixed or + // if it had already started reading the shape bytes from previous callbacks. + if startReading || tl > geo.GlueBytesOffset { + + if !startReading && bytes.Equal(geo.GlueBytes, term[:geo.GlueBytesOffset]) { + startReading = true + + if bytes.Equal(geo.GlueBytes, term[tl-geo.GlueBytesOffset:]) { + term = term[:tl-geo.GlueBytesOffset] + finishReading = true + } + + dvShapeValue = append(dvShapeValue, term[geo.GlueBytesOffset:]...) + + } else if startReading && !finishReading { + if tl > geo.GlueBytesOffset && + bytes.Equal(geo.GlueBytes, term[tl-geo.GlueBytesOffset:]) { + term = term[:tl-geo.GlueBytesOffset] + finishReading = true + } + + dvShapeValue = append(dvShapeValue, termSeparatorSplitSlice...) + dvShapeValue = append(dvShapeValue, term...) + } + + // apply the filter once the entire docvalue is finished reading. + if finishReading { + v, err := geojson.FilterGeoShapesOnRelation(shape, dvShapeValue, relation, &reader, bufPool) + if err == nil && v { + found = true } - }) - if err == nil && found { + dvShapeValue = dvShapeValue[:0] + startReading = false + finishReading = false + } + } + } + + return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { + // reset state variables for each document + found = false + startReading = false + finishReading = false + dvShapeValue = dvShapeValue[:0] + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/search/sort.go b/search/sort.go index 44e5cd91c..64230c116 100644 --- a/search/sort.go +++ b/search/sort.go @@ -683,29 +683,29 @@ type SortGeoDistance struct { Field string Desc bool Unit string - values []string + values [][]byte Lon float64 Lat float64 unitMult float64 + tmp []byte } // UpdateVisitor notifies this sort field that in this document // this field has the specified term func (s *SortGeoDistance) UpdateVisitor(field string, term []byte) { if field == s.Field { - s.values = append(s.values, string(term)) + s.values = append(s.values, term) } } // Value returns the sort value of the DocumentMatch -// it also resets the state of this SortField for +// it also resets the state of this SortGeoDistance for // processing the next document func (s *SortGeoDistance) Value(i *DocumentMatch) string { - iTerms := s.filterTermsByType(s.values) - iTerm := s.filterTermsByMode(iTerms) + iTerm := s.findPrefixCodedNumericTerm(s.values) s.values = s.values[:0] - if iTerm == "" { + if iTerm == nil { return maxDistance } @@ -723,7 +723,8 @@ func (s *SortGeoDistance) Value(i *DocumentMatch) string { dist /= s.unitMult } distInt64 := numeric.Float64ToInt64(dist) - return string(numeric.MustNewPrefixCodedInt64(distInt64, 0)) + s.tmp = numeric.MustNewPrefixCodedInt64Prealloc(distInt64, 0, s.tmp) + return string(s.tmp) } func (s *SortGeoDistance) DecodeValue(value string) string { @@ -739,25 +740,16 @@ func (s *SortGeoDistance) Descending() bool { return s.Desc } -func (s *SortGeoDistance) filterTermsByMode(terms []string) string { - if len(terms) >= 1 { - return terms[0] - } - - return "" -} - -// filterTermsByType attempts to make one pass on the terms -// return only valid prefix coded numbers with shift of 0 -func (s *SortGeoDistance) filterTermsByType(terms []string) []string { - var termsWithShiftZero []string +// findPrefixCodedNumericTerm looks through the provided terms +// and returns the first valid prefix coded numeric term with shift of 0 +func (s *SortGeoDistance) findPrefixCodedNumericTerm(terms [][]byte) []byte { for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { - termsWithShiftZero = append(termsWithShiftZero, term) + return term } } - return termsWithShiftZero + return nil } // RequiresDocID says this SearchSort does not require the DocID be loaded