From a6d4a3668d56fcf7a2dd3f2e4702e2491b895b4f Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 22:05:06 +0800 Subject: [PATCH 01/16] feat: support segmented inverted index build and search --- rust/lance-index/src/scalar/inverted/index.rs | 64 +-- .../lance-index/src/scalar/inverted/scorer.rs | 17 + rust/lance/src/dataset/scanner.rs | 24 +- rust/lance/src/index/api.rs | 6 +- rust/lance/src/index/create.rs | 66 ++- rust/lance/src/index/scalar.rs | 2 + rust/lance/src/index/scalar/inverted.rs | 110 +++++ rust/lance/src/io/exec/fts.rs | 411 ++++++++++++------ 8 files changed, 512 insertions(+), 188 deletions(-) create mode 100644 rust/lance/src/index/scalar/inverted.rs diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index c07d26c74c5..3dceef87c2d 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -65,7 +65,6 @@ use super::{ use crate::Index; use crate::frag_reuse::FragReuseIndex; use crate::pbold; -use crate::scalar::inverted::lance_tokenizer::TextTokenizer; use crate::scalar::inverted::scorer::MemBM25Scorer; use crate::scalar::inverted::tokenizer::lance_tokenizer::LanceTokenizer; use crate::scalar::{ @@ -445,7 +444,6 @@ impl InvertedIndex { pub fn partition_count(&self) -> usize { self.partitions.len() } - /// Returns the set of fragments which are contained in the index, but no longer in the dataset. /// /// Most other indices remove data from deleted fragments when the index updates (copy-on-write). @@ -455,11 +453,19 @@ impl InvertedIndex { &self.deleted_fragments } - // search the documents that contain the query - // return the row ids of the documents sorted by bm25 score - // ref: https://en.wikipedia.org/wiki/Okapi_BM25 - // we first calculate in-partition BM25 scores, - // then re-calculate the scores for the top k documents across all partitions + pub fn bm25_base_scorer(&self, query_tokens: &Tokens) -> MemBM25Scorer { + let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let token_docs = query_tokens + .into_iter() + .map(|token| (token.to_string(), scorer.num_docs_containing_token(token))) + .collect::>(); + MemBM25Scorer::new(scorer.total_tokens(), scorer.num_docs(), token_docs) + } + + /// Search documents that match the query and return row ids sorted by BM25 score. + /// + /// When `base_scorer` is provided, search uses those corpus-level BM25 statistics + /// instead of deriving them from this segment alone. #[instrument(level = "debug", skip_all)] pub async fn bm25_search( &self, @@ -468,7 +474,16 @@ impl InvertedIndex { operator: Operator, prefilter: Arc, metrics: Arc, + base_scorer: Option<&MemBM25Scorer>, ) -> Result<(Vec, Vec)> { + let local_scorer; + let scorer: &dyn Scorer = if let Some(base_scorer) = base_scorer { + base_scorer + } else { + local_scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + &local_scorer + }; + let limit = params.limit.unwrap_or(usize::MAX); if limit == 0 { return Ok((Vec::new(), Vec::new())); @@ -523,7 +538,6 @@ impl InvertedIndex { }) .collect::>(); let mut parts = stream::iter(parts).buffer_unordered(get_num_compute_intensive_cpus()); - let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); let mut idf_cache: HashMap = HashMap::new(); while let Some(res) = parts.try_next().await? { if res.candidates.is_empty() { @@ -800,6 +814,7 @@ impl InvertedIndex { Operator::And, Arc::new(NoFilter), Arc::new(NoOpMetricsCollector), + None, ) .boxed() .await?; @@ -3922,7 +3937,7 @@ async fn tokenize_and_count( /// In order to calculate BM25 scores we need to know token counts for the entire corpus. We extract these from the /// counted input of the flat search combined with any counts recorded for the indexed portion. fn initialize_scorer( - index: &Option, + base_scorer: Option<&MemBM25Scorer>, query_tokens: &Tokens, counted_input: &RecordBatch, ) -> MemBM25Scorer { @@ -3930,14 +3945,12 @@ fn initialize_scorer( let mut num_docs = 0; let mut all_token_counts = vec![0; query_tokens.len()]; - if let Some(index) = index { - let index_bm25_scorer = IndexBM25Scorer::new(index.partitions.iter().map(|p| p.as_ref())); + if let Some(base_scorer) = base_scorer { + total_tokens += base_scorer.total_tokens; + num_docs += base_scorer.num_docs; for (token_index, token) in query_tokens.into_iter().enumerate() { - let token_nq = index_bm25_scorer.num_docs_containing_token(token); - all_token_counts[token_index] = token_nq as u64; + all_token_counts[token_index] = base_scorer.num_docs_containing_token(token) as u64; } - total_tokens += index_bm25_scorer.total_tokens(); - num_docs += index_bm25_scorer.num_docs(); } num_docs += counted_input.num_rows(); @@ -4039,18 +4052,11 @@ pub async fn flat_bm25_search_stream( input: SendableRecordBatchStream, doc_col: String, query: String, - index: &Option, + tokenizer: Box, + base_scorer: Option, target_batch_size: usize, ) -> DataFusionResult { - let mut tokenizer = match index { - Some(index) => index.tokenizer(), - None => Box::new(TextTokenizer::new( - tantivy::tokenizer::TextAnalyzer::builder( - tantivy::tokenizer::SimpleTokenizer::default(), - ) - .build(), - )), - }; + let mut tokenizer = tokenizer; let query_tokens = Arc::new(collect_query_tokens(&query, &mut tokenizer)); let input_schema = input.schema(); @@ -4078,7 +4084,7 @@ pub async fn flat_bm25_search_stream( tokenize_and_count(chunked, tokenizer, query_tokens.clone(), doc_col_idx).await?; // Phase 3 - Calculate final scores (this is fairly cheap, probably don't need to parallelize) - let scorer = initialize_scorer(index, query_tokens.as_ref(), &counted_input); + let scorer = initialize_scorer(base_scorer.as_ref(), query_tokens.as_ref(), &counted_input); let scores = flat_bm25_score(query_tokens.as_ref(), &counted_input, &scorer)?; // Finally we emit batches according to the target batch size @@ -4739,7 +4745,7 @@ mod tests { let metrics = Arc::new(NoOpMetricsCollector); let (row_ids, scores) = index - .bm25_search(tokens, params, Operator::Or, prefilter, metrics) + .bm25_search(tokens, params, Operator::Or, prefilter, metrics, None) .await .unwrap(); @@ -4924,7 +4930,7 @@ mod tests { let metrics = Arc::new(NoOpMetricsCollector); let (row_ids, scores) = index - .bm25_search(tokens, params, Operator::Or, prefilter, metrics) + .bm25_search(tokens, params, Operator::Or, prefilter, metrics, None) .await .unwrap(); @@ -5019,7 +5025,7 @@ mod tests { let metrics = Arc::new(NoOpMetricsCollector); let (row_ids, _scores) = index - .bm25_search(tokens, params, Operator::And, prefilter, metrics) + .bm25_search(tokens, params, Operator::And, prefilter, metrics, None) .await .unwrap(); diff --git a/rust/lance-index/src/scalar/inverted/scorer.rs b/rust/lance-index/src/scalar/inverted/scorer.rs index 58c0471d262..e3fb81871ef 100644 --- a/rust/lance-index/src/scalar/inverted/scorer.rs +++ b/rust/lance-index/src/scalar/inverted/scorer.rs @@ -67,6 +67,23 @@ impl MemBM25Scorer { } } +impl Scorer for MemBM25Scorer { + fn query_weight(&self, token: &str) -> f32 { + let token_docs = self.num_docs_containing_token(token); + if token_docs == 0 { + return 0.0; + } + idf(token_docs, self.num_docs) + } + + fn doc_weight(&self, freq: u32, doc_tokens: u32) -> f32 { + let freq = freq as f32; + let doc_tokens = doc_tokens as f32; + let doc_norm = K1 * (1.0 - B + B * doc_tokens / self.avg_doc_length()); + (K1 + 1.0) * freq / (freq + doc_norm) + } +} + pub struct IndexBM25Scorer<'a> { partitions: Vec<&'a InvertedPartition>, num_docs: usize, diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 34483bfd847..d38668c49c5 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -85,6 +85,7 @@ use crate::index::vector::utils::{ use crate::io::exec::filtered_read::{FilteredReadExec, FilteredReadOptions}; use crate::io::exec::fts::{ BoostQueryExec, FlatMatchFilterExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec, + load_fts_segment_details, load_fts_segments, }; use crate::io::exec::knn::MultivectorScoringExec; use crate::io::exec::scalar_index::{MaterializeIndexExec, ScalarIndexExec}; @@ -3231,20 +3232,15 @@ impl Scanner { "the column must be specified in the query".to_string(), ))?; - let index_meta = self - .dataset - .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts()) - .await? - .ok_or(Error::invalid_input(format!( - "No Inverted index found for column {}", - column - )))?; - - let details_any = - crate::index::scalar::fetch_index_details(&self.dataset, &column, &index_meta).await?; - let details = details_any - .as_ref() - .to_msg::()?; + let segments = + load_fts_segments(&self.dataset, &column) + .await? + .ok_or(Error::invalid_input(format!( + "No Inverted index found for column {}", + column + )))?; + let details = load_fts_segment_details(&self.dataset, &column, &segments).await?; + if !details.with_position { return Err(Error::invalid_input("position is not found but required for phrase queries, try recreating the index with position" .to_string())); diff --git a/rust/lance/src/index/api.rs b/rust/lance/src/index/api.rs index f8e7ee7d012..e13b73d68af 100644 --- a/rust/lance/src/index/api.rs +++ b/rust/lance/src/index/api.rs @@ -81,7 +81,7 @@ impl IndexSegment { } /// A plan for building one physical segment from one or more existing -/// vector index segments. +/// uncommitted index segments. #[derive(Debug, Clone, PartialEq)] pub struct IndexSegmentPlan { segment: IndexSegment, @@ -149,13 +149,13 @@ pub trait DatasetIndexExt { ) -> Self::IndexBuilder<'a>; /// Create a builder for building physical index segments from uncommitted - /// vector index outputs. + /// index outputs. /// /// The caller supplies the uncommitted index metadata returned by /// `execute_uncommitted()` so the builder can plan segment grouping without /// rediscovering fragment coverage. /// - /// This is the canonical entry point for distributed vector segment build. + /// This is the canonical entry point for segment-based index build. /// After building the physical segments, publish them as a /// logical index with [`Self::commit_existing_index_segments`]. fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>; diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 0bf9fdd283c..bef5373f070 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -9,7 +9,7 @@ use crate::{ }, index::{ DatasetIndexExt, DatasetIndexInternalExt, build_index_metadata_from_segments, - scalar::build_scalar_index, + scalar::{IndexDetails, build_scalar_index}, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, @@ -556,7 +556,7 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { } } -/// Build physical index segments from previously-written vector segment outputs. +/// Build physical index segments from previously-written uncommitted index outputs. /// /// Use [`DatasetIndexExt::create_index_segment_builder`] and then either: /// @@ -565,7 +565,7 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { /// /// This builder only builds physical segments. Publishing those segments as /// a logical index still requires [`DatasetIndexExt::commit_existing_index_segments`]. -/// Together these two APIs form the canonical distributed vector segment build workflow. +/// Together these two APIs form the canonical segment-based index build workflow. #[derive(Clone)] pub struct IndexSegmentBuilder<'a> { dataset: &'a Dataset, @@ -611,18 +611,62 @@ impl<'a> IndexSegmentBuilder<'a> { )); } - crate::index::vector::ivf::plan_segments(&self.segments, None, self.target_segment_bytes) - .await + let index_details = self.segments[0] + .index_details + .as_ref() + .ok_or_else(|| { + Error::invalid_input("input segment is missing index details".to_string()) + })? + .clone(); + let details = IndexDetails(index_details); + let index_type = if details.supports_fts() { + IndexType::Inverted + } else if details.is_vector() { + IndexType::Vector + } else { + return Err(Error::invalid_input( + "IndexSegmentBuilder only supports vector and FTS segments".to_string(), + )); + }; + match index_type { + IndexType::Inverted => crate::index::scalar::inverted::plan_segments( + &self.segments, + self.target_segment_bytes, + ), + IndexType::Vector => { + crate::index::vector::ivf::plan_segments( + &self.segments, + None, + self.target_segment_bytes, + ) + .await + } + unsupported => Err(Error::invalid_input(format!( + "IndexSegmentBuilder does not support planning segments for index type {}", + unsupported + ))), + } } /// Build one segment from a previously-generated plan. pub async fn build(&self, plan: &IndexSegmentPlan) -> Result { - crate::index::vector::ivf::build_segment( - self.dataset.object_store(), - &self.dataset.indices_dir(), - plan, - ) - .await + match plan.requested_index_type().unwrap_or(IndexType::Vector) { + IndexType::Inverted => { + crate::index::scalar::inverted::build_segment(self.dataset, plan).await + } + IndexType::Vector => { + crate::index::vector::ivf::build_segment( + self.dataset.object_store(), + &self.dataset.indices_dir(), + plan, + ) + .await + } + unsupported => Err(Error::invalid_input(format!( + "IndexSegmentBuilder does not support building segments for index type {}", + unsupported + ))), + } } /// Plan and build all segments from the provided inputs. diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 44739454bec..f9e17af88bf 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -4,6 +4,8 @@ //! Utilities for integrating scalar indices with datasets //! +pub(crate) mod inverted; + use std::sync::{Arc, LazyLock}; use crate::index::DatasetIndexExt; diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs new file mode 100644 index 00000000000..4b0e671575f --- /dev/null +++ b/rust/lance/src/index/scalar/inverted.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::Arc; + +use lance_index::{IndexType, scalar::lance_format::LanceIndexStore}; +use lance_table::format::IndexMetadata; + +use crate::{ + Dataset, Error, Result, + dataset::index::LanceIndexStoreExt, + index::{IndexSegment, IndexSegmentPlan}, +}; + +/// Plan physical segments for staged inverted-index outputs. +/// +/// Each staged inverted root remains its own physical segment for now. +pub(crate) fn plan_segments( + segments: &[IndexMetadata], + target_segment_bytes: Option, +) -> Result> { + if let Some(0) = target_segment_bytes { + return Err(Error::invalid_input( + "target_segment_bytes must be greater than zero".to_string(), + )); + } + if target_segment_bytes.is_some() && segments.len() > 1 { + // TODO: Support merging multiple staged inverted roots into one segment. + return Err(Error::invalid_input( + "Inverted segment builder does not yet support merging multiple source segments" + .to_string(), + )); + } + + segments + .iter() + .map(|segment| { + let fragment_bitmap = segment.fragment_bitmap.as_ref().ok_or_else(|| { + Error::index(format!( + "Segment '{}' is missing fragment coverage", + segment.uuid + )) + })?; + let index_details = segment.index_details.as_ref().ok_or_else(|| { + Error::index(format!( + "Segment '{}' is missing index details", + segment.uuid + )) + })?; + let built_segment = IndexSegment::new( + segment.uuid, + fragment_bitmap.iter(), + index_details.clone(), + segment.index_version, + ); + let estimated_bytes = segment + .files + .as_ref() + .map(|files| files.iter().map(|file| file.size_bytes).sum()) + .unwrap_or(0); + Ok(IndexSegmentPlan::new( + built_segment, + vec![segment.clone()], + estimated_bytes, + Some(IndexType::Inverted), + )) + }) + .collect() +} + +/// Finalize one staged inverted root into a commit-ready physical segment. +pub(crate) async fn build_segment( + dataset: &Dataset, + segment_plan: &IndexSegmentPlan, +) -> Result { + let built_segment = segment_plan.segment().clone(); + let source_segments = segment_plan.segments(); + if source_segments.len() != 1 { + // TODO: Support building one segment from multiple staged inverted roots. + return Err(Error::invalid_input( + "Inverted segment builder does not yet support merging multiple source segments" + .to_string(), + )); + } + let source_segment = &source_segments[0]; + if source_segment.uuid != built_segment.uuid() { + return Err(Error::invalid_input( + "Inverted segment builder requires the built segment UUID to match the staged source UUID" + .to_string(), + )); + } + + let index_dir = dataset.indices_dir().child(source_segment.uuid.to_string()); + let metadata_path = index_dir.child(lance_index::scalar::inverted::METADATA_FILE); + if dataset.object_store().exists(&metadata_path).await? { + return Ok(built_segment); + } + + let store = Arc::new(LanceIndexStore::from_dataset_for_new( + dataset, + &source_segment.uuid.to_string(), + )?); + lance_index::scalar::inverted::builder::merge_index_files( + dataset.object_store(), + &index_dir, + store, + ) + .await?; + Ok(built_segment) +} diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index b40ad145802..3fae89fb4e5 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -18,17 +18,21 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, Pla use datafusion_physical_expr::{Distribution, EquivalenceProperties, Partitioning}; use datafusion_physical_plan::metrics::{BaselineMetrics, Count}; use futures::stream::{self}; -use futures::{FutureExt, Stream, StreamExt, TryStreamExt}; +use futures::{Stream, StreamExt, TryStreamExt}; use itertools::Itertools; -use lance_core::{ROW_ID, utils::tracing::StreamTracingExt}; +use lance_core::{Error, ROW_ID, Result, utils::tracing::StreamTracingExt}; use lance_datafusion::utils::{ExecutionPlanMetricsSetExt, MetricsExt, PARTITIONS_SEARCHED_METRIC}; +use lance_table::format::IndexMetadata; use super::PreFilterSource; use super::utils::{IndexMetrics, InstrumentedRecordBatchStreamAdapter, build_prefilter}; use crate::index::DatasetIndexExt; +use crate::index::scalar::fetch_index_details; use crate::{Dataset, index::DatasetIndexInternalExt}; use lance_index::IndexCriteria; use lance_index::metrics::MetricsCollector; +use lance_index::pbold::InvertedIndexDetails; +use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::lance_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; use lance_index::scalar::inverted::query::{ @@ -42,6 +46,94 @@ use lance_index::scalar::inverted::{ use lance_index::{prefilter::PreFilter, scalar::inverted::query::BooleanQuery}; use tracing::instrument; +/// Load all FTS segments that belong to the same named index for a column. +pub(crate) async fn load_fts_segments( + dataset: &Dataset, + column: &str, +) -> Result>> { + let Some(index_meta) = dataset + .load_scalar_index(IndexCriteria::default().for_column(column).supports_fts()) + .await? + else { + return Ok(None); + }; + + let indices = dataset.load_indices_by_name(&index_meta.name).await?; + if indices.is_empty() { + return Ok(None); + } + + let expected_fields = indices[0].fields.clone(); + for meta in &indices { + if meta.fields != expected_fields { + return Err(Error::invalid_input(format!( + "FTS index {} has inconsistent fields across segments", + index_meta.name + ))); + } + } + + Ok(Some(indices)) +} + +/// Load and validate the shared FTS details across a set of segments. +pub(crate) async fn load_fts_segment_details( + dataset: &Dataset, + column: &str, + segments: &[IndexMetadata], +) -> Result { + let mut expected_details: Option = None; + for meta in segments { + let details_any = fetch_index_details(dataset, column, meta).await?; + let details = details_any.as_ref().to_msg::()?; + match &expected_details { + Some(expected) if expected != &details => { + return Err(Error::invalid_input(format!( + "FTS index {} has inconsistent inverted index details across segments", + meta.name + ))); + } + Some(_) => {} + None => expected_details = Some(details), + } + } + expected_details.ok_or_else(|| { + Error::invalid_input(format!( + "FTS index for column {} requires at least one segment", + column + )) + }) +} + +/// Open one FTS segment as an [`InvertedIndex`]. +async fn open_fts_segment( + dataset: &Dataset, + column: &str, + segment: &IndexMetadata, + metrics: &IndexMetrics, +) -> Result> { + let uuid = segment.uuid.to_string(); + let index = dataset.open_generic_index(column, &uuid, metrics).await?; + let inverted = index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index for column {} and segment {} is not an inverted index", + column, uuid + )) + })?; + Ok(Arc::new(inverted.clone())) +} + +/// Fall back to the default simple tokenizer when no on-disk FTS segment exists. +fn default_text_tokenizer() -> Box { + Box::new(TextTokenizer::new( + tantivy::tokenizer::TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default()) + .build(), + )) +} + pub struct FtsIndexMetrics { index_metrics: IndexMetrics, partitions_searched: Count, @@ -231,54 +323,53 @@ impl ExecutionPlan for MatchQueryExec { )))?; let stream = stream::once(async move { let _timer = metrics.baseline_metrics.elapsed_compute().timer(); - let index_meta = ds - .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts()) - .await? - .ok_or(DataFusionError::Execution(format!( - "No Inverted index found for column {}", - column, - )))?; - let uuid = index_meta.uuid.to_string(); - let index = ds - .open_generic_index(&column, &uuid, &metrics.index_metrics) - .await?; - - let mut pre_filter = build_prefilter( - context.clone(), - partition, - &prefilter_source, - ds, - &[index_meta], - )?; - - let inverted_idx = index - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Execution(format!( - "Index for column {} is not an inverted index", + let segments = + load_fts_segments(&ds, &column) + .await? + .ok_or(DataFusionError::Execution(format!( + "No Inverted index found for column {}", column, - )) - })?; - if !inverted_idx.deleted_fragments().is_empty() { + )))?; + let _details = load_fts_segment_details(&ds, &column, &segments).await?; + let mut indices = Vec::with_capacity(segments.len()); + for segment in &segments { + indices + .push(open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?); + } + + let mut pre_filter = + build_prefilter(context.clone(), partition, &prefilter_source, ds, &segments)?; + let deleted_fragments = + indices + .iter() + .fold(roaring::RoaringBitmap::new(), |mut deleted, index| { + deleted |= index.deleted_fragments().clone(); + deleted + }); + if !deleted_fragments.is_empty() { Arc::get_mut(&mut pre_filter) .expect("prefilter just created") - .set_deleted_fragments(inverted_idx.deleted_fragments().clone()); + .set_deleted_fragments(deleted_fragments); } - metrics.record_parts_searched(inverted_idx.partition_count()); + metrics + .record_parts_searched(indices.iter().map(|index| index.partition_count()).sum()); let is_fuzzy = matches!(query.fuzziness, Some(n) if n != 0); let params = params .with_fuzziness(query.fuzziness) .with_max_expansions(query.max_expansions) .with_prefix_length(query.prefix_length); + let first_index = indices.first().ok_or(DataFusionError::Execution(format!( + "FTS index for column {} has no segments", + column + )))?; let mut tokenizer = match is_fuzzy { - false => inverted_idx.tokenizer(), + false => first_index.tokenizer(), true => { let tokenizer = tantivy::tokenizer::TextAnalyzer::from( tantivy::tokenizer::SimpleTokenizer::default(), ); - match inverted_idx.tokenizer().doc_type() { + match first_index.tokenizer().doc_type() { DocType::Text => { Box::new(TextTokenizer::new(tokenizer)) as Box } @@ -289,18 +380,46 @@ impl ExecutionPlan for MatchQueryExec { } }; let tokens = collect_query_tokens(&query.terms, &mut tokenizer); + let mut base_scorer = first_index.bm25_base_scorer(&tokens); + for index in indices.iter().skip(1) { + let segment_scorer = index.bm25_base_scorer(&tokens); + base_scorer.total_tokens += segment_scorer.total_tokens; + base_scorer.num_docs += segment_scorer.num_docs; + for (token, count) in segment_scorer.token_docs { + *base_scorer.token_docs.entry(token).or_insert(0) += count; + } + } pre_filter.wait_for_ready().await?; - let (doc_ids, mut scores) = inverted_idx - .bm25_search( - Arc::new(tokens), - params.into(), - query.operator, - pre_filter, - metrics.clone(), - ) - .boxed() - .await?; + let tokens = Arc::new(tokens); + let params = Arc::new(params); + let limit = params.limit.unwrap_or(usize::MAX); + let mut candidates = std::collections::BinaryHeap::new(); + for index in &indices { + let (doc_ids, scores) = index + .bm25_search( + tokens.clone(), + params.clone(), + query.operator, + pre_filter.clone(), + metrics.clone(), + Some(&base_scorer), + ) + .await?; + for (row_id, score) in doc_ids.into_iter().zip(scores.into_iter()) { + if candidates.len() < limit { + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } else if candidates.peek().unwrap().0.score.0 < score { + candidates.pop(); + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } + } + } + let (doc_ids, mut scores): (Vec, Vec) = candidates + .into_sorted_vec() + .into_iter() + .map(|std::cmp::Reverse(doc)| (doc.row_id, doc.score.0)) + .unzip(); scores.iter_mut().for_each(|s| { *s *= query.boost; }); @@ -379,29 +498,19 @@ impl FlatMatchFilterExec { column: &str, metrics: &IndexMetrics, ) -> DataFusionResult> { - let index_meta = dataset - .load_scalar_index(IndexCriteria::default().for_column(column).supports_fts()) - .await?; - - if let Some(index_meta) = index_meta { - let uuid = index_meta.uuid.to_string(); - let index = dataset.open_generic_index(column, &uuid, metrics).await?; - if let Some(index) = index.as_any().downcast_ref::() { - return Ok(index.tokenizer()); - } else { - return Err(DataFusionError::Execution(format!( - "Index for column {} is not an inverted index", - column, - ))); - } - } // Else, no index, use text tokenzier + if let Some(segments) = load_fts_segments(dataset, column).await? { + let index_meta = segments.first().ok_or_else(|| { + DataFusionError::Execution(format!( + "FTS index for column {} has no segments", + column + )) + })?; + return Ok(open_fts_segment(dataset, column, index_meta, metrics) + .await? + .tokenizer()); + } - Ok(Box::new(TextTokenizer::new( - tantivy::tokenizer::TextAnalyzer::builder( - tantivy::tokenizer::SimpleTokenizer::default(), - ) - .build(), - ))) + Ok(default_text_tokenizer()) } pub fn new( @@ -675,28 +784,44 @@ impl ExecutionPlan for FlatMatchQueryExec { document_input(self.unindexed_input.execute(partition, context)?, &column)?; let stream = stream::once(async move { - let index_meta = ds - .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts()) - .await?; - let inverted_idx = match index_meta { - Some(index_meta) => { - let uuid = index_meta.uuid.to_string(); - let index = ds - .open_generic_index(&column, &uuid, &metrics.index_metrics) - .await?; - index.as_any().downcast_ref::().cloned() + let segments = load_fts_segments(&ds, &column).await?; + let (tokenizer, base_scorer) = match segments { + Some(segments) => { + let _details = load_fts_segment_details(&ds, &column, &segments).await?; + let mut indices = Vec::with_capacity(segments.len()); + for segment in &segments { + indices.push( + open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?, + ); + } + metrics.record_parts_searched( + indices.iter().map(|index| index.partition_count()).sum(), + ); + let first_index = indices.first().ok_or(DataFusionError::Execution( + format!("FTS index for column {} has no segments", column), + ))?; + let mut tokenizer = first_index.tokenizer(); + let query_tokens = collect_query_tokens(&query.terms, &mut tokenizer); + let mut base_scorer = first_index.bm25_base_scorer(&query_tokens); + for index in indices.iter().skip(1) { + let segment_scorer = index.bm25_base_scorer(&query_tokens); + base_scorer.total_tokens += segment_scorer.total_tokens; + base_scorer.num_docs += segment_scorer.num_docs; + for (token, count) in segment_scorer.token_docs { + *base_scorer.token_docs.entry(token).or_insert(0) += count; + } + } + (tokenizer, Some(base_scorer)) } - None => None, + None => (default_text_tokenizer(), None), }; - if let Some(index) = inverted_idx.as_ref() { - metrics.record_parts_searched(index.partition_count()); - } flat_bm25_search_stream( unindexed_input, column, query.terms, - &inverted_idx, + tokenizer, + base_scorer, target_batch_size, ) .await @@ -878,63 +1003,87 @@ impl ExecutionPlan for PhraseQueryExec { let metrics = Arc::new(FtsIndexMetrics::new(&self.metrics, partition)); let stream = stream::once(async move { let _timer = metrics.baseline_metrics.elapsed_compute().timer(); - let column = query - .column - .clone() - .ok_or(DataFusionError::Execution(format!( - "column not set for PhraseQuery {}", - query.terms - )))?; - let index_meta = ds - .load_scalar_index(IndexCriteria::default().for_column(&column).supports_fts()) - .await? - .ok_or(DataFusionError::Execution(format!( - "No Inverted index found for column {}", - column, - )))?; - let uuid = index_meta.uuid.to_string(); - let index = ds - .open_generic_index(&column, &uuid, &metrics.index_metrics) - .await?; - - let mut pre_filter = build_prefilter( - context.clone(), - partition, - &prefilter_source, - ds.clone(), - &[index_meta], - )?; - - let index = index - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Execution(format!( - "Index for column {} is not an inverted index", + let column = query.column.ok_or(DataFusionError::Execution(format!( + "column not set for PhraseQuery {}", + query.terms + )))?; + let segments = + load_fts_segments(&ds, &column) + .await? + .ok_or(DataFusionError::Execution(format!( + "No Inverted index found for column {}", column, - )) - })?; - if !index.deleted_fragments().is_empty() { + )))?; + let _details = load_fts_segment_details(&ds, &column, &segments).await?; + let mut indices = Vec::with_capacity(segments.len()); + for segment in &segments { + indices + .push(open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?); + } + + let mut pre_filter = + build_prefilter(context.clone(), partition, &prefilter_source, ds, &segments)?; + let deleted_fragments = + indices + .iter() + .fold(roaring::RoaringBitmap::new(), |mut deleted, index| { + deleted |= index.deleted_fragments().clone(); + deleted + }); + if !deleted_fragments.is_empty() { Arc::get_mut(&mut pre_filter) .expect("prefilter just created") - .set_deleted_fragments(index.deleted_fragments().clone()); + .set_deleted_fragments(deleted_fragments); } - metrics.record_parts_searched(index.partition_count()); + metrics + .record_parts_searched(indices.iter().map(|index| index.partition_count()).sum()); - let mut tokenizer = index.tokenizer(); + let first_index = indices.first().ok_or(DataFusionError::Execution(format!( + "FTS index for column {} has no segments", + column + )))?; + let mut tokenizer = first_index.tokenizer(); let tokens = collect_query_tokens(&query.terms, &mut tokenizer); + let mut base_scorer = first_index.bm25_base_scorer(&tokens); + for index in indices.iter().skip(1) { + let segment_scorer = index.bm25_base_scorer(&tokens); + base_scorer.total_tokens += segment_scorer.total_tokens; + base_scorer.num_docs += segment_scorer.num_docs; + for (token, count) in segment_scorer.token_docs { + *base_scorer.token_docs.entry(token).or_insert(0) += count; + } + } pre_filter.wait_for_ready().await?; - let (doc_ids, scores) = index - .bm25_search( - Arc::new(tokens), - params.into(), - lance_index::scalar::inverted::query::Operator::And, - pre_filter, - metrics.clone(), - ) - .boxed() - .await?; + let tokens = Arc::new(tokens); + let params = Arc::new(params); + let limit = params.limit.unwrap_or(usize::MAX); + let mut candidates = std::collections::BinaryHeap::new(); + for index in &indices { + let (doc_ids, scores) = index + .bm25_search( + tokens.clone(), + params.clone(), + lance_index::scalar::inverted::query::Operator::And, + pre_filter.clone(), + metrics.clone(), + Some(&base_scorer), + ) + .await?; + for (row_id, score) in doc_ids.into_iter().zip(scores.into_iter()) { + if candidates.len() < limit { + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } else if candidates.peek().unwrap().0.score.0 < score { + candidates.pop(); + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } + } + } + let (doc_ids, scores): (Vec, Vec) = candidates + .into_sorted_vec() + .into_iter() + .map(|std::cmp::Reverse(doc)| (doc.row_id, doc.score.0)) + .unzip(); metrics.baseline_metrics.record_output(doc_ids.len()); let batch = RecordBatch::try_new( FTS_SCHEMA.clone(), From 1efcfa9a501c55641e4eb9703151ebfede107320 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 22:05:55 +0800 Subject: [PATCH 02/16] test: cover segmented inverted index workflows --- rust/lance/src/index/create.rs | 64 ++++++++ rust/lance/src/io/exec/fts.rs | 69 +++++++- rust/lance/tests/query/inverted.rs | 256 ++++++++++++++++++++++++++++- 3 files changed, 386 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index bef5373f070..34ba1012657 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -1341,6 +1341,70 @@ mod tests { assert!(result.num_rows() > 0); } + #[tokio::test] + async fn test_index_segment_builder_fts_commits_multi_segment_logical_index() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + let batch1 = create_text_batch(0, 10); + let batch2 = create_text_batch(10, 20); + let batch3 = create_text_batch(20, 30); + + let batches = RecordBatchIterator::new( + vec![Ok(batch1), Ok(batch2), Ok(batch3)], + create_text_batch(0, 1).schema(), + ); + let mut dataset = Dataset::write( + batches, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 10, + max_rows_per_group: 5, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = InvertedIndexParams::default(); + let mut input_segments = Vec::new(); + for fragment in dataset.get_fragments() { + let segment = + CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Inverted, ¶ms) + .name("text_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + input_segments.push(segment); + } + + let segments = dataset + .create_index_segment_builder() + .with_segments(input_segments.clone()) + .build_all() + .await + .unwrap(); + assert_eq!(segments.len(), input_segments.len()); + + for segment in &segments { + let metadata_path = dataset + .indices_dir() + .child(segment.uuid().to_string()) + .child(lance_index::scalar::inverted::METADATA_FILE); + assert!(dataset.object_store().exists(&metadata_path).await.unwrap()); + } + + dataset + .commit_existing_index_segments("text_idx", "text", segments) + .await + .unwrap(); + + let indices = dataset.load_indices_by_name("text_idx").await.unwrap(); + assert_eq!(indices.len(), input_segments.len()); + } + #[tokio::test] async fn test_commit_existing_index_supports_local_hnsw_segments() { let tmpdir = TempStrDir::default(); diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index 3fae89fb4e5..dc898ec5b50 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -1548,6 +1548,7 @@ pub mod tests { use std::sync::{Arc, Mutex}; use crate::index::DatasetIndexExt; + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::{execution::TaskContext, physical_plan::ExecutionPlan}; use lance_datafusion::datagen::DatafusionDatagenExt; use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts}; @@ -1557,18 +1558,23 @@ pub mod tests { use lance_index::scalar::inverted::InvertedIndex; use lance_index::scalar::inverted::query::{ BooleanQuery, BoostQuery, FtsQuery, FtsSearchParams, MatchQuery, Occur, Operator, - PhraseQuery, + PhraseQuery, collect_query_tokens, has_query_token, }; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; use lance_index::{IndexCriteria, IndexType}; + use tantivy::tokenizer::Language; use crate::{ + dataset::transaction::{Operation, TransactionBuilder}, index::DatasetIndexInternalExt, io::exec::PreFilterSource, utils::test::{DatagenExt, FragmentCount, FragmentRowCount, NoContextTestFixture}, }; - use super::{BoostQueryExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec}; + use super::{ + BoostQueryExec, FlatMatchFilterExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec, + }; + use crate::io::exec::utils::IndexMetrics; #[derive(Default)] struct StatsHolder { @@ -1672,6 +1678,65 @@ pub mod tests { assert!(metrics.elapsed_compute().unwrap() > 0); } + #[tokio::test] + async fn test_flat_match_filter_load_tokenizer_uses_on_disk_params_when_details_missing() { + let mut dataset = lance_datagen::gen_batch() + .col( + "text", + lance_datagen::array::cycle_utf8_literals(&["hello", "HELLO"]), + ) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(2)) + .await + .unwrap(); + + let params = InvertedIndexParams::new("simple".to_string(), Language::English) + .with_position(false) + .lower_case(false) + .stem(false) + .remove_stop_words(false) + .ascii_folding(false) + .max_token_length(None); + dataset + .create_index(&["text"], IndexType::Inverted, None, ¶ms, true) + .await + .unwrap(); + + let index_meta = dataset + .load_scalar_index(IndexCriteria::default().for_column("text").supports_fts()) + .await + .unwrap() + .unwrap(); + let mut legacy_index_meta = index_meta.clone(); + legacy_index_meta.index_details = None; + let transaction = TransactionBuilder::new( + dataset.manifest.version, + Operation::CreateIndex { + new_indices: vec![legacy_index_meta], + removed_indices: vec![index_meta], + }, + ) + .build(); + dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await + .unwrap(); + + let metrics = IndexMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let mut tokenizer = FlatMatchFilterExec::load_tokenizer(&dataset, "text", &metrics) + .await + .unwrap(); + let query_tokens = collect_query_tokens("hello", &mut tokenizer); + + let mut tokenizer = FlatMatchFilterExec::load_tokenizer(&dataset, "text", &metrics) + .await + .unwrap(); + assert!(has_query_token("hello", &mut tokenizer, &query_tokens)); + assert!( + !has_query_token("HELLO", &mut tokenizer, &query_tokens), + "legacy FTS indices should continue using on-disk tokenizer params" + ); + } + #[tokio::test] async fn test_parts_searched_metrics() { let mut dataset = lance_datagen::gen_batch() diff --git a/rust/lance/tests/query/inverted.rs b/rust/lance/tests/query/inverted.rs index c9ce1231d92..f209e6f6c28 100644 --- a/rust/lance/tests/query/inverted.rs +++ b/rust/lance/tests/query/inverted.rs @@ -3,7 +3,9 @@ use std::sync::Arc; -use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray, UInt32Array}; +use arrow_array::{ + ArrayRef, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, +}; use lance::Dataset; use lance::dataset::scanner::ColumnOrdering; use lance::dataset::{InsertBuilder, WriteParams}; @@ -11,6 +13,7 @@ use lance::index::DatasetIndexExt; use lance_index::IndexType; use lance_index::scalar::inverted::query::{FtsQuery, PhraseQuery}; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; +use lance_table::format::IndexMetadata; use tantivy::tokenizer::Language; use super::{strip_score_column, test_fts, test_scan, test_take}; @@ -146,6 +149,257 @@ async fn test_inverted_phrase_query_with_positions() { .await; } +#[tokio::test] +async fn test_segmented_inverted_match_query() { + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let batches = vec![ + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![0, 1])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("alpha lance"), Some("beta")])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![2, 3])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("lance delta"), Some("gamma")])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("omega"), Some("lance omega")])) as ArrayRef, + ), + ]) + .unwrap(), + ]; + let schema = batches[0].schema(); + let original = arrow_select::concat::concat_batches(&schema, &batches).unwrap(); + + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + let mut ds = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 2, + max_rows_per_group: 2, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = base_inverted_params(false); + let fragment_ids = ds + .get_fragments() + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let mut metadatas = Vec::::with_capacity(fragment_ids.len()); + for fragment_id in fragment_ids { + let mut builder = ds + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name("segmented_fts".to_string()) + .fragments(vec![fragment_id]); + metadatas.push(builder.execute_uncommitted().await.unwrap()); + } + let segments = ds + .create_index_segment_builder() + .with_segments(metadatas.clone()) + .build_all() + .await + .unwrap(); + ds.commit_existing_index_segments("segmented_fts", "text", segments) + .await + .unwrap(); + assert!(metadatas.len() >= 2); + assert_eq!( + ds.load_indices_by_name("segmented_fts") + .await + .unwrap() + .len(), + metadatas.len() + ); + + let query = FullTextSearchQuery::new("lance".to_string()) + .with_column("text".to_string()) + .unwrap(); + assert_fts_expected(&original, &ds, query.clone(), None, &[0, 2, 5]).await; + test_fts(&original, &ds, "text", "lance", None, true, false).await; +} + +#[tokio::test] +async fn test_segmented_inverted_phrase_query() { + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let batches = vec![ + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![0, 1])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![ + Some("lance database"), + Some("database lance"), + ])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![2, 3])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![ + Some("lance database query"), + Some("lance and database"), + ])) as ArrayRef, + ), + ]) + .unwrap(), + ]; + let schema = batches[0].schema(); + let original = arrow_select::concat::concat_batches(&schema, &batches).unwrap(); + + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + let mut ds = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 2, + max_rows_per_group: 2, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = base_inverted_params(true); + let fragment_ids = ds + .get_fragments() + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let mut metadatas = Vec::::with_capacity(fragment_ids.len()); + for fragment_id in fragment_ids { + let mut builder = ds + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name("segmented_phrase_fts".to_string()) + .fragments(vec![fragment_id]); + metadatas.push(builder.execute_uncommitted().await.unwrap()); + } + let segments = ds + .create_index_segment_builder() + .with_segments(metadatas) + .build_all() + .await + .unwrap(); + ds.commit_existing_index_segments("segmented_phrase_fts", "text", segments) + .await + .unwrap(); + + let phrase = + PhraseQuery::new("lance database".to_string()).with_column(Some("text".to_string())); + let query = FullTextSearchQuery::new_query(FtsQuery::Phrase(phrase)); + assert_fts_expected(&original, &ds, query, None, &[0, 2]).await; + test_fts(&original, &ds, "text", "lance database", None, true, true).await; +} + +#[tokio::test] +async fn test_segmented_inverted_match_query_with_unindexed_fragments() { + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let initial_batches = vec![ + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![0, 1])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("lance zero"), Some("alpha")])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![2, 3])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("beta"), Some("lance three")])) as ArrayRef, + ), + ]) + .unwrap(), + ]; + let schema = initial_batches[0].schema(); + let reader = + RecordBatchIterator::new(initial_batches.clone().into_iter().map(Ok), schema.clone()); + let mut ds = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 2, + max_rows_per_group: 2, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = base_inverted_params(false); + let fragment_ids = ds + .get_fragments() + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let mut metadatas = Vec::::with_capacity(fragment_ids.len()); + for fragment_id in fragment_ids { + let mut builder = ds + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name("segmented_mixed_fts".to_string()) + .fragments(vec![fragment_id]); + metadatas.push(builder.execute_uncommitted().await.unwrap()); + } + let segments = ds + .create_index_segment_builder() + .with_segments(metadatas) + .build_all() + .await + .unwrap(); + ds.commit_existing_index_segments("segmented_mixed_fts", "text", segments) + .await + .unwrap(); + + let appended = RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("lance four"), Some("omega")])) as ArrayRef, + ), + ]) + .unwrap(); + let appended_reader = RecordBatchIterator::new(vec![Ok(appended.clone())], appended.schema()); + ds.append(appended_reader, None).await.unwrap(); + + let original = arrow_select::concat::concat_batches( + &schema, + &[ + initial_batches[0].clone(), + initial_batches[1].clone(), + appended, + ], + ) + .unwrap(); + let query = FullTextSearchQuery::new("lance".to_string()) + .with_column("text".to_string()) + .unwrap(); + assert_fts_expected(&original, &ds, query.clone(), None, &[0, 3, 4]).await; + test_fts(&original, &ds, "text", "lance", None, true, false).await; +} + #[tokio::test] // Validate filters are applied alongside inverted index search results. async fn test_inverted_with_filter() { From b71d333350af2a1ac271f56f27487909f34bfddf Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 22:35:32 +0800 Subject: [PATCH 03/16] fix: address segmented inverted index review issues --- rust/lance-index/src/scalar/inverted/index.rs | 28 ++- rust/lance/src/dataset/scanner.rs | 17 +- rust/lance/src/index/create.rs | 227 +++++++++++++++-- rust/lance/src/index/scalar/inverted.rs | 66 ++++- rust/lance/src/io/exec/fts.rs | 229 ++++++++---------- rust/lance/tests/query/inverted.rs | 78 ++++++ 6 files changed, 489 insertions(+), 156 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 3dceef87c2d..1298cffa09d 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -8,7 +8,11 @@ use std::{ cmp::{Reverse, min}, collections::BinaryHeap, }; -use std::{collections::HashMap, ops::Range, time::Instant}; +use std::{ + collections::{HashMap, HashSet}, + ops::Range, + time::Instant, +}; use crate::metrics::NoOpMetricsCollector; use crate::prefilter::NoFilter; @@ -462,6 +466,28 @@ impl InvertedIndex { MemBM25Scorer::new(scorer.total_tokens(), scorer.num_docs(), token_docs) } + /// Expand fuzzy query tokens against all partitions in this segment. + pub fn expand_fuzzy_tokens(&self, tokens: &Tokens, params: &FtsSearchParams) -> Result { + let mut expanded_tokens = Vec::new(); + let mut expanded_positions = Vec::new(); + let mut seen = HashSet::new(); + for partition in &self.partitions { + let expanded = partition.expand_fuzzy(tokens, params)?; + for idx in 0..expanded.len() { + let token = expanded.get_token(idx); + if seen.insert(token.to_string()) { + expanded_tokens.push(token.to_string()); + expanded_positions.push(expanded.position(idx)); + } + } + } + Ok(Tokens::with_positions( + expanded_tokens, + expanded_positions, + tokens.token_type().clone(), + )) + } + /// Search documents that match the query and return row ids sorted by BM25 score. /// /// When `base_scorer` is provided, search uses those corpus-level BM25 statistics diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index d38668c49c5..b3b9e3ab09b 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -79,13 +79,13 @@ use super::Dataset; use crate::dataset::row_offsets_to_row_addresses; use crate::dataset::utils::SchemaAdapter; use crate::index::DatasetIndexInternalExt; +use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::index::vector::utils::{ default_distance_type_for, get_vector_dim, get_vector_type, validate_distance_type_for, }; use crate::io::exec::filtered_read::{FilteredReadExec, FilteredReadOptions}; use crate::io::exec::fts::{ BoostQueryExec, FlatMatchFilterExec, FlatMatchQueryExec, MatchQueryExec, PhraseQueryExec, - load_fts_segment_details, load_fts_segments, }; use crate::io::exec::knn::MultivectorScoringExec; use crate::io::exec::scalar_index::{MaterializeIndexExec, ScalarIndexExec}; @@ -3232,14 +3232,13 @@ impl Scanner { "the column must be specified in the query".to_string(), ))?; - let segments = - load_fts_segments(&self.dataset, &column) - .await? - .ok_or(Error::invalid_input(format!( - "No Inverted index found for column {}", - column - )))?; - let details = load_fts_segment_details(&self.dataset, &column, &segments).await?; + let segments = load_segments(&self.dataset, &column) + .await? + .ok_or(Error::invalid_input(format!( + "No Inverted index found for column {}", + column + )))?; + let details = load_segment_details(&self.dataset, &column, &segments).await?; if !details.with_position { return Err(Error::invalid_input("position is not found but required for phrase queries, try recreating the index with position" diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 34ba1012657..ca352940764 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -26,7 +26,11 @@ use lance_index::{ scalar::{LANCE_SCALAR_INDEX, ScalarIndexParams, inverted::tokenizer::InvertedIndexParams}, }; use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; -use std::{collections::HashMap, future::IntoFuture, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + future::IntoFuture, + sync::Arc, +}; use tracing::instrument; use uuid::Uuid; @@ -611,23 +615,41 @@ impl<'a> IndexSegmentBuilder<'a> { )); } - let index_details = self.segments[0] - .index_details - .as_ref() - .ok_or_else(|| { + let infer_index_type = |segment: &IndexMetadata| -> Result { + let index_details = segment.index_details.as_ref().ok_or_else(|| { Error::invalid_input("input segment is missing index details".to_string()) - })? - .clone(); - let details = IndexDetails(index_details); - let index_type = if details.supports_fts() { - IndexType::Inverted - } else if details.is_vector() { - IndexType::Vector - } else { - return Err(Error::invalid_input( - "IndexSegmentBuilder only supports vector and FTS segments".to_string(), - )); + })?; + let details = IndexDetails(index_details.clone()); + if details.supports_fts() { + Ok(IndexType::Inverted) + } else if details.is_vector() { + Ok(IndexType::Vector) + } else { + Err(Error::invalid_input( + "IndexSegmentBuilder only supports vector and FTS segments".to_string(), + )) + } }; + + let index_type = infer_index_type(&self.segments[0])?; + let mut seen_segment_ids = HashSet::with_capacity(self.segments.len()); + for segment in &self.segments { + if !seen_segment_ids.insert(segment.uuid) { + return Err(Error::invalid_input(format!( + "IndexSegmentBuilder received duplicate segment uuid {}", + segment.uuid + ))); + } + let segment_index_type = infer_index_type(segment)?; + if segment_index_type != index_type { + return Err(Error::invalid_input(format!( + "IndexSegmentBuilder requires all input segments to have the same index type; \ + expected {}, got {} for segment {}", + index_type, segment_index_type, segment.uuid + ))); + } + } + match index_type { IndexType::Inverted => crate::index::scalar::inverted::plan_segments( &self.segments, @@ -636,7 +658,7 @@ impl<'a> IndexSegmentBuilder<'a> { IndexType::Vector => { crate::index::vector::ivf::plan_segments( &self.segments, - None, + Some(IndexType::Vector), self.target_segment_bytes, ) .await @@ -650,7 +672,12 @@ impl<'a> IndexSegmentBuilder<'a> { /// Build one segment from a previously-generated plan. pub async fn build(&self, plan: &IndexSegmentPlan) -> Result { - match plan.requested_index_type().unwrap_or(IndexType::Vector) { + match plan.requested_index_type().ok_or_else(|| { + Error::invalid_input( + "IndexSegmentBuilder requires planned segments to declare an index type" + .to_string(), + ) + })? { IndexType::Inverted => { crate::index::scalar::inverted::build_segment(self.dataset, plan).await } @@ -1405,6 +1432,170 @@ mod tests { assert_eq!(indices.len(), input_segments.len()); } + #[tokio::test] + async fn test_index_segment_builder_rejects_duplicate_segment_uuids() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + let batches = RecordBatchIterator::new( + vec![Ok(create_text_batch(0, 10))], + create_text_batch(0, 1).schema(), + ); + let mut dataset = Dataset::write( + batches, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 10, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = InvertedIndexParams::default(); + let segment = + CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Inverted, ¶ms) + .name("text_idx".to_string()) + .fragments(vec![0]) + .execute_uncommitted() + .await + .unwrap(); + + let err = dataset + .create_index_segment_builder() + .with_segments(vec![segment.clone(), segment]) + .build_all() + .await + .unwrap_err(); + assert!( + err.to_string().contains("duplicate segment uuid"), + "unexpected error: {err}" + ); + } + + #[tokio::test] + async fn test_index_segment_builder_rejects_mixed_index_types() { + let text_tmpdir = TempStrDir::default(); + let text_dataset_uri = format!("file://{}", text_tmpdir.as_str()); + let text_batches = RecordBatchIterator::new( + vec![Ok(create_text_batch(0, 10))], + create_text_batch(0, 1).schema(), + ); + let mut text_dataset = Dataset::write( + text_batches, + &text_dataset_uri, + Some(WriteParams { + max_rows_per_file: 10, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + let inverted_params = InvertedIndexParams::default(); + let text_segment = CreateIndexBuilder::new( + &mut text_dataset, + &["text"], + IndexType::Inverted, + &inverted_params, + ) + .name("text_idx".to_string()) + .fragments(vec![0]) + .execute_uncommitted() + .await + .unwrap(); + + let vector_tmpdir = TempStrDir::default(); + let vector_dataset_uri = format!("file://{}", vector_tmpdir.as_str()); + let reader = gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vector", + lance_datagen::array::rand_vec::(lance_datagen::Dimension::from(8)), + ) + .into_reader_rows( + lance_datagen::RowCount::from(64), + lance_datagen::BatchCount::from(1), + ); + let mut vector_dataset = Dataset::write( + reader, + &vector_dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + let vector_params = VectorIndexParams::with_ivf_flat_params( + DistanceType::L2, + prepare_vector_ivf(&vector_dataset, "vector").await, + ); + let vector_segment = CreateIndexBuilder::new( + &mut vector_dataset, + &["vector"], + IndexType::Vector, + &vector_params, + ) + .name("vector_idx".to_string()) + .fragments(vec![0]) + .execute_uncommitted() + .await + .unwrap(); + + let err = text_dataset + .create_index_segment_builder() + .with_segments(vec![text_segment, vector_segment]) + .plan() + .await + .unwrap_err(); + assert!( + err.to_string().contains("same index type"), + "unexpected error: {err}" + ); + } + + #[tokio::test] + async fn test_index_segment_builder_requires_requested_index_type() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + let batches = RecordBatchIterator::new( + vec![Ok(create_text_batch(0, 10))], + create_text_batch(0, 1).schema(), + ); + let dataset = Dataset::write( + batches, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 10, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + + let segment = IndexSegment::new( + Uuid::new_v4(), + [0_u32], + Arc::new(prost_types::Any::default()), + 0, + ); + let plan = IndexSegmentPlan::new(segment, Vec::new(), 0, None); + let err = dataset + .create_index_segment_builder() + .build(&plan) + .await + .unwrap_err(); + assert!( + err.to_string().contains("declare an index type"), + "unexpected error: {err}" + ); + } + #[tokio::test] async fn test_commit_existing_index_supports_local_hnsw_segments() { let tmpdir = TempStrDir::default(); diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index 4b0e671575f..9519aeb478a 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -3,13 +3,14 @@ use std::sync::Arc; +use lance_index::pbold::InvertedIndexDetails; use lance_index::{IndexType, scalar::lance_format::LanceIndexStore}; use lance_table::format::IndexMetadata; use crate::{ Dataset, Error, Result, dataset::index::LanceIndexStoreExt, - index::{IndexSegment, IndexSegmentPlan}, + index::{DatasetIndexExt, IndexSegment, IndexSegmentPlan, scalar::fetch_index_details}, }; /// Plan physical segments for staged inverted-index outputs. @@ -108,3 +109,66 @@ pub(crate) async fn build_segment( .await?; Ok(built_segment) } + +/// Load all committed inverted-index segments that belong to the same named index. +pub(crate) async fn load_segments( + dataset: &Dataset, + column: &str, +) -> Result>> { + let Some(index_meta) = dataset + .load_scalar_index( + lance_index::IndexCriteria::default() + .for_column(column) + .supports_fts(), + ) + .await? + else { + return Ok(None); + }; + + let indices = dataset.load_indices_by_name(&index_meta.name).await?; + if indices.is_empty() { + return Ok(None); + } + + let expected_fields = indices[0].fields.clone(); + for meta in &indices { + if meta.fields != expected_fields { + return Err(Error::invalid_input(format!( + "FTS index {} has inconsistent fields across segments", + index_meta.name + ))); + } + } + + Ok(Some(indices)) +} + +/// Load and validate the shared inverted-index details across committed segments. +pub(crate) async fn load_segment_details( + dataset: &Dataset, + column: &str, + segments: &[IndexMetadata], +) -> Result { + let mut expected_details: Option = None; + for meta in segments { + let details_any = fetch_index_details(dataset, column, meta).await?; + let details = details_any.as_ref().to_msg::()?; + match &expected_details { + Some(expected) if expected != &details => { + return Err(Error::invalid_input(format!( + "FTS index {} has inconsistent inverted index details across segments", + meta.name + ))); + } + Some(_) => {} + None => expected_details = Some(details), + } + } + expected_details.ok_or_else(|| { + Error::invalid_input(format!( + "FTS index for column {} requires at least one segment", + column + )) + }) +} diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index dc898ec5b50..64d9d0155cf 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use arrow::array::{AsArray, BooleanBuilder}; @@ -17,6 +17,7 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; use datafusion_physical_expr::{Distribution, EquivalenceProperties, Partitioning}; use datafusion_physical_plan::metrics::{BaselineMetrics, Count}; +use futures::future::try_join_all; use futures::stream::{self}; use futures::{Stream, StreamExt, TryStreamExt}; use itertools::Itertools; @@ -26,12 +27,9 @@ use lance_table::format::IndexMetadata; use super::PreFilterSource; use super::utils::{IndexMetrics, InstrumentedRecordBatchStreamAdapter, build_prefilter}; -use crate::index::DatasetIndexExt; -use crate::index::scalar::fetch_index_details; +use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::{Dataset, index::DatasetIndexInternalExt}; -use lance_index::IndexCriteria; use lance_index::metrics::MetricsCollector; -use lance_index::pbold::InvertedIndexDetails; use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::lance_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -41,70 +39,11 @@ use lance_index::scalar::inverted::query::{ }; use lance_index::scalar::inverted::tokenizer::lance_tokenizer::TextTokenizer; use lance_index::scalar::inverted::{ - FTS_SCHEMA, InvertedIndex, SCORE_COL, flat_bm25_search_stream, + FTS_SCHEMA, InvertedIndex, MemBM25Scorer, SCORE_COL, flat_bm25_search_stream, }; use lance_index::{prefilter::PreFilter, scalar::inverted::query::BooleanQuery}; use tracing::instrument; -/// Load all FTS segments that belong to the same named index for a column. -pub(crate) async fn load_fts_segments( - dataset: &Dataset, - column: &str, -) -> Result>> { - let Some(index_meta) = dataset - .load_scalar_index(IndexCriteria::default().for_column(column).supports_fts()) - .await? - else { - return Ok(None); - }; - - let indices = dataset.load_indices_by_name(&index_meta.name).await?; - if indices.is_empty() { - return Ok(None); - } - - let expected_fields = indices[0].fields.clone(); - for meta in &indices { - if meta.fields != expected_fields { - return Err(Error::invalid_input(format!( - "FTS index {} has inconsistent fields across segments", - index_meta.name - ))); - } - } - - Ok(Some(indices)) -} - -/// Load and validate the shared FTS details across a set of segments. -pub(crate) async fn load_fts_segment_details( - dataset: &Dataset, - column: &str, - segments: &[IndexMetadata], -) -> Result { - let mut expected_details: Option = None; - for meta in segments { - let details_any = fetch_index_details(dataset, column, meta).await?; - let details = details_any.as_ref().to_msg::()?; - match &expected_details { - Some(expected) if expected != &details => { - return Err(Error::invalid_input(format!( - "FTS index {} has inconsistent inverted index details across segments", - meta.name - ))); - } - Some(_) => {} - None => expected_details = Some(details), - } - } - expected_details.ok_or_else(|| { - Error::invalid_input(format!( - "FTS index for column {} requires at least one segment", - column - )) - }) -} - /// Open one FTS segment as an [`InvertedIndex`]. async fn open_fts_segment( dataset: &Dataset, @@ -126,6 +65,77 @@ async fn open_fts_segment( Ok(Arc::new(inverted.clone())) } +/// Open all committed FTS segments for a column. +/// +/// Exact multi-segment BM25 still needs every segment's local corpus statistics, so the +/// current correctness-first path opens each committed segment before scoring. +async fn open_fts_segments( + dataset: &Dataset, + column: &str, + segments: &[IndexMetadata], + metrics: &IndexMetrics, +) -> Result>> { + try_join_all( + segments + .iter() + .map(|segment| open_fts_segment(dataset, column, segment, metrics)), + ) + .await +} + +/// Expand fuzzy query tokens across all segments so the shared BM25 scorer sees every term +/// that any segment-local search may score. +fn scorer_tokens( + indices: &[Arc], + query_tokens: &Tokens, + params: &FtsSearchParams, +) -> Result { + if !matches!(params.fuzziness, Some(n) if n != 0) { + return Ok(query_tokens.clone()); + } + + let mut tokens = Vec::new(); + let mut positions = Vec::new(); + let mut seen = HashSet::new(); + for index in indices { + let expanded = index.expand_fuzzy_tokens(query_tokens, params)?; + for idx in 0..expanded.len() { + let token = expanded.get_token(idx); + if seen.insert(token.to_string()) { + tokens.push(token.to_string()); + positions.push(expanded.position(idx)); + } + } + } + Ok(Tokens::with_positions( + tokens, + positions, + query_tokens.token_type().clone(), + )) +} + +/// Build a shared BM25 scorer for a set of committed FTS segments. +fn build_global_bm25_scorer( + indices: &[Arc], + query_tokens: &Tokens, + params: &FtsSearchParams, +) -> Result { + let scorer_tokens = scorer_tokens(indices, query_tokens, params)?; + let first_index = indices.first().ok_or_else(|| { + Error::invalid_input("FTS index requires at least one segment".to_string()) + })?; + let mut base_scorer = first_index.bm25_base_scorer(&scorer_tokens); + for index in indices.iter().skip(1) { + let segment_scorer = index.bm25_base_scorer(&scorer_tokens); + base_scorer.total_tokens += segment_scorer.total_tokens; + base_scorer.num_docs += segment_scorer.num_docs; + for (token, count) in segment_scorer.token_docs { + *base_scorer.token_docs.entry(token).or_insert(0) += count; + } + } + Ok(base_scorer) +} + /// Fall back to the default simple tokenizer when no on-disk FTS segment exists. fn default_text_tokenizer() -> Box { Box::new(TextTokenizer::new( @@ -323,19 +333,15 @@ impl ExecutionPlan for MatchQueryExec { )))?; let stream = stream::once(async move { let _timer = metrics.baseline_metrics.elapsed_compute().timer(); - let segments = - load_fts_segments(&ds, &column) - .await? - .ok_or(DataFusionError::Execution(format!( - "No Inverted index found for column {}", - column, - )))?; - let _details = load_fts_segment_details(&ds, &column, &segments).await?; - let mut indices = Vec::with_capacity(segments.len()); - for segment in &segments { - indices - .push(open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?); - } + let segments = load_segments(&ds, &column) + .await? + .ok_or(DataFusionError::Execution(format!( + "No Inverted index found for column {}", + column, + )))?; + let _details = load_segment_details(&ds, &column, &segments).await?; + let indices = + open_fts_segments(&ds, &column, &segments, &metrics.index_metrics).await?; let mut pre_filter = build_prefilter(context.clone(), partition, &prefilter_source, ds, &segments)?; @@ -380,15 +386,7 @@ impl ExecutionPlan for MatchQueryExec { } }; let tokens = collect_query_tokens(&query.terms, &mut tokenizer); - let mut base_scorer = first_index.bm25_base_scorer(&tokens); - for index in indices.iter().skip(1) { - let segment_scorer = index.bm25_base_scorer(&tokens); - base_scorer.total_tokens += segment_scorer.total_tokens; - base_scorer.num_docs += segment_scorer.num_docs; - for (token, count) in segment_scorer.token_docs { - *base_scorer.token_docs.entry(token).or_insert(0) += count; - } - } + let base_scorer = build_global_bm25_scorer(&indices, &tokens, ¶ms)?; pre_filter.wait_for_ready().await?; let tokens = Arc::new(tokens); @@ -498,7 +496,7 @@ impl FlatMatchFilterExec { column: &str, metrics: &IndexMetrics, ) -> DataFusionResult> { - if let Some(segments) = load_fts_segments(dataset, column).await? { + if let Some(segments) = load_segments(dataset, column).await? { let index_meta = segments.first().ok_or_else(|| { DataFusionError::Execution(format!( "FTS index for column {} has no segments", @@ -784,16 +782,12 @@ impl ExecutionPlan for FlatMatchQueryExec { document_input(self.unindexed_input.execute(partition, context)?, &column)?; let stream = stream::once(async move { - let segments = load_fts_segments(&ds, &column).await?; + let segments = load_segments(&ds, &column).await?; let (tokenizer, base_scorer) = match segments { Some(segments) => { - let _details = load_fts_segment_details(&ds, &column, &segments).await?; - let mut indices = Vec::with_capacity(segments.len()); - for segment in &segments { - indices.push( - open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?, - ); - } + let _details = load_segment_details(&ds, &column, &segments).await?; + let indices = + open_fts_segments(&ds, &column, &segments, &metrics.index_metrics).await?; metrics.record_parts_searched( indices.iter().map(|index| index.partition_count()).sum(), ); @@ -802,15 +796,8 @@ impl ExecutionPlan for FlatMatchQueryExec { ))?; let mut tokenizer = first_index.tokenizer(); let query_tokens = collect_query_tokens(&query.terms, &mut tokenizer); - let mut base_scorer = first_index.bm25_base_scorer(&query_tokens); - for index in indices.iter().skip(1) { - let segment_scorer = index.bm25_base_scorer(&query_tokens); - base_scorer.total_tokens += segment_scorer.total_tokens; - base_scorer.num_docs += segment_scorer.num_docs; - for (token, count) in segment_scorer.token_docs { - *base_scorer.token_docs.entry(token).or_insert(0) += count; - } - } + let base_scorer = + build_global_bm25_scorer(&indices, &query_tokens, &FtsSearchParams::new())?; (tokenizer, Some(base_scorer)) } None => (default_text_tokenizer(), None), @@ -1007,19 +994,15 @@ impl ExecutionPlan for PhraseQueryExec { "column not set for PhraseQuery {}", query.terms )))?; - let segments = - load_fts_segments(&ds, &column) - .await? - .ok_or(DataFusionError::Execution(format!( - "No Inverted index found for column {}", - column, - )))?; - let _details = load_fts_segment_details(&ds, &column, &segments).await?; - let mut indices = Vec::with_capacity(segments.len()); - for segment in &segments { - indices - .push(open_fts_segment(&ds, &column, segment, &metrics.index_metrics).await?); - } + let segments = load_segments(&ds, &column) + .await? + .ok_or(DataFusionError::Execution(format!( + "No Inverted index found for column {}", + column, + )))?; + let _details = load_segment_details(&ds, &column, &segments).await?; + let indices = + open_fts_segments(&ds, &column, &segments, &metrics.index_metrics).await?; let mut pre_filter = build_prefilter(context.clone(), partition, &prefilter_source, ds, &segments)?; @@ -1044,15 +1027,7 @@ impl ExecutionPlan for PhraseQueryExec { )))?; let mut tokenizer = first_index.tokenizer(); let tokens = collect_query_tokens(&query.terms, &mut tokenizer); - let mut base_scorer = first_index.bm25_base_scorer(&tokens); - for index in indices.iter().skip(1) { - let segment_scorer = index.bm25_base_scorer(&tokens); - base_scorer.total_tokens += segment_scorer.total_tokens; - base_scorer.num_docs += segment_scorer.num_docs; - for (token, count) in segment_scorer.token_docs { - *base_scorer.token_docs.entry(token).or_insert(0) += count; - } - } + let base_scorer = build_global_bm25_scorer(&indices, &tokens, ¶ms)?; pre_filter.wait_for_ready().await?; let tokens = Arc::new(tokens); diff --git a/rust/lance/tests/query/inverted.rs b/rust/lance/tests/query/inverted.rs index f209e6f6c28..3a1e475e39a 100644 --- a/rust/lance/tests/query/inverted.rs +++ b/rust/lance/tests/query/inverted.rs @@ -3,6 +3,7 @@ use std::sync::Arc; +use arrow_array::cast::AsArray; use arrow_array::{ ArrayRef, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, }; @@ -235,6 +236,83 @@ async fn test_segmented_inverted_match_query() { test_fts(&original, &ds, "text", "lance", None, true, false).await; } +#[tokio::test] +async fn test_segmented_inverted_fuzzy_match_uses_global_idf() { + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let batches = vec![ + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![0])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("lance")])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![1])) as ArrayRef), + ( + "text", + Arc::new(StringArray::from(vec![Some("lance lance lance")])) as ArrayRef, + ), + ]) + .unwrap(), + ]; + let schema = batches[0].schema(); + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let mut ds = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + max_rows_per_group: 1, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = base_inverted_params(false); + let fragment_ids = ds + .get_fragments() + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let mut metadatas = Vec::::with_capacity(fragment_ids.len()); + for fragment_id in fragment_ids { + let mut builder = ds + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name("segmented_fuzzy".to_string()) + .fragments(vec![fragment_id]); + metadatas.push(builder.execute_uncommitted().await.unwrap()); + } + let segments = ds + .create_index_segment_builder() + .with_segments(metadatas) + .build_all() + .await + .unwrap(); + ds.commit_existing_index_segments("segmented_fuzzy", "text", segments) + .await + .unwrap(); + + let batch = ds + .scan() + .full_text_search( + FullTextSearchQuery::new_fuzzy("lnce".to_string(), Some(1)) + .with_column("text".to_string()) + .unwrap() + .limit(Some(1)), + ) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let ids = batch["id"].as_primitive::(); + assert_eq!(ids.values(), &[1]); +} + #[tokio::test] async fn test_segmented_inverted_phrase_query() { let test_dir = tempfile::tempdir().unwrap(); From f69eb663e7f3288f6851cc574fd5f745140c64c4 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 22:39:30 +0800 Subject: [PATCH 04/16] fix: update inverted bench for shared bm25 scorer --- rust/lance-index/benches/inverted.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/lance-index/benches/inverted.rs b/rust/lance-index/benches/inverted.rs index bce3fac4414..a51822f0695 100644 --- a/rust/lance-index/benches/inverted.rs +++ b/rust/lance-index/benches/inverted.rs @@ -188,6 +188,7 @@ fn bench_inverted(c: &mut Criterion) { Operator::Or, no_filter.clone(), Arc::new(NoOpMetricsCollector), + None, ) .await .unwrap(), @@ -226,6 +227,7 @@ fn bench_inverted(c: &mut Criterion) { Operator::And, no_filter.clone(), Arc::new(NoOpMetricsCollector), + None, ) .await .unwrap(), From ea80a5bf14b3b3b6b456d139df14c6f4bb6ae990 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 22:46:38 +0800 Subject: [PATCH 05/16] fix: satisfy clippy visibility checks --- rust/lance/src/index/scalar/inverted.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index 9519aeb478a..e23ce5db364 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +#![allow(clippy::redundant_pub_crate)] + use std::sync::Arc; use lance_index::pbold::InvertedIndexDetails; From 28e963b485217f62d76a5bb58e34b0f451babc71 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 26 Mar 2026 23:18:31 +0800 Subject: [PATCH 06/16] fix: restore vector segment builder compatibility --- rust/lance/src/index/create.rs | 111 ++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 16 deletions(-) diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index ca352940764..9d1d770b341 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -616,18 +616,30 @@ impl<'a> IndexSegmentBuilder<'a> { } let infer_index_type = |segment: &IndexMetadata| -> Result { - let index_details = segment.index_details.as_ref().ok_or_else(|| { - Error::invalid_input("input segment is missing index details".to_string()) - })?; - let details = IndexDetails(index_details.clone()); - if details.supports_fts() { + if let Some(index_details) = segment.index_details.as_ref() { + let details = IndexDetails(index_details.clone()); + if details.supports_fts() { + return Ok(IndexType::Inverted); + } + if details.is_vector() { + return Ok(IndexType::Vector); + } + } + + if segment.files.as_ref().is_some_and(|files| { + files.iter().any(|file| { + file.path == lance_index::scalar::inverted::METADATA_FILE + || file.path.starts_with("part_") + }) + }) { Ok(IndexType::Inverted) - } else if details.is_vector() { + } else if segment.files.is_some() { Ok(IndexType::Vector) } else { - Err(Error::invalid_input( - "IndexSegmentBuilder only supports vector and FTS segments".to_string(), - )) + Err(Error::invalid_input(format!( + "IndexSegmentBuilder could not infer the index type for segment {}", + segment.uuid + ))) } }; @@ -658,7 +670,7 @@ impl<'a> IndexSegmentBuilder<'a> { IndexType::Vector => { crate::index::vector::ivf::plan_segments( &self.segments, - Some(IndexType::Vector), + None, self.target_segment_bytes, ) .await @@ -672,12 +684,23 @@ impl<'a> IndexSegmentBuilder<'a> { /// Build one segment from a previously-generated plan. pub async fn build(&self, plan: &IndexSegmentPlan) -> Result { - match plan.requested_index_type().ok_or_else(|| { - Error::invalid_input( - "IndexSegmentBuilder requires planned segments to declare an index type" - .to_string(), - ) - })? { + let index_type = if let Some(index_type) = plan.requested_index_type() { + index_type + } else { + let details = IndexDetails(plan.segment().index_details().clone()); + if details.supports_fts() { + IndexType::Inverted + } else if details.is_vector() { + IndexType::Vector + } else { + return Err(Error::invalid_input( + "IndexSegmentBuilder requires planned segments to declare an index type" + .to_string(), + )); + } + }; + + match index_type { IndexType::Inverted => { crate::index::scalar::inverted::build_segment(self.dataset, plan).await } @@ -1368,6 +1391,62 @@ mod tests { assert!(result.num_rows() > 0); } + #[tokio::test] + async fn test_index_segment_builder_vector_segments_without_index_details() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + let reader = gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vector", + lance_datagen::array::rand_vec::(lance_datagen::Dimension::from(16)), + ) + .into_reader_rows( + lance_datagen::RowCount::from(256), + lance_datagen::BatchCount::from(4), + ); + let mut dataset = Dataset::write( + reader, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + ..Default::default() + }), + ) + .await + .unwrap(); + + let fragments = dataset.get_fragments(); + assert!(fragments.len() >= 2); + let params = VectorIndexParams::with_ivf_flat_params( + DistanceType::L2, + prepare_vector_ivf(&dataset, "vector").await, + ); + let mut input_segments = Vec::new(); + + for fragment in fragments.iter().take(2) { + let mut segment = + CreateIndexBuilder::new(&mut dataset, &["vector"], IndexType::Vector, ¶ms) + .name("vector_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + segment.index_details = None; + input_segments.push(segment); + } + + let segments = dataset + .create_index_segment_builder() + .with_segments(input_segments) + .build_all() + .await + .unwrap(); + assert_eq!(segments.len(), 2); + } + #[tokio::test] async fn test_index_segment_builder_fts_commits_multi_segment_logical_index() { let tmpdir = TempStrDir::default(); From f9a7c71428cd8d12e7446c0b8d39d6790d062d07 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 27 Mar 2026 00:00:20 +0800 Subject: [PATCH 07/16] fix: require explicit index type for segment builder --- python/python/lance/dataset.py | 11 +- python/python/lance/lance/__init__.pyi | 9 +- python/python/tests/test_vector_index.py | 41 ++++- python/src/dataset.rs | 29 ++++ rust/lance/src/index/api.rs | 3 +- rust/lance/src/index/create.rs | 212 ++++++++++------------- rust/lance/tests/query/inverted.rs | 4 + 7 files changed, 174 insertions(+), 135 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index fd2db9de351..ba14e24b799 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3260,7 +3260,7 @@ def create_index( This enables distributed/fragment-level indexing. When provided, the method creates one segment but does not commit the index to the dataset. The returned metadata can be passed to - ``create_index_segment_builder().with_segments(...)`` + ``create_index_segment_builder().with_index_type(...).with_segments(...)`` and then committed with ``commit_existing_index_segments(...)``. index_uuid : str, optional A UUID to use for the segment written by this call. @@ -3439,8 +3439,10 @@ def create_index_uncommitted( 1. run :meth:`create_index_uncommitted` on each worker with that worker's assigned ``fragment_ids`` 2. collect the returned :class:`Index` objects - 3. pass them to :meth:`IndexSegmentBuilder.with_segments` - 4. build one or more physical segments and commit them with + 3. call :meth:`IndexSegmentBuilder.with_index_type` with the concrete + distributed index type + 4. pass them to :meth:`IndexSegmentBuilder.with_segments` + 5. build one or more physical segments and commit them with :meth:`commit_existing_index_segments` Parameters are the same as :meth:`create_index`, with one additional @@ -3558,7 +3560,8 @@ def create_index_segment_builder(self): Provide the segment metadata returned by :meth:`create_index_uncommitted` through - :meth:`IndexSegmentBuilder.with_segments`. + :meth:`IndexSegmentBuilder.with_segments`, and declare the segment type + with :meth:`IndexSegmentBuilder.with_index_type`. """ return self._ds.create_index_segment_builder() diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index e2f70a853a1..5c4c587c050 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -188,9 +188,8 @@ class _Session: def size_bytes(self) -> int: ... class IndexSegmentBuilder: - @property - def staging_index_uuid(self) -> str: ... - def with_partial_indices(self, partial_indices: List[Index]) -> Self: ... + def with_index_type(self, index_type: str) -> Self: ... + def with_segments(self, segments: List[Index]) -> Self: ... def with_target_segment_bytes(self, bytes: int) -> Self: ... def plan(self) -> List[IndexSegmentPlan]: ... def build(self, plan: IndexSegmentPlan) -> IndexSegment: ... @@ -371,9 +370,7 @@ class _Dataset: def merge_index_metadata( self, index_uuid: str, index_type: str, batch_readhead: Optional[int] = None ): ... - def create_index_segment_builder( - self, staging_index_uuid: str - ) -> IndexSegmentBuilder: ... + def create_index_segment_builder(self) -> IndexSegmentBuilder: ... def commit_existing_index_segments( self, index_name: str, column: str, segments: List[IndexSegment] ) -> None: ... diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 2451f4684b4..eb9799d9daf 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -2133,7 +2133,10 @@ def build_distributed_vector_index( ) segments = ( - dataset.create_index_segment_builder().with_segments(segments).build_all() + dataset.create_index_segment_builder() + .with_index_type(index_type) + .with_segments(segments) + .build_all() ) return dataset.commit_existing_index_segments(f"{column}_idx", column, segments) @@ -2506,7 +2509,12 @@ def test_metadata_merge_pq_success(tmp_path): ivf_centroids=pre["ivf_centroids"], pq_codebook=pre["pq_codebook"], ) - segments = ds.create_index_segment_builder().with_segments(segments).build_all() + segments = ( + ds.create_index_segment_builder() + .with_index_type("IVF_PQ") + .with_segments(segments) + .build_all() + ) ds = _commit_segments_helper(ds, segments, "vector") q = np.random.rand(128).astype(np.float32) results = ds.to_table(nearest={"column": "vector", "q": q, "k": 10}) @@ -2545,7 +2553,12 @@ def test_distributed_workflow_merge_and_search(tmp_path): ivf_centroids=pre["ivf_centroids"], pq_codebook=pre["pq_codebook"], ) - segments = ds.create_index_segment_builder().with_segments(segments).build_all() + segments = ( + ds.create_index_segment_builder() + .with_index_type("IVF_PQ") + .with_segments(segments) + .build_all() + ) ds = _commit_segments_helper(ds, segments, "vector") q = np.random.rand(128).astype(np.float32) results = ds.to_table(nearest={"column": "vector", "q": q, "k": 10}) @@ -2581,7 +2594,12 @@ def test_vector_merge_two_shards_success_flat(tmp_path): ivf_centroids=preprocessed["ivf_centroids"], pq_codebook=preprocessed["pq_codebook"], ) - segments = ds.create_index_segment_builder().with_segments(segments).build_all() + segments = ( + ds.create_index_segment_builder() + .with_index_type("IVF_FLAT") + .with_segments(segments) + .build_all() + ) ds = _commit_segments_helper(ds, segments, column="vector") q = np.random.rand(128).astype(np.float32) result = ds.to_table(nearest={"column": "vector", "q": q, "k": 5}) @@ -2634,7 +2652,12 @@ def test_distributed_ivf_parameterized(tmp_path, index_type, num_sub_vectors): ds.create_index_uncommitted(**kwargs1), ds.create_index_uncommitted(**kwargs2), ] - segments = ds.create_index_segment_builder().with_segments(segments).build_all() + segments = ( + ds.create_index_segment_builder() + .with_index_type(index_type) + .with_segments(segments) + .build_all() + ) ds = _commit_segments_helper(ds, segments, "vector") q = np.random.rand(128).astype(np.float32) @@ -2697,6 +2720,7 @@ def test_merge_two_shards_parameterized(tmp_path, index_type, num_sub_vectors): segments = ( ds.create_index_segment_builder() + .with_index_type(index_type) .with_segments([segment1, segment2]) .build_all() ) @@ -2735,7 +2759,11 @@ def test_index_segment_builder_builds_vector_segments(tmp_path): for fragment in frags[:2] ] - segment_builder = ds.create_index_segment_builder().with_segments(segments) + segment_builder = ( + ds.create_index_segment_builder() + .with_index_type("IVF_FLAT") + .with_segments(segments) + ) plans = segment_builder.plan() assert len(plans) == 2 assert all(len(plan.segments) == 1 for plan in plans) @@ -2805,6 +2833,7 @@ def build_distributed_ivf_pq(ds_copy, shard_order): ) segments = ( ds_copy.create_index_segment_builder() + .with_index_type("IVF_PQ") .with_segments(segments) .build_all() ) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 402bc010268..5e41705a2a4 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -327,6 +327,7 @@ impl MergeInsertBuilder { #[derive(Clone)] pub struct PyIndexSegmentBuilder { dataset: Arc, + index_type: Option, segments: Vec, target_segment_bytes: Option, } @@ -337,6 +338,9 @@ impl PyIndexSegmentBuilder { .dataset .create_index_segment_builder() .with_segments(self.segments.clone()); + if let Some(index_type) = self.index_type { + builder = builder.with_index_type(index_type); + } if let Some(target_segment_bytes) = self.target_segment_bytes { builder = builder.with_target_segment_bytes(target_segment_bytes); } @@ -346,6 +350,30 @@ impl PyIndexSegmentBuilder { #[pymethods] impl PyIndexSegmentBuilder { + fn with_index_type<'a>( + mut slf: PyRefMut<'a, Self>, + index_type: &str, + ) -> PyResult> { + let normalized = index_type.to_uppercase(); + slf.index_type = Some(match normalized.as_str() { + "INVERTED" | "FTS" => IndexType::Inverted, + "VECTOR" => IndexType::Vector, + "IVF_FLAT" => IndexType::IvfFlat, + "IVF_PQ" => IndexType::IvfPq, + "IVF_SQ" => IndexType::IvfSq, + "IVF_RQ" => IndexType::IvfRq, + "IVF_HNSW_FLAT" => IndexType::IvfHnswFlat, + "IVF_HNSW_PQ" => IndexType::IvfHnswPq, + "IVF_HNSW_SQ" => IndexType::IvfHnswSq, + _ => { + return Err(PyValueError::new_err(format!( + "Unsupported index type for segment builder: {index_type}" + ))); + } + }); + Ok(slf) + } + fn with_segments<'a>( mut slf: PyRefMut<'a, Self>, segments: &Bound<'_, PyAny>, @@ -2094,6 +2122,7 @@ impl Dataset { fn create_index_segment_builder(&self) -> PyResult { Ok(PyIndexSegmentBuilder { dataset: self.ds.clone(), + index_type: None, segments: Vec::new(), target_segment_bytes: None, }) diff --git a/rust/lance/src/index/api.rs b/rust/lance/src/index/api.rs index e13b73d68af..b022d6cb905 100644 --- a/rust/lance/src/index/api.rs +++ b/rust/lance/src/index/api.rs @@ -152,7 +152,8 @@ pub trait DatasetIndexExt { /// index outputs. /// /// The caller supplies the uncommitted index metadata returned by - /// `execute_uncommitted()` so the builder can plan segment grouping without + /// `execute_uncommitted()` and then declares the concrete index type with + /// `with_index_type(...)` so the builder can plan segment grouping without /// rediscovering fragment coverage. /// /// This is the canonical entry point for segment-based index build. diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 9d1d770b341..10ee397625e 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -9,7 +9,7 @@ use crate::{ }, index::{ DatasetIndexExt, DatasetIndexInternalExt, build_index_metadata_from_segments, - scalar::{IndexDetails, build_scalar_index}, + scalar::build_scalar_index, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, @@ -487,9 +487,33 @@ impl<'a> CreateIndexBuilder<'a> { new_idx.name )) })?; + let segment_index_type = match self.index_type { + IndexType::Vector + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfFlat + | IndexType::IvfRq + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq => self + .params + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index("Vector index type must take a VectorIndexParams".to_string()) + })? + .index_type(), + unsupported => { + return Err(Error::internal(format!( + "Segment commit path does not support index type {}", + unsupported + ))); + } + }; let segments = self .dataset .create_index_segment_builder() + .with_index_type(segment_index_type) .with_segments(vec![new_idx.clone()]) .build_all() .await?; @@ -564,6 +588,7 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { /// /// Use [`DatasetIndexExt::create_index_segment_builder`] and then either: /// +/// - call [`Self::with_index_type`] with the concrete segment type first, then /// - call [`Self::plan`] and orchestrate individual segment builds externally, or /// - call [`Self::build_all`] to build all segments on the current node. /// @@ -573,6 +598,7 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { #[derive(Clone)] pub struct IndexSegmentBuilder<'a> { dataset: &'a Dataset, + index_type: Option, segments: Vec, target_segment_bytes: Option, } @@ -581,11 +607,18 @@ impl<'a> IndexSegmentBuilder<'a> { pub(crate) fn new(dataset: &'a Dataset) -> Self { Self { dataset, + index_type: None, segments: Vec::new(), target_segment_bytes: None, } } + /// Declare the concrete index type of the staged segments. + pub fn with_index_type(mut self, index_type: IndexType) -> Self { + self.index_type = Some(index_type); + self + } + /// Provide the segment metadata returned by `execute_uncommitted()`. /// /// These segments must already exist in storage and must not have been @@ -614,36 +647,12 @@ impl<'a> IndexSegmentBuilder<'a> { .to_string(), )); } - - let infer_index_type = |segment: &IndexMetadata| -> Result { - if let Some(index_details) = segment.index_details.as_ref() { - let details = IndexDetails(index_details.clone()); - if details.supports_fts() { - return Ok(IndexType::Inverted); - } - if details.is_vector() { - return Ok(IndexType::Vector); - } - } - - if segment.files.as_ref().is_some_and(|files| { - files.iter().any(|file| { - file.path == lance_index::scalar::inverted::METADATA_FILE - || file.path.starts_with("part_") - }) - }) { - Ok(IndexType::Inverted) - } else if segment.files.is_some() { - Ok(IndexType::Vector) - } else { - Err(Error::invalid_input(format!( - "IndexSegmentBuilder could not infer the index type for segment {}", - segment.uuid - ))) - } - }; - - let index_type = infer_index_type(&self.segments[0])?; + let index_type = self.index_type.ok_or_else(|| { + Error::invalid_input( + "IndexSegmentBuilder requires an explicit index type; call with_index_type(...)" + .to_string(), + ) + })?; let mut seen_segment_ids = HashSet::with_capacity(self.segments.len()); for segment in &self.segments { if !seen_segment_ids.insert(segment.uuid) { @@ -652,14 +661,6 @@ impl<'a> IndexSegmentBuilder<'a> { segment.uuid ))); } - let segment_index_type = infer_index_type(segment)?; - if segment_index_type != index_type { - return Err(Error::invalid_input(format!( - "IndexSegmentBuilder requires all input segments to have the same index type; \ - expected {}, got {} for segment {}", - index_type, segment_index_type, segment.uuid - ))); - } } match index_type { @@ -670,7 +671,21 @@ impl<'a> IndexSegmentBuilder<'a> { IndexType::Vector => { crate::index::vector::ivf::plan_segments( &self.segments, - None, + Some(index_type), + self.target_segment_bytes, + ) + .await + } + IndexType::IvfFlat + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfRq + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq => { + crate::index::vector::ivf::plan_segments( + &self.segments, + Some(index_type), self.target_segment_bytes, ) .await @@ -684,27 +699,23 @@ impl<'a> IndexSegmentBuilder<'a> { /// Build one segment from a previously-generated plan. pub async fn build(&self, plan: &IndexSegmentPlan) -> Result { - let index_type = if let Some(index_type) = plan.requested_index_type() { - index_type - } else { - let details = IndexDetails(plan.segment().index_details().clone()); - if details.supports_fts() { - IndexType::Inverted - } else if details.is_vector() { - IndexType::Vector - } else { - return Err(Error::invalid_input( - "IndexSegmentBuilder requires planned segments to declare an index type" - .to_string(), - )); - } - }; - - match index_type { + match plan.requested_index_type().ok_or_else(|| { + Error::invalid_input( + "IndexSegmentBuilder requires planned segments to declare an index type" + .to_string(), + ) + })? { IndexType::Inverted => { crate::index::scalar::inverted::build_segment(self.dataset, plan).await } - IndexType::Vector => { + IndexType::Vector + | IndexType::IvfFlat + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfRq + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq => { crate::index::vector::ivf::build_segment( self.dataset.object_store(), &self.dataset.indices_dir(), @@ -1247,6 +1258,7 @@ mod tests { let segments = dataset .create_index_segment_builder() + .with_index_type(params.index_type()) .with_segments(input_segments.clone()) .build_all() .await @@ -1342,6 +1354,7 @@ mod tests { let segments = dataset .create_index_segment_builder() + .with_index_type(params.index_type()) .with_segments(input_segments) .build_all() .await @@ -1440,6 +1453,7 @@ mod tests { let segments = dataset .create_index_segment_builder() + .with_index_type(params.index_type()) .with_segments(input_segments) .build_all() .await @@ -1488,6 +1502,7 @@ mod tests { let segments = dataset .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(input_segments.clone()) .build_all() .await @@ -1543,6 +1558,7 @@ mod tests { let err = dataset .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(vec![segment.clone(), segment]) .build_all() .await @@ -1554,16 +1570,17 @@ mod tests { } #[tokio::test] - async fn test_index_segment_builder_rejects_mixed_index_types() { - let text_tmpdir = TempStrDir::default(); - let text_dataset_uri = format!("file://{}", text_tmpdir.as_str()); - let text_batches = RecordBatchIterator::new( + async fn test_index_segment_builder_requires_explicit_index_type() { + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + let batches = RecordBatchIterator::new( vec![Ok(create_text_batch(0, 10))], create_text_batch(0, 1).schema(), ); - let mut text_dataset = Dataset::write( - text_batches, - &text_dataset_uri, + let mut dataset = Dataset::write( + batches, + &dataset_uri, Some(WriteParams { max_rows_per_file: 10, mode: WriteMode::Overwrite, @@ -1572,66 +1589,24 @@ mod tests { ) .await .unwrap(); - let inverted_params = InvertedIndexParams::default(); - let text_segment = CreateIndexBuilder::new( - &mut text_dataset, - &["text"], - IndexType::Inverted, - &inverted_params, - ) - .name("text_idx".to_string()) - .fragments(vec![0]) - .execute_uncommitted() - .await - .unwrap(); - let vector_tmpdir = TempStrDir::default(); - let vector_dataset_uri = format!("file://{}", vector_tmpdir.as_str()); - let reader = gen_batch() - .col("id", lance_datagen::array::step::()) - .col( - "vector", - lance_datagen::array::rand_vec::(lance_datagen::Dimension::from(8)), - ) - .into_reader_rows( - lance_datagen::RowCount::from(64), - lance_datagen::BatchCount::from(1), - ); - let mut vector_dataset = Dataset::write( - reader, - &vector_dataset_uri, - Some(WriteParams { - max_rows_per_file: 64, - mode: WriteMode::Overwrite, - ..Default::default() - }), - ) - .await - .unwrap(); - let vector_params = VectorIndexParams::with_ivf_flat_params( - DistanceType::L2, - prepare_vector_ivf(&vector_dataset, "vector").await, - ); - let vector_segment = CreateIndexBuilder::new( - &mut vector_dataset, - &["vector"], - IndexType::Vector, - &vector_params, - ) - .name("vector_idx".to_string()) - .fragments(vec![0]) - .execute_uncommitted() - .await - .unwrap(); + let params = InvertedIndexParams::default(); + let segment = + CreateIndexBuilder::new(&mut dataset, &["text"], IndexType::Inverted, ¶ms) + .name("text_idx".to_string()) + .fragments(vec![0]) + .execute_uncommitted() + .await + .unwrap(); - let err = text_dataset + let err = dataset .create_index_segment_builder() - .with_segments(vec![text_segment, vector_segment]) + .with_segments(vec![segment]) .plan() .await .unwrap_err(); assert!( - err.to_string().contains("same index type"), + err.to_string().contains("requires an explicit index type"), "unexpected error: {err}" ); } @@ -1666,6 +1641,7 @@ mod tests { let plan = IndexSegmentPlan::new(segment, Vec::new(), 0, None); let err = dataset .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .build(&plan) .await .unwrap_err(); diff --git a/rust/lance/tests/query/inverted.rs b/rust/lance/tests/query/inverted.rs index 3a1e475e39a..eef190a8a29 100644 --- a/rust/lance/tests/query/inverted.rs +++ b/rust/lance/tests/query/inverted.rs @@ -213,6 +213,7 @@ async fn test_segmented_inverted_match_query() { } let segments = ds .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(metadatas.clone()) .build_all() .await @@ -289,6 +290,7 @@ async fn test_segmented_inverted_fuzzy_match_uses_global_idf() { } let segments = ds .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(metadatas) .build_all() .await @@ -374,6 +376,7 @@ async fn test_segmented_inverted_phrase_query() { } let segments = ds .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(metadatas) .build_all() .await @@ -443,6 +446,7 @@ async fn test_segmented_inverted_match_query_with_unindexed_fragments() { } let segments = ds .create_index_segment_builder() + .with_index_type(IndexType::Inverted) .with_segments(metadatas) .build_all() .await From 6d242dfce72b5d5779ce337b30c87604cb047aca Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 27 Mar 2026 00:10:06 +0800 Subject: [PATCH 08/16] fix: require explicit segment types in java builder --- java/lance-jni/src/blocking_dataset.rs | 5 ++++ java/src/main/java/org/lance/Dataset.java | 24 ++++++++++++++++--- .../java/org/lance/index/VectorIndexTest.java | 9 ++++--- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index e705c0eda1d..103adaae593 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -1075,6 +1075,7 @@ pub extern "system" fn Java_org_lance_Dataset_nativeBuildIndexSegments<'local>( mut env: JNIEnv<'local>, java_dataset: JObject, java_segments: JObject, + index_type: jint, target_segment_bytes_jobj: JObject, ) -> JObject<'local> { ok_or_throw!( @@ -1083,6 +1084,7 @@ pub extern "system" fn Java_org_lance_Dataset_nativeBuildIndexSegments<'local>( &mut env, java_dataset, java_segments, + index_type, target_segment_bytes_jobj ) ) @@ -1092,9 +1094,11 @@ fn inner_build_index_segments<'local>( env: &mut JNIEnv<'local>, java_dataset: JObject, java_segments: JObject, + index_type: jint, target_segment_bytes_jobj: JObject, ) -> Result> { let segments = import_vec_to_rust(env, &java_segments, |env, obj| obj.extract_object(env))?; + let index_type = IndexType::try_from(index_type)?; let target_segment_bytes = env .get_long_opt(&target_segment_bytes_jobj)? .map(|v| v as u64); @@ -1106,6 +1110,7 @@ fn inner_build_index_segments<'local>( let mut builder = dataset_guard .inner .create_index_segment_builder() + .with_index_type(index_type) .with_segments(segments); if let Some(target_segment_bytes) = target_segment_bytes { builder = builder.with_target_segment_bytes(target_segment_bytes); diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 506827be902..67eb6892a86 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -1024,20 +1024,38 @@ private native void innerMergeIndexMetadata( * * @param segments segment metadata returned by {@link #createIndex(IndexOptions)} when * fragmentIds are provided + * @param indexType concrete index type for the staged segments * @param targetSegmentBytes optional size target for merged physical segments * @return built physical segment metadata */ - public List buildIndexSegments(List segments, Optional targetSegmentBytes) { + public List buildIndexSegments( + List segments, IndexType indexType, Optional targetSegmentBytes) { Preconditions.checkNotNull(segments, "segments cannot be null"); Preconditions.checkArgument(!segments.isEmpty(), "segments cannot be empty"); + Preconditions.checkNotNull(indexType, "indexType cannot be null"); try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) { Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); - return nativeBuildIndexSegments(segments, targetSegmentBytes); + return nativeBuildIndexSegments(segments, indexType.getValue(), targetSegmentBytes); } } + /** + * Build physical vector index segments from previously-created fragment-level index outputs. + * + * @param segments segment metadata returned by {@link #createIndex(IndexOptions)} when + * fragmentIds are provided + * @param targetSegmentBytes optional size target for merged physical segments + * @return built physical segment metadata + */ + @Deprecated + public List buildIndexSegments(List segments, Optional targetSegmentBytes) { + throw new IllegalArgumentException( + "buildIndexSegments now requires an explicit index type; call " + + "buildIndexSegments(segments, indexType, targetSegmentBytes)"); + } + private native List nativeBuildIndexSegments( - List segments, Optional targetSegmentBytes); + List segments, int indexType, Optional targetSegmentBytes); /** * Publish one or more existing physical index segments as a logical index. diff --git a/java/src/test/java/org/lance/index/VectorIndexTest.java b/java/src/test/java/org/lance/index/VectorIndexTest.java index a96b6593d30..e60a0f517b4 100755 --- a/java/src/test/java/org/lance/index/VectorIndexTest.java +++ b/java/src/test/java/org/lance/index/VectorIndexTest.java @@ -96,7 +96,8 @@ public void testCreateIvfFlatIndexDistributively(@TempDir Path tempDir) throws E "Partially created IVF_FLAT index should not present before commit"); List builtSegments = - dataset.buildIndexSegments(List.of(firstSegment, secondSegment), Optional.empty()); + dataset.buildIndexSegments( + List.of(firstSegment, secondSegment), IndexType.IVF_FLAT, Optional.empty()); assertEquals(2, builtSegments.size()); List committed = @@ -188,7 +189,8 @@ public void testCreateIvfPqIndexDistributively(@TempDir Path tempDir) throws Exc "Partially created IVF_PQ index should not present before commit"); List builtSegments = - dataset.buildIndexSegments(List.of(firstSegment, secondSegment), Optional.empty()); + dataset.buildIndexSegments( + List.of(firstSegment, secondSegment), IndexType.IVF_PQ, Optional.empty()); assertEquals(2, builtSegments.size()); List committed = @@ -264,7 +266,8 @@ public void testCreateIvfSqIndexDistributively(@TempDir Path tempDir) throws Exc "Partially created IVF_SQ index should not present before commit"); List builtSegments = - dataset.buildIndexSegments(List.of(firstSegment, secondSegment), Optional.empty()); + dataset.buildIndexSegments( + List.of(firstSegment, secondSegment), IndexType.IVF_SQ, Optional.empty()); assertEquals(2, builtSegments.size()); List committed = From 5fd5794c2e6fad598fc711d2f187e21cca7bf13a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 27 Mar 2026 00:23:11 +0800 Subject: [PATCH 09/16] fix: skip segment commit path for vector extensions --- rust/lance/src/index/create.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 10ee397625e..a3be4cb7fc0 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -480,7 +480,7 @@ impl<'a> CreateIndexBuilder<'a> { } else { vec![] }; - let transaction = if uses_segment_commit_path(self.index_type) { + let transaction = if uses_segment_commit_path(self.index_type, &new_idx.name, self.params) { let field_id = *new_idx.fields.first().ok_or_else(|| { Error::internal(format!( "Index '{}' is missing field ids after build", @@ -561,7 +561,15 @@ impl<'a> CreateIndexBuilder<'a> { } } -fn uses_segment_commit_path(index_type: IndexType) -> bool { +fn uses_segment_commit_path( + index_type: IndexType, + index_name: &str, + params: &dyn IndexParams, +) -> bool { + if index_name != LANCE_VECTOR_INDEX { + return false; + } + matches!( index_type, IndexType::Vector @@ -572,7 +580,7 @@ fn uses_segment_commit_path(index_type: IndexType) -> bool { | IndexType::IvfHnswFlat | IndexType::IvfHnswPq | IndexType::IvfHnswSq - ) + ) && params.as_any().is::() } impl<'a> IntoFuture for CreateIndexBuilder<'a> { From 88dab0e2926d2b5731eb17814e8453f41d7a7372 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 27 Mar 2026 03:53:11 +0800 Subject: [PATCH 10/16] fix: decode inverted index details from payload --- rust/lance/src/index/scalar/inverted.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index e23ce5db364..0629477cc53 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use lance_index::pbold::InvertedIndexDetails; use lance_index::{IndexType, scalar::lance_format::LanceIndexStore}; use lance_table::format::IndexMetadata; +use prost::Message; use crate::{ Dataset, Error, Result, @@ -155,7 +156,12 @@ pub(crate) async fn load_segment_details( let mut expected_details: Option = None; for meta in segments { let details_any = fetch_index_details(dataset, column, meta).await?; - let details = details_any.as_ref().to_msg::()?; + let details = + InvertedIndexDetails::decode(details_any.value.as_slice()).map_err(|err| { + Error::io(format!( + "failed to decode InvertedIndexDetails payload: {err}" + )) + })?; match &expected_details { Some(expected) if expected != &details => { return Err(Error::invalid_input(format!( @@ -174,3 +180,17 @@ pub(crate) async fn load_segment_details( )) }) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn decode_legacy_inverted_details_type_url() { + let mut details_any = prost_types::Any::from_msg(&InvertedIndexDetails::default()).unwrap(); + details_any.type_url = "/lance.index.pb.InvertedIndexDetails".to_string(); + + let decoded = InvertedIndexDetails::decode(details_any.value.as_slice()).unwrap(); + assert_eq!(decoded, InvertedIndexDetails::default()); + } +} From f5e60abe5072ce7cbb4929d1b79a76e56c9e9a11 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 2 Apr 2026 03:47:35 +0800 Subject: [PATCH 11/16] feat: benchmark and parallelize segmented fts search --- rust/lance/benches/fts_search.rs | 158 ++++++++++++++++++++++++++++- rust/lance/src/index/create.rs | 3 +- rust/lance/src/index/vector/ivf.rs | 11 +- rust/lance/src/io/exec/fts.rs | 130 ++++++++++++++---------- 4 files changed, 239 insertions(+), 63 deletions(-) diff --git a/rust/lance/benches/fts_search.rs b/rust/lance/benches/fts_search.rs index 7ea96bf29b4..ba7bfd97673 100644 --- a/rust/lance/benches/fts_search.rs +++ b/rust/lance/benches/fts_search.rs @@ -9,15 +9,25 @@ /// /// This benchmark is primarily intended for developers to use for profiling and debugging. The python /// benchmark is more comprehensive and will cover regression testing. +use std::{env, sync::Arc}; + +use arrow_array::{ArrayRef, Int32Array, RecordBatch, RecordBatchIterator, StringArray}; use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; use futures::TryStreamExt; -use lance::Dataset; -use lance_index::scalar::FullTextSearchQuery; +use lance::{Dataset, dataset::WriteParams, index::DatasetIndexExt}; +use lance_index::{ + IndexType, + scalar::{FullTextSearchQuery, inverted::tokenizer::InvertedIndexParams}, +}; #[cfg(target_os = "linux")] use pprof::criterion::{Output, PProfProfiler}; -use std::env; +use tempfile::TempDir; const WIKIPEDIA_DATASET_ENV_VAR: &str = "LANCE_WIKIPEDIA_DATASET_PATH"; +const INDEX_NAME: &str = "segmented_fts"; +const INDEXED_FRAGMENT_COUNT: usize = 12; +const UNINDEXED_FRAGMENT_COUNT: usize = 1; +const ROWS_PER_FRAGMENT: usize = 64; /// Get the Wikipedia dataset path from environment variable. /// Panics if the environment variable is not set. @@ -30,6 +40,96 @@ fn get_wikipedia_dataset_path() -> String { }) } +struct BenchDataset { + _tmpdir: TempDir, + dataset: Dataset, +} + +fn create_fragment_batch(fragment_id: usize) -> RecordBatch { + let start = (fragment_id * ROWS_PER_FRAGMENT) as i32; + let ids = Arc::new(Int32Array::from_iter_values( + start..start + ROWS_PER_FRAGMENT as i32, + )); + let texts = Arc::new(StringArray::from_iter_values((0..ROWS_PER_FRAGMENT).map( + |row| { + let term = match (fragment_id + row) % 4 { + 0 => "alpha", + 1 => "beta", + 2 => "gamma", + _ => "delta", + }; + format!("shared {term} fragment-{fragment_id} row-{row}") + }, + ))); + RecordBatch::try_from_iter(vec![("id", ids as ArrayRef), ("text", texts as ArrayRef)]).unwrap() +} + +fn grouped_fragment_ids(segment_count: usize) -> Vec> { + let fragments_per_segment = INDEXED_FRAGMENT_COUNT / segment_count; + (0..segment_count) + .map(|segment_idx| { + let start = segment_idx * fragments_per_segment; + let end = start + fragments_per_segment; + (start..end).map(|fragment_id| fragment_id as u32).collect() + }) + .collect() +} + +async fn build_segmented_fts_dataset(segment_count: usize) -> BenchDataset { + let tmpdir = TempDir::new().unwrap(); + let uri = format!("file://{}", tmpdir.path().display()); + let batches = RecordBatchIterator::new( + (0..(INDEXED_FRAGMENT_COUNT + UNINDEXED_FRAGMENT_COUNT)) + .map(|fragment_id| Ok(create_fragment_batch(fragment_id))) + .collect::>(), + create_fragment_batch(0).schema(), + ); + let mut dataset = Dataset::write( + batches, + &uri, + Some(WriteParams { + max_rows_per_file: ROWS_PER_FRAGMENT, + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!( + dataset.get_fragments().len(), + INDEXED_FRAGMENT_COUNT + UNINDEXED_FRAGMENT_COUNT + ); + + let params = InvertedIndexParams::default(); + let mut staged_segments = Vec::with_capacity(segment_count); + for fragment_ids in grouped_fragment_ids(segment_count) { + let segment = dataset + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name(INDEX_NAME.to_string()) + .fragments(fragment_ids) + .execute_uncommitted() + .await + .unwrap(); + staged_segments.push(segment); + } + let segments = dataset + .create_index_segment_builder() + .with_index_type(IndexType::Inverted) + .with_segments(staged_segments) + .build_all() + .await + .unwrap(); + dataset + .commit_existing_index_segments(INDEX_NAME, "text", segments) + .await + .unwrap(); + + BenchDataset { + _tmpdir: tmpdir, + dataset, + } +} + /// Benchmark full text search on Wikipedia dataset with different K values fn bench_fts_search(c: &mut Criterion) { let rt = tokio::runtime::Runtime::new().unwrap(); @@ -86,19 +186,67 @@ fn bench_fts_search(c: &mut Criterion) { group.finish(); } +fn bench_segmented_fts_search(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let bench_datasets = [1_usize, 2, 4, 6] + .into_iter() + .map(|segment_count| { + ( + segment_count, + rt.block_on(build_segmented_fts_dataset(segment_count)), + ) + }) + .collect::>(); + + let mut group = c.benchmark_group("fts_search_segment_count"); + for (segment_count, bench_dataset) in &bench_datasets { + group.bench_with_input( + BenchmarkId::from_parameter(segment_count), + segment_count, + |b, _| { + b.iter(|| { + rt.block_on(async { + let mut scanner = bench_dataset.dataset.scan(); + let query = FullTextSearchQuery::new("shared alpha".to_string()) + .with_column("text".to_string()) + .unwrap(); + let mut stream = scanner + .full_text_search(query) + .unwrap() + .limit(Some(20), None) + .unwrap() + .project(&["_rowid"]) + .unwrap() + .try_into_stream() + .await + .unwrap(); + + let mut num_rows = 0; + while let Some(batch) = stream.try_next().await.unwrap() { + num_rows += batch.num_rows(); + } + assert!(num_rows <= 20); + }) + }); + }, + ); + } + group.finish(); +} + #[cfg(target_os = "linux")] criterion_group!( name=benches; config = Criterion::default().significance_level(0.1).sample_size(10) .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); - targets = bench_fts_search + targets = bench_fts_search, bench_segmented_fts_search ); #[cfg(not(target_os = "linux"))] criterion_group!( name=benches; config = Criterion::default().significance_level(0.1).sample_size(10); - targets = bench_fts_search + targets = bench_fts_search, bench_segmented_fts_search ); criterion_main!(benches); diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 04144b04808..2046038db88 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -8,7 +8,8 @@ use crate::{ transaction::{Operation, TransactionBuilder}, }, index::{ - DatasetIndexExt, DatasetIndexInternalExt, api::{IndexSegment, IndexSegmentPlan}, + DatasetIndexExt, DatasetIndexInternalExt, + api::{IndexSegment, IndexSegmentPlan}, build_index_metadata_from_segments, scalar::build_scalar_index, vector::{ diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index a7c974db73d..19f0d04d26e 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -14,7 +14,10 @@ use crate::index::vector::utils::{get_vector_dim, get_vector_type}; use crate::{ dataset::Dataset, index::{ - INDEX_FILE_NAME, api::{IndexSegment, IndexSegmentPlan}, pb, prefilter::PreFilter, + INDEX_FILE_NAME, + api::{IndexSegment, IndexSegmentPlan}, + pb, + prefilter::PreFilter, vector::ivf::io::write_pq_partitions, }, }; @@ -99,7 +102,11 @@ use prost::Message; use roaring::RoaringBitmap; use serde::Serialize; use serde_json::json; -use std::{any::Any, collections::{HashMap, HashSet}, sync::Arc}; +use std::{ + any::Any, + collections::{HashMap, HashSet}, + sync::Arc, +}; use tokio::sync::mpsc; use tracing::instrument; use uuid::Uuid; diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index 7e4001521c8..f5823c5997d 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -21,7 +21,10 @@ use futures::future::try_join_all; use futures::stream::{self}; use futures::{Stream, StreamExt, TryStreamExt}; use itertools::Itertools; -use lance_core::{Error, ROW_ID, Result, utils::tracing::StreamTracingExt}; +use lance_core::{ + Error, ROW_ID, Result, + utils::{tokio::get_num_compute_intensive_cpus, tracing::StreamTracingExt}, +}; use lance_datafusion::utils::{ExecutionPlanMetricsSetExt, MetricsExt, PARTITIONS_SEARCHED_METRIC}; use lance_table::format::IndexMetadata; @@ -136,6 +139,57 @@ fn build_global_bm25_scorer( Ok(base_scorer) } +async fn search_segments( + indices: &[Arc], + tokens: Arc, + params: Arc, + operator: lance_index::scalar::inverted::query::Operator, + pre_filter: Arc, + metrics: Arc, + base_scorer: Arc, +) -> Result<(Vec, Vec)> { + let limit = params.limit.unwrap_or(usize::MAX); + let mut candidates = std::collections::BinaryHeap::new(); + let searches = stream::iter(indices.iter().cloned().map(|index| { + let tokens = tokens.clone(); + let params = params.clone(); + let pre_filter = pre_filter.clone(); + let metrics = metrics.clone(); + let base_scorer = base_scorer.clone(); + async move { + index + .bm25_search( + tokens, + params, + operator, + pre_filter, + metrics, + Some(base_scorer.as_ref()), + ) + .await + } + })) + .buffer_unordered(get_num_compute_intensive_cpus()); + let mut searches = searches; + + while let Some((doc_ids, scores)) = searches.try_next().await? { + for (row_id, score) in doc_ids.into_iter().zip(scores.into_iter()) { + if candidates.len() < limit { + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } else if candidates.peek().unwrap().0.score.0 < score { + candidates.pop(); + candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); + } + } + } + + Ok(candidates + .into_sorted_vec() + .into_iter() + .map(|std::cmp::Reverse(doc)| (doc.row_id, doc.score.0)) + .unzip()) +} + /// Fall back to the default simple tokenizer when no on-disk FTS segment exists. fn default_text_tokenizer() -> Box { Box::new(TextTokenizer::new( @@ -391,33 +445,16 @@ impl ExecutionPlan for MatchQueryExec { pre_filter.wait_for_ready().await?; let tokens = Arc::new(tokens); let params = Arc::new(params); - let limit = params.limit.unwrap_or(usize::MAX); - let mut candidates = std::collections::BinaryHeap::new(); - for index in &indices { - let (doc_ids, scores) = index - .bm25_search( - tokens.clone(), - params.clone(), - query.operator, - pre_filter.clone(), - metrics.clone(), - Some(&base_scorer), - ) - .await?; - for (row_id, score) in doc_ids.into_iter().zip(scores.into_iter()) { - if candidates.len() < limit { - candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); - } else if candidates.peek().unwrap().0.score.0 < score { - candidates.pop(); - candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); - } - } - } - let (doc_ids, mut scores): (Vec, Vec) = candidates - .into_sorted_vec() - .into_iter() - .map(|std::cmp::Reverse(doc)| (doc.row_id, doc.score.0)) - .unzip(); + let (doc_ids, mut scores) = search_segments( + &indices, + tokens, + params, + query.operator, + pre_filter, + metrics.clone(), + Arc::new(base_scorer), + ) + .await?; scores.iter_mut().for_each(|s| { *s *= query.boost; }); @@ -1032,33 +1069,16 @@ impl ExecutionPlan for PhraseQueryExec { pre_filter.wait_for_ready().await?; let tokens = Arc::new(tokens); let params = Arc::new(params); - let limit = params.limit.unwrap_or(usize::MAX); - let mut candidates = std::collections::BinaryHeap::new(); - for index in &indices { - let (doc_ids, scores) = index - .bm25_search( - tokens.clone(), - params.clone(), - lance_index::scalar::inverted::query::Operator::And, - pre_filter.clone(), - metrics.clone(), - Some(&base_scorer), - ) - .await?; - for (row_id, score) in doc_ids.into_iter().zip(scores.into_iter()) { - if candidates.len() < limit { - candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); - } else if candidates.peek().unwrap().0.score.0 < score { - candidates.pop(); - candidates.push(std::cmp::Reverse(ScoredDoc::new(row_id, score))); - } - } - } - let (doc_ids, scores): (Vec, Vec) = candidates - .into_sorted_vec() - .into_iter() - .map(|std::cmp::Reverse(doc)| (doc.row_id, doc.score.0)) - .unzip(); + let (doc_ids, scores) = search_segments( + &indices, + tokens, + params, + lance_index::scalar::inverted::query::Operator::And, + pre_filter, + metrics.clone(), + Arc::new(base_scorer), + ) + .await?; metrics.baseline_metrics.record_output(doc_ids.len()); let batch = RecordBatch::try_new( FTS_SCHEMA.clone(), From e85f6d60798a4e8e8b975f394168406d673c12f4 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 2 Apr 2026 19:04:49 +0800 Subject: [PATCH 12/16] perf: reduce segmented fts scorer setup overhead --- rust/lance-index/src/scalar/inverted/index.rs | 9 +++ rust/lance/src/io/exec/fts.rs | 58 +++++++++++-------- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 263ecbcb44a..62a92e6c7ec 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -466,6 +466,15 @@ impl InvertedIndex { MemBM25Scorer::new(scorer.total_tokens(), scorer.num_docs(), token_docs) } + pub fn bm25_stats_for_terms(&self, terms: &[String]) -> (u64, usize, Vec) { + let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let token_docs = terms + .iter() + .map(|term| scorer.num_docs_containing_token(term)) + .collect(); + (scorer.total_tokens(), scorer.num_docs(), token_docs) + } + /// Expand fuzzy query tokens against all partitions in this segment. pub fn expand_fuzzy_tokens(&self, tokens: &Tokens, params: &FtsSearchParams) -> Result { let mut expanded_tokens = Vec::new(); diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index f5823c5997d..f9f972d5bbc 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -86,35 +86,37 @@ async fn open_fts_segments( .await } -/// Expand fuzzy query tokens across all segments so the shared BM25 scorer sees every term -/// that any segment-local search may score. -fn scorer_tokens( +/// Collect the unique terms needed to build a shared BM25 scorer. +/// +/// The scorer only needs corpus-level document frequencies, so we keep a deduplicated +/// term list here instead of constructing a full `Tokens` object with positions. +fn scorer_terms( indices: &[Arc], query_tokens: &Tokens, params: &FtsSearchParams, -) -> Result { +) -> Result> { + let mut terms = Vec::new(); + let mut seen = HashSet::new(); + if !matches!(params.fuzziness, Some(n) if n != 0) { - return Ok(query_tokens.clone()); + for token in query_tokens { + if seen.insert(token.to_string()) { + terms.push(token.to_string()); + } + } + return Ok(terms); } - let mut tokens = Vec::new(); - let mut positions = Vec::new(); - let mut seen = HashSet::new(); for index in indices { let expanded = index.expand_fuzzy_tokens(query_tokens, params)?; for idx in 0..expanded.len() { let token = expanded.get_token(idx); if seen.insert(token.to_string()) { - tokens.push(token.to_string()); - positions.push(expanded.position(idx)); + terms.push(token.to_string()); } } } - Ok(Tokens::with_positions( - tokens, - positions, - query_tokens.token_type().clone(), - )) + Ok(terms) } /// Build a shared BM25 scorer for a set of committed FTS segments. @@ -123,20 +125,30 @@ fn build_global_bm25_scorer( query_tokens: &Tokens, params: &FtsSearchParams, ) -> Result { - let scorer_tokens = scorer_tokens(indices, query_tokens, params)?; + let terms = scorer_terms(indices, query_tokens, params)?; let first_index = indices.first().ok_or_else(|| { Error::invalid_input("FTS index requires at least one segment".to_string()) })?; - let mut base_scorer = first_index.bm25_base_scorer(&scorer_tokens); + let (mut total_tokens, mut num_docs, first_token_docs) = + first_index.bm25_stats_for_terms(&terms); + let mut token_docs = HashMap::with_capacity(terms.len()); + for (term, count) in terms.iter().cloned().zip(first_token_docs.into_iter()) { + token_docs.insert(term, count); + } + for index in indices.iter().skip(1) { - let segment_scorer = index.bm25_base_scorer(&scorer_tokens); - base_scorer.total_tokens += segment_scorer.total_tokens; - base_scorer.num_docs += segment_scorer.num_docs; - for (token, count) in segment_scorer.token_docs { - *base_scorer.token_docs.entry(token).or_insert(0) += count; + let (segment_total_tokens, segment_num_docs, segment_token_docs) = + index.bm25_stats_for_terms(&terms); + total_tokens += segment_total_tokens; + num_docs += segment_num_docs; + for (term, count) in terms.iter().zip(segment_token_docs.into_iter()) { + *token_docs + .get_mut(term) + .expect("global scorer terms should already be initialized") += count; } } - Ok(base_scorer) + + Ok(MemBM25Scorer::new(total_tokens, num_docs, token_docs)) } async fn search_segments( From 2ec97711fdea770fd79565df588951237d3171aa Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 14 Apr 2026 21:01:06 +0800 Subject: [PATCH 13/16] bench: compare segmented and partitioned inverted search --- rust/lance/benches/fts_search.rs | 232 +++++++++++++++++++++++- rust/lance/src/index/scalar/inverted.rs | 1 + 2 files changed, 227 insertions(+), 6 deletions(-) diff --git a/rust/lance/benches/fts_search.rs b/rust/lance/benches/fts_search.rs index ba7bfd97673..d56dec66c84 100644 --- a/rust/lance/benches/fts_search.rs +++ b/rust/lance/benches/fts_search.rs @@ -9,7 +9,7 @@ /// /// This benchmark is primarily intended for developers to use for profiling and debugging. The python /// benchmark is more comprehensive and will cover regression testing. -use std::{env, sync::Arc}; +use std::{collections::BTreeSet, env, sync::Arc}; use arrow_array::{ArrayRef, Int32Array, RecordBatch, RecordBatchIterator, StringArray}; use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; @@ -25,9 +25,14 @@ use tempfile::TempDir; const WIKIPEDIA_DATASET_ENV_VAR: &str = "LANCE_WIKIPEDIA_DATASET_PATH"; const INDEX_NAME: &str = "segmented_fts"; +const PARTITION_COMPARE_INDEX_NAME: &str = "partition_shape_fts"; const INDEXED_FRAGMENT_COUNT: usize = 12; const UNINDEXED_FRAGMENT_COUNT: usize = 1; const ROWS_PER_FRAGMENT: usize = 64; +const PARTITION_COMPARE_INDEXED_FRAGMENT_COUNT: usize = 64; +const PARTITION_COMPARE_UNINDEXED_FRAGMENT_COUNT: usize = 1; +const PARTITION_COMPARE_ROWS_PER_FRAGMENT: usize = 512; +const PARTITION_COMPARE_TOKEN_REPEAT: usize = 1024; /// Get the Wikipedia dataset path from environment variable. /// Panics if the environment variable is not set. @@ -64,8 +69,34 @@ fn create_fragment_batch(fragment_id: usize) -> RecordBatch { RecordBatch::try_from_iter(vec![("id", ids as ArrayRef), ("text", texts as ArrayRef)]).unwrap() } -fn grouped_fragment_ids(segment_count: usize) -> Vec> { - let fragments_per_segment = INDEXED_FRAGMENT_COUNT / segment_count; +fn create_partition_compare_fragment_batch(fragment_id: usize) -> RecordBatch { + let start = (fragment_id * PARTITION_COMPARE_ROWS_PER_FRAGMENT) as i32; + let ids = Arc::new(Int32Array::from_iter_values( + start..start + PARTITION_COMPARE_ROWS_PER_FRAGMENT as i32, + )); + let texts = Arc::new(StringArray::from_iter_values( + (0..PARTITION_COMPARE_ROWS_PER_FRAGMENT).map(|row| { + let term = match (fragment_id + row) % 4 { + 0 => "alpha", + 1 => "beta", + 2 => "gamma", + _ => "delta", + }; + let unique = format!("fragment-{fragment_id} row-{row}"); + let repeated = std::iter::repeat_n( + format!("shared {term} {unique}"), + PARTITION_COMPARE_TOKEN_REPEAT, + ) + .collect::>() + .join(" "); + format!("{repeated} tail-{term}") + }), + )); + RecordBatch::try_from_iter(vec![("id", ids as ArrayRef), ("text", texts as ArrayRef)]).unwrap() +} + +fn grouped_fragment_ids(total_fragments: usize, segment_count: usize) -> Vec> { + let fragments_per_segment = total_fragments / segment_count; (0..segment_count) .map(|segment_idx| { let start = segment_idx * fragments_per_segment; @@ -102,7 +133,7 @@ async fn build_segmented_fts_dataset(segment_count: usize) -> BenchDataset { let params = InvertedIndexParams::default(); let mut staged_segments = Vec::with_capacity(segment_count); - for fragment_ids in grouped_fragment_ids(segment_count) { + for fragment_ids in grouped_fragment_ids(INDEXED_FRAGMENT_COUNT, segment_count) { let segment = dataset .create_index_builder(&["text"], IndexType::Inverted, ¶ms) .name(INDEX_NAME.to_string()) @@ -130,6 +161,145 @@ async fn build_segmented_fts_dataset(segment_count: usize) -> BenchDataset { } } +fn count_partitions(segment: &lance_table::format::IndexMetadata) -> usize { + segment + .files + .as_ref() + .map(|files| { + files + .iter() + .filter_map(|file| { + file.path + .strip_prefix("part_") + .and_then(|path| path.split_once('_')) + .map(|(partition_id, _)| partition_id.to_string()) + }) + .collect::>() + .len() + }) + .unwrap_or(0) +} + +async fn build_partition_compare_dataset_with_memory_limit( + partition_count: usize, + segmented: bool, + memory_limit_mb: u64, +) -> BenchDataset { + let tmpdir = TempDir::new().unwrap(); + let uri = format!("file://{}", tmpdir.path().display()); + let batches = RecordBatchIterator::new( + (0..(PARTITION_COMPARE_INDEXED_FRAGMENT_COUNT + + PARTITION_COMPARE_UNINDEXED_FRAGMENT_COUNT)) + .map(|fragment_id| Ok(create_partition_compare_fragment_batch(fragment_id))) + .collect::>(), + create_partition_compare_fragment_batch(0).schema(), + ); + let mut dataset = Dataset::write( + batches, + &uri, + Some(WriteParams { + max_rows_per_file: PARTITION_COMPARE_ROWS_PER_FRAGMENT, + ..Default::default() + }), + ) + .await + .unwrap(); + + let fragment_groups = if segmented { + grouped_fragment_ids(PARTITION_COMPARE_INDEXED_FRAGMENT_COUNT, partition_count) + } else { + vec![(0..PARTITION_COMPARE_INDEXED_FRAGMENT_COUNT as u32).collect()] + }; + let params = InvertedIndexParams::default() + .with_position(true) + .num_workers(1) + .memory_limit_mb(memory_limit_mb); + + let mut staged_segments = Vec::with_capacity(fragment_groups.len()); + for fragment_ids in fragment_groups { + let segment = dataset + .create_index_builder(&["text"], IndexType::Inverted, ¶ms) + .name(PARTITION_COMPARE_INDEX_NAME.to_string()) + .fragments(fragment_ids) + .execute_uncommitted() + .await + .unwrap(); + staged_segments.push(segment); + } + let segments = dataset + .create_index_segment_builder() + .with_index_type(IndexType::Inverted) + .with_segments(staged_segments) + .build_all() + .await + .unwrap(); + dataset + .commit_existing_index_segments(PARTITION_COMPARE_INDEX_NAME, "text", segments) + .await + .unwrap(); + + let committed_segments = dataset + .load_indices_by_name(PARTITION_COMPARE_INDEX_NAME) + .await + .unwrap(); + if segmented { + assert_eq!(committed_segments.len(), partition_count); + for segment in &committed_segments { + assert_eq!( + count_partitions(segment), + 1, + "expected each segmented FTS segment to have exactly one partition" + ); + } + } + + BenchDataset { + _tmpdir: tmpdir, + dataset, + } +} + +async fn build_partition_compare_dataset(partition_count: usize, segmented: bool) -> BenchDataset { + if segmented { + return build_partition_compare_dataset_with_memory_limit(partition_count, true, 512).await; + } + + let mut observed = Vec::new(); + for memory_limit_mb in [ + 512, 256, 192, 160, 128, 96, 80, 64, 56, 48, 40, 36, 32, 28, 24, 20, 18, 16, 14, 12, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1, + ] { + let bench_dataset = build_partition_compare_dataset_with_memory_limit( + partition_count, + false, + memory_limit_mb, + ) + .await; + let committed_segments = bench_dataset + .dataset + .load_indices_by_name(PARTITION_COMPARE_INDEX_NAME) + .await + .unwrap(); + let actual_partition_count = if committed_segments.len() == 1 { + count_partitions(&committed_segments[0]) + } else { + 0 + }; + observed.push(( + memory_limit_mb, + committed_segments.len(), + actual_partition_count, + )); + if committed_segments.len() == 1 && actual_partition_count == partition_count { + return bench_dataset; + } + } + + panic!( + "failed to build 1 segment x {partition_count} partitions for partition-shape benchmark: {observed:?}" + ); +} + /// Benchmark full text search on Wikipedia dataset with different K values fn bench_fts_search(c: &mut Criterion) { let rt = tokio::runtime::Runtime::new().unwrap(); @@ -234,19 +404,69 @@ fn bench_segmented_fts_search(c: &mut Criterion) { group.finish(); } +fn bench_fts_segment_vs_partition_shape(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let bench_datasets = [16_usize] + .into_iter() + .flat_map(|count| { + [ + ( + format!("{count}_segments_x_1_partition"), + rt.block_on(build_partition_compare_dataset(count, true)), + ), + ( + format!("1_segment_x_{count}_partitions"), + rt.block_on(build_partition_compare_dataset(count, false)), + ), + ] + }) + .collect::>(); + + let mut group = c.benchmark_group("fts_search_segment_vs_partition_shape"); + for (shape, bench_dataset) in &bench_datasets { + group.bench_with_input(BenchmarkId::from_parameter(shape), shape, |b, _| { + b.iter(|| { + rt.block_on(async { + let mut scanner = bench_dataset.dataset.scan(); + let query = FullTextSearchQuery::new("shared alpha".to_string()) + .with_column("text".to_string()) + .unwrap(); + let mut stream = scanner + .full_text_search(query) + .unwrap() + .limit(Some(20), None) + .unwrap() + .project(&["_rowid"]) + .unwrap() + .try_into_stream() + .await + .unwrap(); + + let mut num_rows = 0; + while let Some(batch) = stream.try_next().await.unwrap() { + num_rows += batch.num_rows(); + } + assert!(num_rows <= 20); + }) + }); + }); + } + group.finish(); +} + #[cfg(target_os = "linux")] criterion_group!( name=benches; config = Criterion::default().significance_level(0.1).sample_size(10) .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); - targets = bench_fts_search, bench_segmented_fts_search + targets = bench_fts_search, bench_segmented_fts_search, bench_fts_segment_vs_partition_shape ); #[cfg(not(target_os = "linux"))] criterion_group!( name=benches; config = Criterion::default().significance_level(0.1).sample_size(10); - targets = bench_fts_search, bench_segmented_fts_search + targets = bench_fts_search, bench_segmented_fts_search, bench_fts_segment_vs_partition_shape ); criterion_main!(benches); diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index 51fc3e19335..c9094f6f015 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -112,6 +112,7 @@ pub(crate) async fn build_segment( dataset.object_store(), &index_dir, store, + lance_index::progress::noop_progress(), ) .await?; Ok(built_segment) From 83f53a006311fa73b015f67d1149667149e50b2d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 14 Apr 2026 21:04:43 +0800 Subject: [PATCH 14/16] fix: silence unused vector segment parameter --- rust/lance/src/index/vector/ivf.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index c9101cea896..eb3910aa5b7 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -2195,8 +2195,15 @@ pub(crate) async fn merge_segments_with_progress( let index_version = infer_source_index_version(&segments)?; let segment_uuid = Uuid::new_v4(); let final_dir = indices_dir.child(segment_uuid.to_string()); - merge_segments_to_dir(object_store, indices_dir, &final_dir, &segments, None, progress) - .await?; + merge_segments_to_dir( + object_store, + indices_dir, + &final_dir, + &segments, + None, + progress, + ) + .await?; let files = list_index_files_with_sizes(object_store, &final_dir).await?; merged_segment = TableIndexMetadata { @@ -2222,7 +2229,7 @@ async fn merge_segments_to_dir( indices_dir: &Path, final_dir: &Path, segments: &[TableIndexMetadata], - requested_index_type: Option, + _requested_index_type: Option, progress: Arc, ) -> Result<()> { reset_final_segment_dir(object_store, final_dir).await?; From c3dfd98a0d3d2a0f73dc176edb1ab86e6366d664 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 15 Apr 2026 00:02:00 +0800 Subject: [PATCH 15/16] fix: restore segment builder call sites after main merge --- rust/lance/src/dataset/index.rs | 2 +- rust/lance/src/dataset/scanner.rs | 7 +++++ rust/lance/src/index.rs | 7 +++++ rust/lance/src/index/vector/ivf/v2.rs | 9 +++++- rust/lance/src/io/exec/fts.rs | 44 +++++++++++++++------------ 5 files changed, 47 insertions(+), 22 deletions(-) diff --git a/rust/lance/src/dataset/index.rs b/rust/lance/src/dataset/index.rs index 80d5e6dc0b1..b3b235d8015 100644 --- a/rust/lance/src/dataset/index.rs +++ b/rust/lance/src/dataset/index.rs @@ -241,7 +241,7 @@ mod tests { .unwrap(); } - let segments = vec![ + let segments = [ IndexMetadata { uuid: first_segment_uuid, fragment_bitmap: Some(std::iter::once(target_fragments[0].id() as u32).collect()), diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 795ec2e0791..1617d2dcaff 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4885,6 +4885,13 @@ pub mod test_dataset { .iter() .map(|segment| segment.uuid) .collect::>(); + let segments = self + .dataset + .create_index_segment_builder() + .with_index_type(params.index_type()) + .with_segments(segments) + .build_all() + .await?; self.dataset .commit_existing_index_segments("idx", "vec", segments) .await?; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index f14be3b9e82..c6d1f3c6dad 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -2521,6 +2521,13 @@ mod tests { .iter() .map(|segment| segment.uuid) .collect::>(); + let segments = dataset + .create_index_segment_builder() + .with_index_type(params.index_type()) + .with_segments(segments) + .build_all() + .await + .unwrap(); dataset .commit_existing_index_segments(index_name, column, segments) .await diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 4e0d117b874..1ca573bfe58 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -2875,8 +2875,15 @@ mod tests { ) .await .unwrap(); + let merged_segment = dataset + .create_index_segment_builder() + .with_index_type(params.index_type()) + .with_segments(vec![merged_segment]) + .build_all() + .await + .unwrap(); dataset - .commit_existing_index_segments(INDEX_NAME, "vector", vec![merged_segment]) + .commit_existing_index_segments(INDEX_NAME, "vector", merged_segment) .await .unwrap(); diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index f9f972d5bbc..2add8f7c407 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -162,26 +162,30 @@ async fn search_segments( ) -> Result<(Vec, Vec)> { let limit = params.limit.unwrap_or(usize::MAX); let mut candidates = std::collections::BinaryHeap::new(); - let searches = stream::iter(indices.iter().cloned().map(|index| { - let tokens = tokens.clone(); - let params = params.clone(); - let pre_filter = pre_filter.clone(); - let metrics = metrics.clone(); - let base_scorer = base_scorer.clone(); - async move { - index - .bm25_search( - tokens, - params, - operator, - pre_filter, - metrics, - Some(base_scorer.as_ref()), - ) - .await - } - })) - .buffer_unordered(get_num_compute_intensive_cpus()); + let searches = indices + .iter() + .map(|index| { + let index = Arc::clone(index); + let tokens = tokens.clone(); + let params = params.clone(); + let pre_filter = pre_filter.clone(); + let metrics = metrics.clone(); + let base_scorer = base_scorer.clone(); + async move { + index + .bm25_search( + tokens, + params, + operator, + pre_filter, + metrics, + Some(base_scorer.as_ref()), + ) + .await + } + }) + .collect::>(); + let searches = stream::iter(searches).buffer_unordered(get_num_compute_intensive_cpus()); let mut searches = searches; while let Some((doc_ids, scores)) = searches.try_next().await? { From c78f29e8a3340528192c87957504b070e2e728f1 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 20 Apr 2026 11:36:44 +0800 Subject: [PATCH 16/16] fix: restore binding support after tokenizer merge --- .../python/lance/lance/indices/__init__.pyi | 15 +++ python/src/indices.rs | 111 +++++++++++++++++- rust/lance-index/src/scalar/inverted/index.rs | 2 - rust/lance/src/io/exec/fts.rs | 2 +- 4 files changed, 125 insertions(+), 5 deletions(-) diff --git a/python/python/lance/lance/indices/__init__.pyi b/python/python/lance/lance/indices/__init__.pyi index 152ea1d10b2..fc5d03b80bd 100644 --- a/python/python/lance/lance/indices/__init__.pyi +++ b/python/python/lance/lance/indices/__init__.pyi @@ -21,6 +21,21 @@ class IndexConfig: index_type: str config: str +class IndexSegment: + uuid: str + fragment_ids: set[int] + index_version: int + + def __repr__(self) -> str: ... + +class IndexSegmentPlan: + segment: IndexSegment + segments: list[object] + estimated_bytes: int + requested_index_type: Optional[str] + + def __repr__(self) -> str: ... + def train_ivf_model( dataset, column: str, diff --git a/python/src/indices.rs b/python/src/indices.rs index 62f3c0c64ec..a778abff758 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -11,6 +11,7 @@ use arrow_data::ArrayData; use chrono::{DateTime, Utc}; use lance::dataset::Dataset as LanceDataset; use lance::index::DatasetIndexExt; +use lance::index::{IndexSegment, IndexSegmentPlan}; use lance::index::vector::ivf::builder::write_vector_storage; use lance::index::vector::pq::build_pq_model_in_fragments; use lance::io::ObjectStore; @@ -35,7 +36,7 @@ use pyo3::{ use lance::index::DatasetIndexInternalExt; use crate::fragment::FileFragment; -use crate::utils::PyJson; +use crate::utils::{PyJson, PyLance}; use crate::{ dataset::Dataset, error::PythonErrorExt, file::object_store_from_uri_or_path_no_options, rt, }; @@ -61,6 +62,96 @@ impl PyIndexConfig { } } +#[pyclass(name = "IndexSegment", module = "lance.indices")] +#[derive(Debug, Clone)] +pub struct PyIndexSegment { + pub(crate) inner: IndexSegment, +} + +impl PyIndexSegment { + pub(crate) fn from_inner(inner: IndexSegment) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyIndexSegment { + #[getter] + fn uuid(&self) -> String { + self.inner.uuid().to_string() + } + + #[getter] + fn fragment_ids(&self) -> HashSet { + self.inner.fragment_bitmap().iter().collect() + } + + #[getter] + fn index_version(&self) -> i32 { + self.inner.index_version() + } + + fn __repr__(&self) -> String { + format!( + "IndexSegment(uuid={}, fragment_ids={:?}, index_version={})", + self.uuid(), + self.fragment_ids(), + self.index_version() + ) + } +} + +#[pyclass(name = "IndexSegmentPlan", module = "lance.indices")] +#[derive(Debug, Clone)] +pub struct PyIndexSegmentPlan { + pub(crate) inner: IndexSegmentPlan, +} + +impl PyIndexSegmentPlan { + pub(crate) fn from_inner(inner: IndexSegmentPlan) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyIndexSegmentPlan { + #[getter] + fn segment(&self) -> PyIndexSegment { + PyIndexSegment::from_inner(self.inner.segment().clone()) + } + + #[getter] + fn segments(&self) -> Vec> { + self.inner + .segments() + .iter() + .cloned() + .map(PyLance) + .collect() + } + + #[getter] + fn estimated_bytes(&self) -> u64 { + self.inner.estimated_bytes() + } + + #[getter] + fn requested_index_type(&self) -> Option { + self.inner + .requested_index_type() + .map(|index_type| index_type.to_string()) + } + + fn __repr__(&self) -> String { + format!( + "IndexSegmentPlan(segments={}, estimated_bytes={}, requested_index_type={:?})", + self.inner.segments().len(), + self.estimated_bytes(), + self.requested_index_type() + ) + } +} + #[pyclass(name = "IvfModel", module = "lance.indices")] #[derive(Debug, Clone)] pub struct PyIvfModel { @@ -449,7 +540,21 @@ async fn do_load_shuffled_vectors( base_id: None, files: Some(files), }; - ds.commit_existing_index_segments(index_name, column, vec![metadata]) + let segment = IndexSegment::new( + metadata.uuid, + metadata + .fragment_bitmap + .as_ref() + .expect("vector metadata should include fragment coverage") + .iter(), + metadata + .index_details + .as_ref() + .expect("vector metadata should include index details") + .clone(), + metadata.index_version, + ); + ds.commit_existing_index_segments(index_name, column, vec![segment]) .await .infer_error()?; @@ -643,6 +748,8 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { indices.add_wrapped(wrap_pyfunction!(load_shuffled_vectors))?; indices.add_class::()?; indices.add_class::()?; + indices.add_class::()?; + indices.add_class::()?; indices.add_class::()?; indices.add_class::()?; indices.add_wrapped(wrap_pyfunction!(get_ivf_model))?; diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index d20436b62a5..cfec52b0692 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -68,7 +68,6 @@ use super::{ }; use crate::frag_reuse::FragReuseIndex; use crate::pbold; -use crate::scalar::inverted::document_tokenizer::TextTokenizer; use crate::scalar::inverted::scorer::MemBM25Scorer; use crate::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer; use crate::scalar::{ @@ -77,7 +76,6 @@ use crate::scalar::{ }; use crate::{FtsPrewarmOptions, Index}; use crate::{prefilter::PreFilter, scalar::inverted::iter::take_fst_keys}; -use lance_tokenizer::{SimpleTokenizer, TextAnalyzer}; use std::str::FromStr; // Version 0: Arrow TokenSetFormat (legacy) diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index cf8440808d9..411a0f2ba57 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -1564,13 +1564,13 @@ mod tests { use lance_datagen::{BatchCount, ByteCount, RowCount}; use lance_index::metrics::NoOpMetricsCollector; use lance_index::scalar::inverted::InvertedIndex; + use lance_index::scalar::inverted::Language; use lance_index::scalar::inverted::query::{ BooleanQuery, BoostQuery, FtsQuery, FtsSearchParams, MatchQuery, Occur, Operator, PhraseQuery, collect_query_tokens, has_query_token, }; use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams}; use lance_index::{IndexCriteria, IndexType}; - use tantivy::tokenizer::Language; use crate::{ dataset::transaction::{Operation, TransactionBuilder},