From 8fc6f89927f57a26dea7c8565c40918a32a5c327 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 17 Nov 2025 15:47:02 +0800 Subject: [PATCH 01/12] refactor: write bitmap index statistics in file instead Signed-off-by: Xuanwo --- rust/lance-index/src/scalar/bitmap.rs | 175 ++++++++++++++++++++- rust/lance-index/src/scalar/bloomfilter.rs | 12 ++ rust/lance-index/src/scalar/btree.rs | 4 + rust/lance-index/src/scalar/inverted.rs | 5 + rust/lance-index/src/scalar/json.rs | 4 + rust/lance-index/src/scalar/label_list.rs | 4 + rust/lance-index/src/scalar/ngram.rs | 4 + rust/lance-index/src/scalar/registry.rs | 13 ++ rust/lance-index/src/scalar/zonemap.rs | 4 + rust/lance/src/index.rs | 57 ++++--- 10 files changed, 253 insertions(+), 29 deletions(-) diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 9f3779668f1..1d68049452f 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -48,6 +48,7 @@ use crate::{metrics::MetricsCollector, Index, IndexType}; use crate::{scalar::expression::ScalarQueryParser, scalar::IndexReader}; pub const BITMAP_LOOKUP_NAME: &str = "bitmap_page_lookup.lance"; +pub const INDEX_STATS_METADATA_KEY: &str = "lance:index_stats"; const MAX_BITMAP_ARRAY_LENGTH: usize = i32::MAX as usize - 1024 * 1024; // leave headroom @@ -601,6 +602,7 @@ impl BitmapIndexPlugin { index_store: &dyn IndexStore, value_type: &DataType, ) -> Result<()> { + let num_bitmaps = state.len(); let schema = Arc::new(Schema::new(vec![ Field::new("keys", value_type.clone(), true), Field::new("bitmaps", DataType::Binary, true), @@ -653,8 +655,17 @@ impl BitmapIndexPlugin { bitmap_index_file.write_record_batch(record_batch).await?; } - // Finish file once at the end - this creates the file even if we wrote no batches - bitmap_index_file.finish().await?; + // Finish file with metadata that allows lightweight statistics reads + let stats_json = serde_json::to_string(&BitmapStatistics { num_bitmaps }).map_err(|e| { + Error::Internal { + message: format!("failed to serialize bitmap statistics: {e}"), + location: location!(), + } + })?; + let mut metadata = HashMap::new(); + metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json); + + bitmap_index_file.finish_with_metadata(metadata).await?; Ok(()) } @@ -715,6 +726,10 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { true } + fn index_type(&self) -> IndexType { + IndexType::Bitmap + } + fn version(&self) -> u32 { BITMAP_INDEX_VERSION } @@ -759,19 +774,169 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { ) -> Result> { Ok(BitmapIndex::load(index_store, frag_reuse_index, cache).await? as Arc) } + + async fn load_statistics( + &self, + index_store: Arc, + _index_details: &prost_types::Any, + ) -> Result> { + let reader = index_store.open_index_file(BITMAP_LOOKUP_NAME).await?; + if let Some(value) = reader.schema().metadata.get(INDEX_STATS_METADATA_KEY) { + let stats = serde_json::from_str(value).map_err(|e| Error::Internal { + message: format!("failed to parse bitmap statistics metadata: {e}"), + location: location!(), + })?; + Ok(Some(stats)) + } else { + Ok(None) + } + } } #[cfg(test)] pub mod tests { use super::*; use crate::metrics::NoOpMetricsCollector; - use crate::scalar::lance_format::LanceIndexStore; + use crate::scalar::{lance_format::LanceIndexStore, IndexStore, IndexWriter}; use arrow_array::{RecordBatch, StringArray, UInt64Array}; - use arrow_schema::{Field, Schema}; + use arrow_schema::{DataType, Field, Schema}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use deepsize::DeepSizeOf; use futures::stream; - use lance_core::utils::{address::RowAddress, tempfile::TempObjDir}; + use lance_core::{ + datatypes::Schema as LanceSchema, + utils::{address::RowAddress, tempfile::TempObjDir}, + }; use lance_io::object_store::ObjectStore; + use std::{any::Any, collections::HashMap}; + + #[derive(Debug)] + struct MetadataOnlyStore { + schema: Arc, + } + + impl DeepSizeOf for MetadataOnlyStore { + fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { + self.schema.deep_size_of_children(ctx) + } + } + + #[derive(Debug)] + struct MetadataOnlyReader { + schema: Arc, + } + + #[async_trait] + impl IndexReader for MetadataOnlyReader { + async fn read_record_batch(&self, _offset: u64, _batch_size: u64) -> Result { + panic!("metadata reader should not read record batches") + } + + async fn read_range( + &self, + _range: std::ops::Range, + _projection: Option<&[&str]>, + ) -> Result { + panic!("metadata reader should not read ranges") + } + + async fn num_batches(&self, _batch_size: u64) -> u32 { + 0 + } + + fn num_rows(&self) -> usize { + 0 + } + + fn schema(&self) -> &LanceSchema { + &self.schema + } + } + + #[async_trait] + impl IndexStore for MetadataOnlyStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn io_parallelism(&self) -> usize { + 1 + } + + async fn new_index_file( + &self, + _name: &str, + _schema: Arc, + ) -> Result> { + panic!("metadata store does not support writing") + } + + async fn open_index_file(&self, _name: &str) -> Result> { + Ok(Arc::new(MetadataOnlyReader { + schema: self.schema.clone(), + })) + } + + async fn copy_index_file(&self, _name: &str, _dest_store: &dyn IndexStore) -> Result<()> { + panic!("metadata store does not support copy") + } + + async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result<()> { + panic!("metadata store does not support rename") + } + + async fn delete_index_file(&self, _name: &str) -> Result<()> { + panic!("metadata store does not support delete") + } + } + + #[tokio::test] + async fn test_bitmap_metadata_statistics_minimal_io() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let colors = vec![ + "red", "blue", "green", "red", "yellow", "blue", "red", "green", + ]; + let row_ids = (0u64..colors.len() as u64).collect::>(); + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Utf8, false), + Field::new("_rowid", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(colors.clone())), + Arc::new(UInt64Array::from(row_ids.clone())), + ], + ) + .unwrap(); + let stream = stream::once(async move { Ok(batch) }); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + BitmapIndexPlugin::train_bitmap_index(stream, store.as_ref()) + .await + .unwrap(); + + let reader = store.open_index_file(BITMAP_LOOKUP_NAME).await.unwrap(); + let schema = Arc::new(reader.schema().clone()); + + let metadata_store = MetadataOnlyStore { schema }; + let stats = BitmapIndexPlugin + .load_statistics(Arc::new(metadata_store), &prost_types::Any::default()) + .await + .unwrap() + .expect("bitmap metadata statistics should exist"); + + assert_eq!( + stats.get("num_bitmaps").and_then(|v| v.as_u64()).unwrap(), + 4, + "num_bitmaps should equal number of distinct values", + ); + } #[tokio::test] async fn test_bitmap_lazy_loading_and_cache() { diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 7fef76136e2..1d000061029 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -1235,6 +1235,10 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { false } + fn index_type(&self) -> IndexType { + IndexType::BloomFilter + } + fn version(&self) -> u32 { BLOOMFILTER_INDEX_VERSION } @@ -1259,6 +1263,14 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { as Arc, ) } + + async fn load_statistics( + &self, + _index_store: Arc, + _index_details: &prost_types::Any, + ) -> Result> { + Ok(None) + } } #[derive(Debug)] diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index a9cd73dc5e5..a8f7d45e1ab 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1941,6 +1941,10 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { true } + fn index_type(&self) -> IndexType { + IndexType::BTree + } + fn version(&self) -> u32 { BTREE_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs index fb5c3cd7a68..da65bed0416 100644 --- a/rust/lance-index/src/scalar/inverted.rs +++ b/rust/lance-index/src/scalar/inverted.rs @@ -35,6 +35,7 @@ use crate::{ registry::{ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest}, CreatedIndex, ScalarIndex, }, + IndexType, }; use super::IndexStore; @@ -133,6 +134,10 @@ impl ScalarIndexPlugin for InvertedIndexPlugin { false } + fn index_type(&self) -> IndexType { + IndexType::Inverted + } + fn version(&self) -> u32 { INVERTED_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index c400aec036b..2e2aa7ec21e 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -740,6 +740,10 @@ impl ScalarIndexPlugin for JsonIndexPlugin { true } + fn index_type(&self) -> IndexType { + IndexType::Scalar + } + fn attach_registry(&self, registry: Arc) { let mut reg_ref = self.registry.lock().unwrap(); *reg_ref = Some(registry); diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index b22a12f8e4a..9f8303428a6 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -382,6 +382,10 @@ impl ScalarIndexPlugin for LabelListIndexPlugin { true } + fn index_type(&self) -> IndexType { + IndexType::LabelList + } + fn version(&self) -> u32 { LABEL_LIST_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 00a2f7da5d9..210eae83724 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -1274,6 +1274,10 @@ impl ScalarIndexPlugin for NGramIndexPlugin { false } + fn index_type(&self) -> IndexType { + IndexType::NGram + } + fn version(&self) -> u32 { NGRAM_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/registry.rs b/rust/lance-index/src/scalar/registry.rs index a36e221f6a0..17a99786509 100644 --- a/rust/lance-index/src/scalar/registry.rs +++ b/rust/lance-index/src/scalar/registry.rs @@ -19,6 +19,7 @@ use crate::{ label_list::LabelListIndexPlugin, ngram::NGramIndexPlugin, zonemap::ZoneMapIndexPlugin, CreatedIndex, IndexStore, ScalarIndex, }, + IndexType, }; pub const VALUE_COLUMN_NAME: &str = "value"; @@ -126,6 +127,9 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug { /// Returns true if the index returns an exact answer (e.g. not AtMost) fn provides_exact_answer(&self) -> bool; + /// Returns the index type for this plugin + fn index_type(&self) -> IndexType; + /// The version of the index plugin /// /// We assume that indexes are not forwards compatible. If an index was written with a @@ -153,6 +157,15 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug { cache: &LanceCache, ) -> Result>; + /// Optional hook allowing a plugin to provide statistics without loading the index. + async fn load_statistics( + &self, + _index_store: Arc, + _index_details: &prost_types::Any, + ) -> Result> { + Ok(None) + } + /// Optional hook that plugins can use if they need to be aware of the registry fn attach_registry(&self, _registry: Arc) {} } diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 7b6e6078310..9dc8974a653 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -969,6 +969,10 @@ impl ScalarIndexPlugin for ZoneMapIndexPlugin { false } + fn index_type(&self) -> IndexType { + IndexType::ZoneMap + } + fn version(&self) -> u32 { ZONEMAP_INDEX_VERSION } diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index c24c0744c98..4966d108371 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -11,7 +11,7 @@ use arrow_schema::{DataType, Schema}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use futures::{stream, StreamExt, TryStreamExt}; +use futures::stream; use itertools::Itertools; use lance_core::cache::{CacheKey, UnsizedCacheKey}; use lance_core::utils::address::RowAddress; @@ -726,25 +726,35 @@ impl DatasetIndexExt for Dataset { let field_id = metadatas[0].fields[0]; let field_path = self.schema().field_path(field_id)?; - // Open all delta indices - let indices = stream::iter(metadatas.iter()) - .then(|m| { - let field_path = field_path.clone(); - async move { - self.open_generic_index(&field_path, &m.uuid.to_string(), &NoOpMetricsCollector) - .await - } - }) - .try_collect::>() - .await?; + let mut indices_stats = Vec::with_capacity(metadatas.len()); + let mut index_type: Option = None; - // Stastistics for each delta index. - let indices_stats = indices - .iter() - .map(|idx| idx.statistics()) - .collect::>>()?; + for meta in metadatas.iter() { + let index_store = Arc::new(LanceIndexStore::from_dataset_for_existing(self, meta)?); + let index_details = scalar::fetch_index_details(self, &field_path, meta).await?; + let index_details_wrapper = scalar::IndexDetails(index_details.clone()); + let plugin = index_details_wrapper.get_plugin()?; + + if index_type.is_none() { + index_type = Some(plugin.index_type().to_string()); + } + + if let Some(stats) = plugin + .load_statistics(index_store.clone(), index_details.as_ref()) + .await? + { + indices_stats.push(stats); + continue; + } + + let index = self + .open_generic_index(&field_path, &meta.uuid.to_string(), &NoOpMetricsCollector) + .await?; + + indices_stats.push(index.statistics()?); + } - let index_type = indices[0].index_type().to_string(); + let index_type = index_type.unwrap_or_else(|| "Unknown".to_string()); let indexed_fragments_per_delta = self.indexed_fragments(index_name).await?; @@ -1604,24 +1614,21 @@ fn is_vector_field(data_type: DataType) -> bool { #[cfg(test)] mod tests { + use super::*; use crate::dataset::builder::DatasetBuilder; use crate::dataset::optimize::{compact_files, CompactionOptions}; use crate::dataset::{ReadParams, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::session::Session; use crate::utils::test::{copy_test_data_to_tmp, DatagenExt, FragmentCount, FragmentRowCount}; - use arrow_array::Int32Array; - use lance_io::utils::tracking_store::IOTracker; - use lance_io::{assert_io_eq, assert_io_lt}; - - use super::*; - use arrow::array::AsArray; use arrow::datatypes::{Float32Type, Int32Type}; + use arrow_array::Int32Array; use arrow_array::{ FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, StringArray, }; use arrow_schema::{Field, Schema}; + use futures::stream::TryStreamExt; use lance_arrow::*; use lance_core::utils::tempfile::TempStrDir; use lance_datagen::gen_batch; @@ -1631,6 +1638,8 @@ mod tests { hnsw::builder::HnswBuildParams, ivf::IvfBuildParams, sq::builder::SQBuildParams, }; use lance_io::object_store::ObjectStoreParams; + use lance_io::utils::tracking_store::IOTracker; + use lance_io::{assert_io_eq, assert_io_lt}; use lance_linalg::distance::{DistanceType, MetricType}; use lance_testing::datagen::generate_random_array; use rstest::rstest; From feaa02e593323017d08a19c006362e4762b0690d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 17 Nov 2025 17:28:23 +0800 Subject: [PATCH 02/12] Add fackball for not existing plugin Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 4966d108371..24ca7deabc6 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -733,24 +733,29 @@ impl DatasetIndexExt for Dataset { let index_store = Arc::new(LanceIndexStore::from_dataset_for_existing(self, meta)?); let index_details = scalar::fetch_index_details(self, &field_path, meta).await?; let index_details_wrapper = scalar::IndexDetails(index_details.clone()); - let plugin = index_details_wrapper.get_plugin()?; - if index_type.is_none() { - index_type = Some(plugin.index_type().to_string()); - } + if let Ok(plugin) = index_details_wrapper.get_plugin() { + if index_type.is_none() { + index_type = Some(plugin.index_type().to_string()); + } - if let Some(stats) = plugin - .load_statistics(index_store.clone(), index_details.as_ref()) - .await? - { - indices_stats.push(stats); - continue; + if let Some(stats) = plugin + .load_statistics(index_store.clone(), index_details.as_ref()) + .await? + { + indices_stats.push(stats); + continue; + } } let index = self .open_generic_index(&field_path, &meta.uuid.to_string(), &NoOpMetricsCollector) .await?; + if index_type.is_none() { + index_type = Some(index.index_type().to_string()); + } + indices_stats.push(index.statistics()?); } From b4e2ea2c1a5d488706749b1307ae8b9ba375dca6 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 18 Nov 2025 18:27:45 +0800 Subject: [PATCH 03/12] refactor tests Signed-off-by: Xuanwo --- rust/lance-index/src/scalar/bitmap.rs | 138 +------------------------- rust/lance/src/index.rs | 93 ++++++++++++++++- 2 files changed, 94 insertions(+), 137 deletions(-) diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 1d68049452f..1e85745d3bb 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -797,146 +797,14 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { pub mod tests { use super::*; use crate::metrics::NoOpMetricsCollector; - use crate::scalar::{lance_format::LanceIndexStore, IndexStore, IndexWriter}; + use crate::scalar::lance_format::LanceIndexStore; use arrow_array::{RecordBatch, StringArray, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; - use deepsize::DeepSizeOf; use futures::stream; - use lance_core::{ - datatypes::Schema as LanceSchema, - utils::{address::RowAddress, tempfile::TempObjDir}, - }; + use lance_core::utils::{address::RowAddress, tempfile::TempObjDir}; use lance_io::object_store::ObjectStore; - use std::{any::Any, collections::HashMap}; - - #[derive(Debug)] - struct MetadataOnlyStore { - schema: Arc, - } - - impl DeepSizeOf for MetadataOnlyStore { - fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { - self.schema.deep_size_of_children(ctx) - } - } - - #[derive(Debug)] - struct MetadataOnlyReader { - schema: Arc, - } - - #[async_trait] - impl IndexReader for MetadataOnlyReader { - async fn read_record_batch(&self, _offset: u64, _batch_size: u64) -> Result { - panic!("metadata reader should not read record batches") - } - - async fn read_range( - &self, - _range: std::ops::Range, - _projection: Option<&[&str]>, - ) -> Result { - panic!("metadata reader should not read ranges") - } - - async fn num_batches(&self, _batch_size: u64) -> u32 { - 0 - } - - fn num_rows(&self) -> usize { - 0 - } - - fn schema(&self) -> &LanceSchema { - &self.schema - } - } - - #[async_trait] - impl IndexStore for MetadataOnlyStore { - fn as_any(&self) -> &dyn Any { - self - } - - fn io_parallelism(&self) -> usize { - 1 - } - - async fn new_index_file( - &self, - _name: &str, - _schema: Arc, - ) -> Result> { - panic!("metadata store does not support writing") - } - - async fn open_index_file(&self, _name: &str) -> Result> { - Ok(Arc::new(MetadataOnlyReader { - schema: self.schema.clone(), - })) - } - - async fn copy_index_file(&self, _name: &str, _dest_store: &dyn IndexStore) -> Result<()> { - panic!("metadata store does not support copy") - } - - async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result<()> { - panic!("metadata store does not support rename") - } - - async fn delete_index_file(&self, _name: &str) -> Result<()> { - panic!("metadata store does not support delete") - } - } - - #[tokio::test] - async fn test_bitmap_metadata_statistics_minimal_io() { - let tmpdir = TempObjDir::default(); - let store = Arc::new(LanceIndexStore::new( - Arc::new(ObjectStore::local()), - tmpdir.clone(), - Arc::new(LanceCache::no_cache()), - )); - - let colors = vec![ - "red", "blue", "green", "red", "yellow", "blue", "red", "green", - ]; - let row_ids = (0u64..colors.len() as u64).collect::>(); - let schema = Arc::new(Schema::new(vec![ - Field::new("value", DataType::Utf8, false), - Field::new("_rowid", DataType::UInt64, false), - ])); - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(colors.clone())), - Arc::new(UInt64Array::from(row_ids.clone())), - ], - ) - .unwrap(); - let stream = stream::once(async move { Ok(batch) }); - let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); - BitmapIndexPlugin::train_bitmap_index(stream, store.as_ref()) - .await - .unwrap(); - - let reader = store.open_index_file(BITMAP_LOOKUP_NAME).await.unwrap(); - let schema = Arc::new(reader.schema().clone()); - - let metadata_store = MetadataOnlyStore { schema }; - let stats = BitmapIndexPlugin - .load_statistics(Arc::new(metadata_store), &prost_types::Any::default()) - .await - .unwrap() - .expect("bitmap metadata statistics should exist"); - - assert_eq!( - stats.get("num_bitmaps").and_then(|v| v.as_u64()).unwrap(), - 4, - "num_bitmaps should equal number of distinct values", - ); - } + use std::collections::HashMap; #[tokio::test] async fn test_bitmap_lazy_loading_and_cache() { diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 24ca7deabc6..15774a61e22 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1632,13 +1632,16 @@ mod tests { use arrow_array::{ FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, StringArray, }; - use arrow_schema::{Field, Schema}; + use arrow_schema::{DataType, Field, Schema}; use futures::stream::TryStreamExt; use lance_arrow::*; use lance_core::utils::tempfile::TempStrDir; use lance_datagen::gen_batch; use lance_datagen::{array, BatchCount, Dimension, RowCount}; - use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams}; + use lance_index::scalar::bitmap::BITMAP_LOOKUP_NAME; + use lance_index::scalar::{ + BuiltinIndexType, FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams, + }; use lance_index::vector::{ hnsw::builder::HnswBuildParams, ivf::IvfBuildParams, sq::builder::SQBuildParams, }; @@ -1709,6 +1712,92 @@ mod tests { .is_err()); } + #[tokio::test] + async fn test_bitmap_index_statistics_minimal_io_via_dataset() { + const NUM_ROWS: usize = 500_000; + let test_dir = TempStrDir::default(); + let schema = Arc::new(Schema::new(vec![Field::new( + "status", + DataType::Int32, + false, + )])); + let values: Vec = (0..NUM_ROWS as i32).collect(); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(values))]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let io_tracker = Arc::new(IOTracker::default()); + let write_params = WriteParams { + store_params: Some(ObjectStoreParams { + object_store_wrapper: Some(io_tracker.clone()), + ..Default::default() + }), + ..WriteParams::default() + }; + + Dataset::write(reader, &test_dir, Some(write_params)) + .await + .unwrap(); + + let read_params = ReadParams { + store_options: Some(ObjectStoreParams { + object_store_wrapper: Some(io_tracker.clone()), + ..Default::default() + }), + ..ReadParams::default() + }; + + let mut dataset = DatasetBuilder::from_uri(&test_dir) + .with_read_params(read_params) + .load() + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index( + &["status"], + IndexType::Bitmap, + Some("status_idx".to_string()), + ¶ms, + true, + ) + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + let index_meta = indices + .iter() + .find(|idx| idx.name == "status_idx") + .expect("status_idx should exist"); + let lookup_path = dataset + .indice_files_dir(index_meta) + .unwrap() + .child(index_meta.uuid.to_string()) + .child(BITMAP_LOOKUP_NAME); + let meta = dataset.object_store.inner.head(&lookup_path).await.unwrap(); + assert!( + meta.size >= 1_000_000, + "bitmap index should be large enough to fail without metadata path, size={} bytes", + meta.size + ); + + io_tracker.incremental_stats(); + + dataset.index_statistics("status_idx").await.unwrap(); + + let stats = io_tracker.incremental_stats(); + assert!( + stats.read_bytes < 1024, + "index_statistics should only read metadata, read {} bytes", + stats.read_bytes + ); + assert_eq!( + stats.write_bytes, 0, + "index_statistics should not perform writes" + ); + } + fn sample_vector_field() -> Field { let dimensions = 16; let column_name = "vec"; From d80c5fd67efb92e39e23d33379ea836e6dbfa70e Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 21 Nov 2025 18:28:39 +0800 Subject: [PATCH 04/12] FIx Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index b01f963a430..f87dac2a53c 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1800,7 +1800,7 @@ mod tests { use super::*; use crate::dataset::builder::DatasetBuilder; use crate::dataset::optimize::{compact_files, CompactionOptions}; - use crate::dataset::{WriteMode, WriteParams}; + use crate::dataset::{ReadParams, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::session::Session; use crate::utils::test::{copy_test_data_to_tmp, DatagenExt, FragmentCount, FragmentRowCount}; @@ -1971,7 +1971,7 @@ mod tests { stats.read_bytes ); assert_eq!( - stats.write_bytes, 0, + stats.written_bytes, 0, "index_statistics should not perform writes" ); } From 22206e743ef8d845e644f2a0c3adfd1a91adc5f8 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sat, 22 Nov 2025 02:18:43 +0800 Subject: [PATCH 05/12] Apply suggestions from code review Co-authored-by: Will Jones --- rust/lance/src/index.rs | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index f87dac2a53c..e94428c012f 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1904,32 +1904,10 @@ mod tests { RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(values))]).unwrap(); let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - let io_tracker = Arc::new(IOTracker::default()); - let write_params = WriteParams { - store_params: Some(ObjectStoreParams { - object_store_wrapper: Some(io_tracker.clone()), - ..Default::default() - }), - ..WriteParams::default() - }; - - Dataset::write(reader, &test_dir, Some(write_params)) - .await - .unwrap(); - - let read_params = ReadParams { - store_options: Some(ObjectStoreParams { - object_store_wrapper: Some(io_tracker.clone()), - ..Default::default() - }), - ..ReadParams::default() - }; - - let mut dataset = DatasetBuilder::from_uri(&test_dir) - .with_read_params(read_params) - .load() + let mut dataset = Dataset::write(reader, &test_dir, Some(write_params)) .await .unwrap(); + let io_tracker = dataset.object_store().io_tracker().clone(); let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); dataset @@ -1965,13 +1943,17 @@ mod tests { dataset.index_statistics("status_idx").await.unwrap(); let stats = io_tracker.incremental_stats(); - assert!( - stats.read_bytes < 1024, + assert_io_lt!( + stats, + read_bytes, + 1024, "index_statistics should only read metadata, read {} bytes", stats.read_bytes ); - assert_eq!( - stats.written_bytes, 0, + assert_io_eq!( + stats, + written_bytes, + 0, "index_statistics should not perform writes" ); } From 5426f1f8e36b6102ca3006d8e59d9f9f6fdd87a3 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Sat, 22 Nov 2025 02:24:50 +0800 Subject: [PATCH 06/12] Fix build Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index e94428c012f..89b557856d3 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1800,7 +1800,7 @@ mod tests { use super::*; use crate::dataset::builder::DatasetBuilder; use crate::dataset::optimize::{compact_files, CompactionOptions}; - use crate::dataset::{ReadParams, WriteMode, WriteParams}; + use crate::dataset::{WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::session::Session; use crate::utils::test::{copy_test_data_to_tmp, DatagenExt, FragmentCount, FragmentRowCount}; @@ -1823,8 +1823,6 @@ mod tests { use lance_index::vector::{ hnsw::builder::HnswBuildParams, ivf::IvfBuildParams, sq::builder::SQBuildParams, }; - use lance_io::object_store::ObjectStoreParams; - use lance_io::utils::tracking_store::IOTracker; use lance_io::{assert_io_eq, assert_io_lt}; use lance_linalg::distance::{DistanceType, MetricType}; use lance_testing::datagen::generate_random_array; @@ -1904,9 +1902,7 @@ mod tests { RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(values))]).unwrap(); let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - let mut dataset = Dataset::write(reader, &test_dir, Some(write_params)) - .await - .unwrap(); + let mut dataset = Dataset::write(reader, &test_dir, None).await.unwrap(); let io_tracker = dataset.object_store().io_tracker().clone(); let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); @@ -1938,6 +1934,7 @@ mod tests { meta.size ); + // Reset stats collected during index creation io_tracker.incremental_stats(); dataset.index_statistics("status_idx").await.unwrap(); From 5bfbf4df9b2e0f26477d066300fc8e84b70a6a27 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 24 Nov 2025 14:36:12 +0800 Subject: [PATCH 07/12] Fix CI Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 89b557856d3..a4b6ddddc0c 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1940,13 +1940,20 @@ mod tests { dataset.index_statistics("status_idx").await.unwrap(); let stats = io_tracker.incremental_stats(); - assert_io_lt!( + assert_io_eq!( stats, read_bytes, - 1024, - "index_statistics should only read metadata, read {} bytes", + 4096, + "index_statistics should only read the index footer; got {} bytes", stats.read_bytes ); + assert_io_lt!( + stats, + read_iops, + 3, + "index_statistics should only require a head plus one range read; got {} ops", + stats.read_iops + ); assert_io_eq!( stats, written_bytes, From 417971ea87d2ade7387b9a5e1d9b0118ecace975 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 19:05:38 +0800 Subject: [PATCH 08/12] Address comments Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 52 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index a4b6ddddc0c..c04c6f6a5fd 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -196,6 +196,33 @@ fn auto_migrate_corruption() -> bool { }) } +/// Derive a friendly (but not necessarily unique) type name from a type URL. +fn friendly_type_name_from_uri(index_uri: &str) -> String { + let type_name = index_uri.rsplit('/').next().unwrap_or(index_uri); + type_name + .strip_suffix("IndexDetails") + .unwrap_or(type_name) + .to_string() +} + +/// Legacy mapping from type URL to the old IndexType string for backwards compatibility. +fn legacy_type_name(index_uri: &str) -> String { + let type_name = index_uri.rsplit('/').next().unwrap_or(index_uri); + match type_name { + "BTreeIndexDetails" => IndexType::BTree.to_string(), + "BitmapIndexDetails" => IndexType::Bitmap.to_string(), + "LabelListIndexDetails" => IndexType::LabelList.to_string(), + "NGramIndexDetails" => IndexType::NGram.to_string(), + "ZoneMapIndexDetails" => IndexType::ZoneMap.to_string(), + "BloomFilterIndexDetails" => IndexType::BloomFilter.to_string(), + "InvertedIndexDetails" => IndexType::Inverted.to_string(), + "JsonIndexDetails" => IndexType::Scalar.to_string(), + "FlatIndexDetails" => IndexType::Vector.to_string(), + "VectorIndexDetails" => IndexType::Vector.to_string(), + _ => "N/A".to_string(), + } +} + /// Builds index. #[async_trait] pub trait IndexBuilder { @@ -905,16 +932,20 @@ impl DatasetIndexExt for Dataset { let field_path = self.schema().field_path(field_id)?; let mut indices_stats = Vec::with_capacity(metadatas.len()); - let mut index_type: Option = None; + let mut index_uri: Option = None; + let mut index_typename: Option = None; for meta in metadatas.iter() { let index_store = Arc::new(LanceIndexStore::from_dataset_for_existing(self, meta)?); let index_details = scalar::fetch_index_details(self, &field_path, meta).await?; + if index_uri.is_none() { + index_uri = Some(index_details.type_url.clone()); + } let index_details_wrapper = scalar::IndexDetails(index_details.clone()); if let Ok(plugin) = index_details_wrapper.get_plugin() { - if index_type.is_none() { - index_type = Some(plugin.index_type().to_string()); + if index_typename.is_none() { + index_typename = Some(plugin.name().to_string()); } if let Some(stats) = plugin @@ -930,14 +961,21 @@ impl DatasetIndexExt for Dataset { .open_generic_index(&field_path, &meta.uuid.to_string(), &NoOpMetricsCollector) .await?; - if index_type.is_none() { - index_type = Some(index.index_type().to_string()); + if index_typename.is_none() { + // Fall back to a friendly name from the type URL if the plugin is unknown + let uri = index_uri + .as_deref() + .unwrap_or_else(|| index_details.type_url.as_str()); + index_typename = Some(friendly_type_name_from_uri(uri)); } indices_stats.push(index.statistics()?); } - let index_type = index_type.unwrap_or_else(|| "Unknown".to_string()); + let index_uri = index_uri.unwrap_or_else(|| "unknown".to_string()); + let index_typename = + index_typename.unwrap_or_else(|| friendly_type_name_from_uri(&index_uri)); + let index_type = legacy_type_name(&index_uri); let indexed_fragments_per_delta = self.indexed_fragments(index_name).await?; @@ -1016,6 +1054,8 @@ impl DatasetIndexExt for Dataset { let stats = json!({ "index_type": index_type, + "index_uri": index_uri, + "index_typename": index_typename, "name": index_name, "num_indices": metadatas.len(), "indices": indices_stats, From b1b86285db91c705ce6a2997deb5d3f6f534599e Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Dec 2025 21:24:37 +0800 Subject: [PATCH 09/12] Remove not needed plugin type api Signed-off-by: Xuanwo --- rust/lance-index/src/scalar/bitmap.rs | 4 ---- rust/lance-index/src/scalar/bloomfilter.rs | 4 ---- rust/lance-index/src/scalar/btree.rs | 4 ---- rust/lance-index/src/scalar/inverted.rs | 5 ----- rust/lance-index/src/scalar/json.rs | 4 ---- rust/lance-index/src/scalar/label_list.rs | 4 ---- rust/lance-index/src/scalar/ngram.rs | 4 ---- rust/lance-index/src/scalar/registry.rs | 4 ---- rust/lance-index/src/scalar/zonemap.rs | 4 ---- 9 files changed, 37 deletions(-) diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index b8ca825729e..8d9eb8fe60e 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -749,10 +749,6 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { true } - fn index_type(&self) -> IndexType { - IndexType::Bitmap - } - fn version(&self) -> u32 { BITMAP_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index c6e86cc23fc..f29d18a6095 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -1285,10 +1285,6 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { false } - fn index_type(&self) -> IndexType { - IndexType::BloomFilter - } - fn version(&self) -> u32 { BLOOMFILTER_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index bc593c102ac..291e32169ad 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1939,10 +1939,6 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { true } - fn index_type(&self) -> IndexType { - IndexType::BTree - } - fn version(&self) -> u32 { BTREE_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs index ecd44473f2d..e8644600513 100644 --- a/rust/lance-index/src/scalar/inverted.rs +++ b/rust/lance-index/src/scalar/inverted.rs @@ -35,7 +35,6 @@ use crate::{ registry::{ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest}, CreatedIndex, ScalarIndex, }, - IndexType, }; use super::IndexStore; @@ -138,10 +137,6 @@ impl ScalarIndexPlugin for InvertedIndexPlugin { false } - fn index_type(&self) -> IndexType { - IndexType::Inverted - } - fn version(&self) -> u32 { INVERTED_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index 45aca4f7a3b..82501444291 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -742,10 +742,6 @@ impl ScalarIndexPlugin for JsonIndexPlugin { true } - fn index_type(&self) -> IndexType { - IndexType::Scalar - } - fn attach_registry(&self, registry: Arc) { let mut reg_ref = self.registry.lock().unwrap(); *reg_ref = Some(registry); diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 5b7c9d90c4d..91d3a9063fe 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -386,10 +386,6 @@ impl ScalarIndexPlugin for LabelListIndexPlugin { true } - fn index_type(&self) -> IndexType { - IndexType::LabelList - } - fn version(&self) -> u32 { LABEL_LIST_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index d81f9a121e3..aec9cc29dcc 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -1278,10 +1278,6 @@ impl ScalarIndexPlugin for NGramIndexPlugin { false } - fn index_type(&self) -> IndexType { - IndexType::NGram - } - fn version(&self) -> u32 { NGRAM_INDEX_VERSION } diff --git a/rust/lance-index/src/scalar/registry.rs b/rust/lance-index/src/scalar/registry.rs index e1439faa322..76b088518e3 100644 --- a/rust/lance-index/src/scalar/registry.rs +++ b/rust/lance-index/src/scalar/registry.rs @@ -12,7 +12,6 @@ use crate::registry::IndexPluginRegistry; use crate::{ frag_reuse::FragReuseIndex, scalar::{expression::ScalarQueryParser, CreatedIndex, IndexStore, ScalarIndex}, - IndexType, }; pub const VALUE_COLUMN_NAME: &str = "value"; @@ -130,9 +129,6 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug { /// Returns true if the index returns an exact answer (e.g. not AtMost) fn provides_exact_answer(&self) -> bool; - /// Returns the index type for this plugin - fn index_type(&self) -> IndexType; - /// The version of the index plugin /// /// We assume that indexes are not forwards compatible. If an index was written with a diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 0459c2e84fb..0b4b94e7e30 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -1023,10 +1023,6 @@ impl ScalarIndexPlugin for ZoneMapIndexPlugin { false } - fn index_type(&self) -> IndexType { - IndexType::ZoneMap - } - fn version(&self) -> u32 { ZONEMAP_INDEX_VERSION } From ebda2af775eb5b1e08ac2b41da38b11f104e7c55 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Dec 2025 22:32:26 +0800 Subject: [PATCH 10/12] Fix build Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 42 +++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 52cd94c265c..74cd5f7b8f1 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -197,28 +197,31 @@ fn auto_migrate_corruption() -> bool { } /// Derive a friendly (but not necessarily unique) type name from a type URL. -fn friendly_type_name_from_uri(index_uri: &str) -> String { +/// Extract a human-friendly type name from a type URL. +/// +/// Strips prefixes like `type.googleapis.com/` and package names, then removes +/// trailing `IndexDetails` / `Index` so callers get a concise display name. +fn type_name_from_uri(index_uri: &str) -> String { let type_name = index_uri.rsplit('/').next().unwrap_or(index_uri); - type_name - .strip_suffix("IndexDetails") - .unwrap_or(type_name) - .to_string() + let type_name = type_name.rsplit('.').next().unwrap_or(type_name); + type_name.trim_end_matches("IndexDetails").to_string() } /// Legacy mapping from type URL to the old IndexType string for backwards compatibility. fn legacy_type_name(index_uri: &str) -> String { - let type_name = index_uri.rsplit('/').next().unwrap_or(index_uri); - match type_name { - "BTreeIndexDetails" => IndexType::BTree.to_string(), - "BitmapIndexDetails" => IndexType::Bitmap.to_string(), - "LabelListIndexDetails" => IndexType::LabelList.to_string(), - "NGramIndexDetails" => IndexType::NGram.to_string(), - "ZoneMapIndexDetails" => IndexType::ZoneMap.to_string(), - "BloomFilterIndexDetails" => IndexType::BloomFilter.to_string(), - "InvertedIndexDetails" => IndexType::Inverted.to_string(), - "JsonIndexDetails" => IndexType::Scalar.to_string(), - "FlatIndexDetails" => IndexType::Vector.to_string(), - "VectorIndexDetails" => IndexType::Vector.to_string(), + let base = type_name_from_uri(index_uri); + + match base.as_str() { + "BTree" => IndexType::BTree.to_string(), + "Bitmap" => IndexType::Bitmap.to_string(), + "LabelList" => IndexType::LabelList.to_string(), + "NGram" => IndexType::NGram.to_string(), + "ZoneMap" => IndexType::ZoneMap.to_string(), + "BloomFilter" => IndexType::BloomFilter.to_string(), + "Inverted" => IndexType::Inverted.to_string(), + "Json" => IndexType::Scalar.to_string(), + "Flat" | "Vector" => IndexType::Vector.to_string(), + other if other.contains("Vector") => IndexType::Vector.to_string(), _ => "N/A".to_string(), } } @@ -966,15 +969,14 @@ impl DatasetIndexExt for Dataset { let uri = index_uri .as_deref() .unwrap_or_else(|| index_details.type_url.as_str()); - index_typename = Some(friendly_type_name_from_uri(uri)); + index_typename = Some(type_name_from_uri(uri)); } indices_stats.push(index.statistics()?); } let index_uri = index_uri.unwrap_or_else(|| "unknown".to_string()); - let index_typename = - index_typename.unwrap_or_else(|| friendly_type_name_from_uri(&index_uri)); + let index_typename = index_typename.unwrap_or_else(|| type_name_from_uri(&index_uri)); let index_type = legacy_type_name(&index_uri); let indexed_fragments_per_delta = self.indexed_fragments(index_name).await?; From 36c20c25f95ff6b4de463db2d53c9244fac53f9d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Dec 2025 23:16:10 +0800 Subject: [PATCH 11/12] remove not used info Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 74cd5f7b8f1..dc408b8b0a4 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -976,7 +976,6 @@ impl DatasetIndexExt for Dataset { } let index_uri = index_uri.unwrap_or_else(|| "unknown".to_string()); - let index_typename = index_typename.unwrap_or_else(|| type_name_from_uri(&index_uri)); let index_type = legacy_type_name(&index_uri); let indexed_fragments_per_delta = self.indexed_fragments(index_name).await?; @@ -1056,8 +1055,6 @@ impl DatasetIndexExt for Dataset { let stats = json!({ "index_type": index_type, - "index_uri": index_uri, - "index_typename": index_typename, "name": index_name, "num_indices": metadatas.len(), "indices": indices_stats, From a22ef3b6477e4fcff2fc67e0baac42cd719742e2 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Dec 2025 23:32:56 +0800 Subject: [PATCH 12/12] Make sure vector index type returned correctly Signed-off-by: Xuanwo --- rust/lance/src/index.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index dc408b8b0a4..522bd9a2aa9 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -208,7 +208,16 @@ fn type_name_from_uri(index_uri: &str) -> String { } /// Legacy mapping from type URL to the old IndexType string for backwards compatibility. -fn legacy_type_name(index_uri: &str) -> String { +/// Legacy mapping from type URL to the old IndexType string for backwards compatibility. +/// +/// If `index_type_hint` is provided (e.g. parsed from the index statistics of a concrete +/// index instance), it takes precedence so callers can surface the exact index type even +/// when the type URL alone is too generic (such as VectorIndexDetails). +fn legacy_type_name(index_uri: &str, index_type_hint: Option<&str>) -> String { + if let Some(hint) = index_type_hint { + return hint.to_string(); + } + let base = type_name_from_uri(index_uri); match base.as_str() { @@ -976,7 +985,11 @@ impl DatasetIndexExt for Dataset { } let index_uri = index_uri.unwrap_or_else(|| "unknown".to_string()); - let index_type = legacy_type_name(&index_uri); + let index_type_hint = indices_stats + .first() + .and_then(|stats| stats.get("index_type")) + .and_then(|v| v.as_str()); + let index_type = legacy_type_name(&index_uri, index_type_hint); let indexed_fragments_per_delta = self.indexed_fragments(index_name).await?;