From 936dd22aa9a155b1abf9ee1cd3eb5dcb73e12d9a Mon Sep 17 00:00:00 2001 From: ztorchan <976762403@qq.com> Date: Fri, 21 Nov 2025 00:10:07 +0800 Subject: [PATCH 1/3] fix: update btree index with its own zone size instead of DEFAULT_BTREE_BATCH_SIZE --- rust/lance-index/src/scalar/btree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 080f0cb4a34..9bdbee58411 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1226,13 +1226,13 @@ impl ScalarIndex for BTreeIndex { // Merge the existing index data with the new data and then retrain the index on the merged stream let merged_data_source = self .clone() - .combine_old_new(new_data, DEFAULT_BTREE_BATCH_SIZE) + .combine_old_new(new_data, self.batch_size) .await?; train_btree_index( merged_data_source, self.sub_index.as_ref(), dest_store, - DEFAULT_BTREE_BATCH_SIZE, + self.batch_size, None, ) .await?; From 024d7437e4ec7ba0fb5e12ae862327a9c2dacd2f Mon Sep 17 00:00:00 2001 From: ztorchan <976762403@qq.com> Date: Sun, 23 Nov 2025 14:37:51 +0800 Subject: [PATCH 2/3] test: add a unit test for optimize btree index --- rust/lance/src/index/scalar.rs | 134 ++++++++++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 76fc909b2a8..879c9739cb4 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -537,7 +537,7 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_core::{datatypes::Field, utils::address::RowAddress}; use lance_datagen::array; - use lance_index::IndexType; + use lance_index::{optimize::OptimizeOptions, IndexType}; use lance_index::{pbold::NGramIndexDetails, scalar::BuiltinIndexType}; use lance_table::format::pb::VectorIndexDetails; @@ -854,6 +854,138 @@ mod tests { } } + #[tokio::test] + async fn test_optimize_scalar_index_btree() { + use crate::dataset::Dataset; + use arrow_array::types::Float32Type; + use lance_datagen::{array, BatchCount, RowCount}; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::scalar::ScalarIndexParams; + use lance_index::DatasetIndexExt; + + let test_dir = TempStrDir::default(); + let uri = format!("{}/source", test_dir.as_str()); + + // Create source dataset with BTree index + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col("value", array::rand::()) + .into_reader_rows(RowCount::from(100), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, &uri, None) + .await + .unwrap(); + + // Create BTree index on source with custom zone_size + use lance_index::scalar::btree::BTreeParameters; + + let btree_params = BTreeParameters { + zone_size: Some(50), + }; + let params_json = serde_json::to_value(&btree_params).unwrap(); + let index_params = + ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree) + .with_params(¶ms_json); + + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_btree".to_string()), + &index_params, + false, + ) + .await + .unwrap(); + + // Verify index was created + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1, "Target should have 1 index"); + assert_eq!( + indices[0].name, "id_btree", + "Index name should match" + ); + assert_eq!( + indices[0].fields, + vec![0], + "Index should be on field 0 (id)" + ); + + // Verify the index type is correct + let scalar_index = dataset + .open_scalar_index( + "id", + &indices[0].uuid.to_string(), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + + assert_eq!( + scalar_index.index_type(), + IndexType::BTree, + "Index type should be BTree" + ); + + // Verify BTree parameters are preserved + let derived_params = scalar_index.derive_index_params().unwrap(); + if let Some(params_json) = derived_params.params { + let params: BTreeParameters = serde_json::from_str(¶ms_json).unwrap(); + assert_eq!(params.zone_size, Some(50), "BTree zone_size should be 50"); + } else { + panic!("BTree index should have parameters"); + } + + // Append more data to dataset + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col("value", array::rand::()) + .into_reader_rows(RowCount::from(200), BatchCount::from(1)); + dataset.append (reader, None).await.unwrap(); + + // Optimize BTree index + let optimize_index_options = OptimizeOptions::new().index_names(vec!["id_btree".to_string()]); + dataset + .optimize_indices(&optimize_index_options) + .await + .unwrap(); + + // Verify BTree parameters are same after optimization + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1, "Target should have 1 index"); + assert_eq!( + indices[0].name, "id_btree", + "Index name should match" + ); + assert_eq!( + indices[0].fields, + vec![0], + "Index should be on field 0 (id)" + ); + + let scalar_index = dataset + .open_scalar_index( + "id", + &indices[0].uuid.to_string(), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + + assert_eq!( + scalar_index.index_type(), + IndexType::BTree, + "Index type should be BTree" + ); + + let derived_params = scalar_index.derive_index_params().unwrap(); + if let Some(params_json) = derived_params.params { + let params: BTreeParameters = serde_json::from_str(¶ms_json).unwrap(); + assert_eq!(params.zone_size, Some(50), "BTree zone_size should be 50"); + } else { + panic!("BTree index should have parameters"); + } + } + #[tokio::test] async fn test_initialize_scalar_index_bitmap() { use crate::dataset::Dataset; From 38531713266d0269b21fd876a3e03114167a151c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 24 Nov 2025 09:29:39 -0800 Subject: [PATCH 3/3] format --- rust/lance/src/index/scalar.rs | 35 ++++++++++------------------------ 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 879c9739cb4..25b60761139 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -871,9 +871,7 @@ mod tests { .col("id", array::step::()) .col("value", array::rand::()) .into_reader_rows(RowCount::from(100), BatchCount::from(1)); - let mut dataset = Dataset::write(reader, &uri, None) - .await - .unwrap(); + let mut dataset = Dataset::write(reader, &uri, None).await.unwrap(); // Create BTree index on source with custom zone_size use lance_index::scalar::btree::BTreeParameters; @@ -896,14 +894,11 @@ mod tests { ) .await .unwrap(); - + // Verify index was created let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 1, "Target should have 1 index"); - assert_eq!( - indices[0].name, "id_btree", - "Index name should match" - ); + assert_eq!(indices[0].name, "id_btree", "Index name should match"); assert_eq!( indices[0].fields, vec![0], @@ -912,11 +907,7 @@ mod tests { // Verify the index type is correct let scalar_index = dataset - .open_scalar_index( - "id", - &indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index("id", &indices[0].uuid.to_string(), &NoOpMetricsCollector) .await .unwrap(); @@ -940,22 +931,20 @@ mod tests { .col("id", array::step::()) .col("value", array::rand::()) .into_reader_rows(RowCount::from(200), BatchCount::from(1)); - dataset.append (reader, None).await.unwrap(); + dataset.append(reader, None).await.unwrap(); // Optimize BTree index - let optimize_index_options = OptimizeOptions::new().index_names(vec!["id_btree".to_string()]); + let optimize_index_options = + OptimizeOptions::new().index_names(vec!["id_btree".to_string()]); dataset .optimize_indices(&optimize_index_options) .await .unwrap(); - + // Verify BTree parameters are same after optimization let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 1, "Target should have 1 index"); - assert_eq!( - indices[0].name, "id_btree", - "Index name should match" - ); + assert_eq!(indices[0].name, "id_btree", "Index name should match"); assert_eq!( indices[0].fields, vec![0], @@ -963,11 +952,7 @@ mod tests { ); let scalar_index = dataset - .open_scalar_index( - "id", - &indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) + .open_scalar_index("id", &indices[0].uuid.to_string(), &NoOpMetricsCollector) .await .unwrap();