diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 09143a6adf0..5bb30513354 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3469,6 +3469,7 @@ dependencies = [ "arrow-buffer", "arrow-cast", "arrow-data", + "arrow-ipc", "arrow-ord", "arrow-schema", "arrow-select", diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 3d7ef2ebdaf..79f9ba25ffe 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -426,6 +426,7 @@ pub fn get_vector_index_params( stages, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: Default::default(), }) }, )?; diff --git a/protos/index.proto b/protos/index.proto index 1fb51f3291c..5c095c73a91 100644 --- a/protos/index.proto +++ b/protos/index.proto @@ -184,6 +184,67 @@ message VectorIndex { VectorMetricType metric_type = 4; } +// Details for vector indexes, stored in the manifest's index_details field. +message VectorIndexDetails { + VectorMetricType metric_type = 1; + + // The target number of vectors per partition. + // 0 means unset. + uint64 target_partition_size = 2; + + // Optional HNSW index configuration. If set, the index has an HNSW layer. + optional HnswParameters hnsw_index_config = 3; + + message ProductQuantization { + uint32 num_bits = 1; + uint32 num_sub_vectors = 2; + } + message ScalarQuantization { + uint32 num_bits = 1; + } + message RabitQuantization { + enum RotationType { + FAST = 0; + MATRIX = 1; + } + uint32 num_bits = 1; + RotationType rotation_type = 2; + } + + // No quantization; vectors are stored as-is. + message FlatCompression {} + + oneof compression { + ProductQuantization pq = 4; + ScalarQuantization sq = 5; + RabitQuantization rq = 6; + FlatCompression flat = 8; + } + + // The version of the index file format. Useful for maintaining backwards + // compatibility when introducing breaking changes to the index format. + // 0 means unset (legacy index). + uint32 index_version = 7; + + // Runtime hints: optional build preferences that don't affect index structure. + // Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters", "lancedb.accelerator"). + // Unrecognized keys must be silently ignored by all runtimes. + map runtime_hints = 9; +} + +// Hierarchical Navigable Small World (HNSW) parameters, used as an optional configuration for IVF indexes. +message HnswParameters { + // The maximum number of outgoing edges per node in the HNSW graph. Higher values + // means more connections, better recall, but more memory and slower builds. + // Referred to as "M" in the HNSW literature. + uint32 max_connections = 1; + // "construction exploration factor": The size of the dynamic list used during + // index construction. + uint32 construction_ef = 2; + // The maximum number of levels in the HNSW graph. + uint32 max_level = 3; +} + message JsonIndexDetails { string path = 1; google.protobuf.Any target_details = 2; diff --git a/protos/table.proto b/protos/table.proto index e73d22b6b93..2b0f768857e 100644 --- a/protos/table.proto +++ b/protos/table.proto @@ -474,8 +474,7 @@ message ExternalFile { uint64 size = 3; } -// Empty details messages for older indexes that don't take advantage of the details field. -message VectorIndexDetails {} +// VectorIndexDetails and HnswParameters (formerly HnswIndexDetails) moved to index.proto message FragmentReuseIndexDetails { diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py index 194435c095a..b98ffdf63e3 100644 --- a/python/python/tests/compat/test_vector_indices.py +++ b/python/python/tests/compat/test_vector_indices.py @@ -71,6 +71,17 @@ def check_read(self): ) assert result.num_rows == 4 + if hasattr(ds, "describe_indices"): + indices = ds.describe_indices() + assert len(indices) >= 1 + name = indices[0].name + elif self.compat_version >= "0.39.0": + indices = ds.list_indices() + assert len(indices) >= 1 + name = indices[0]["name"] + stats = ds.stats.index_stats(name) + assert stats["num_indexed_rows"] > 0 + def check_write(self): """Verify can insert vectors and rebuild index.""" ds = lance.dataset(self.path) @@ -140,6 +151,18 @@ def check_read(self): ) assert result.num_rows == 4 + if hasattr(ds, "describe_indices"): + indices = ds.describe_indices() + assert len(indices) >= 1 + name = indices[0].name + else: + indices = ds.list_indices() + assert len(indices) >= 1 + name = indices[0]["name"] + + stats = ds.stats.index_stats(name) + assert stats["num_indexed_rows"] > 0 + def check_write(self): """Verify can insert vectors and rebuild index.""" ds = lance.dataset(self.path) @@ -209,6 +232,18 @@ def check_read(self): ) assert result.num_rows == 4 + if hasattr(ds, "describe_indices"): + indices = ds.describe_indices() + assert len(indices) >= 1 + name = indices[0].name + else: + indices = ds.list_indices() + assert len(indices) >= 1 + name = indices[0]["name"] + + stats = ds.stats.index_stats(name) + assert stats["num_indexed_rows"] > 0 + def check_write(self): """Verify can insert vectors and rebuild index.""" ds = lance.dataset(self.path) @@ -226,9 +261,9 @@ def check_write(self): ds.optimize.compact_files() -@compat_test(min_version="0.39.0") +@compat_test(min_version="4.0.0-beta.8") class IvfRqVectorIndex(UpgradeDowngradeTest): - """Test IVF_RQ vector index compatibility.""" + """Test IVF_RQ vector index compatibility. V2 was introduced in v4.0.0-beta.8""" def __init__(self, path: Path): self.path = path @@ -273,6 +308,18 @@ def check_read(self): ) assert result.num_rows == 4 + if hasattr(ds, "describe_indices"): + indices = ds.describe_indices() + assert len(indices) >= 1 + name = indices[0].name + else: + indices = ds.list_indices() + assert len(indices) >= 1 + name = indices[0].name + + stats = ds.stats.index_stats(name) + assert stats["num_indexed_rows"] > 0 + def check_write(self): """Verify can insert vectors and run optimize workflows.""" ds = lance.dataset(self.path) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index ebaa8534abc..cb29cd10a33 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1677,7 +1677,7 @@ def test_describe_vector_index(indexed_dataset: LanceDataset): info = indexed_dataset.describe_indices()[0] assert info.name == "vector_idx" - assert info.type_url == "/lance.table.VectorIndexDetails" + assert info.type_url == "/lance.index.pb.VectorIndexDetails" assert info.index_type == "IVF_PQ" assert info.num_rows_indexed == 1000 assert info.fields == [0] @@ -1688,6 +1688,44 @@ def test_describe_vector_index(indexed_dataset: LanceDataset): assert info.segments[0].index_version == 1 assert info.segments[0].created_at is not None + details = info.details + assert details["metric_type"] == "L2" + assert details["compression"]["type"] == "pq" + assert details["compression"]["num_bits"] == 8 + assert details["compression"]["num_sub_vectors"] == 16 + + +def test_describe_index_runtime_hints_stored(tmp_path): + tbl = create_table(nvec=300, ndim=16) + dataset = lance.write_dataset(tbl, tmp_path) + dataset = dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=4, + max_iters=100, + sample_rate=512, + ) + details = dataset.describe_indices()[0].details + hints = details.get("runtime_hints", {}) + assert hints.get("lance.ivf.max_iters") == "100" + assert hints.get("lance.ivf.sample_rate") == "512" + assert hints.get("lance.pq.max_iters") == "100" + assert hints.get("lance.pq.sample_rate") == "512" + + +def test_describe_index_runtime_hints_defaults_omitted(tmp_path): + tbl = create_table(nvec=300, ndim=16) + dataset = lance.write_dataset(tbl, tmp_path) + dataset = dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=4, + ) + details = dataset.describe_indices()[0].details + assert "runtime_hints" not in details + def test_optimize_indices(indexed_dataset): data = create_table() diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 8838b89bc0a..8614e1f6ba5 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3569,6 +3569,12 @@ fn prepare_vector_index_params( sq_params.sample_rate = sample_rate; } + if let Some(max_iters) = kwargs.get_item("max_iters")? { + let max_iters: usize = max_iters.extract()?; + ivf_params.max_iters = max_iters; + pq_params.max_iters = max_iters; + } + // Parse IVF params if let Some(n) = kwargs.get_item("num_partitions")? { ivf_params.num_partitions = Some(n.extract()?) @@ -3731,6 +3737,13 @@ fn prepare_vector_index_params( }?; params.version(index_file_version); params.skip_transpose(skip_transpose); + if let Some(kwargs) = kwargs + && let Some(acc) = kwargs.get_item("accelerator")? + { + params + .runtime_hints + .insert("lancedb.accelerator".to_string(), acc.to_string()); + } Ok(params) } diff --git a/python/src/indices.rs b/python/src/indices.rs index 62f3c0c64ec..6ff95fbe151 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -441,8 +441,7 @@ async fn do_load_shuffled_vectors( dataset_version: ds.manifest.version, fragment_bitmap: Some(ds.fragments().iter().map(|f| f.id as u32).collect()), index_details: Some(Arc::new( - prost_types::Any::from_msg(&lance_table::format::pb::VectorIndexDetails::default()) - .unwrap(), + prost_types::Any::from_msg(&lance_index::pb::VectorIndexDetails::default()).unwrap(), )), index_version: IndexType::IvfPq.version(), created_at: Some(Utc::now()), diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index d56bfdcafc6..a0a16b22169 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -7,6 +7,7 @@ use std::iter::once; use std::str::FromStr; use std::sync::Arc; +use crate::pb::vector_index_details::RabitQuantization; use arrow_array::types::Float32Type; use arrow_array::{Array, ArrayRef, UInt8Array, cast::AsArray}; use lance_core::{Error, Result}; @@ -121,6 +122,19 @@ impl RQBuildParams { } } +impl From<&RQBuildParams> for RabitQuantization { + fn from(value: &RQBuildParams) -> Self { + use crate::pb::vector_index_details::rabit_quantization::RotationType; + Self { + num_bits: value.num_bits as u32, + rotation_type: match value.rotation_type { + RQRotationType::Fast => RotationType::Fast as i32, + RQRotationType::Matrix => RotationType::Matrix as i32, + }, + } + } +} + impl QuantizerBuildParams for RQBuildParams { fn sample_size(&self) -> usize { 0 diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index 6de41fbc162..93ed6c93d4c 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -60,6 +60,16 @@ pub struct HnswBuildParams { pub prefetch_distance: Option, } +impl From<&HnswBuildParams> for crate::pb::HnswParameters { + fn from(params: &HnswBuildParams) -> Self { + Self { + max_connections: params.m as u32, + construction_ef: params.ef_construction as u32, + max_level: params.max_level as u32, + } + } +} + impl Default for HnswBuildParams { fn default() -> Self { Self { diff --git a/rust/lance-index/src/vector/pq/builder.rs b/rust/lance-index/src/vector/pq/builder.rs index 1768e9fe8f0..c4dad4a6a3e 100644 --- a/rust/lance-index/src/vector/pq/builder.rs +++ b/rust/lance-index/src/vector/pq/builder.rs @@ -44,6 +44,15 @@ pub struct PQBuildParams { pub sample_rate: usize, } +impl From<&PQBuildParams> for crate::pb::vector_index_details::ProductQuantization { + fn from(params: &PQBuildParams) -> Self { + Self { + num_bits: params.num_bits as u32, + num_sub_vectors: params.num_sub_vectors as u32, + } + } +} + impl Default for PQBuildParams { fn default() -> Self { Self { diff --git a/rust/lance-index/src/vector/sq/builder.rs b/rust/lance-index/src/vector/sq/builder.rs index 913751062cf..359765040dd 100644 --- a/rust/lance-index/src/vector/sq/builder.rs +++ b/rust/lance-index/src/vector/sq/builder.rs @@ -12,6 +12,14 @@ pub struct SQBuildParams { pub sample_rate: usize, } +impl From<&SQBuildParams> for crate::pb::vector_index_details::ScalarQuantization { + fn from(params: &SQBuildParams) -> Self { + Self { + num_bits: params.num_bits as u32, + } + } +} + impl Default for SQBuildParams { fn default() -> Self { Self { diff --git a/rust/lance/src/dataset/index.rs b/rust/lance/src/dataset/index.rs index 856f7361892..4796273bc16 100644 --- a/rust/lance/src/dataset/index.rs +++ b/rust/lance/src/dataset/index.rs @@ -17,9 +17,9 @@ use async_trait::async_trait; use lance_core::{Error, Result}; use lance_encoding::version::LanceFileVersion; use lance_index::frag_reuse::FRAG_REUSE_INDEX_NAME; +use lance_index::pb::VectorIndexDetails; use lance_index::scalar::lance_format::LanceIndexStore; use lance_table::format::IndexMetadata; -use lance_table::format::pb::VectorIndexDetails; use serde::{Deserialize, Serialize}; use super::optimize::{IndexRemapper, IndexRemapperOptions}; diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index c70eb93bbcd..de73228a843 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -4091,6 +4091,7 @@ mod tests { ], version: crate::index::vector::IndexFileVersion::V3, skip_transpose: false, + runtime_hints: Default::default(), }, false, ) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index b7f5094ac35..3fba2fc305d 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -3529,18 +3529,23 @@ impl Scanner { } } } else if let Some(index) = indices.iter().find(|i| i.fields.contains(&column_id)) { - // TODO: Once we do https://github.com/lance-format/lance/issues/5231, we - // should be able to get the metric type directly from the index metadata, - // at least for newer indexes. - let idx = self - .dataset - .open_vector_index( - q.column.as_str(), - &index.uuid.to_string(), - &NoOpMetricsCollector, - ) - .await?; - let index_metric = idx.metric_type(); + // Try to get metric type from index metadata first (fast path for newer indices) + let index_metric = if let Some(metric) = + crate::index::vector::details::metric_type_from_index_metadata(index) + { + metric + } else { + // Fall back to opening the index for legacy indices without details + let idx = self + .dataset + .open_vector_index( + q.column.as_str(), + &index.uuid.to_string(), + &NoOpMetricsCollector, + ) + .await?; + idx.metric_type() + }; let use_this_index = match q.metric_type { Some(user_metric) => { @@ -9504,6 +9509,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ], version: crate::index::vector::IndexFileVersion::Legacy, skip_transpose: false, + runtime_hints: Default::default(), }, false, ) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 400a1be9244..b246476f52e 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -63,6 +63,12 @@ use scalar::index_matches_criteria; use serde_json::json; use tracing::{info, instrument}; use uuid::Uuid; +use vector::details::{ + derive_vector_index_type, infer_missing_vector_details, vector_details_as_json, +}; +pub(crate) use vector::details::{ + vector_index_details, vector_index_details_default, vector_params_from_details, +}; use vector::ivf::v2::{IVFIndex, IvfStateEntryBox}; use vector::utils::get_vector_type; @@ -464,7 +470,7 @@ pub(crate) async fn remap_index( CreatedIndex { index_details: prost_types::Any::from_msg( - &lance_table::format::pb::VectorIndexDetails::default(), + &lance_index::pb::VectorIndexDetails::default(), ) .unwrap(), index_version, @@ -515,11 +521,6 @@ async fn open_index_proto(reader: &dyn Reader) -> Result { Ok(proto) } -fn vector_index_details() -> prost_types::Any { - let details = lance_table::format::pb::VectorIndexDetails::default(); - prost_types::Any::from_msg(&details).unwrap() -} - struct IndexDescriptionImpl { name: String, field_ids: Vec, @@ -572,35 +573,11 @@ impl IndexDescriptionImpl { let details = IndexDetails(index_details.clone()); let mut rows_indexed = 0; - // Vector indices need to be opened to get the correct type let index_type = if details.is_vector() { - let column = field_ids - .first() - .and_then(|id| dataset.schema().field_by_id(*id)) - .map(|f| f.name.clone()) - .ok_or_else(|| { - Error::index("Cannot determine column name for vector index".to_string()) - })?; - - match dataset - .open_generic_index( - &column, - &example_metadata.uuid.to_string(), - &NoOpMetricsCollector, - ) - .await - { - Ok(idx) => idx.index_type().to_string(), - Err(e) => { - log::warn!( - "Failed to open vector index {} to determine type: {}", - name, - e - ); - "Unknown".to_string() - } - } + derive_vector_index_type(index_details) } else { + // We attempted to infer the index type when we loaded the indices, + // so if we hit this branch the index type is truly unknown. details .get_plugin() .map(|p| p.name().to_string()) @@ -657,10 +634,14 @@ impl IndexDescription for IndexDescriptionImpl { } fn details(&self) -> Result { - let plugin = self.details.get_plugin()?; - plugin - .details_as_json(&self.details.0) - .map(|v| v.to_string()) + if self.details.is_vector() { + vector_details_as_json(&self.details.0) + } else { + let plugin = self.details.get_plugin()?; + plugin + .details_as_json(&self.details.0) + .map(|v| v.to_string()) + } } fn total_size_bytes(&self) -> Option { @@ -830,7 +811,7 @@ impl DatasetIndexExt for Dataset { let metadata_key = IndexMetadataKey { version: self.version().version, }; - let indices = match self.index_cache.get_with_key(&metadata_key).await { + let mut indices = match self.index_cache.get_with_key(&metadata_key).await { Some(indices) => indices, None => { let mut loaded_indices = read_manifest_indexes( @@ -848,6 +829,20 @@ impl DatasetIndexExt for Dataset { } }; + // Infer details for legacy vector indices (once per index name, concurrently). + // This may run on indices that were opportunistically cached during Dataset::open + // before the full Dataset was available for inference. + { + let mut updated = indices.as_ref().clone(); + infer_missing_vector_details(self, &mut updated).await; + if updated != *indices { + indices = Arc::new(updated); + self.index_cache + .insert_with_key(&metadata_key, indices.clone()) + .await; + } + } + if let Some(frag_reuse_index_meta) = indices.iter().find(|idx| idx.name == FRAG_REUSE_INDEX_NAME) { @@ -2257,7 +2252,7 @@ mod tests { fields: vec![field_id], dataset_version: dataset.manifest.version, fragment_bitmap: Some(fragment_bitmap.into_iter().collect()), - index_details: Some(Arc::new(vector_index_details())), + index_details: Some(Arc::new(vector_index_details_default())), index_version: IndexType::Vector.version(), created_at: Some(chrono::Utc::now()), base_id: None, @@ -3848,6 +3843,114 @@ mod tests { "updated_at_timestamp_ms should be null when no indices have created_at timestamps" ); } + + #[tokio::test] + async fn test_legacy_vector_index_details_inferred_on_load_and_migration() { + use lance_linalg::distance::DistanceType; + + // Create a fresh dataset with IVF_HNSW_SQ so inference produces non-default + // details (HNSW config + SQ compression) that survive proto serialization. + let test_dir = lance_core::utils::tempfile::TempDir::default(); + let test_uri = test_dir.path_str(); + let data = gen_batch() + .col("i", array::step::()) + .col("vec", array::rand_vec::(16.into())) + .into_reader_rows(RowCount::from(1024), BatchCount::from(1)); + let mut dataset = Dataset::write(data, &test_uri, None).await.unwrap(); + + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::Cosine, + IvfBuildParams { + num_partitions: Some(2), + ..Default::default() + }, + HnswBuildParams::default(), + SQBuildParams::default(), + ); + dataset + .create_index(&["vec"], IndexType::Vector, None, ¶ms, true) + .await + .unwrap(); + + // Verify the index has populated details. + let descriptions = dataset.describe_indices(None).await.unwrap(); + assert_eq!(descriptions.len(), 1); + assert_eq!(descriptions[0].index_type(), "IVF_HNSW_SQ"); + + // Simulate a legacy dataset by clearing details from the manifest. + // Write a new manifest with empty VectorIndexDetails value bytes. + let mut indices = dataset.load_indices().await.unwrap().as_ref().clone(); + for idx in &mut indices { + if let Some(details) = idx.index_details.as_ref() + && details.type_url.ends_with("VectorIndexDetails") + { + idx.index_details = Some(Arc::new(vector_index_details_default())); + } + } + // Write back via a no-op commit that carries the cleared indices. + // We commit by doing a delete("false") after replacing the cached indices. + let metadata_key = crate::session::index_caches::IndexMetadataKey { + version: dataset.version().version, + }; + dataset + .index_cache + .insert_with_key(&metadata_key, Arc::new(indices)) + .await; + dataset.delete("false").await.unwrap(); + + // -- Part 1: Inference on load -- + // Open with a fresh session so nothing is cached. + let dataset = DatasetBuilder::from_uri(&test_uri) + .with_session(Arc::new(Session::default())) + .load() + .await + .unwrap(); + + // load_indices should detect empty details and infer from index files. + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + let details = indices[0].index_details.as_ref().unwrap(); + assert!( + !details.value.is_empty(), + "Details should have been inferred from index files" + ); + + // describe_indices should return a real type (not generic "Vector"). + let descriptions = dataset.describe_indices(None).await.unwrap(); + assert_eq!(descriptions.len(), 1); + assert_ne!( + descriptions[0].index_type(), + "Vector", + "Should have inferred a specific index type" + ); + let inferred_type = descriptions[0].index_type().to_string(); + let details_json: serde_json::Value = + serde_json::from_str(&descriptions[0].details().unwrap()).unwrap(); + assert_eq!(details_json["metric_type"], "COSINE"); + + // -- Part 2: Migration persists inferred details -- + let mut dataset = dataset; + dataset.delete("false").await.unwrap(); + + // Open with yet another fresh session. + let dataset = DatasetBuilder::from_uri(&test_uri) + .with_session(Arc::new(Session::default())) + .load() + .await + .unwrap(); + + // The migrated manifest should have non-empty details without + // needing to read index files again. + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + assert!( + !indices[0].index_details.as_ref().unwrap().value.is_empty(), + "Migrated manifest should have non-empty details" + ); + let descriptions = dataset.describe_indices(None).await.unwrap(); + assert_eq!(descriptions[0].index_type(), inferred_type); + } + #[rstest] #[case::btree("i", IndexType::BTree, Box::new(ScalarIndexParams::default()))] #[case::bitmap("i", IndexType::Bitmap, Box::new(ScalarIndexParams::default()))] diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index b93e325d472..518e16a0e81 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -26,7 +26,7 @@ use crate::dataset::Dataset; use crate::dataset::index::LanceIndexStoreExt; use crate::dataset::rowids::load_row_id_sequences; use crate::index::scalar::load_training_data; -use crate::index::vector_index_details; +use crate::index::vector_index_details_default; #[derive(Debug, Clone)] pub struct IndexMergeResults<'a> { @@ -249,7 +249,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( vec![removed_segment], new_fragment_bitmap, CreatedIndex { - index_details: vector_index_details(), + index_details: vector_index_details_default(), index_version: lance_index::IndexType::Vector.version() as u32, files: Some(files), }, @@ -294,6 +294,16 @@ pub async fn merge_indices_with_unindexed_frags<'a>( } } + // Carry forward existing index details, preferring the first segment + // that has populated (non-empty) details. + let index_details = old_indices + .iter() + .rev() + .filter_map(|idx| idx.index_details.as_ref()) + .find(|d| !d.value.is_empty()) + .map(|d| d.as_ref().clone()) + .unwrap_or_else(vector_index_details_default); + let index_dir = dataset.indices_dir().child(new_uuid.to_string()); let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; @@ -302,7 +312,10 @@ pub async fn merge_indices_with_unindexed_frags<'a>( removed_indices, frag_bitmap, CreatedIndex { - index_details: vector_index_details(), + index_details, + // retain_supported_indices guarantees all old_indices have + // index_version <= our max supported version, so we can safely + // write the current library's version for this index type. index_version: lance_index::IndexType::Vector.version() as u32, files: Some(files), }, diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 8aed939787a..9916a995306 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -14,7 +14,7 @@ use crate::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, }, - vector_index_details, + vector_index_details, vector_index_details_default, }, }; use futures::future::BoxFuture; @@ -380,7 +380,7 @@ impl<'a> CreateIndexBuilder<'a> { let files = list_index_files_with_sizes(&self.dataset.object_store, &index_dir).await?; CreatedIndex { - index_details: vector_index_details(), + index_details: vector_index_details(vec_params), index_version, files: Some(files), } @@ -419,7 +419,7 @@ impl<'a> CreateIndexBuilder<'a> { let files = list_index_files_with_sizes(&self.dataset.object_store, &index_dir).await?; CreatedIndex { - index_details: vector_index_details(), + index_details: vector_index_details_default(), index_version: self.index_type.version() as u32, files: Some(files), } diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 44739454bec..8a44968c040 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -603,9 +603,9 @@ mod tests { use lance_core::utils::tempfile::TempStrDir; use lance_core::{datatypes::Field, utils::address::RowAddress}; use lance_datagen::array; + use lance_index::pb::VectorIndexDetails; use lance_index::{IndexType, optimize::OptimizeOptions}; use lance_index::{pbold::NGramIndexDetails, scalar::BuiltinIndexType}; - use lance_table::format::pb::VectorIndexDetails; fn make_index_metadata( name: &str, diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index e137237b9a0..437e4333343 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use std::{any::Any, collections::HashMap}; pub mod builder; +pub(crate) mod details; pub mod ivf; pub mod pq; pub mod utils; @@ -59,7 +60,7 @@ use tracing::instrument; use utils::get_vector_type; use uuid::Uuid; -use super::{DatasetIndexExt, DatasetIndexInternalExt, IndexParams, pb, vector_index_details}; +use super::{DatasetIndexExt, DatasetIndexInternalExt, IndexParams, pb}; use crate::dataset::index::dataset_format_version; use crate::dataset::transaction::{Operation, Transaction}; use crate::{Error, Result, dataset::Dataset, index::pb::vector_index_stage::Stage}; @@ -265,6 +266,11 @@ pub struct VectorIndexParams { /// Skip transpose / packing for PQ and RQ storage. pub skip_transpose: bool, + + /// Runtime hints: optional build preferences stored in the index manifest. + /// Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters"). + /// Populated by the build path and merged into VectorIndexDetails at creation time. + pub runtime_hints: HashMap, } impl VectorIndexParams { @@ -286,6 +292,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -296,6 +303,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -330,6 +338,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -356,6 +365,7 @@ impl VectorIndexParams { metric_type: distance_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -371,6 +381,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -385,6 +396,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -399,6 +411,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -413,6 +426,7 @@ impl VectorIndexParams { metric_type: distance_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -434,6 +448,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -455,6 +470,7 @@ impl VectorIndexParams { metric_type, version: IndexFileVersion::V3, skip_transpose: false, + runtime_hints: HashMap::new(), } } @@ -1790,7 +1806,7 @@ pub async fn initialize_vector_index( fields: vec![field.id], dataset_version: target_dataset.manifest.version, fragment_bitmap, - index_details: Some(Arc::new(vector_index_details())), + index_details: source_index.index_details.clone(), index_version: source_index.index_version, created_at: Some(chrono::Utc::now()), base_id: None, diff --git a/rust/lance/src/index/vector/details.rs b/rust/lance/src/index/vector/details.rs new file mode 100644 index 00000000000..dba82997d93 --- /dev/null +++ b/rust/lance/src/index/vector/details.rs @@ -0,0 +1,1154 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Serialization and deserialization of [`VectorIndexDetails`] proto messages. +//! +//! This module handles: +//! - Populating `VectorIndexDetails` from build params at index creation time +//! - Deriving a human-readable index type string (e.g., "IVF_PQ") from details +//! - Serializing details as JSON for `describe_indices()` +//! - Inferring details from index files on disk (fallback for legacy indices) + +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::Arc; + +use lance_file::reader::FileReaderOptions; +use lance_index::pb::VectorIndexDetails; +use lance_index::pb::VectorMetricType; +use lance_index::pb::index::Implementation; +use lance_index::pb::vector_index_details::{Compression, FlatCompression, rabit_quantization}; +use lance_index::{INDEX_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, pb}; +use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; +use lance_io::traits::Reader; +use lance_io::utils::{CachedFileSize, read_last_block, read_version}; +use lance_linalg::distance::DistanceType; +use lance_table::format::IndexMetadata; +use serde::Serialize; + +use lance_index::vector::bq::{RQBuildParams, RQRotationType}; +use lance_index::vector::hnsw::builder::HnswBuildParams; +use lance_index::vector::ivf::IvfBuildParams; +use lance_index::vector::pq::PQBuildParams; +use lance_index::vector::sq::builder::SQBuildParams; + +use super::{StageParams, VectorIndexParams}; +use crate::dataset::Dataset; +use crate::index::open_index_proto; +use crate::{Error, Result}; + +// Private structs for JSON serialization of VectorIndexDetails. +// Changes to field names or structure are backwards-incompatible for users +// parsing the JSON output of describe_indices(). See snapshot tests below. + +#[derive(Serialize)] +struct VectorDetailsJson { + metric_type: &'static str, + #[serde(skip_serializing_if = "Option::is_none")] + target_partition_size: Option, + #[serde(skip_serializing_if = "Option::is_none")] + hnsw: Option, + #[serde(skip_serializing_if = "Option::is_none")] + compression: Option, + #[serde(skip_serializing_if = "HashMap::is_empty")] + runtime_hints: HashMap, +} + +#[derive(Serialize)] +struct HnswDetailsJson { + max_connections: u32, + construction_ef: u32, + #[serde(skip_serializing_if = "is_zero")] + max_level: u32, +} + +fn is_zero(v: &u32) -> bool { + *v == 0 +} + +#[derive(Serialize)] +#[serde(tag = "type", rename_all = "lowercase")] +enum CompressionDetailsJson { + Pq { + num_bits: u32, + num_sub_vectors: u32, + }, + Sq { + num_bits: u32, + }, + Rq { + num_bits: u32, + rotation_type: &'static str, + }, +} + +/// Build a `VectorIndexDetails` proto from build params at index creation time. +pub fn vector_index_details(params: &VectorIndexParams) -> prost_types::Any { + let metric_type = match params.metric_type { + lance_linalg::distance::DistanceType::L2 => VectorMetricType::L2, + lance_linalg::distance::DistanceType::Cosine => VectorMetricType::Cosine, + lance_linalg::distance::DistanceType::Dot => VectorMetricType::Dot, + lance_linalg::distance::DistanceType::Hamming => VectorMetricType::Hamming, + }; + + let mut target_partition_size = 0u64; + let mut hnsw_index_config = None; + let mut compression = None; + let mut runtime_hints: HashMap = params.runtime_hints.clone(); + + // Only write hints that differ from their defaults, keeping the map minimal. + // Absence of a key means "use your default". + for stage in ¶ms.stages { + match stage { + StageParams::Ivf(ivf) => { + if let Some(tps) = ivf.target_partition_size { + target_partition_size = tps as u64; + } + if ivf.max_iters != 50 { + runtime_hints + .insert("lance.ivf.max_iters".to_string(), ivf.max_iters.to_string()); + } + if ivf.sample_rate != 256 { + runtime_hints.insert( + "lance.ivf.sample_rate".to_string(), + ivf.sample_rate.to_string(), + ); + } + if ivf.shuffle_partition_batches != 1024 * 10 { + runtime_hints.insert( + "lance.ivf.shuffle_partition_batches".to_string(), + ivf.shuffle_partition_batches.to_string(), + ); + } + if ivf.shuffle_partition_concurrency != 2 { + runtime_hints.insert( + "lance.ivf.shuffle_partition_concurrency".to_string(), + ivf.shuffle_partition_concurrency.to_string(), + ); + } + } + StageParams::Hnsw(hnsw) => { + hnsw_index_config = Some(hnsw.into()); + let default_prefetch: Option = Some(2); + if hnsw.prefetch_distance != default_prefetch { + let val = match hnsw.prefetch_distance { + Some(v) => v.to_string(), + None => "none".to_string(), + }; + runtime_hints.insert("lance.hnsw.prefetch_distance".to_string(), val); + } + } + StageParams::PQ(pq) => { + compression = Some(Compression::Pq(pq.into())); + if pq.max_iters != 50 { + runtime_hints + .insert("lance.pq.max_iters".to_string(), pq.max_iters.to_string()); + } + if pq.sample_rate != 256 { + runtime_hints.insert( + "lance.pq.sample_rate".to_string(), + pq.sample_rate.to_string(), + ); + } + if pq.kmeans_redos != 1 { + runtime_hints.insert( + "lance.pq.kmeans_redos".to_string(), + pq.kmeans_redos.to_string(), + ); + } + } + StageParams::SQ(sq) => { + compression = Some(Compression::Sq(sq.into())); + if sq.sample_rate != 256 { + runtime_hints.insert( + "lance.sq.sample_rate".to_string(), + sq.sample_rate.to_string(), + ); + } + } + StageParams::RQ(rq) => { + compression = Some(Compression::Rq(rq.into())); + } + } + } + + if params.skip_transpose { + runtime_hints.insert("lance.skip_transpose".to_string(), "true".to_string()); + } + + let compression = compression.or(Some(Compression::Flat(FlatCompression {}))); + let index_version = params.index_type().version() as u32; + + let details = VectorIndexDetails { + metric_type: metric_type.into(), + target_partition_size, + hnsw_index_config, + compression, + index_version, + runtime_hints, + }; + prost_types::Any::from_msg(&details).unwrap() +} + +pub fn vector_index_details_default() -> prost_types::Any { + let details = lance_index::pb::VectorIndexDetails::default(); + prost_types::Any::from_msg(&details).unwrap() +} + +/// Apply stored runtime hints from `VectorIndexDetails` back into build params. +/// +/// Known `lance.*` keys are parsed and applied to the appropriate stage. Unknown +/// keys (e.g., from other runtimes) are silently ignored. Malformed values are +/// also silently ignored — the stage keeps its existing default. +pub fn apply_runtime_hints(hints: &HashMap, params: &mut VectorIndexParams) { + fn parse(hints: &HashMap, key: &str) -> Option { + hints.get(key)?.parse().ok() + } + + for stage in &mut params.stages { + match stage { + StageParams::Ivf(ivf) => { + if let Some(v) = parse(hints, "lance.ivf.max_iters") { + ivf.max_iters = v; + } + if let Some(v) = parse(hints, "lance.ivf.sample_rate") { + ivf.sample_rate = v; + } + if let Some(v) = parse(hints, "lance.ivf.shuffle_partition_batches") { + ivf.shuffle_partition_batches = v; + } + if let Some(v) = parse(hints, "lance.ivf.shuffle_partition_concurrency") { + ivf.shuffle_partition_concurrency = v; + } + } + StageParams::Hnsw(hnsw) => { + if let Some(raw) = hints.get("lance.hnsw.prefetch_distance") { + hnsw.prefetch_distance = if raw == "none" { + None + } else { + raw.parse().ok() + }; + } + } + StageParams::PQ(pq) => { + if let Some(v) = parse(hints, "lance.pq.max_iters") { + pq.max_iters = v; + } + if let Some(v) = parse(hints, "lance.pq.sample_rate") { + pq.sample_rate = v; + } + if let Some(v) = parse(hints, "lance.pq.kmeans_redos") { + pq.kmeans_redos = v; + } + } + StageParams::SQ(sq) => { + if let Some(v) = parse(hints, "lance.sq.sample_rate") { + sq.sample_rate = v; + } + } + StageParams::RQ(_) => {} + } + } + + if hints + .get("lance.skip_transpose") + .map(|v| v == "true") + .unwrap_or(false) + { + params.skip_transpose = true; + } +} + +/// Reconstruct `VectorIndexParams` from a stored `VectorIndexDetails` proto. +/// +/// Returns `None` for legacy indices (empty details) or if the proto is malformed. +/// Runtime hints are applied on top of the reconstructed spec. +pub fn vector_params_from_details(details: &prost_types::Any) -> Option { + if details.value.is_empty() { + return None; + } + let d = details.to_msg::().ok()?; + + let metric = DistanceType::from(VectorMetricType::try_from(d.metric_type).ok()?); + + let mut ivf = IvfBuildParams::default(); + if d.target_partition_size > 0 { + ivf.target_partition_size = Some(d.target_partition_size as usize); + } + + let hnsw = d.hnsw_index_config.map(|h| HnswBuildParams { + m: h.max_connections as usize, + ef_construction: h.construction_ef as usize, + max_level: h.max_level as u16, + ..Default::default() + }); + + let mut params = match (hnsw, d.compression) { + (None, Some(Compression::Pq(pq))) => VectorIndexParams::with_ivf_pq_params( + metric, + ivf, + PQBuildParams { + num_bits: pq.num_bits as usize, + num_sub_vectors: pq.num_sub_vectors as usize, + ..Default::default() + }, + ), + (None, Some(Compression::Sq(sq))) => VectorIndexParams::with_ivf_sq_params( + metric, + ivf, + SQBuildParams { + num_bits: sq.num_bits as u16, + ..Default::default() + }, + ), + (None, Some(Compression::Rq(rq))) => { + let rotation_type = + match rabit_quantization::RotationType::try_from(rq.rotation_type).ok()? { + rabit_quantization::RotationType::Matrix => RQRotationType::Matrix, + rabit_quantization::RotationType::Fast => RQRotationType::Fast, + }; + VectorIndexParams::with_ivf_rq_params( + metric, + ivf, + RQBuildParams::with_rotation_type(rq.num_bits as u8, rotation_type), + ) + } + (Some(hnsw), Some(Compression::Pq(pq))) => VectorIndexParams::with_ivf_hnsw_pq_params( + metric, + ivf, + hnsw, + PQBuildParams { + num_bits: pq.num_bits as usize, + num_sub_vectors: pq.num_sub_vectors as usize, + ..Default::default() + }, + ), + (Some(hnsw), Some(Compression::Sq(sq))) => VectorIndexParams::with_ivf_hnsw_sq_params( + metric, + ivf, + hnsw, + SQBuildParams { + num_bits: sq.num_bits as u16, + ..Default::default() + }, + ), + (Some(hnsw), _) => VectorIndexParams::ivf_hnsw(metric, ivf, hnsw), + _ => VectorIndexParams::with_ivf_flat_params(metric, ivf), + }; + + apply_runtime_hints(&d.runtime_hints, &mut params); + Some(params) +} + +/// Extract metric type from index metadata without opening the index file. +/// +/// For newer indices with populated `VectorIndexDetails`, returns the metric type directly. +/// For legacy indices without details, returns `None` and caller should fall back to opening the index. +/// +/// # Arguments +/// * `index` - The index metadata containing serialized VectorIndexDetails +/// +/// # Returns +/// * `Some(DistanceType)` if details are present and valid +/// * `None` if details are absent or empty (legacy index without details) +pub fn metric_type_from_index_metadata(index: &IndexMetadata) -> Option { + let index_details = index.index_details.as_ref()?; + + // Empty value bytes indicates legacy index that needs to be opened for details + if index_details.value.is_empty() { + return None; + } + + let details = index_details.to_msg::().ok()?; + + // Try to convert the metric_type field. This works even if metric_type is 0 (L2), + // since L2 is a valid metric type. + let metric_enum = VectorMetricType::try_from(details.metric_type).ok()?; + Some(DistanceType::from(metric_enum)) +} + +/// Returns true if the proto value represents a "truly empty" VectorIndexDetails +/// (i.e., a legacy index that was created before we populated this field). +fn is_empty_vector_details(details: &prost_types::Any) -> bool { + details.value.is_empty() +} + +/// Returns true if this is a vector index whose details need to be inferred from disk. +/// +/// This covers two legacy cases: +/// - Very old indices (<=0.19.2) where `index_details` is `None` but the indexed +/// field is a vector type +/// - Newer pre-details indices where `index_details` has a VectorIndexDetails +/// type_url but empty value bytes +pub fn needs_vector_details_inference( + index: &IndexMetadata, + schema: &lance_core::datatypes::Schema, +) -> bool { + match &index.index_details { + Some(d) => d.type_url.ends_with("VectorIndexDetails") && d.value.is_empty(), + None => index.fields.iter().any(|&field_id| { + schema + .field_by_id(field_id) + .map(|f| matches!(f.data_type(), arrow_schema::DataType::FixedSizeList(_, _))) + .unwrap_or(false) + }), + } +} + +/// Infer missing vector index details for all indices that need it. +/// +/// Runs inference once per unique index name, concurrently across names. +/// Applies the inferred details back to all matching indices in the slice. +pub async fn infer_missing_vector_details(dataset: &Dataset, indices: &mut [IndexMetadata]) { + let schema = dataset.schema(); + let needs_inference: HashMap<&str, &IndexMetadata> = indices + .iter() + .filter(|idx| needs_vector_details_inference(idx, schema)) + .map(|idx| (idx.name.as_str(), idx)) + .collect(); + if needs_inference.is_empty() { + return; + } + let inferred: HashMap> = + futures::future::join_all(needs_inference.into_iter().map( + |(name, representative)| async move { + let result = infer_vector_index_details(dataset, representative).await; + (name.to_string(), result) + }, + )) + .await + .into_iter() + .filter_map(|(name, result)| match result { + Ok(details) => Some((name, Arc::new(details))), + Err(err) => { + tracing::warn!("Could not infer vector index details for {}: {}", name, err); + None + } + }) + .collect(); + for index in indices.iter_mut() { + if let Some(details) = inferred.get(&index.name) { + index.index_details = Some(details.clone()); + } + } +} + +/// Derive a human-readable index type string from VectorIndexDetails. +pub fn derive_vector_index_type(details: &prost_types::Any) -> String { + if is_empty_vector_details(details) { + return "Vector".to_string(); + } + + let Ok(d) = details.to_msg::() else { + return "Vector".to_string(); + }; + let mut index_type = "IVF_".to_string(); + if d.hnsw_index_config.is_some() { + index_type.push_str("HNSW_"); + } + match d.compression { + None | Some(Compression::Flat(_)) => index_type.push_str("FLAT"), + Some(Compression::Pq(_)) => index_type.push_str("PQ"), + Some(Compression::Sq(_)) => index_type.push_str("SQ"), + Some(Compression::Rq(_)) => index_type.push_str("RQ"), + } + index_type +} + +/// Serialize VectorIndexDetails as a JSON string. +pub fn vector_details_as_json(details: &prost_types::Any) -> Result { + if is_empty_vector_details(details) { + return Ok("{}".to_string()); + } + + let d = details + .to_msg::() + .map_err(|e| Error::index(format!("Failed to deserialize VectorIndexDetails: {}", e)))?; + + let metric_type = match VectorMetricType::try_from(d.metric_type) { + Ok(VectorMetricType::L2) => "L2", + Ok(VectorMetricType::Cosine) => "COSINE", + Ok(VectorMetricType::Dot) => "DOT", + Ok(VectorMetricType::Hamming) => "HAMMING", + Err(_) => "UNKNOWN", + }; + + let hnsw = d.hnsw_index_config.map(|h| HnswDetailsJson { + max_connections: h.max_connections, + construction_ef: h.construction_ef, + max_level: h.max_level, + }); + + let compression = d.compression.and_then(|c| match c { + Compression::Flat(_) => None, + Compression::Pq(pq) => Some(CompressionDetailsJson::Pq { + num_bits: pq.num_bits, + num_sub_vectors: pq.num_sub_vectors, + }), + Compression::Sq(sq) => Some(CompressionDetailsJson::Sq { + num_bits: sq.num_bits, + }), + Compression::Rq(rq) => { + let rotation_type = match rabit_quantization::RotationType::try_from(rq.rotation_type) { + Ok(rabit_quantization::RotationType::Matrix) => "matrix", + _ => "fast", + }; + Some(CompressionDetailsJson::Rq { + num_bits: rq.num_bits, + rotation_type, + }) + } + }); + + let json = VectorDetailsJson { + metric_type, + target_partition_size: if d.target_partition_size > 0 { + Some(d.target_partition_size) + } else { + None + }, + hnsw, + compression, + runtime_hints: d.runtime_hints, + }; + + serde_json::to_string(&json).map_err(|e| Error::index(format!("Failed to serialize: {}", e))) +} + +/// Infer VectorIndexDetails from index files on disk. +/// Used as a fallback for legacy indices where the manifest doesn't have populated details. +pub async fn infer_vector_index_details( + dataset: &Dataset, + index: &IndexMetadata, +) -> Result { + let uuid = index.uuid.to_string(); + let index_dir = dataset.indice_files_dir(index)?; + let index_file = index_dir.child(uuid.as_str()).child(INDEX_FILE_NAME); + let reader: Arc = dataset.object_store.open(&index_file).await?.into(); + + let tailing_bytes = read_last_block(reader.as_ref()).await?; + let (major_version, minor_version) = read_version(&tailing_bytes)?; + + match (major_version, minor_version) { + (0, 1) | (0, 0) => { + // Legacy v0.1: read pb::Index, extract VectorIndex stages + let proto = open_index_proto(reader.as_ref()).await?; + convert_legacy_proto_to_details(&proto) + } + _ => { + // v0.2+/v0.3: read lance file schema metadata + convert_v3_metadata_to_details(dataset, &index_file).await + } + } +} + +fn convert_legacy_proto_to_details(proto: &pb::Index) -> Result { + use lance_index::pb::VectorIndexDetails; + use lance_index::pb::vector_index_details::*; + use pb::vector_index_stage::Stage; + + let Some(Implementation::VectorIndex(vector_index)) = &proto.implementation else { + return Ok(vector_index_details_default()); + }; + + let metric_type = pb::VectorMetricType::try_from(vector_index.metric_type) + .unwrap_or(pb::VectorMetricType::L2); + + let mut compression: Option = None; + for stage in &vector_index.stages { + if let Some(Stage::Pq(pq)) = &stage.stage { + compression = Some(Compression::Pq(ProductQuantization { + num_bits: pq.num_bits, + num_sub_vectors: pq.num_sub_vectors, + })); + } + } + let compression = compression.or(Some(Compression::Flat(FlatCompression {}))); + + let details = VectorIndexDetails { + metric_type: metric_type.into(), + target_partition_size: 0, + hnsw_index_config: None, + compression, + index_version: 0, + runtime_hints: Default::default(), + }; + Ok(prost_types::Any::from_msg(&details).unwrap()) +} + +async fn convert_v3_metadata_to_details( + dataset: &Dataset, + index_file: &object_store::path::Path, +) -> Result { + use lance_index::pb::vector_index_details::*; + use lance_index::pb::{HnswParameters, VectorIndexDetails}; + use lance_index::vector::bq::storage::RABIT_METADATA_KEY; + use lance_index::vector::hnsw::HnswMetadata; + use lance_index::vector::ivf::storage::IVF_PARTITION_KEY; + use lance_index::vector::pq::storage::{PQ_METADATA_KEY, ProductQuantizationMetadata}; + use lance_index::vector::sq::storage::{SQ_METADATA_KEY, ScalarQuantizationMetadata}; + + let scheduler = ScanScheduler::new( + dataset.object_store.clone(), + SchedulerConfig::max_bandwidth(&dataset.object_store), + ); + let file = scheduler + .open_file(index_file, &CachedFileSize::unknown()) + .await?; + let reader = lance_file::reader::FileReader::try_open( + file, + None, + Default::default(), + &dataset.metadata_cache.file_metadata_cache(index_file), + FileReaderOptions::default(), + ) + .await?; + + let metadata = &reader.schema().metadata; + + // Get distance_type from index metadata + let metric_type = if let Some(idx_meta_str) = metadata.get(INDEX_METADATA_SCHEMA_KEY) { + let idx_meta: lance_index::IndexMetadata = serde_json::from_str(idx_meta_str)?; + match idx_meta.distance_type.to_uppercase().as_str() { + "L2" | "EUCLIDEAN" => VectorMetricType::L2, + "COSINE" => VectorMetricType::Cosine, + "DOT" => VectorMetricType::Dot, + "HAMMING" => VectorMetricType::Hamming, + _ => VectorMetricType::L2, + } + } else { + VectorMetricType::L2 + }; + + // Check for compression + let compression = if let Some(pq_str) = metadata.get(PQ_METADATA_KEY) { + let pq_meta: ProductQuantizationMetadata = serde_json::from_str(pq_str)?; + Some(Compression::Pq(ProductQuantization { + num_bits: pq_meta.nbits as u32, + num_sub_vectors: pq_meta.num_sub_vectors as u32, + })) + } else if let Some(sq_str) = metadata.get(SQ_METADATA_KEY) { + let sq_meta: ScalarQuantizationMetadata = serde_json::from_str(sq_str)?; + Some(Compression::Sq(ScalarQuantization { + num_bits: sq_meta.num_bits as u32, + })) + } else if let Some(rq_str) = metadata.get(RABIT_METADATA_KEY) { + let rq_meta: lance_index::vector::bq::storage::RabitQuantizationMetadata = + serde_json::from_str(rq_str)?; + let rotation_type = match rq_meta.rotation_type { + lance_index::vector::bq::RQRotationType::Fast => rabit_quantization::RotationType::Fast, + lance_index::vector::bq::RQRotationType::Matrix => { + rabit_quantization::RotationType::Matrix + } + }; + Some(Compression::Rq(RabitQuantization { + num_bits: rq_meta.num_bits as u32, + rotation_type: rotation_type.into(), + })) + } else { + Some(Compression::Flat(FlatCompression {})) + }; + + // Check for HNSW + let hnsw_index_config = if let Some(partition_str) = metadata.get(IVF_PARTITION_KEY) { + let partitions: Vec = serde_json::from_str(partition_str)?; + partitions.first().map(|hnsw| HnswParameters { + max_connections: hnsw.params.m as u32, + construction_ef: hnsw.params.ef_construction as u32, + max_level: hnsw.params.max_level as u32, + }) + } else { + None + }; + + let details = VectorIndexDetails { + metric_type: metric_type.into(), + target_partition_size: 0, + hnsw_index_config, + compression, + index_version: 0, + runtime_hints: Default::default(), + }; + Ok(prost_types::Any::from_msg(&details).unwrap()) +} + +#[cfg(test)] +mod tests { + use super::*; + use lance_index::pb::vector_index_details::*; + use lance_index::pb::{HnswParameters, VectorIndexDetails}; + + fn make_details( + metric: VectorMetricType, + hnsw: Option, + compression: Option, + ) -> prost_types::Any { + let details = VectorIndexDetails { + metric_type: metric.into(), + target_partition_size: 0, + hnsw_index_config: hnsw, + compression, + index_version: 0, + runtime_hints: Default::default(), + }; + prost_types::Any::from_msg(&details).unwrap() + } + + #[test] + fn test_derive_index_type_without_hnsw() { + // Note: (None, "IVF_FLAT") is not testable here because a proto with + // all defaults serializes to empty bytes, which is treated as a legacy index. + let cases: [(Option, &str); 3] = [ + ( + Some(Compression::Pq(ProductQuantization { + num_bits: 8, + num_sub_vectors: 16, + })), + "IVF_PQ", + ), + ( + Some(Compression::Sq(ScalarQuantization { num_bits: 8 })), + "IVF_SQ", + ), + ( + Some(Compression::Rq(RabitQuantization { + num_bits: 1, + rotation_type: 0, + })), + "IVF_RQ", + ), + ]; + for (compression, expected) in cases { + let details = make_details(VectorMetricType::L2, None, compression); + assert_eq!(derive_vector_index_type(&details), expected); + } + } + + #[test] + fn test_derive_index_type_with_hnsw() { + let hnsw = Some(HnswParameters { + max_connections: 20, + construction_ef: 150, + max_level: 7, + }); + assert_eq!( + derive_vector_index_type(&make_details(VectorMetricType::L2, hnsw, None)), + "IVF_HNSW_FLAT" + ); + assert_eq!( + derive_vector_index_type(&make_details( + VectorMetricType::L2, + hnsw, + Some(Compression::Pq(ProductQuantization { + num_bits: 8, + num_sub_vectors: 16, + })) + )), + "IVF_HNSW_PQ" + ); + assert_eq!( + derive_vector_index_type(&make_details( + VectorMetricType::L2, + hnsw, + Some(Compression::Sq(ScalarQuantization { num_bits: 8 })) + )), + "IVF_HNSW_SQ" + ); + } + + #[test] + fn test_derive_index_type_empty_details() { + let details = vector_index_details_default(); + assert_eq!(derive_vector_index_type(&details), "Vector"); + } + + // Snapshot tests for JSON serialization. These guard backwards compatibility + // of the JSON format returned by describe_indices(). + + #[test] + fn test_json_ivf_pq() { + let details = make_details( + VectorMetricType::L2, + None, + Some(Compression::Pq(ProductQuantization { + num_bits: 8, + num_sub_vectors: 16, + })), + ); + assert_eq!( + vector_details_as_json(&details).unwrap(), + r#"{"metric_type":"L2","compression":{"type":"pq","num_bits":8,"num_sub_vectors":16}}"# + ); + } + + #[test] + fn test_json_ivf_hnsw_sq() { + let details = make_details( + VectorMetricType::Cosine, + Some(HnswParameters { + max_connections: 30, + construction_ef: 200, + max_level: 8, + }), + Some(Compression::Sq(ScalarQuantization { num_bits: 4 })), + ); + assert_eq!( + vector_details_as_json(&details).unwrap(), + r#"{"metric_type":"COSINE","hnsw":{"max_connections":30,"construction_ef":200,"max_level":8},"compression":{"type":"sq","num_bits":4}}"# + ); + } + + #[test] + fn test_json_ivf_rq_with_rotation() { + let details = make_details( + VectorMetricType::Dot, + None, + Some(Compression::Rq(RabitQuantization { + num_bits: 1, + rotation_type: rabit_quantization::RotationType::Matrix as i32, + })), + ); + assert_eq!( + vector_details_as_json(&details).unwrap(), + r#"{"metric_type":"DOT","compression":{"type":"rq","num_bits":1,"rotation_type":"matrix"}}"# + ); + } + + #[test] + fn test_json_ivf_rq_fast_rotation() { + let details = make_details( + VectorMetricType::L2, + None, + Some(Compression::Rq(RabitQuantization { + num_bits: 1, + rotation_type: rabit_quantization::RotationType::Fast as i32, + })), + ); + assert_eq!( + vector_details_as_json(&details).unwrap(), + r#"{"metric_type":"L2","compression":{"type":"rq","num_bits":1,"rotation_type":"fast"}}"# + ); + } + + #[test] + fn test_json_with_target_partition_size() { + let details = { + let d = VectorIndexDetails { + metric_type: VectorMetricType::L2.into(), + target_partition_size: 5000, + hnsw_index_config: None, + compression: None, + index_version: 0, + runtime_hints: Default::default(), + }; + prost_types::Any::from_msg(&d).unwrap() + }; + assert_eq!( + vector_details_as_json(&details).unwrap(), + r#"{"metric_type":"L2","target_partition_size":5000}"# + ); + } + + #[test] + fn test_json_empty_details() { + let details = vector_index_details_default(); + assert_eq!(vector_details_as_json(&details).unwrap(), "{}"); + } + + #[test] + fn test_metric_type_from_index_metadata_populated() { + // Test that populated details return the metric type. + // Note: We add a non-default compression field so the proto doesn't serialize to empty bytes. + let details = make_details( + VectorMetricType::L2, + None, + Some(Compression::Pq(ProductQuantization { + num_bits: 8, + num_sub_vectors: 16, + })), + ); + let index_details = Some(std::sync::Arc::new(details)); + let index = IndexMetadata { + uuid: uuid::Uuid::new_v4(), + fields: vec![0], + name: "test_index".to_string(), + dataset_version: 1, + fragment_bitmap: None, + index_details, + index_version: 1, + created_at: None, + base_id: None, + files: None, + }; + + let metric = metric_type_from_index_metadata(&index); + assert_eq!(metric, Some(DistanceType::L2)); + } + + #[test] + fn test_metric_type_from_index_metadata_empty() { + // Test that empty details return None (legacy index) + let details = vector_index_details_default(); + let index_details = Some(std::sync::Arc::new(details)); + let index = IndexMetadata { + uuid: uuid::Uuid::new_v4(), + fields: vec![0], + name: "test_index".to_string(), + dataset_version: 1, + fragment_bitmap: None, + index_details, + index_version: 1, + created_at: None, + base_id: None, + files: None, + }; + + let metric = metric_type_from_index_metadata(&index); + assert_eq!(metric, None); + } + + #[test] + fn test_metric_type_from_index_metadata_none() { + // Test that missing details return None + let index = IndexMetadata { + uuid: uuid::Uuid::new_v4(), + fields: vec![0], + name: "test_index".to_string(), + dataset_version: 1, + fragment_bitmap: None, + index_details: None, + index_version: 1, + created_at: None, + base_id: None, + files: None, + }; + + let metric = metric_type_from_index_metadata(&index); + assert_eq!(metric, None); + } + + #[test] + fn test_metric_type_from_index_metadata_all_metrics() { + // Test all supported metric types. + // Note: We add a non-default compression field so the proto doesn't serialize to empty bytes. + let metrics = [ + VectorMetricType::L2, + VectorMetricType::Cosine, + VectorMetricType::Dot, + VectorMetricType::Hamming, + ]; + let expected = [ + DistanceType::L2, + DistanceType::Cosine, + DistanceType::Dot, + DistanceType::Hamming, + ]; + + for (metric_enum, expected_distance) in metrics.iter().zip(expected.iter()) { + let details = make_details( + *metric_enum, + None, + Some(Compression::Sq(ScalarQuantization { num_bits: 8 })), + ); + let index_details = Some(std::sync::Arc::new(details)); + let index = IndexMetadata { + uuid: uuid::Uuid::new_v4(), + fields: vec![0], + name: "test_index".to_string(), + dataset_version: 1, + fragment_bitmap: None, + index_details, + index_version: 1, + created_at: None, + base_id: None, + files: None, + }; + + let metric = metric_type_from_index_metadata(&index); + assert_eq!(metric, Some(*expected_distance)); + } + } + + #[test] + fn test_runtime_hints_roundtrip() { + use crate::index::vector::{StageParams, VectorIndexParams}; + use lance_index::vector::ivf::builder::IvfBuildParams; + use lance_index::vector::pq::builder::PQBuildParams; + use lance_linalg::distance::DistanceType; + + // Non-default values for IVF and PQ hints + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + IvfBuildParams { + max_iters: 100, + sample_rate: 512, + shuffle_partition_batches: 2048, + shuffle_partition_concurrency: 4, + ..Default::default() + }, + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + max_iters: 75, + kmeans_redos: 3, + sample_rate: 128, + ..Default::default() + }, + ); + + let any = vector_index_details(¶ms); + let details = any.to_msg::().unwrap(); + assert_eq!( + details + .runtime_hints + .get("lance.ivf.max_iters") + .map(|s| s.as_str()), + Some("100") + ); + assert_eq!( + details + .runtime_hints + .get("lance.ivf.sample_rate") + .map(|s| s.as_str()), + Some("512") + ); + assert_eq!( + details + .runtime_hints + .get("lance.ivf.shuffle_partition_batches") + .map(|s| s.as_str()), + Some("2048") + ); + assert_eq!( + details + .runtime_hints + .get("lance.ivf.shuffle_partition_concurrency") + .map(|s| s.as_str()), + Some("4") + ); + assert_eq!( + details + .runtime_hints + .get("lance.pq.max_iters") + .map(|s| s.as_str()), + Some("75") + ); + assert_eq!( + details + .runtime_hints + .get("lance.pq.sample_rate") + .map(|s| s.as_str()), + Some("128") + ); + assert_eq!( + details + .runtime_hints + .get("lance.pq.kmeans_redos") + .map(|s| s.as_str()), + Some("3") + ); + // Default values should not appear in the map + assert!( + !details + .runtime_hints + .contains_key("lance.hnsw.prefetch_distance") + ); + assert!(!details.runtime_hints.contains_key("lance.skip_transpose")); + + // Roundtrip: apply hints back to a fresh params struct + let mut restored = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + IvfBuildParams::default(), + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + ..Default::default() + }, + ); + apply_runtime_hints(&details.runtime_hints, &mut restored); + let StageParams::Ivf(ivf) = &restored.stages[0] else { + panic!() + }; + assert_eq!(ivf.max_iters, 100); + assert_eq!(ivf.sample_rate, 512); + assert_eq!(ivf.shuffle_partition_batches, 2048); + assert_eq!(ivf.shuffle_partition_concurrency, 4); + let StageParams::PQ(pq) = &restored.stages[1] else { + panic!() + }; + assert_eq!(pq.max_iters, 75); + assert_eq!(pq.sample_rate, 128); + assert_eq!(pq.kmeans_redos, 3); + } + + #[test] + fn test_runtime_hints_defaults_omitted() { + use crate::index::vector::VectorIndexParams; + use lance_index::vector::ivf::builder::IvfBuildParams; + use lance_index::vector::pq::builder::PQBuildParams; + use lance_linalg::distance::DistanceType; + + // All defaults — hints map should be empty + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + IvfBuildParams::default(), + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + ..Default::default() + }, + ); + let any = vector_index_details(¶ms); + let details = any.to_msg::().unwrap(); + assert!(details.runtime_hints.is_empty()); + } + + #[test] + fn test_runtime_hints_in_json() { + use crate::index::vector::VectorIndexParams; + use lance_index::vector::ivf::builder::IvfBuildParams; + use lance_index::vector::pq::builder::PQBuildParams; + use lance_linalg::distance::DistanceType; + + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + IvfBuildParams { + max_iters: 100, + ..Default::default() + }, + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + ..Default::default() + }, + ); + let any = vector_index_details(¶ms); + let json = vector_details_as_json(&any).unwrap(); + let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed["runtime_hints"]["lance.ivf.max_iters"], "100"); + } + + #[test] + fn test_apply_runtime_hints_ignores_unknown_keys() { + use crate::index::vector::VectorIndexParams; + use lance_index::vector::ivf::builder::IvfBuildParams; + use lance_linalg::distance::DistanceType; + + let hints: HashMap = [ + ("lancedb.accelerator".to_string(), "cuda".to_string()), + ("unknown.vendor.key".to_string(), "value".to_string()), + ("lance.ivf.max_iters".to_string(), "99".to_string()), + ] + .into(); + + let mut params = + VectorIndexParams::with_ivf_flat_params(DistanceType::L2, IvfBuildParams::default()); + apply_runtime_hints(&hints, &mut params); + + let StageParams::Ivf(ivf) = ¶ms.stages[0] else { + panic!() + }; + assert_eq!(ivf.max_iters, 99); + // Unknown keys silently ignored — no panic + } +} diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 9f226a42db5..ef899274a15 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -17,7 +17,10 @@ use crate::index::DatasetIndexInternalExt; use crate::index::vector::utils::{get_vector_dim, get_vector_type}; use crate::{ dataset::Dataset, - index::{INDEX_FILE_NAME, pb, prefilter::PreFilter, vector::ivf::io::write_pq_partitions}, + index::{ + INDEX_FILE_NAME, pb, prefilter::PreFilter, vector::ivf::io::write_pq_partitions, + vector_params_from_details, + }, }; use crate::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion}; use arrow::datatypes::UInt8Type; @@ -385,12 +388,22 @@ pub(crate) async fn optimize_vector_indices( // try cast to v1 IVFIndex, // fallback to v2 IVFIndex if it's not v1 IVFIndex if !existing_indices[0].as_any().is::() { + // Restore skip_transpose from stored details so incremental rebuilds + // honour the original preference rather than silently reverting to false. + let skip_transpose = logical_index + .segments() + .next() + .and_then(|(meta, _)| meta.index_details.as_deref()) + .and_then(vector_params_from_details) + .map(|p| p.skip_transpose) + .unwrap_or(false); return optimize_vector_indices_v2( &dataset, unindexed, vector_column, &existing_indices, options, + skip_transpose, ) .await; } @@ -461,6 +474,7 @@ pub(crate) async fn optimize_vector_indices_v2( vector_column: &str, existing_indices: &[Arc], options: &OptimizeOptions, + skip_transpose: bool, ) -> Result<(Uuid, usize)> { // Sanity check the indices if existing_indices.is_empty() { @@ -503,6 +517,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -521,6 +536,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -542,6 +558,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -562,6 +579,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -581,6 +599,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -602,6 +621,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -620,6 +640,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -641,6 +662,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -661,6 +683,7 @@ pub(crate) async fn optimize_vector_indices_v2( .with_ivf(ivf_model.clone()) .with_quantizer(quantizer.try_into()?) .with_existing_indices(existing_indices.clone()) + .with_transpose(!skip_transpose) .shuffle_data(unindexed) .await? .build() @@ -2049,7 +2072,7 @@ pub(crate) async fn merge_segments_with_progress( merged_segment = TableIndexMetadata { uuid: segment_uuid, fragment_bitmap: Some(fragment_bitmap), - index_details: Some(Arc::new(crate::index::vector_index_details())), + index_details: Some(Arc::new(crate::index::vector_index_details_default())), index_version, created_at: Some(chrono::Utc::now()), base_id: None, @@ -2462,7 +2485,7 @@ mod tests { use crate::dataset::{InsertBuilder, WriteMode, WriteParams}; use crate::index::prefilter::DatasetPreFilter; use crate::index::vector::IndexFileVersion; - use crate::index::vector_index_details; + use crate::index::vector_index_details_default; use crate::index::{DatasetIndexExt, DatasetIndexInternalExt, vector::VectorIndexParams}; use crate::utils::test::copy_test_data_to_tmp; @@ -2847,7 +2870,7 @@ mod tests { fields: vec![field.id], name: INDEX_NAME.to_string(), fragment_bitmap: Some(dataset.fragment_bitmap.as_ref().clone()), - index_details: Some(Arc::new(vector_index_details())), + index_details: Some(Arc::new(vector_index_details_default())), index_version: VECTOR_INDEX_VERSION as i32, created_at: Some(chrono::Utc::now()), base_id: None, @@ -2886,7 +2909,7 @@ mod tests { fields: Vec::new(), name: INDEX_NAME.to_string(), fragment_bitmap: None, - index_details: Some(Arc::new(vector_index_details())), + index_details: Some(Arc::new(vector_index_details_default())), index_version: VECTOR_INDEX_VERSION as i32, created_at: None, // Test index, not setting timestamp base_id: None, @@ -2946,7 +2969,7 @@ mod tests { fields: vec![field.id], name: format!("{}_remapped", INDEX_NAME), fragment_bitmap: Some(dataset_mut.fragment_bitmap.as_ref().clone()), - index_details: Some(Arc::new(vector_index_details())), + index_details: Some(Arc::new(vector_index_details_default())), index_version: VECTOR_INDEX_VERSION as i32, created_at: Some(chrono::Utc::now()), base_id: None, diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index 7c35957fd75..39f61162a1c 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -628,6 +628,8 @@ fn must_recalculate_fragment_bitmap( /// Indices might be missing `fragment_bitmap`, so this function will add it. /// Indices might also be missing `files` (file sizes), so this function will collect them. async fn migrate_indices(dataset: &Dataset, indices: &mut [IndexMetadata]) -> Result<()> { + use crate::index::vector::details::infer_missing_vector_details; + infer_missing_vector_details(dataset, indices).await; let needs_recalculating = match detect_overlapping_fragments(indices) { Ok(()) => vec![], Err(BadFragmentBitmapError { bad_indices }) => {