From b606ea163f3100ff96a40caee1ceb70f5ef4592a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 18:02:21 +0800 Subject: [PATCH 1/3] feat: return Precondition error while expected error happened Signed-off-by: Xuanwo --- rust/lance-core/src/error.rs | 2 ++ rust/lance-index/src/vector/kmeans.rs | 9 ++++-- rust/lance/src/index.rs | 4 +-- rust/lance/src/index/vector/pq.rs | 46 ++++++++++++++++++++++++--- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/rust/lance-core/src/error.rs b/rust/lance-core/src/error.rs index 8e02d2964c7..7d058b525f8 100644 --- a/rust/lance-core/src/error.rs +++ b/rust/lance-core/src/error.rs @@ -63,6 +63,8 @@ pub enum Error { Internal { message: String, location: Location }, #[snafu(display("A prerequisite task failed: {message}, {location}"))] PrerequisiteFailed { message: String, location: Location }, + #[snafu(display("Precondition failed: {message}, {location}"))] + Precondition { message: String, location: Location }, #[snafu(display("LanceError(Arrow): {message}, {location}"))] Arrow { message: String, location: Location }, #[snafu(display("LanceError(Schema): {message}, {location}"))] diff --git a/rust/lance-index/src/vector/kmeans.rs b/rust/lance-index/src/vector/kmeans.rs index 48c61bcdbe4..548b5b5ac55 100644 --- a/rust/lance-index/src/vector/kmeans.rs +++ b/rust/lance-index/src/vector/kmeans.rs @@ -1319,9 +1319,12 @@ where { let num_rows = array.len() / dimension; if num_rows < k { - return Err(Error::Index{message: format!( - "KMeans: can not train {k} centroids with {num_rows} vectors, choose a smaller K (< {num_rows}) instead" - ),location: location!()}); + return Err(Error::Precondition { + message: format!( + "KMeans cannot train {k} centroids with {num_rows} vectors; choose a smaller K (< {num_rows})" + ), + location: location!(), + }); } // Only sample sample_rate * num_clusters. See Faiss diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 60342a00d93..1ae5b56ed13 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -2491,8 +2491,8 @@ mod tests { .create_index(&["vector"], IndexType::Vector, None, ¶ms, false) .await; - assert!(matches!(result, Err(Error::Index { .. }))); - if let Error::Index { message, .. } = result.unwrap_err() { + assert!(matches!(result, Err(Error::Precondition { .. }))); + if let Error::Precondition { message, .. } = result.unwrap_err() { assert_eq!( message, "Not enough rows to train PQ. Requires 256 rows but only 100 available", diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index c6167876455..d2f748cfab2 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -508,6 +508,8 @@ pub async fn build_pq_model( params: &PQBuildParams, ivf: Option<&IvfModel>, ) -> Result { + let num_codes = 2_usize.pow(params.num_bits as u32); + if let Some(codebook) = ¶ms.codebook { let dt = if metric_type == MetricType::Cosine { info!("Normalize training data for PQ training: Cosine"); @@ -577,12 +579,15 @@ pub async fn build_pq_model( training_data }; - let num_codes = 2_usize.pow(params.num_bits as u32); if training_data.len() < num_codes { - return Err(Error::Index { + warn!( + "Skip PQ training: only {} rows available, needs >= {}", + training_data.len(), + num_codes + ); + return Err(Error::Precondition { message: format!( - "Not enough rows to train PQ. Requires {:?} rows but only {:?} available", - num_codes, + "Not enough rows to train PQ. Requires {num_codes} rows but only {} available", training_data.len() ), location: location!(), @@ -637,7 +642,9 @@ mod tests { use crate::index::vector::ivf::build_ivf_model; use lance_core::utils::mask::RowIdMask; use lance_index::vector::ivf::IvfBuildParams; - use lance_testing::datagen::generate_random_array_with_range; + use lance_testing::datagen::{ + generate_random_array_with_range, generate_random_array_with_seed, + }; const DIM: usize = 128; async fn generate_dataset( @@ -761,6 +768,35 @@ mod tests { ); } + #[tokio::test] + async fn test_build_pq_model_insufficient_rows_returns_prereq() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let dim = 16; + let schema = Arc::new(Schema::new(vec![Field::new( + "vector", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + dim as i32, + ), + false, + )])); + + let vectors = generate_random_array_with_seed::(dim * 10, [11u8; 32]); + let fsl = FixedSizeListArray::try_new_from_values(vectors, dim as i32).unwrap(); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(fsl)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + + let params = PQBuildParams::new(16, 8); + let err = build_pq_model(&dataset, "vector", dim, MetricType::L2, ¶ms, None) + .await + .unwrap_err(); + + assert!(matches!(err, Error::Precondition { .. })); + } + struct TestPreFilter { row_ids: Vec, } From 19c5bdb3d23c73875386c660e7796cf32878e3c8 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 19:12:47 +0800 Subject: [PATCH 2/3] Fix Build Signed-off-by: Xuanwo --- rust/lance-index/src/vector/pq/builder.rs | 5 ++--- rust/lance/src/index/vector/pq.rs | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rust/lance-index/src/vector/pq/builder.rs b/rust/lance-index/src/vector/pq/builder.rs index 5b17e2f1224..250dc628726 100644 --- a/rust/lance-index/src/vector/pq/builder.rs +++ b/rust/lance-index/src/vector/pq/builder.rs @@ -171,10 +171,9 @@ impl PQBuildParams { let num_centroids = 2_usize.pow(self.num_bits as u32); if data.len() < num_centroids { - return Err(Error::Index { + return Err(Error::Precondition { message: format!( - "Not enough rows to train PQ. Requires {:?} rows but only {:?} available", - num_centroids, + "Not enough rows to train PQ. Requires {num_centroids} rows but only {} available", data.len() ), location: location!(), diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index d2f748cfab2..4b5aeebe40f 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -587,8 +587,8 @@ pub async fn build_pq_model( ); return Err(Error::Precondition { message: format!( - "Not enough rows to train PQ. Requires {num_codes} rows but only {} available", - training_data.len() + "Not enough rows to train PQ. Requires {num_codes} rows but only {available} available", + available = training_data.len() ), location: location!(), }); From bac0f4dc899bd1817df76f48222295518ab3063d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 27 Nov 2025 14:42:40 +0800 Subject: [PATCH 3/3] Use better names Signed-off-by: Xuanwo --- rust/lance-core/src/error.rs | 4 ++-- rust/lance-index/src/vector/kmeans.rs | 2 +- rust/lance-index/src/vector/pq/builder.rs | 2 +- rust/lance/src/index.rs | 4 ++-- rust/lance/src/index/vector/pq.rs | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rust/lance-core/src/error.rs b/rust/lance-core/src/error.rs index 7d058b525f8..48150db4354 100644 --- a/rust/lance-core/src/error.rs +++ b/rust/lance-core/src/error.rs @@ -63,8 +63,8 @@ pub enum Error { Internal { message: String, location: Location }, #[snafu(display("A prerequisite task failed: {message}, {location}"))] PrerequisiteFailed { message: String, location: Location }, - #[snafu(display("Precondition failed: {message}, {location}"))] - Precondition { message: String, location: Location }, + #[snafu(display("Unprocessable: {message}, {location}"))] + Unprocessable { message: String, location: Location }, #[snafu(display("LanceError(Arrow): {message}, {location}"))] Arrow { message: String, location: Location }, #[snafu(display("LanceError(Schema): {message}, {location}"))] diff --git a/rust/lance-index/src/vector/kmeans.rs b/rust/lance-index/src/vector/kmeans.rs index 548b5b5ac55..be76fade6f6 100644 --- a/rust/lance-index/src/vector/kmeans.rs +++ b/rust/lance-index/src/vector/kmeans.rs @@ -1319,7 +1319,7 @@ where { let num_rows = array.len() / dimension; if num_rows < k { - return Err(Error::Precondition { + return Err(Error::Unprocessable { message: format!( "KMeans cannot train {k} centroids with {num_rows} vectors; choose a smaller K (< {num_rows})" ), diff --git a/rust/lance-index/src/vector/pq/builder.rs b/rust/lance-index/src/vector/pq/builder.rs index 250dc628726..d44d86e4f31 100644 --- a/rust/lance-index/src/vector/pq/builder.rs +++ b/rust/lance-index/src/vector/pq/builder.rs @@ -171,7 +171,7 @@ impl PQBuildParams { let num_centroids = 2_usize.pow(self.num_bits as u32); if data.len() < num_centroids { - return Err(Error::Precondition { + return Err(Error::Unprocessable { message: format!( "Not enough rows to train PQ. Requires {num_centroids} rows but only {} available", data.len() diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 1ae5b56ed13..2a3ad508483 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -2491,8 +2491,8 @@ mod tests { .create_index(&["vector"], IndexType::Vector, None, ¶ms, false) .await; - assert!(matches!(result, Err(Error::Precondition { .. }))); - if let Error::Precondition { message, .. } = result.unwrap_err() { + assert!(matches!(result, Err(Error::Unprocessable { .. }))); + if let Error::Unprocessable { message, .. } = result.unwrap_err() { assert_eq!( message, "Not enough rows to train PQ. Requires 256 rows but only 100 available", diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index 4b5aeebe40f..6c55f50f7af 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -585,7 +585,7 @@ pub async fn build_pq_model( training_data.len(), num_codes ); - return Err(Error::Precondition { + return Err(Error::Unprocessable { message: format!( "Not enough rows to train PQ. Requires {num_codes} rows but only {available} available", available = training_data.len() @@ -794,7 +794,7 @@ mod tests { .await .unwrap_err(); - assert!(matches!(err, Error::Precondition { .. })); + assert!(matches!(err, Error::Unprocessable { .. })); } struct TestPreFilter {