From 53028c094c1411fd303ddc2e3f7ff2e4966ecef7 Mon Sep 17 00:00:00 2001 From: Hemantsudarshan Date: Sun, 22 Feb 2026 01:45:44 +0530 Subject: [PATCH 1/3] feat(core): add Levenshtein-based suggestions to not-found errors in schema --- rust/lance-core/src/datatypes/schema.rs | 86 +++++++++++++-- rust/lance-core/src/error.rs | 53 ++++++++++ rust/lance-core/src/levenshtein.rs | 133 ++++++++++++++++++++++++ rust/lance-core/src/lib.rs | 1 + 4 files changed, 265 insertions(+), 8 deletions(-) create mode 100644 rust/lance-core/src/levenshtein.rs diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 65e44e0d38d..fdddf5cbca8 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -54,10 +54,9 @@ impl FieldRef<'_> { Ok(id) } FieldRef::ByPath(path) => { - let field = schema.field(path).ok_or_else(|| Error::InvalidInput { - source: format!("Field '{}' not found in schema", path).into(), - location: location!(), - })?; + let field = schema + .field(path) + .ok_or_else(|| Error::field_not_found(path, schema.field_paths()))?; Ok(field.id) } } @@ -267,10 +266,7 @@ impl Schema { } } } else if err_on_missing { - return Err(Error::Schema { - message: format!("Column {} does not exist", col.as_ref()), - location: location!(), - }); + return Err(Error::field_not_found(col.as_ref(), self.field_paths())); } } @@ -384,6 +380,27 @@ impl Schema { SchemaFieldIterPreOrder::new(self) } + /// Get all field paths in the schema as a list of strings. + /// + /// This returns all field paths in the schema, including nested fields. + /// For example, if there's a struct field "user" with a field "name", + /// this will return "user.name" as one of the paths. + pub fn field_paths(&self) -> Vec { + let mut paths = Vec::new(); + for field in self.fields_pre_order() { + let ancestry = self.field_ancestry_by_id(field.id); + if let Some(ancestry) = ancestry { + let path = ancestry + .iter() + .map(|f| f.name.as_str()) + .collect::>() + .join("."); + paths.push(path); + } + } + paths + } + /// Returns a new schema that only contains the fields in `column_ids`. /// /// This projection can filter out both top-level and nested fields @@ -2772,4 +2789,57 @@ mod tests { assert_eq!(pk_fields[1].name, "e"); assert_eq!(pk_fields[2].name, "g"); } + + #[test] + fn test_project_with_suggestion() { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("vector", ArrowDataType::Float32, false), + ArrowField::new("label", ArrowDataType::Utf8, true), + ArrowField::new("score", ArrowDataType::Float64, false), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + // Typo: "vectr" is close to "vector" → should get suggestion + let err = schema.project(&["vectr"]).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("Did you mean 'vector'?"), + "Expected suggestion for 'vectr', got: {}", + msg + ); + // Should also list available fields + assert!( + msg.contains("Available fields:"), + "Expected available fields list, got: {}", + msg + ); + + // Completely wrong name → no suggestion but still lists fields + let err = schema.project(&["nonexistent_column"]).unwrap_err(); + let msg = err.to_string(); + assert!( + !msg.contains("Did you mean"), + "Should not suggest for completely different name, got: {}", + msg + ); + assert!( + msg.contains("Available fields:"), + "Expected available fields list even without suggestion, got: {}", + msg + ); + } + + #[test] + fn test_field_paths() { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("id", ArrowDataType::Int32, false), + ArrowField::new("vector", ArrowDataType::Float32, false), + ArrowField::new("name", ArrowDataType::Utf8, true), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let paths = schema.field_paths(); + assert!(paths.contains(&"id".to_string())); + assert!(paths.contains(&"vector".to_string())); + assert!(paths.contains(&"name".to_string())); + } } diff --git a/rust/lance-core/src/error.rs b/rust/lance-core/src/error.rs index fe943e03a1d..72bb865061a 100644 --- a/rust/lance-core/src/error.rs +++ b/rust/lance-core/src/error.rs @@ -1,11 +1,50 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::fmt; + use arrow_schema::ArrowError; use snafu::{Location, Snafu}; type BoxedError = Box; +/// Error for when a requested field is not found in a schema. +/// +/// This error computes suggestions lazily (only when displayed) to avoid +/// computing Levenshtein distance when the error is created but never shown. +#[derive(Debug)] +pub struct FieldNotFoundError { + pub field_name: String, + pub candidates: Vec, +} + +impl fmt::Display for FieldNotFoundError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Field '{}' not found.", self.field_name)?; + let suggestion = + crate::levenshtein::find_best_suggestion(&self.field_name, &self.candidates); + if let Some(suggestion) = suggestion { + write!(f, " Did you mean '{}'?", suggestion)?; + } + write!(f, "\nAvailable fields: [")?; + for (i, candidate) in self.candidates.iter().take(10).enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "'{}'", candidate)?; + } + if self.candidates.len() > 10 { + let remaining = self.candidates.len() - 10; + write!(f, ", ... and {} more]", remaining)?; + } else { + write!(f, "]")?; + } + Ok(()) + } +} + +impl std::error::Error for FieldNotFoundError {} + /// Allocates error on the heap and then places `e` into it. #[inline] pub fn box_error(e: impl std::error::Error + Send + Sync + 'static) -> BoxedError { @@ -125,6 +164,10 @@ pub enum Error { /// or inspected using [`Error::external_source`]. #[snafu(transparent)] External { source: BoxedError }, + + /// A requested field was not found in a schema. + #[snafu(transparent)] + FieldNotFound { source: FieldNotFoundError }, } impl Error { @@ -197,6 +240,16 @@ impl Error { Self::External { source } } + /// Create a FieldNotFound error with the given field name and available candidates. + pub fn field_not_found(field_name: impl Into, candidates: Vec) -> Self { + Self::FieldNotFound { + source: FieldNotFoundError { + field_name: field_name.into(), + candidates, + }, + } + } + /// Returns a reference to the external error source if this is an `External` variant. /// /// This allows downcasting to recover the original error type. diff --git a/rust/lance-core/src/levenshtein.rs b/rust/lance-core/src/levenshtein.rs new file mode 100644 index 00000000000..ebf5d890127 --- /dev/null +++ b/rust/lance-core/src/levenshtein.rs @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +/// Calculate the Levenshtein distance between two strings. +/// +/// The Levenshtein distance is a measure of the number of single-character edits +/// (insertions, deletions, or substitutions) required to change one word into the other. +/// +/// # Examples +/// +/// ``` +/// use lance_core::levenshtein::levenshtein_distance; +/// +/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3); +/// assert_eq!(levenshtein_distance("hello", "hello"), 0); +/// ``` +pub fn levenshtein_distance(s1: &str, s2: &str) -> usize { + let s1_chars: Vec = s1.chars().collect(); + let s2_chars: Vec = s2.chars().collect(); + let m = s1_chars.len(); + let n = s2_chars.len(); + + if m == 0 { + return n; + } + if n == 0 { + return m; + } + + // Use two rows instead of full matrix for space efficiency + let mut prev_row: Vec = (0..=n).collect(); + let mut curr_row: Vec = vec![0; n + 1]; + + for (i, s1_char) in s1_chars.iter().enumerate() { + curr_row[0] = i + 1; + for (j, s2_char) in s2_chars.iter().enumerate() { + let cost = if s1_char == s2_char { 0 } else { 1 }; + curr_row[j + 1] = (prev_row[j + 1] + 1) + .min(curr_row[j] + 1) + .min(prev_row[j] + cost); + } + std::mem::swap(&mut prev_row, &mut curr_row); + } + + prev_row[n] +} + +/// Find the best suggestion from a list of options based on Levenshtein distance. +/// +/// Returns `Some(suggestion)` if there's an option where the Levenshtein distance +/// is at most 1/3 of the length of the input string (integer division). +/// Otherwise returns `None`. +/// +/// # Examples +/// +/// ``` +/// use lance_core::levenshtein::find_best_suggestion; +/// +/// let options = vec!["vector", "id", "name"]; +/// assert_eq!(find_best_suggestion("vacter", &options), Some("vector")); +/// assert_eq!(find_best_suggestion("hello", &options), None); +/// ``` +pub fn find_best_suggestion<'a, 'b>( + input: &'a str, + options: &'b [impl AsRef], +) -> Option<&'b str> { + let input_len = input.chars().count(); + if input_len == 0 { + return None; + } + + let threshold = input_len / 3; + let mut best_option: Option<(&'b str, usize)> = None; + for option in options { + let distance = levenshtein_distance(input, option.as_ref()); + if distance <= threshold { + match &best_option { + None => best_option = Some((option.as_ref(), distance)), + Some((_, best_distance)) => { + if distance < *best_distance { + best_option = Some((option.as_ref(), distance)); + } + } + } + } + } + + best_option.map(|(option, _)| option) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_levenshtein_distance() { + assert_eq!(levenshtein_distance("", ""), 0); + assert_eq!(levenshtein_distance("a", ""), 1); + assert_eq!(levenshtein_distance("", "a"), 1); + assert_eq!(levenshtein_distance("abc", "abc"), 0); + assert_eq!(levenshtein_distance("abc", ""), 3); + assert_eq!(levenshtein_distance("", "abc"), 3); + assert_eq!(levenshtein_distance("kitten", "sitting"), 3); + assert_eq!(levenshtein_distance("saturday", "sunday"), 3); + assert_eq!(levenshtein_distance("vector", "vectr"), 1); + assert_eq!(levenshtein_distance("vector", "vextor"), 1); + assert_eq!(levenshtein_distance("vector", "vvector"), 1); + assert_eq!(levenshtein_distance("abc", "xyz"), 3); + } + + #[test] + fn test_find_best_suggestion() { + let options = vec!["vector", "id", "name", "column", "table"]; + + assert_eq!(find_best_suggestion("vacter", &options), Some("vector")); + assert_eq!(find_best_suggestion("vectr", &options), Some("vector")); + assert_eq!(find_best_suggestion("tble", &options), Some("table")); + + // Should return None if no good match + assert_eq!(find_best_suggestion("hello", &options), None); + assert_eq!(find_best_suggestion("xyz", &options), None); + + // Should return None if input is too short + assert_eq!(find_best_suggestion("v", &options), None); + assert_eq!(find_best_suggestion("", &options), None); + + // Picks closest when multiple are close + assert_eq!( + find_best_suggestion("vecor", &["vector", "vendor"]), + Some("vector") + ); + } +} diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 8c669eda223..0860f710c84 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -9,6 +9,7 @@ pub mod cache; pub mod container; pub mod datatypes; pub mod error; +pub mod levenshtein; pub mod traits; pub mod utils; From dcb5fc28492e8be04ac9058b3f877b302fbe4ebd Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 25 Feb 2026 09:11:00 -0800 Subject: [PATCH 2/3] fix test --- rust/lance/src/dataset/metadata.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/rust/lance/src/dataset/metadata.rs b/rust/lance/src/dataset/metadata.rs index d800ccce61f..48dcdd8eaee 100644 --- a/rust/lance/src/dataset/metadata.rs +++ b/rust/lance/src/dataset/metadata.rs @@ -529,10 +529,13 @@ mod tests { assert!(result.is_err()); let err = result.unwrap_err(); - assert!(matches!(err, Error::InvalidInput { .. })); - assert!(err - .to_string() - .contains("Field 'non_existent_field' not found in schema")); + assert!(matches!(err, Error::FieldNotFound { .. })); + assert!( + err.to_string() + .contains("Field 'non_existent_field' not found"), + "Expected error message to contain field name, got: {}", + err.to_string() + ); } #[tokio::test] From a36aef1fd93c56b4babb28b689b0da7ecef20d24 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 25 Feb 2026 09:25:35 -0800 Subject: [PATCH 3/3] clippy --- rust/lance/src/dataset/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/dataset/metadata.rs b/rust/lance/src/dataset/metadata.rs index 48dcdd8eaee..0149675df39 100644 --- a/rust/lance/src/dataset/metadata.rs +++ b/rust/lance/src/dataset/metadata.rs @@ -534,7 +534,7 @@ mod tests { err.to_string() .contains("Field 'non_existent_field' not found"), "Expected error message to contain field name, got: {}", - err.to_string() + err ); }