Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 78 additions & 8 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,9 @@ impl FieldRef<'_> {
Ok(id)
}
FieldRef::ByPath(path) => {
let field = schema.field(path).ok_or_else(|| Error::InvalidInput {
source: format!("Field '{}' not found in schema", path).into(),
location: location!(),
})?;
let field = schema
.field(path)
.ok_or_else(|| Error::field_not_found(path, schema.field_paths()))?;
Ok(field.id)
}
}
Expand Down Expand Up @@ -267,10 +266,7 @@ impl Schema {
}
}
} else if err_on_missing {
return Err(Error::Schema {
message: format!("Column {} does not exist", col.as_ref()),
location: location!(),
});
return Err(Error::field_not_found(col.as_ref(), self.field_paths()));
}
}

Expand Down Expand Up @@ -384,6 +380,27 @@ impl Schema {
SchemaFieldIterPreOrder::new(self)
}

/// Get all field paths in the schema as a list of strings.
///
/// This returns all field paths in the schema, including nested fields.
/// For example, if there's a struct field "user" with a field "name",
/// this will return "user.name" as one of the paths.
pub fn field_paths(&self) -> Vec<String> {
let mut paths = Vec::new();
for field in self.fields_pre_order() {
let ancestry = self.field_ancestry_by_id(field.id);
if let Some(ancestry) = ancestry {
let path = ancestry
.iter()
.map(|f| f.name.as_str())
.collect::<Vec<_>>()
.join(".");
paths.push(path);
}
}
paths
}

/// Returns a new schema that only contains the fields in `column_ids`.
///
/// This projection can filter out both top-level and nested fields
Expand Down Expand Up @@ -2772,4 +2789,57 @@ mod tests {
assert_eq!(pk_fields[1].name, "e");
assert_eq!(pk_fields[2].name, "g");
}

#[test]
fn test_project_with_suggestion() {
let arrow_schema = ArrowSchema::new(vec![
ArrowField::new("vector", ArrowDataType::Float32, false),
ArrowField::new("label", ArrowDataType::Utf8, true),
ArrowField::new("score", ArrowDataType::Float64, false),
]);
let schema = Schema::try_from(&arrow_schema).unwrap();

// Typo: "vectr" is close to "vector" → should get suggestion
let err = schema.project(&["vectr"]).unwrap_err();
let msg = err.to_string();
assert!(
msg.contains("Did you mean 'vector'?"),
"Expected suggestion for 'vectr', got: {}",
msg
);
// Should also list available fields
assert!(
msg.contains("Available fields:"),
"Expected available fields list, got: {}",
msg
);

// Completely wrong name → no suggestion but still lists fields
let err = schema.project(&["nonexistent_column"]).unwrap_err();
let msg = err.to_string();
assert!(
!msg.contains("Did you mean"),
"Should not suggest for completely different name, got: {}",
msg
);
assert!(
msg.contains("Available fields:"),
"Expected available fields list even without suggestion, got: {}",
msg
);
}

#[test]
fn test_field_paths() {
let arrow_schema = ArrowSchema::new(vec![
ArrowField::new("id", ArrowDataType::Int32, false),
ArrowField::new("vector", ArrowDataType::Float32, false),
ArrowField::new("name", ArrowDataType::Utf8, true),
]);
let schema = Schema::try_from(&arrow_schema).unwrap();
let paths = schema.field_paths();
assert!(paths.contains(&"id".to_string()));
assert!(paths.contains(&"vector".to_string()));
assert!(paths.contains(&"name".to_string()));
}
}
53 changes: 53 additions & 0 deletions rust/lance-core/src/error.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::fmt;

use arrow_schema::ArrowError;
use snafu::{Location, Snafu};

type BoxedError = Box<dyn std::error::Error + Send + Sync + 'static>;

/// Error for when a requested field is not found in a schema.
///
/// This error computes suggestions lazily (only when displayed) to avoid
/// computing Levenshtein distance when the error is created but never shown.
#[derive(Debug)]
pub struct FieldNotFoundError {
pub field_name: String,
pub candidates: Vec<String>,
}

impl fmt::Display for FieldNotFoundError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Field '{}' not found.", self.field_name)?;
let suggestion =
crate::levenshtein::find_best_suggestion(&self.field_name, &self.candidates);
if let Some(suggestion) = suggestion {
write!(f, " Did you mean '{}'?", suggestion)?;
}
write!(f, "\nAvailable fields: [")?;
for (i, candidate) in self.candidates.iter().take(10).enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "'{}'", candidate)?;
}
if self.candidates.len() > 10 {
let remaining = self.candidates.len() - 10;
write!(f, ", ... and {} more]", remaining)?;
} else {
write!(f, "]")?;
}
Ok(())
}
}

impl std::error::Error for FieldNotFoundError {}

/// Allocates error on the heap and then places `e` into it.
#[inline]
pub fn box_error(e: impl std::error::Error + Send + Sync + 'static) -> BoxedError {
Expand Down Expand Up @@ -125,6 +164,10 @@ pub enum Error {
/// or inspected using [`Error::external_source`].
#[snafu(transparent)]
External { source: BoxedError },

/// A requested field was not found in a schema.
#[snafu(transparent)]
FieldNotFound { source: FieldNotFoundError },
}

impl Error {
Expand Down Expand Up @@ -197,6 +240,16 @@ impl Error {
Self::External { source }
}

/// Create a FieldNotFound error with the given field name and available candidates.
pub fn field_not_found(field_name: impl Into<String>, candidates: Vec<String>) -> Self {
Self::FieldNotFound {
source: FieldNotFoundError {
field_name: field_name.into(),
candidates,
},
}
}

/// Returns a reference to the external error source if this is an `External` variant.
///
/// This allows downcasting to recover the original error type.
Expand Down
133 changes: 133 additions & 0 deletions rust/lance-core/src/levenshtein.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

/// Calculate the Levenshtein distance between two strings.
///
/// The Levenshtein distance is a measure of the number of single-character edits
/// (insertions, deletions, or substitutions) required to change one word into the other.
///
/// # Examples
///
/// ```
/// use lance_core::levenshtein::levenshtein_distance;
///
/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
/// assert_eq!(levenshtein_distance("hello", "hello"), 0);
/// ```
pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
let s1_chars: Vec<char> = s1.chars().collect();
let s2_chars: Vec<char> = s2.chars().collect();
let m = s1_chars.len();
let n = s2_chars.len();

if m == 0 {
return n;
}
if n == 0 {
return m;
}

// Use two rows instead of full matrix for space efficiency
let mut prev_row: Vec<usize> = (0..=n).collect();
let mut curr_row: Vec<usize> = vec![0; n + 1];

for (i, s1_char) in s1_chars.iter().enumerate() {
curr_row[0] = i + 1;
for (j, s2_char) in s2_chars.iter().enumerate() {
let cost = if s1_char == s2_char { 0 } else { 1 };
curr_row[j + 1] = (prev_row[j + 1] + 1)
.min(curr_row[j] + 1)
.min(prev_row[j] + cost);
}
std::mem::swap(&mut prev_row, &mut curr_row);
}

prev_row[n]
}

/// Find the best suggestion from a list of options based on Levenshtein distance.
///
/// Returns `Some(suggestion)` if there's an option where the Levenshtein distance
/// is at most 1/3 of the length of the input string (integer division).
/// Otherwise returns `None`.
///
/// # Examples
///
/// ```
/// use lance_core::levenshtein::find_best_suggestion;
///
/// let options = vec!["vector", "id", "name"];
/// assert_eq!(find_best_suggestion("vacter", &options), Some("vector"));
/// assert_eq!(find_best_suggestion("hello", &options), None);
/// ```
pub fn find_best_suggestion<'a, 'b>(
input: &'a str,
options: &'b [impl AsRef<str>],
) -> Option<&'b str> {
let input_len = input.chars().count();
if input_len == 0 {
return None;
}

let threshold = input_len / 3;
let mut best_option: Option<(&'b str, usize)> = None;
for option in options {
let distance = levenshtein_distance(input, option.as_ref());
if distance <= threshold {
match &best_option {
None => best_option = Some((option.as_ref(), distance)),
Some((_, best_distance)) => {
if distance < *best_distance {
best_option = Some((option.as_ref(), distance));
}
}
}
}
}

best_option.map(|(option, _)| option)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_levenshtein_distance() {
assert_eq!(levenshtein_distance("", ""), 0);
assert_eq!(levenshtein_distance("a", ""), 1);
assert_eq!(levenshtein_distance("", "a"), 1);
assert_eq!(levenshtein_distance("abc", "abc"), 0);
assert_eq!(levenshtein_distance("abc", ""), 3);
assert_eq!(levenshtein_distance("", "abc"), 3);
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
assert_eq!(levenshtein_distance("vector", "vectr"), 1);
assert_eq!(levenshtein_distance("vector", "vextor"), 1);
assert_eq!(levenshtein_distance("vector", "vvector"), 1);
assert_eq!(levenshtein_distance("abc", "xyz"), 3);
}

#[test]
fn test_find_best_suggestion() {
let options = vec!["vector", "id", "name", "column", "table"];

assert_eq!(find_best_suggestion("vacter", &options), Some("vector"));
assert_eq!(find_best_suggestion("vectr", &options), Some("vector"));
assert_eq!(find_best_suggestion("tble", &options), Some("table"));

// Should return None if no good match
assert_eq!(find_best_suggestion("hello", &options), None);
assert_eq!(find_best_suggestion("xyz", &options), None);

// Should return None if input is too short
assert_eq!(find_best_suggestion("v", &options), None);
assert_eq!(find_best_suggestion("", &options), None);

// Picks closest when multiple are close
assert_eq!(
find_best_suggestion("vecor", &["vector", "vendor"]),
Some("vector")
);
}
}
1 change: 1 addition & 0 deletions rust/lance-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub mod cache;
pub mod container;
pub mod datatypes;
pub mod error;
pub mod levenshtein;
pub mod traits;
pub mod utils;

Expand Down
11 changes: 7 additions & 4 deletions rust/lance/src/dataset/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -529,10 +529,13 @@ mod tests {

assert!(result.is_err());
let err = result.unwrap_err();
assert!(matches!(err, Error::InvalidInput { .. }));
assert!(err
.to_string()
.contains("Field 'non_existent_field' not found in schema"));
assert!(matches!(err, Error::FieldNotFound { .. }));
assert!(
err.to_string()
.contains("Field 'non_existent_field' not found"),
"Expected error message to contain field name, got: {}",
err
);
}

#[tokio::test]
Expand Down