From 965864fb2b4505f4d3c4b9969a99ca5d3b320fe8 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 18:11:24 +0800 Subject: [PATCH 1/3] refactor: allow datafiles to contain columns without field id Signed-off-by: Xuanwo --- rust/lance-table/src/format/fragment.rs | 29 +++++++++++++++++++++++-- rust/lance/src/dataset/fragment.rs | 2 +- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index 4d935411575..87135d7fb01 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -33,6 +33,9 @@ pub struct DataFile { /// /// Note that -1 is a possibility and it indices that the field has /// no top-level column in the file. + /// + /// Columns that lack a field id may still exist as extra entries in + /// `column_indices`; such columns are ignored by field-id–based projection. #[serde(default)] pub column_indices: Vec, /// The major version of the file format used to write this file. @@ -139,10 +142,12 @@ impl DataFile { location!(), )); } - } else if self.fields.len() != self.column_indices.len() { + } else if self.column_indices.len() < self.fields.len() { + // Every recorded field id must have a column index, but not every column needs + // to be associated with a field id (extra columns are allowed). return Err(Error::corrupt_file( base_path.child(self.path.clone()), - "contained an unequal number of fields / column_indices", + "contained fewer column_indices than fields", location!(), )); } @@ -531,6 +536,7 @@ mod tests { use arrow_schema::{ DataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema, }; + use object_store::path::Path; use serde_json::{json, Value}; #[test] @@ -618,4 +624,23 @@ mod tests { let frag2 = Fragment::from_json(&json).unwrap(); assert_eq!(fragment, frag2); } + + #[test] + fn data_file_validate_allows_extra_columns() { + let data_file = DataFile { + path: "foo.lance".to_string(), + fields: vec![1, 2], + // One extra column without a field id mapping + column_indices: vec![0, 1, 2], + file_major_version: MAJOR_VERSION as u32, + file_minor_version: MINOR_VERSION as u32, + file_size_bytes: Default::default(), + base_id: None, + }; + + let base_path = Path::from("base"); + data_file + .validate(&base_path) + .expect("validation should allow extra columns without field ids"); + } } diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 5e5f0520691..62a9ce71521 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1195,7 +1195,7 @@ impl FileFragment { /// * All field ids in the fragment are distinct /// * Within each data file, field ids are in increasing order /// * All fields in the schema have a corresponding field in one of the data - /// files + /// files (extra columns without field ids are allowed and ignored) /// * All data files exist and have the same length /// * Field ids are distinct between data files. /// * Deletion file exists and has rowids in the correct range From 6fd1f4b589e6ecc32a733e1c7bb6a6f3614dfc6a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 23:21:35 +0800 Subject: [PATCH 2/3] Update rust/lance-table/src/format/fragment.rs Co-authored-by: Weston Pace --- rust/lance-table/src/format/fragment.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index 87135d7fb01..c039279b55f 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -36,6 +36,8 @@ pub struct DataFile { /// /// Columns that lack a field id may still exist as extra entries in /// `column_indices`; such columns are ignored by field-id–based projection. + /// For example, some fields, such as blob fields, occupy multiple + /// columns in the file but only have a single field id. #[serde(default)] pub column_indices: Vec, /// The major version of the file format used to write this file. From a6773d29146b298ca732e418c0efe24474586d71 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 26 Nov 2025 23:22:26 +0800 Subject: [PATCH 3/3] Remove wrong comment Signed-off-by: Xuanwo --- rust/lance/src/dataset/fragment.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 62a9ce71521..15c77a8c2a8 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1194,8 +1194,6 @@ impl FileFragment { /// Verifies: /// * All field ids in the fragment are distinct /// * Within each data file, field ids are in increasing order - /// * All fields in the schema have a corresponding field in one of the data - /// files (extra columns without field ids are allowed and ignored) /// * All data files exist and have the same length /// * Field ids are distinct between data files. /// * Deletion file exists and has rowids in the correct range