From 4b718f9c70cfbfd56a6b6e025a6d0cb78a54b7b5 Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sat, 10 Jan 2026 16:55:45 -0800 Subject: [PATCH 01/10] Add order to primary key --- docs/src/format/table/index.md | 26 ++++- protos/file.proto | 5 + rust/lance-core/src/datatypes/field.rs | 42 ++++++-- rust/lance-core/src/datatypes/schema.rs | 135 +++++++++++++++++++++++- rust/lance-file/src/datatypes.rs | 11 +- 5 files changed, 203 insertions(+), 16 deletions(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index 0114feeb0a1..4fc463fff5c 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -25,7 +25,7 @@ a monotonically increasing version number, and an optional reference to the inde ## Schema & Fields -The schema of the table is written as a series of fields, plus a schema metadata map. +The schema of the table is written as a series of fields, plus a schema metadata map. The data types generally have a 1-1 correspondence with the Apache Arrow data types. Each field, including nested fields, have a unique integer id. At initial table creation time, fields are assigned ids in depth-first order. Afterwards, field IDs are assigned incrementally for newly added fields. @@ -42,6 +42,30 @@ See [File Format Encoding Specification](../file/encoding.md) for details on ava +### Unenforced Primary Key + +Lance supports defining an unenforced primary key through field metadata. +This is useful for deduplication during merge-insert operations and other use cases that benefit from logical row identity. +The primary key is "unenforced" meaning Lance does not always validate uniqueness constraints. +Users can use specific workloads like merge-insert to enforce it if necessary. + +A primary key field must satisfy: + +- The field, and all its ancestors, must not be nullable. +- The field must be a leaf field (primitive data type without children). +- The field must not be within a list or map type. + +To mark a field as part of the primary key, add the following metadata to the Arrow field: + +- `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. +- `lance-schema:unenforced-primary-key:position` (optional): A 1-based integer specifying the field's position within a composite primary key. + +For composite primary keys with multiple columns, the position determines the column ordering: + +- When positions are specified, fields are ordered by their position values (1, 2, 3, ...). +- When positions are not specified, fields are ordered by their lance schema field id. +- Fields with explicit positions are ordered before fields without explicit positions. + ## Fragments ![Fragment Structure](../../images/fragment_structure.png) diff --git a/protos/file.proto b/protos/file.proto index 4245b354a21..78e6328fcb8 100644 --- a/protos/file.proto +++ b/protos/file.proto @@ -166,6 +166,11 @@ message Field { bool unenforced_primary_key = 12; + // Position of this field in the primary key (1-based). + // 0 means the field is part of the primary key but no explicit position is set. + // When set to a positive value, primary key fields are ordered by this position. + uint32 unenforced_primary_key_position = 13; + // DEPRECATED ---------------------------------------------------------------- // Deprecated: Only used in V1 file format. V2 uses variable encodings defined diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index 1df60d65611..d29dbcc6148 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -42,6 +42,13 @@ use crate::{ /// (3) The field must not be within a list type. pub const LANCE_UNENFORCED_PRIMARY_KEY: &str = "lance-schema:unenforced-primary-key"; +/// Use this config key in Arrow field metadata to specify the position of a primary key column. +/// The value is a 1-based integer indicating the order within the composite primary key. +/// When specified, primary key fields are ordered by this position. +/// When not specified, primary key fields are ordered by their lance schema field id. +pub const LANCE_UNENFORCED_PRIMARY_KEY_POSITION: &str = + "lance-schema:unenforced-primary-key:position"; + fn has_blob_v2_extension(field: &ArrowField) -> bool { field .metadata() @@ -148,7 +155,11 @@ pub struct Field { /// Dictionary value array if this field is dictionary. pub dictionary: Option, - pub unenforced_primary_key: bool, + + /// Position of this field in the primary key (1-based). + /// None means the field is not part of the primary key. + /// Some(n) means this field is the nth column in the primary key. + pub unenforced_primary_key_position: Option, } impl Field { @@ -574,7 +585,7 @@ impl Field { nullable: self.nullable, children: vec![], dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; if path_components.is_empty() { // Project stops here, copy all the remaining children. @@ -845,7 +856,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; return Ok(f); } @@ -908,7 +919,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }) } } @@ -1038,6 +1049,11 @@ impl Field { pub fn is_leaf(&self) -> bool { self.children.is_empty() } + + /// Return true if the field is part of the (unenforced) primary key. + pub fn is_unenforced_primary_key(&self) -> bool { + self.unenforced_primary_key_position.is_some() + } } impl fmt::Display for Field { @@ -1114,10 +1130,18 @@ impl TryFrom<&ArrowField> for Field { } _ => vec![], }; - let unenforced_primary_key = metadata - .get(LANCE_UNENFORCED_PRIMARY_KEY) - .map(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes")) - .unwrap_or(false); + // Parse primary key position: first try explicit position, then fall back to boolean flag + let unenforced_primary_key_position = metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + .and_then(|s| s.parse::().ok()) + .or_else(|| { + // Backward compatibility: if only the boolean flag is set, use 0 to indicate + // "is PK but no explicit position" (will be ordered by field id) + metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY) + .filter(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes")) + .map(|_| 0) + }); let is_blob_v2 = has_blob_v2_extension(field); if is_blob_v2 { @@ -1154,7 +1178,7 @@ impl TryFrom<&ArrowField> for Field { nullable: field.is_nullable(), children, dictionary: None, - unenforced_primary_key, + unenforced_primary_key_position, }) } } diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 242dea3315b..fc4c072b4ba 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -111,11 +111,31 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } impl Schema { - /// The unenforced primary key fields in the schema + /// The unenforced primary key fields in the schema, ordered by position. + /// + /// Fields with explicit positions (1, 2, 3, ...) are ordered by their position. + /// Fields without explicit positions (using the legacy boolean flag) are ordered + /// by their schema field id and come after explicitly positioned fields. pub fn unenforced_primary_key(&self) -> Vec<&Field> { - self.fields_pre_order() - .filter(|f| f.unenforced_primary_key) - .collect::>() + let mut pk_fields: Vec<&Field> = self + .fields_pre_order() + .filter(|f| f.is_unenforced_primary_key()) + .collect(); + + // Sort by position, with fields lacking explicit position (position=0) + // coming after explicitly positioned fields, sorted by field id + pk_fields.sort_by_key(|f| { + let pos = f.unenforced_primary_key_position.unwrap_or(0); + if pos > 0 { + // Explicit position: sort by position, then by field id for stability + (false, pos as i32, f.id) + } else { + // No explicit position: sort by field id, after explicit positions + (true, f.id, f.id) + } + }); + + pk_fields } pub fn compare_with_options(&self, expected: &Self, options: &SchemaCompareOptions) -> bool { @@ -2599,4 +2619,111 @@ mod tests { .contains(error_message_contains[idx])); } } + + #[test] + fn test_schema_unenforced_primary_key_ordering() { + use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; + + // Test 1: Explicit positions should order by position + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("a", DataType::Int32, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "2".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ArrowField::new("b", DataType::Int64, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "1".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 2); + assert_eq!(pk_fields[0].name, "b"); // position 1 + assert_eq!(pk_fields[1].name, "a"); // position 2 + + // Test 2: No explicit positions should order by field id + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("c", DataType::Int32, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ArrowField::new("d", DataType::Int64, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 2); + assert_eq!(pk_fields[0].name, "c"); // field_id 0 + assert_eq!(pk_fields[1].name, "d"); // field_id 1 + + // Test 3: Mixed - explicit positions come before fields without explicit positions + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("e", DataType::Int32, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ArrowField::new("f", DataType::Int64, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "1".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ArrowField::new("g", DataType::Utf8, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 3); + assert_eq!(pk_fields[0].name, "f"); // explicit position 1 + assert_eq!(pk_fields[1].name, "e"); // no explicit position, field_id 0 + assert_eq!(pk_fields[2].name, "g"); // no explicit position, field_id 2 + } } diff --git a/rust/lance-file/src/datatypes.rs b/rust/lance-file/src/datatypes.rs index 09c5076f86d..c0966bf0268 100644 --- a/rust/lance-file/src/datatypes.rs +++ b/rust/lance-file/src/datatypes.rs @@ -45,7 +45,13 @@ impl From<&pb::Field> for Field { nullable: field.nullable, children: vec![], dictionary: field.dictionary.as_ref().map(Dictionary::from), - unenforced_primary_key: field.unenforced_primary_key, + unenforced_primary_key_position: if field.unenforced_primary_key_position > 0 { + Some(field.unenforced_primary_key_position) + } else if field.unenforced_primary_key { + Some(0) + } else { + None + }, } } } @@ -77,7 +83,8 @@ impl From<&Field> for pb::Field { .map(|name| name.to_owned()) .unwrap_or_default(), r#type: 0, - unenforced_primary_key: field.unenforced_primary_key, + unenforced_primary_key: field.unenforced_primary_key_position.is_some(), + unenforced_primary_key_position: field.unenforced_primary_key_position.unwrap_or(0), } } } From 5d58e9ed970e1bbaccf2b38987094f1423f9a0bf Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sat, 10 Jan 2026 19:19:44 -0800 Subject: [PATCH 02/10] refactor: rename primary key position to field_id Address review feedback to use field_id instead of position for consistency with other specs like partitioning and region specs. Co-Authored-By: Claude Opus 4.5 --- docs/src/format/table/index.md | 10 ++++----- protos/file.proto | 8 +++---- rust/lance-core/src/datatypes/field.rs | 26 ++++++++++----------- rust/lance-core/src/datatypes/schema.rs | 30 ++++++++++++------------- rust/lance-file/src/datatypes.rs | 8 +++---- 5 files changed, 41 insertions(+), 41 deletions(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index 4fc463fff5c..4b8ec7c643d 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -58,13 +58,13 @@ A primary key field must satisfy: To mark a field as part of the primary key, add the following metadata to the Arrow field: - `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. -- `lance-schema:unenforced-primary-key:position` (optional): A 1-based integer specifying the field's position within a composite primary key. +- `lance-schema:unenforced-primary-key:field-id` (optional): A 1-based integer specifying the field's ID within a composite primary key. -For composite primary keys with multiple columns, the position determines the column ordering: +For composite primary keys with multiple columns, the field ID determines the column ordering: -- When positions are specified, fields are ordered by their position values (1, 2, 3, ...). -- When positions are not specified, fields are ordered by their lance schema field id. -- Fields with explicit positions are ordered before fields without explicit positions. +- When field IDs are specified, fields are ordered by their field ID values (1, 2, 3, ...). +- When field IDs are not specified, fields are ordered by their lance schema field id. +- Fields with explicit field IDs are ordered before fields without explicit field IDs. ## Fragments diff --git a/protos/file.proto b/protos/file.proto index 78e6328fcb8..c8d94a27e4f 100644 --- a/protos/file.proto +++ b/protos/file.proto @@ -166,10 +166,10 @@ message Field { bool unenforced_primary_key = 12; - // Position of this field in the primary key (1-based). - // 0 means the field is part of the primary key but no explicit position is set. - // When set to a positive value, primary key fields are ordered by this position. - uint32 unenforced_primary_key_position = 13; + // Field ID of this field in the primary key (1-based). + // 0 means the field is part of the primary key but no explicit field ID is set. + // When set to a positive value, primary key fields are ordered by this field ID. + uint32 unenforced_primary_key_field_id = 13; // DEPRECATED ---------------------------------------------------------------- diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index d29dbcc6148..487f22dff56 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -42,12 +42,12 @@ use crate::{ /// (3) The field must not be within a list type. pub const LANCE_UNENFORCED_PRIMARY_KEY: &str = "lance-schema:unenforced-primary-key"; -/// Use this config key in Arrow field metadata to specify the position of a primary key column. +/// Use this config key in Arrow field metadata to specify the field ID of a primary key column. /// The value is a 1-based integer indicating the order within the composite primary key. -/// When specified, primary key fields are ordered by this position. +/// When specified, primary key fields are ordered by this field ID. /// When not specified, primary key fields are ordered by their lance schema field id. -pub const LANCE_UNENFORCED_PRIMARY_KEY_POSITION: &str = - "lance-schema:unenforced-primary-key:position"; +pub const LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID: &str = + "lance-schema:unenforced-primary-key:field-id"; fn has_blob_v2_extension(field: &ArrowField) -> bool { field @@ -156,10 +156,10 @@ pub struct Field { /// Dictionary value array if this field is dictionary. pub dictionary: Option, - /// Position of this field in the primary key (1-based). + /// Field ID of this field in the primary key (1-based). /// None means the field is not part of the primary key. /// Some(n) means this field is the nth column in the primary key. - pub unenforced_primary_key_position: Option, + pub unenforced_primary_key_field_id: Option, } impl Field { @@ -585,7 +585,7 @@ impl Field { nullable: self.nullable, children: vec![], dictionary: self.dictionary.clone(), - unenforced_primary_key_position: self.unenforced_primary_key_position, + unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, }; if path_components.is_empty() { // Project stops here, copy all the remaining children. @@ -856,7 +856,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key_position: self.unenforced_primary_key_position, + unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, }; return Ok(f); } @@ -919,7 +919,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key_position: self.unenforced_primary_key_position, + unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, }) } } @@ -1052,7 +1052,7 @@ impl Field { /// Return true if the field is part of the (unenforced) primary key. pub fn is_unenforced_primary_key(&self) -> bool { - self.unenforced_primary_key_position.is_some() + self.unenforced_primary_key_field_id.is_some() } } @@ -1131,8 +1131,8 @@ impl TryFrom<&ArrowField> for Field { _ => vec![], }; // Parse primary key position: first try explicit position, then fall back to boolean flag - let unenforced_primary_key_position = metadata - .get(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + let unenforced_primary_key_field_id = metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID) .and_then(|s| s.parse::().ok()) .or_else(|| { // Backward compatibility: if only the boolean flag is set, use 0 to indicate @@ -1178,7 +1178,7 @@ impl TryFrom<&ArrowField> for Field { nullable: field.is_nullable(), children, dictionary: None, - unenforced_primary_key_position, + unenforced_primary_key_field_id, }) } } diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index fc4c072b4ba..d101e5cdca0 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -111,26 +111,26 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } impl Schema { - /// The unenforced primary key fields in the schema, ordered by position. + /// The unenforced primary key fields in the schema, ordered by field ID. /// - /// Fields with explicit positions (1, 2, 3, ...) are ordered by their position. - /// Fields without explicit positions (using the legacy boolean flag) are ordered - /// by their schema field id and come after explicitly positioned fields. + /// Fields with explicit field IDs (1, 2, 3, ...) are ordered by their field ID. + /// Fields without explicit field IDs (using the legacy boolean flag) are ordered + /// by their schema field id and come after fields with explicit field IDs. pub fn unenforced_primary_key(&self) -> Vec<&Field> { let mut pk_fields: Vec<&Field> = self .fields_pre_order() .filter(|f| f.is_unenforced_primary_key()) .collect(); - // Sort by position, with fields lacking explicit position (position=0) - // coming after explicitly positioned fields, sorted by field id + // Sort by field ID, with fields lacking explicit field ID (field_id=0) + // coming after fields with explicit field IDs, sorted by schema field id pk_fields.sort_by_key(|f| { - let pos = f.unenforced_primary_key_position.unwrap_or(0); - if pos > 0 { - // Explicit position: sort by position, then by field id for stability - (false, pos as i32, f.id) + let pk_field_id = f.unenforced_primary_key_field_id.unwrap_or(0); + if pk_field_id > 0 { + // Explicit field ID: sort by field ID, then by schema field id for stability + (false, pk_field_id as i32, f.id) } else { - // No explicit position: sort by field id, after explicit positions + // No explicit field ID: sort by schema field id, after explicit field IDs (true, f.id, f.id) } }); @@ -2622,7 +2622,7 @@ mod tests { #[test] fn test_schema_unenforced_primary_key_ordering() { - use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; + use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID; // Test 1: Explicit positions should order by position let arrow_schema = ArrowSchema::new(vec![ @@ -2633,7 +2633,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), "2".to_owned(), ), ] @@ -2647,7 +2647,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), "1".to_owned(), ), ] @@ -2703,7 +2703,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), "1".to_owned(), ), ] diff --git a/rust/lance-file/src/datatypes.rs b/rust/lance-file/src/datatypes.rs index c0966bf0268..0fb80ba1f4a 100644 --- a/rust/lance-file/src/datatypes.rs +++ b/rust/lance-file/src/datatypes.rs @@ -45,8 +45,8 @@ impl From<&pb::Field> for Field { nullable: field.nullable, children: vec![], dictionary: field.dictionary.as_ref().map(Dictionary::from), - unenforced_primary_key_position: if field.unenforced_primary_key_position > 0 { - Some(field.unenforced_primary_key_position) + unenforced_primary_key_field_id: if field.unenforced_primary_key_field_id > 0 { + Some(field.unenforced_primary_key_field_id) } else if field.unenforced_primary_key { Some(0) } else { @@ -83,8 +83,8 @@ impl From<&Field> for pb::Field { .map(|name| name.to_owned()) .unwrap_or_default(), r#type: 0, - unenforced_primary_key: field.unenforced_primary_key_position.is_some(), - unenforced_primary_key_position: field.unenforced_primary_key_position.unwrap_or(0), + unenforced_primary_key: field.unenforced_primary_key_field_id.is_some(), + unenforced_primary_key_field_id: field.unenforced_primary_key_field_id.unwrap_or(0), } } } From 2aaf44848fe916d94172bfe0fad44ed2fb9df299 Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sat, 10 Jan 2026 19:27:57 -0800 Subject: [PATCH 03/10] fix: address review comments - Add note that primary key is fixed after initial setting - Change "column ordering" to "primary key field ordering" in docs - Simplify inline comments - Update test comments to match doc terminology Co-Authored-By: Claude Opus 4.5 --- docs/src/format/table/index.md | 3 ++- rust/lance-core/src/datatypes/field.rs | 4 +--- rust/lance-core/src/datatypes/schema.rs | 24 ++++++++++-------------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index 4b8ec7c643d..7be421aaa5b 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -48,6 +48,7 @@ Lance supports defining an unenforced primary key through field metadata. This is useful for deduplication during merge-insert operations and other use cases that benefit from logical row identity. The primary key is "unenforced" meaning Lance does not always validate uniqueness constraints. Users can use specific workloads like merge-insert to enforce it if necessary. +The primary key is fixed after initial setting and must not be updated or removed. A primary key field must satisfy: @@ -60,7 +61,7 @@ To mark a field as part of the primary key, add the following metadata to the Ar - `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. - `lance-schema:unenforced-primary-key:field-id` (optional): A 1-based integer specifying the field's ID within a composite primary key. -For composite primary keys with multiple columns, the field ID determines the column ordering: +For composite primary keys with multiple columns, the field ID determines the primary key field ordering: - When field IDs are specified, fields are ordered by their field ID values (1, 2, 3, ...). - When field IDs are not specified, fields are ordered by their lance schema field id. diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index 487f22dff56..d746ebd9a8c 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -1130,13 +1130,11 @@ impl TryFrom<&ArrowField> for Field { } _ => vec![], }; - // Parse primary key position: first try explicit position, then fall back to boolean flag let unenforced_primary_key_field_id = metadata .get(LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID) .and_then(|s| s.parse::().ok()) .or_else(|| { - // Backward compatibility: if only the boolean flag is set, use 0 to indicate - // "is PK but no explicit position" (will be ordered by field id) + // Backward compatibility: use 0 for legacy boolean flag metadata .get(LANCE_UNENFORCED_PRIMARY_KEY) .filter(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes")) diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index d101e5cdca0..31049619c4b 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -122,15 +122,11 @@ impl Schema { .filter(|f| f.is_unenforced_primary_key()) .collect(); - // Sort by field ID, with fields lacking explicit field ID (field_id=0) - // coming after fields with explicit field IDs, sorted by schema field id pk_fields.sort_by_key(|f| { let pk_field_id = f.unenforced_primary_key_field_id.unwrap_or(0); if pk_field_id > 0 { - // Explicit field ID: sort by field ID, then by schema field id for stability (false, pk_field_id as i32, f.id) } else { - // No explicit field ID: sort by schema field id, after explicit field IDs (true, f.id, f.id) } }); @@ -2624,7 +2620,7 @@ mod tests { fn test_schema_unenforced_primary_key_ordering() { use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID; - // Test 1: Explicit positions should order by position + // When field IDs are specified, fields are ordered by their field ID values let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("a", DataType::Int32, false).with_metadata( vec![ @@ -2658,10 +2654,10 @@ mod tests { let schema = Schema::try_from(&arrow_schema).unwrap(); let pk_fields = schema.unenforced_primary_key(); assert_eq!(pk_fields.len(), 2); - assert_eq!(pk_fields[0].name, "b"); // position 1 - assert_eq!(pk_fields[1].name, "a"); // position 2 + assert_eq!(pk_fields[0].name, "b"); + assert_eq!(pk_fields[1].name, "a"); - // Test 2: No explicit positions should order by field id + // When field IDs are not specified, fields are ordered by their lance schema field id let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("c", DataType::Int32, false).with_metadata( vec![( @@ -2683,10 +2679,10 @@ mod tests { let schema = Schema::try_from(&arrow_schema).unwrap(); let pk_fields = schema.unenforced_primary_key(); assert_eq!(pk_fields.len(), 2); - assert_eq!(pk_fields[0].name, "c"); // field_id 0 - assert_eq!(pk_fields[1].name, "d"); // field_id 1 + assert_eq!(pk_fields[0].name, "c"); + assert_eq!(pk_fields[1].name, "d"); - // Test 3: Mixed - explicit positions come before fields without explicit positions + // Fields with explicit field IDs are ordered before fields without explicit field IDs let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("e", DataType::Int32, false).with_metadata( vec![( @@ -2722,8 +2718,8 @@ mod tests { let schema = Schema::try_from(&arrow_schema).unwrap(); let pk_fields = schema.unenforced_primary_key(); assert_eq!(pk_fields.len(), 3); - assert_eq!(pk_fields[0].name, "f"); // explicit position 1 - assert_eq!(pk_fields[1].name, "e"); // no explicit position, field_id 0 - assert_eq!(pk_fields[2].name, "g"); // no explicit position, field_id 2 + assert_eq!(pk_fields[0].name, "f"); + assert_eq!(pk_fields[1].name, "e"); + assert_eq!(pk_fields[2].name, "g"); } } From e3f8f9fbf32dc81f0eb2e64a0ea9c6dc3cbe2721 Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sat, 10 Jan 2026 19:56:42 -0800 Subject: [PATCH 04/10] fix: clarify primary key field id vs schema field id terminology Co-Authored-By: Claude Opus 4.5 --- docs/src/format/table/index.md | 10 +++++----- rust/lance-core/src/datatypes/schema.rs | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index 7be421aaa5b..fdb9e974073 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -59,13 +59,13 @@ A primary key field must satisfy: To mark a field as part of the primary key, add the following metadata to the Arrow field: - `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. -- `lance-schema:unenforced-primary-key:field-id` (optional): A 1-based integer specifying the field's ID within a composite primary key. +- `lance-schema:unenforced-primary-key:field-id` (optional): A 1-based integer specifying the primary key field id within a composite primary key. -For composite primary keys with multiple columns, the field ID determines the primary key field ordering: +For composite primary keys with multiple columns, the primary key field id determines the primary key field ordering: -- When field IDs are specified, fields are ordered by their field ID values (1, 2, 3, ...). -- When field IDs are not specified, fields are ordered by their lance schema field id. -- Fields with explicit field IDs are ordered before fields without explicit field IDs. +- When primary key field ids are specified, fields are ordered by their primary key field id values (1, 2, 3, ...). +- When primary key field ids are not specified, fields are ordered by their lance schema field id. +- Fields with explicit primary key field ids are ordered before fields without. ## Fragments diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 31049619c4b..7f8c3cfab69 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -111,11 +111,11 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } impl Schema { - /// The unenforced primary key fields in the schema, ordered by field ID. + /// The unenforced primary key fields in the schema, ordered by primary key field id. /// - /// Fields with explicit field IDs (1, 2, 3, ...) are ordered by their field ID. - /// Fields without explicit field IDs (using the legacy boolean flag) are ordered - /// by their schema field id and come after fields with explicit field IDs. + /// Fields with explicit primary key field ids (1, 2, 3, ...) are ordered by their primary key field id. + /// Fields without explicit primary key field ids (using the legacy boolean flag) are ordered + /// by their schema field id and come after fields with explicit primary key field ids. pub fn unenforced_primary_key(&self) -> Vec<&Field> { let mut pk_fields: Vec<&Field> = self .fields_pre_order() From a9f92ca52b2ae1ed15a69d71ce5ad9604223988f Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sat, 10 Jan 2026 19:59:33 -0800 Subject: [PATCH 05/10] docs: clarify primary key field id vs schema field id in proto comment Co-Authored-By: Claude Opus 4.5 --- protos/file.proto | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protos/file.proto b/protos/file.proto index c8d94a27e4f..515fc2e8dc5 100644 --- a/protos/file.proto +++ b/protos/file.proto @@ -166,9 +166,9 @@ message Field { bool unenforced_primary_key = 12; - // Field ID of this field in the primary key (1-based). - // 0 means the field is part of the primary key but no explicit field ID is set. - // When set to a positive value, primary key fields are ordered by this field ID. + // Primary key field id (1-based), distinct from the schema field id. + // 0 means the field is part of the primary key but uses schema field id for ordering. + // When set to a positive value, primary key fields are ordered by this primary key field id. uint32 unenforced_primary_key_field_id = 13; // DEPRECATED ---------------------------------------------------------------- From 101576d13a3ad00a3af4227a66b1486b8f4e307c Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sun, 11 Jan 2026 11:39:44 -0800 Subject: [PATCH 06/10] refactor: rename primary key field-id to position Address review feedback to avoid confusion between "schema field id" and "primary key field id" by using "position" terminology instead. Co-Authored-By: Claude Opus 4.5 --- docs/src/format/table/index.md | 10 ++++----- protos/file.proto | 6 +++--- rust/lance-core/src/datatypes/field.rs | 28 ++++++++++++------------- rust/lance-core/src/datatypes/schema.rs | 22 +++++++++---------- rust/lance-file/src/datatypes.rs | 8 +++---- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index fdb9e974073..c2263716c9f 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -59,13 +59,13 @@ A primary key field must satisfy: To mark a field as part of the primary key, add the following metadata to the Arrow field: - `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. -- `lance-schema:unenforced-primary-key:field-id` (optional): A 1-based integer specifying the primary key field id within a composite primary key. +- `lance-schema:unenforced-primary-key:position` (optional): A 1-based integer specifying the position within a composite primary key. -For composite primary keys with multiple columns, the primary key field id determines the primary key field ordering: +For composite primary keys with multiple columns, the position determines the primary key field ordering: -- When primary key field ids are specified, fields are ordered by their primary key field id values (1, 2, 3, ...). -- When primary key field ids are not specified, fields are ordered by their lance schema field id. -- Fields with explicit primary key field ids are ordered before fields without. +- When positions are specified, fields are ordered by their position values (1, 2, 3, ...). +- When positions are not specified, fields are ordered by their schema field id. +- Fields with explicit positions are ordered before fields without. ## Fragments diff --git a/protos/file.proto b/protos/file.proto index 515fc2e8dc5..db5971fe61d 100644 --- a/protos/file.proto +++ b/protos/file.proto @@ -166,10 +166,10 @@ message Field { bool unenforced_primary_key = 12; - // Primary key field id (1-based), distinct from the schema field id. + // Position of this field in the primary key (1-based). // 0 means the field is part of the primary key but uses schema field id for ordering. - // When set to a positive value, primary key fields are ordered by this primary key field id. - uint32 unenforced_primary_key_field_id = 13; + // When set to a positive value, primary key fields are ordered by this position. + uint32 unenforced_primary_key_position = 13; // DEPRECATED ---------------------------------------------------------------- diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index d746ebd9a8c..62c97a3914c 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -42,12 +42,12 @@ use crate::{ /// (3) The field must not be within a list type. pub const LANCE_UNENFORCED_PRIMARY_KEY: &str = "lance-schema:unenforced-primary-key"; -/// Use this config key in Arrow field metadata to specify the field ID of a primary key column. +/// Use this config key in Arrow field metadata to specify the position of a primary key column. /// The value is a 1-based integer indicating the order within the composite primary key. -/// When specified, primary key fields are ordered by this field ID. -/// When not specified, primary key fields are ordered by their lance schema field id. -pub const LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID: &str = - "lance-schema:unenforced-primary-key:field-id"; +/// When specified, primary key fields are ordered by this position value. +/// When not specified, primary key fields are ordered by their schema field id. +pub const LANCE_UNENFORCED_PRIMARY_KEY_POSITION: &str = + "lance-schema:unenforced-primary-key:position"; fn has_blob_v2_extension(field: &ArrowField) -> bool { field @@ -156,10 +156,10 @@ pub struct Field { /// Dictionary value array if this field is dictionary. pub dictionary: Option, - /// Field ID of this field in the primary key (1-based). + /// Position of this field in the primary key (1-based). /// None means the field is not part of the primary key. /// Some(n) means this field is the nth column in the primary key. - pub unenforced_primary_key_field_id: Option, + pub unenforced_primary_key_position: Option, } impl Field { @@ -585,7 +585,7 @@ impl Field { nullable: self.nullable, children: vec![], dictionary: self.dictionary.clone(), - unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; if path_components.is_empty() { // Project stops here, copy all the remaining children. @@ -856,7 +856,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; return Ok(f); } @@ -919,7 +919,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key_field_id: self.unenforced_primary_key_field_id, + unenforced_primary_key_position: self.unenforced_primary_key_position, }) } } @@ -1052,7 +1052,7 @@ impl Field { /// Return true if the field is part of the (unenforced) primary key. pub fn is_unenforced_primary_key(&self) -> bool { - self.unenforced_primary_key_field_id.is_some() + self.unenforced_primary_key_position.is_some() } } @@ -1130,8 +1130,8 @@ impl TryFrom<&ArrowField> for Field { } _ => vec![], }; - let unenforced_primary_key_field_id = metadata - .get(LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID) + let unenforced_primary_key_position = metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) .and_then(|s| s.parse::().ok()) .or_else(|| { // Backward compatibility: use 0 for legacy boolean flag @@ -1176,7 +1176,7 @@ impl TryFrom<&ArrowField> for Field { nullable: field.is_nullable(), children, dictionary: None, - unenforced_primary_key_field_id, + unenforced_primary_key_position, }) } } diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 7f8c3cfab69..5cfb83dbb8b 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -111,11 +111,11 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } impl Schema { - /// The unenforced primary key fields in the schema, ordered by primary key field id. + /// The unenforced primary key fields in the schema, ordered by position. /// - /// Fields with explicit primary key field ids (1, 2, 3, ...) are ordered by their primary key field id. - /// Fields without explicit primary key field ids (using the legacy boolean flag) are ordered - /// by their schema field id and come after fields with explicit primary key field ids. + /// Fields with explicit positions (1, 2, 3, ...) are ordered by their position value. + /// Fields without explicit positions (using the legacy boolean flag) are ordered + /// by their schema field id and come after fields with explicit positions. pub fn unenforced_primary_key(&self) -> Vec<&Field> { let mut pk_fields: Vec<&Field> = self .fields_pre_order() @@ -123,9 +123,9 @@ impl Schema { .collect(); pk_fields.sort_by_key(|f| { - let pk_field_id = f.unenforced_primary_key_field_id.unwrap_or(0); - if pk_field_id > 0 { - (false, pk_field_id as i32, f.id) + let pk_position = f.unenforced_primary_key_position.unwrap_or(0); + if pk_position > 0 { + (false, pk_position as i32, f.id) } else { (true, f.id, f.id) } @@ -2618,7 +2618,7 @@ mod tests { #[test] fn test_schema_unenforced_primary_key_ordering() { - use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID; + use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; // When field IDs are specified, fields are ordered by their field ID values let arrow_schema = ArrowSchema::new(vec![ @@ -2629,7 +2629,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), "2".to_owned(), ), ] @@ -2643,7 +2643,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), "1".to_owned(), ), ] @@ -2699,7 +2699,7 @@ mod tests { "true".to_owned(), ), ( - LANCE_UNENFORCED_PRIMARY_KEY_FIELD_ID.to_owned(), + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), "1".to_owned(), ), ] diff --git a/rust/lance-file/src/datatypes.rs b/rust/lance-file/src/datatypes.rs index 0fb80ba1f4a..c0966bf0268 100644 --- a/rust/lance-file/src/datatypes.rs +++ b/rust/lance-file/src/datatypes.rs @@ -45,8 +45,8 @@ impl From<&pb::Field> for Field { nullable: field.nullable, children: vec![], dictionary: field.dictionary.as_ref().map(Dictionary::from), - unenforced_primary_key_field_id: if field.unenforced_primary_key_field_id > 0 { - Some(field.unenforced_primary_key_field_id) + unenforced_primary_key_position: if field.unenforced_primary_key_position > 0 { + Some(field.unenforced_primary_key_position) } else if field.unenforced_primary_key { Some(0) } else { @@ -83,8 +83,8 @@ impl From<&Field> for pb::Field { .map(|name| name.to_owned()) .unwrap_or_default(), r#type: 0, - unenforced_primary_key: field.unenforced_primary_key_field_id.is_some(), - unenforced_primary_key_field_id: field.unenforced_primary_key_field_id.unwrap_or(0), + unenforced_primary_key: field.unenforced_primary_key_position.is_some(), + unenforced_primary_key_position: field.unenforced_primary_key_position.unwrap_or(0), } } } From 9ddc2e8228a6c4eeab58a5a013eae3ac95d131de Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sun, 11 Jan 2026 11:41:46 -0800 Subject: [PATCH 07/10] docs: clarify Arrow schema context for primary key metadata Co-Authored-By: Claude Opus 4.5 --- docs/src/format/table/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index c2263716c9f..a1105592ae6 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -56,7 +56,7 @@ A primary key field must satisfy: - The field must be a leaf field (primitive data type without children). - The field must not be within a list or map type. -To mark a field as part of the primary key, add the following metadata to the Arrow field: +When using an Arrow schema to create a Lance table, add the following metadata to the Arrow field to mark it as part of the primary key: - `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. - `lance-schema:unenforced-primary-key:position` (optional): A 1-based integer specifying the position within a composite primary key. From ab70d05ec7efc0236d3ad34f5509eb5b7067d831 Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sun, 11 Jan 2026 11:46:13 -0800 Subject: [PATCH 08/10] fix: update test comments to use position terminology Co-Authored-By: Claude Opus 4.5 --- rust/lance-core/src/datatypes/schema.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 5cfb83dbb8b..f0cffefe271 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -2620,7 +2620,7 @@ mod tests { fn test_schema_unenforced_primary_key_ordering() { use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; - // When field IDs are specified, fields are ordered by their field ID values + // When positions are specified, fields are ordered by their position values let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("a", DataType::Int32, false).with_metadata( vec![ @@ -2657,7 +2657,7 @@ mod tests { assert_eq!(pk_fields[0].name, "b"); assert_eq!(pk_fields[1].name, "a"); - // When field IDs are not specified, fields are ordered by their lance schema field id + // When positions are not specified, fields are ordered by their schema field id let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("c", DataType::Int32, false).with_metadata( vec![( @@ -2682,7 +2682,7 @@ mod tests { assert_eq!(pk_fields[0].name, "c"); assert_eq!(pk_fields[1].name, "d"); - // Fields with explicit field IDs are ordered before fields without explicit field IDs + // Fields with explicit positions are ordered before fields without let arrow_schema = ArrowSchema::new(vec![ ArrowField::new("e", DataType::Int32, false).with_metadata( vec![( From 7590e3fbf10990d3ec9ffec8191a0952f4ce585f Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sun, 11 Jan 2026 12:08:18 -0800 Subject: [PATCH 09/10] fix: update JNI code for unenforced_primary_key field change Use is_unenforced_primary_key() method instead of the removed unenforced_primary_key field. Co-Authored-By: Claude Opus 4.5 --- java/lance-jni/src/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/lance-jni/src/schema.rs b/java/lance-jni/src/schema.rs index b9c3d70ef83..9c318afa82f 100644 --- a/java/lance-jni/src/schema.rs +++ b/java/lance-jni/src/schema.rs @@ -57,7 +57,7 @@ pub fn convert_to_java_field<'local>( JValue::Object(&JObject::null()), JValue::Object(&metadata), JValue::Object(&children), - JValue::Bool(lance_field.unenforced_primary_key as jboolean), + JValue::Bool(lance_field.is_unenforced_primary_key() as jboolean), ], )?; From 118407a476942a38574780e4f1d486470a41e056 Mon Sep 17 00:00:00 2001 From: Heng Ge Date: Sun, 11 Jan 2026 12:36:47 -0800 Subject: [PATCH 10/10] feat: add unenforced primary key position to Java and Python bindings - Java: Add unenforcedPrimaryKeyPosition field and getter to LanceField - Python: Add is_unenforced_primary_key() and unenforced_primary_key_position() methods to LanceField Co-Authored-By: Claude Opus 4.5 --- java/lance-jni/src/schema.rs | 4 +++- .../java/org/lance/schema/LanceField.java | 19 ++++++++++++++++++- python/python/lance/lance/schema.pyi | 4 +++- python/src/schema.rs | 15 +++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/java/lance-jni/src/schema.rs b/java/lance-jni/src/schema.rs index 9c318afa82f..d0330d1bf25 100644 --- a/java/lance-jni/src/schema.rs +++ b/java/lance-jni/src/schema.rs @@ -44,7 +44,8 @@ pub fn convert_to_java_field<'local>( + "ZLorg/apache/arrow/vector/types/pojo/ArrowType;" + "Lorg/apache/arrow/vector/types/pojo/DictionaryEncoding;" + "Ljava/util/Map;" - + "Ljava/util/List;Z)V"; + + "Ljava/util/List;ZI)V"; + let pk_position = lance_field.unenforced_primary_key_position.unwrap_or(0) as jint; let field_obj = env.new_object( "org/lance/schema/LanceField", ctor_sig.as_str(), @@ -58,6 +59,7 @@ pub fn convert_to_java_field<'local>( JValue::Object(&metadata), JValue::Object(&children), JValue::Bool(lance_field.is_unenforced_primary_key() as jboolean), + JValue::Int(pk_position), ], )?; diff --git a/java/src/main/java/org/lance/schema/LanceField.java b/java/src/main/java/org/lance/schema/LanceField.java index 4ede9ccb864..08df8c07158 100644 --- a/java/src/main/java/org/lance/schema/LanceField.java +++ b/java/src/main/java/org/lance/schema/LanceField.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalInt; import java.util.stream.Collectors; public class LanceField { @@ -34,6 +35,7 @@ public class LanceField { private final Map metadata; private final List children; private final boolean isUnenforcedPrimaryKey; + private final int unenforcedPrimaryKeyPosition; LanceField( int id, @@ -44,7 +46,8 @@ public class LanceField { DictionaryEncoding dictionaryEncoding, Map metadata, List children, - boolean isUnenforcedPrimaryKey) { + boolean isUnenforcedPrimaryKey, + int unenforcedPrimaryKeyPosition) { this.id = id; this.parentId = parentId; this.name = name; @@ -54,6 +57,7 @@ public class LanceField { this.metadata = metadata; this.children = children; this.isUnenforcedPrimaryKey = isUnenforcedPrimaryKey; + this.unenforcedPrimaryKeyPosition = unenforcedPrimaryKeyPosition; } public int getId() { @@ -92,6 +96,18 @@ public boolean isUnenforcedPrimaryKey() { return isUnenforcedPrimaryKey; } + /** + * Get the position of this field within a composite primary key. + * + * @return the 1-based position if explicitly set, or empty if using schema field id ordering + */ + public OptionalInt getUnenforcedPrimaryKeyPosition() { + if (unenforcedPrimaryKeyPosition > 0) { + return OptionalInt.of(unenforcedPrimaryKeyPosition); + } + return OptionalInt.empty(); + } + public Field asArrowField() { List arrowChildren = children.stream().map(LanceField::asArrowField).collect(Collectors.toList()); @@ -110,6 +126,7 @@ public String toString() { .add("dictionaryEncoding", dictionaryEncoding) .add("children", children) .add("isUnenforcedPrimaryKey", isUnenforcedPrimaryKey) + .add("unenforcedPrimaryKeyPosition", unenforcedPrimaryKeyPosition) .add("metadata", metadata) .toString(); } diff --git a/python/python/lance/lance/schema.pyi b/python/python/lance/lance/schema.pyi index 6bbb54a4b4d..51a1459779d 100644 --- a/python/python/lance/lance/schema.pyi +++ b/python/python/lance/lance/schema.pyi @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pyarrow as pa @@ -9,6 +9,8 @@ class LanceField: def name(self) -> str: ... def id(self) -> int: ... def children(self) -> List[LanceField]: ... + def is_unenforced_primary_key(self) -> bool: ... + def unenforced_primary_key_position(self) -> Optional[int]: ... class LanceSchema: def fields(self) -> List[LanceField]: ... diff --git a/python/src/schema.rs b/python/src/schema.rs index 1a54618416e..d257c009a72 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -57,6 +57,21 @@ impl LanceField { Ok(self.0.metadata.clone()) } + /// Check if this field is part of an unenforced primary key. + pub fn is_unenforced_primary_key(&self) -> bool { + self.0.is_unenforced_primary_key() + } + + /// Get the position of this field within a composite primary key. + /// + /// Returns the 1-based position if explicitly set, or None if not part of + /// a primary key or using schema field id ordering. + pub fn unenforced_primary_key_position(&self) -> Option { + self.0 + .unenforced_primary_key_position + .filter(|&pos| pos > 0) + } + pub fn to_arrow(&self) -> PyArrowType { PyArrowType((&self.0).into()) }