diff --git a/docs/src/format/table/index.md b/docs/src/format/table/index.md index 0114feeb0a1..a1105592ae6 100644 --- a/docs/src/format/table/index.md +++ b/docs/src/format/table/index.md @@ -25,7 +25,7 @@ a monotonically increasing version number, and an optional reference to the inde ## Schema & Fields -The schema of the table is written as a series of fields, plus a schema metadata map. +The schema of the table is written as a series of fields, plus a schema metadata map. The data types generally have a 1-1 correspondence with the Apache Arrow data types. Each field, including nested fields, have a unique integer id. At initial table creation time, fields are assigned ids in depth-first order. Afterwards, field IDs are assigned incrementally for newly added fields. @@ -42,6 +42,31 @@ See [File Format Encoding Specification](../file/encoding.md) for details on ava +### Unenforced Primary Key + +Lance supports defining an unenforced primary key through field metadata. +This is useful for deduplication during merge-insert operations and other use cases that benefit from logical row identity. +The primary key is "unenforced" meaning Lance does not always validate uniqueness constraints. +Users can use specific workloads like merge-insert to enforce it if necessary. +The primary key is fixed after initial setting and must not be updated or removed. + +A primary key field must satisfy: + +- The field, and all its ancestors, must not be nullable. +- The field must be a leaf field (primitive data type without children). +- The field must not be within a list or map type. + +When using an Arrow schema to create a Lance table, add the following metadata to the Arrow field to mark it as part of the primary key: + +- `lance-schema:unenforced-primary-key`: Set to `true`, `1`, or `yes` (case-insensitive) to indicate the field is part of the primary key. +- `lance-schema:unenforced-primary-key:position` (optional): A 1-based integer specifying the position within a composite primary key. + +For composite primary keys with multiple columns, the position determines the primary key field ordering: + +- When positions are specified, fields are ordered by their position values (1, 2, 3, ...). +- When positions are not specified, fields are ordered by their schema field id. +- Fields with explicit positions are ordered before fields without. + ## Fragments ![Fragment Structure](../../images/fragment_structure.png) diff --git a/java/lance-jni/src/schema.rs b/java/lance-jni/src/schema.rs index b9c3d70ef83..d0330d1bf25 100644 --- a/java/lance-jni/src/schema.rs +++ b/java/lance-jni/src/schema.rs @@ -44,7 +44,8 @@ pub fn convert_to_java_field<'local>( + "ZLorg/apache/arrow/vector/types/pojo/ArrowType;" + "Lorg/apache/arrow/vector/types/pojo/DictionaryEncoding;" + "Ljava/util/Map;" - + "Ljava/util/List;Z)V"; + + "Ljava/util/List;ZI)V"; + let pk_position = lance_field.unenforced_primary_key_position.unwrap_or(0) as jint; let field_obj = env.new_object( "org/lance/schema/LanceField", ctor_sig.as_str(), @@ -57,7 +58,8 @@ pub fn convert_to_java_field<'local>( JValue::Object(&JObject::null()), JValue::Object(&metadata), JValue::Object(&children), - JValue::Bool(lance_field.unenforced_primary_key as jboolean), + JValue::Bool(lance_field.is_unenforced_primary_key() as jboolean), + JValue::Int(pk_position), ], )?; diff --git a/java/src/main/java/org/lance/schema/LanceField.java b/java/src/main/java/org/lance/schema/LanceField.java index 4ede9ccb864..08df8c07158 100644 --- a/java/src/main/java/org/lance/schema/LanceField.java +++ b/java/src/main/java/org/lance/schema/LanceField.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalInt; import java.util.stream.Collectors; public class LanceField { @@ -34,6 +35,7 @@ public class LanceField { private final Map metadata; private final List children; private final boolean isUnenforcedPrimaryKey; + private final int unenforcedPrimaryKeyPosition; LanceField( int id, @@ -44,7 +46,8 @@ public class LanceField { DictionaryEncoding dictionaryEncoding, Map metadata, List children, - boolean isUnenforcedPrimaryKey) { + boolean isUnenforcedPrimaryKey, + int unenforcedPrimaryKeyPosition) { this.id = id; this.parentId = parentId; this.name = name; @@ -54,6 +57,7 @@ public class LanceField { this.metadata = metadata; this.children = children; this.isUnenforcedPrimaryKey = isUnenforcedPrimaryKey; + this.unenforcedPrimaryKeyPosition = unenforcedPrimaryKeyPosition; } public int getId() { @@ -92,6 +96,18 @@ public boolean isUnenforcedPrimaryKey() { return isUnenforcedPrimaryKey; } + /** + * Get the position of this field within a composite primary key. + * + * @return the 1-based position if explicitly set, or empty if using schema field id ordering + */ + public OptionalInt getUnenforcedPrimaryKeyPosition() { + if (unenforcedPrimaryKeyPosition > 0) { + return OptionalInt.of(unenforcedPrimaryKeyPosition); + } + return OptionalInt.empty(); + } + public Field asArrowField() { List arrowChildren = children.stream().map(LanceField::asArrowField).collect(Collectors.toList()); @@ -110,6 +126,7 @@ public String toString() { .add("dictionaryEncoding", dictionaryEncoding) .add("children", children) .add("isUnenforcedPrimaryKey", isUnenforcedPrimaryKey) + .add("unenforcedPrimaryKeyPosition", unenforcedPrimaryKeyPosition) .add("metadata", metadata) .toString(); } diff --git a/protos/file.proto b/protos/file.proto index 4245b354a21..db5971fe61d 100644 --- a/protos/file.proto +++ b/protos/file.proto @@ -166,6 +166,11 @@ message Field { bool unenforced_primary_key = 12; + // Position of this field in the primary key (1-based). + // 0 means the field is part of the primary key but uses schema field id for ordering. + // When set to a positive value, primary key fields are ordered by this position. + uint32 unenforced_primary_key_position = 13; + // DEPRECATED ---------------------------------------------------------------- // Deprecated: Only used in V1 file format. V2 uses variable encodings defined diff --git a/python/python/lance/lance/schema.pyi b/python/python/lance/lance/schema.pyi index 6bbb54a4b4d..51a1459779d 100644 --- a/python/python/lance/lance/schema.pyi +++ b/python/python/lance/lance/schema.pyi @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pyarrow as pa @@ -9,6 +9,8 @@ class LanceField: def name(self) -> str: ... def id(self) -> int: ... def children(self) -> List[LanceField]: ... + def is_unenforced_primary_key(self) -> bool: ... + def unenforced_primary_key_position(self) -> Optional[int]: ... class LanceSchema: def fields(self) -> List[LanceField]: ... diff --git a/python/src/schema.rs b/python/src/schema.rs index 1a54618416e..d257c009a72 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -57,6 +57,21 @@ impl LanceField { Ok(self.0.metadata.clone()) } + /// Check if this field is part of an unenforced primary key. + pub fn is_unenforced_primary_key(&self) -> bool { + self.0.is_unenforced_primary_key() + } + + /// Get the position of this field within a composite primary key. + /// + /// Returns the 1-based position if explicitly set, or None if not part of + /// a primary key or using schema field id ordering. + pub fn unenforced_primary_key_position(&self) -> Option { + self.0 + .unenforced_primary_key_position + .filter(|&pos| pos > 0) + } + pub fn to_arrow(&self) -> PyArrowType { PyArrowType((&self.0).into()) } diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index 1df60d65611..62c97a3914c 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -42,6 +42,13 @@ use crate::{ /// (3) The field must not be within a list type. pub const LANCE_UNENFORCED_PRIMARY_KEY: &str = "lance-schema:unenforced-primary-key"; +/// Use this config key in Arrow field metadata to specify the position of a primary key column. +/// The value is a 1-based integer indicating the order within the composite primary key. +/// When specified, primary key fields are ordered by this position value. +/// When not specified, primary key fields are ordered by their schema field id. +pub const LANCE_UNENFORCED_PRIMARY_KEY_POSITION: &str = + "lance-schema:unenforced-primary-key:position"; + fn has_blob_v2_extension(field: &ArrowField) -> bool { field .metadata() @@ -148,7 +155,11 @@ pub struct Field { /// Dictionary value array if this field is dictionary. pub dictionary: Option, - pub unenforced_primary_key: bool, + + /// Position of this field in the primary key (1-based). + /// None means the field is not part of the primary key. + /// Some(n) means this field is the nth column in the primary key. + pub unenforced_primary_key_position: Option, } impl Field { @@ -574,7 +585,7 @@ impl Field { nullable: self.nullable, children: vec![], dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; if path_components.is_empty() { // Project stops here, copy all the remaining children. @@ -845,7 +856,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }; return Ok(f); } @@ -908,7 +919,7 @@ impl Field { nullable: self.nullable, children, dictionary: self.dictionary.clone(), - unenforced_primary_key: self.unenforced_primary_key, + unenforced_primary_key_position: self.unenforced_primary_key_position, }) } } @@ -1038,6 +1049,11 @@ impl Field { pub fn is_leaf(&self) -> bool { self.children.is_empty() } + + /// Return true if the field is part of the (unenforced) primary key. + pub fn is_unenforced_primary_key(&self) -> bool { + self.unenforced_primary_key_position.is_some() + } } impl fmt::Display for Field { @@ -1114,10 +1130,16 @@ impl TryFrom<&ArrowField> for Field { } _ => vec![], }; - let unenforced_primary_key = metadata - .get(LANCE_UNENFORCED_PRIMARY_KEY) - .map(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes")) - .unwrap_or(false); + let unenforced_primary_key_position = metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY_POSITION) + .and_then(|s| s.parse::().ok()) + .or_else(|| { + // Backward compatibility: use 0 for legacy boolean flag + metadata + .get(LANCE_UNENFORCED_PRIMARY_KEY) + .filter(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes")) + .map(|_| 0) + }); let is_blob_v2 = has_blob_v2_extension(field); if is_blob_v2 { @@ -1154,7 +1176,7 @@ impl TryFrom<&ArrowField> for Field { nullable: field.is_nullable(), children, dictionary: None, - unenforced_primary_key, + unenforced_primary_key_position, }) } } diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 242dea3315b..f0cffefe271 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -111,11 +111,27 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } impl Schema { - /// The unenforced primary key fields in the schema + /// The unenforced primary key fields in the schema, ordered by position. + /// + /// Fields with explicit positions (1, 2, 3, ...) are ordered by their position value. + /// Fields without explicit positions (using the legacy boolean flag) are ordered + /// by their schema field id and come after fields with explicit positions. pub fn unenforced_primary_key(&self) -> Vec<&Field> { - self.fields_pre_order() - .filter(|f| f.unenforced_primary_key) - .collect::>() + let mut pk_fields: Vec<&Field> = self + .fields_pre_order() + .filter(|f| f.is_unenforced_primary_key()) + .collect(); + + pk_fields.sort_by_key(|f| { + let pk_position = f.unenforced_primary_key_position.unwrap_or(0); + if pk_position > 0 { + (false, pk_position as i32, f.id) + } else { + (true, f.id, f.id) + } + }); + + pk_fields } pub fn compare_with_options(&self, expected: &Self, options: &SchemaCompareOptions) -> bool { @@ -2599,4 +2615,111 @@ mod tests { .contains(error_message_contains[idx])); } } + + #[test] + fn test_schema_unenforced_primary_key_ordering() { + use crate::datatypes::field::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; + + // When positions are specified, fields are ordered by their position values + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("a", DataType::Int32, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "2".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ArrowField::new("b", DataType::Int64, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "1".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 2); + assert_eq!(pk_fields[0].name, "b"); + assert_eq!(pk_fields[1].name, "a"); + + // When positions are not specified, fields are ordered by their schema field id + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("c", DataType::Int32, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ArrowField::new("d", DataType::Int64, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 2); + assert_eq!(pk_fields[0].name, "c"); + assert_eq!(pk_fields[1].name, "d"); + + // Fields with explicit positions are ordered before fields without + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("e", DataType::Int32, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ArrowField::new("f", DataType::Int64, false).with_metadata( + vec![ + ( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + ), + ( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_owned(), + "1".to_owned(), + ), + ] + .into_iter() + .collect::>(), + ), + ArrowField::new("g", DataType::Utf8, false).with_metadata( + vec![( + "lance-schema:unenforced-primary-key".to_owned(), + "true".to_owned(), + )] + .into_iter() + .collect::>(), + ), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + let pk_fields = schema.unenforced_primary_key(); + assert_eq!(pk_fields.len(), 3); + assert_eq!(pk_fields[0].name, "f"); + assert_eq!(pk_fields[1].name, "e"); + assert_eq!(pk_fields[2].name, "g"); + } } diff --git a/rust/lance-file/src/datatypes.rs b/rust/lance-file/src/datatypes.rs index 09c5076f86d..c0966bf0268 100644 --- a/rust/lance-file/src/datatypes.rs +++ b/rust/lance-file/src/datatypes.rs @@ -45,7 +45,13 @@ impl From<&pb::Field> for Field { nullable: field.nullable, children: vec![], dictionary: field.dictionary.as_ref().map(Dictionary::from), - unenforced_primary_key: field.unenforced_primary_key, + unenforced_primary_key_position: if field.unenforced_primary_key_position > 0 { + Some(field.unenforced_primary_key_position) + } else if field.unenforced_primary_key { + Some(0) + } else { + None + }, } } } @@ -77,7 +83,8 @@ impl From<&Field> for pb::Field { .map(|name| name.to_owned()) .unwrap_or_default(), r#type: 0, - unenforced_primary_key: field.unenforced_primary_key, + unenforced_primary_key: field.unenforced_primary_key_position.is_some(), + unenforced_primary_key_position: field.unenforced_primary_key_position.unwrap_or(0), } } }