Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
852 changes: 852 additions & 0 deletions python/python/tests/test_map_type.py

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ impl LogicalType {
fn is_blob(&self) -> bool {
self.0 == BLOB_LOGICAL_TYPE
}

fn is_map(&self) -> bool {
self.0 == "map"
}
}

impl From<&str> for LogicalType {
Expand Down Expand Up @@ -196,6 +200,21 @@ impl TryFrom<&DataType> for LogicalType {
}
}
DataType::FixedSizeBinary(len) => format!("fixed_size_binary:{}", *len),
DataType::Map(_, keys_sorted) => {
// TODO: We only support keys_sorted=false for now,
// because converting a rust arrow map field to the python arrow field will
// lose the keys_sorted property.
if *keys_sorted {
return Err(Error::Schema {
message: format!(
"Unsupported map data type with keys_sorted=true: {:?}",
dt
),
location: location!(),
});
}
"map".to_string()
}
_ => {
return Err(Error::Schema {
message: format!("Unsupported data type: {:?}", dt),
Expand Down
170 changes: 160 additions & 10 deletions rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ impl Field {
lt if lt.is_struct() => {
DataType::Struct(self.children.iter().map(ArrowField::from).collect())
}
lt if lt.is_map() => {
DataType::Map(Arc::new(ArrowField::from(&self.children[0])), false)
}
lt => DataType::try_from(lt).unwrap(),
}
}
Expand Down Expand Up @@ -250,11 +253,17 @@ impl Field {
}

pub fn apply_projection(&self, projection: &Projection) -> Option<Self> {
let children = self
.children
.iter()
.filter_map(|c| c.apply_projection(projection))
.collect::<Vec<_>>();
// For Map types, we must preserve ALL children (entries struct with key/value)
// Map internal structure should not be subject to projection filtering
let children = if self.logical_type.is_map() {
// Map field: keep all children intact (entries struct and its key/value fields)
self.children.clone()
} else {
self.children
.iter()
.filter_map(|c| c.apply_projection(projection))
.collect::<Vec<_>>()
};

// The following case is invalid:
// - This is a nested field (has children)
Expand Down Expand Up @@ -691,7 +700,8 @@ impl Field {
Ok(cloned)
}
(DataType::List(_), DataType::List(_))
| (DataType::LargeList(_), DataType::LargeList(_)) => {
| (DataType::LargeList(_), DataType::LargeList(_))
| (DataType::Map(_, _), DataType::Map(_, _)) => {
let projected =
self.children[0].project_by_field(&other.children[0], on_type_mismatch)?;
let mut cloned = self.clone();
Expand Down Expand Up @@ -769,7 +779,9 @@ impl Field {

if matches!(
(&self_type, &other_type),
(DataType::Struct(_), DataType::Struct(_)) | (DataType::List(_), DataType::List(_))
(DataType::Struct(_), DataType::Struct(_))
| (DataType::List(_), DataType::List(_))
| (DataType::Map(_, _), DataType::Map(_, _))
) {
// Blob v2 uses a struct logical type for descriptors, which differs from the logical
// input struct (data/uri). When intersecting schemas for projection we want to keep
Expand Down Expand Up @@ -1023,16 +1035,52 @@ impl TryFrom<&ArrowField> for Field {
type Error = Error;

fn try_from(field: &ArrowField) -> Result<Self> {
let mut metadata = field.metadata().clone();
let children = match field.data_type() {
DataType::Struct(children) => children
.iter()
.map(|f| Self::try_from(f.as_ref()))
.collect::<Result<_>>()?,
DataType::List(item) => vec![Self::try_from(item.as_ref())?],
DataType::LargeList(item) => vec![Self::try_from(item.as_ref())?],
DataType::Map(entries, keys_sorted) => {
// TODO: We only support keys_sorted=false for now,
// because converting a rust arrow map field to the python arrow field will
// lose the keys_sorted property.
if *keys_sorted {
Comment thread
Xuanwo marked this conversation as resolved.
return Err(Error::Schema {
message: "Unsupported map field with keys_sorted=true".to_string(),
location: location!(),
});
}
// Validate Map entries follow Arrow specification
let DataType::Struct(struct_fields) = entries.data_type() else {
return Err(Error::Schema {
message: "Map entries field must be a Struct<key, value>".to_string(),
location: location!(),
});
};
if struct_fields.len() < 2 {
return Err(Error::Schema {
message: "Map entries struct must contain both key and value fields"
.to_string(),
location: location!(),
});
}
let key_field = &struct_fields[0];
if key_field.is_nullable() {
return Err(Error::Schema {
message: format!(
"Map key field '{}' must be non-nullable according to Arrow Map specification",
key_field.name()
),
location: location!(),
});
}
vec![Self::try_from(entries.as_ref())?]
}
_ => vec![],
};
let mut metadata = field.metadata().clone();
let unenforced_primary_key = metadata
.get(LANCE_UNENFORCED_PRIMARY_KEY)
.map(|s| matches!(s.to_lowercase().as_str(), "true" | "1" | "yes"))
Expand Down Expand Up @@ -1063,8 +1111,10 @@ impl TryFrom<&ArrowField> for Field {
dt if dt.is_fixed_stride() => Some(Encoding::Plain),
dt if dt.is_binary_like() => Some(Encoding::VarBinary),
DataType::Dictionary(_, _) => Some(Encoding::Dictionary),
// Use plain encoder to store the offsets of list.
DataType::List(_) | DataType::LargeList(_) => Some(Encoding::Plain),
// Use plain encoder to store the offsets of list and map.
DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => {
Some(Encoding::Plain)
}
_ => None,
},
metadata,
Expand Down Expand Up @@ -1206,6 +1256,23 @@ mod tests {
.0,
"struct"
);

assert_eq!(
LogicalType::try_from(&DataType::Map(
Arc::new(ArrowField::new(
"entries",
DataType::Struct(Fields::from(vec![
ArrowField::new("key", DataType::Utf8, false),
ArrowField::new("value", DataType::Int32, true),
])),
true
)),
false
))
.unwrap()
.0,
"map"
);
}

#[test]
Expand All @@ -1225,6 +1292,89 @@ mod tests {
assert_eq!(ArrowField::from(&field), arrow_field);
}

#[test]
fn map_key_must_be_non_nullable() {
let entries_field = Arc::new(ArrowField::new(
"entries",
DataType::Struct(Fields::from(vec![
ArrowField::new("key", DataType::Utf8, true), // invalid: nullable key
ArrowField::new("value", DataType::Int32, true),
])),
false,
));
let arrow_field = ArrowField::new("props", DataType::Map(entries_field, false), true);

let result = Field::try_from(&arrow_field);
assert!(result.is_err(), "Nullable map key should be rejected");
}

#[test]
fn map_keys_sorted_unsupported() {
let entries_field = Arc::new(ArrowField::new(
"entries",
DataType::Struct(Fields::from(vec![
ArrowField::new("key", DataType::Utf8, false),
ArrowField::new("value", DataType::Int32, true),
])),
false,
));

// Test that keys_sorted=true is rejected
let arrow_field_sorted = ArrowField::new(
"map_field",
DataType::Map(entries_field.clone(), true),
true,
);
let result = Field::try_from(&arrow_field_sorted);
assert!(result.is_err(), "keys_sorted=true should be rejected");
assert!(result.unwrap_err().to_string().contains("keys_sorted=true"));

// Test that keys_sorted=false is supported
let arrow_field_unsorted =
ArrowField::new("map_field", DataType::Map(entries_field, false), true);
let lance_field_unsorted = Field::try_from(&arrow_field_unsorted).unwrap();

// Verify conversion back to ArrowField preserves keys_sorted=false
let converted_field_unsorted = ArrowField::from(&lance_field_unsorted);
match converted_field_unsorted.data_type() {
DataType::Map(_, keys_sorted) => assert!(!keys_sorted, "keys_sorted should be false"),
_ => panic!("Expected Map type"),
}
}

#[test]
fn map_entries_must_be_struct() {
let entries_field = Arc::new(ArrowField::new("entries", DataType::Utf8, false));
let arrow_field = ArrowField::new("map_field", DataType::Map(entries_field, false), true);

let err = Field::try_from(&arrow_field).unwrap_err();
assert!(
err.to_string()
.contains("Map entries field must be a Struct"),
"Expected struct requirement error, got {err}"
);
}

#[test]
fn map_entries_struct_needs_key_and_value() {
let entries_field = Arc::new(ArrowField::new(
"entries",
DataType::Struct(Fields::from(vec![ArrowField::new(
"key",
DataType::Utf8,
false,
)])),
false,
));
let arrow_field = ArrowField::new("map_field", DataType::Map(entries_field, false), true);

let err = Field::try_from(&arrow_field).unwrap_err();
assert!(
err.to_string().contains("must contain both key and value"),
"Expected both fields requirement error, got {err}"
);
}

#[test]
fn test_project_by_field_null_type() {
let f1: Field = ArrowField::new("a", DataType::Null, true)
Expand Down
10 changes: 10 additions & 0 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,16 @@ impl TryFrom<&ArrowSchema> for Schema {
location: location!(),
});
}

if ancestor.logical_type.is_map() {
return Err(Error::Schema {
message: format!(
"Primary key column must not be in a map type: {}",
ancestor
),
location: location!(),
});
}
}
}
}
Expand Down
Loading