diff --git a/native/Cargo.lock b/native/Cargo.lock index 55c648edb9..43e37c8b42 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1539,6 +1539,7 @@ dependencies = [ "tikv-jemallocator", "tokio", "url", + "uuid", "zstd", ] diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index c3d7dac843..92af3e2388 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -77,6 +77,7 @@ reqwest = { version = "0.12", default-features = false, features = ["rustls-tls- object_store_opendal = {version = "0.54.0", optional = true} hdfs-sys = {version = "0.3", optional = true, features = ["hdfs_3_3"]} opendal = { version ="0.54.1", optional = true, features = ["services-hdfs"] } +uuid = "1.0" [target.'cfg(target_os = "linux")'.dependencies] procfs = "0.18.0" diff --git a/native/core/src/parquet/parquet_support.rs b/native/core/src/parquet/parquet_support.rs index 00208e3161..0b5c45d24d 100644 --- a/native/core/src/parquet/parquet_support.rs +++ b/native/core/src/parquet/parquet_support.rs @@ -16,7 +16,7 @@ // under the License. use crate::execution::operators::ExecutionError; -use arrow::array::{ListArray, MapArray}; +use arrow::array::{FixedSizeBinaryArray, ListArray, MapArray, StringArray}; use arrow::buffer::NullBuffer; use arrow::compute::can_cast_types; use arrow::datatypes::{FieldRef, Fields}; @@ -200,6 +200,28 @@ fn parquet_convert_array( (Map(_, ordered_from), Map(_, ordered_to)) if ordered_from == ordered_to => parquet_convert_map_to_map(array.as_map(), to_type, parquet_options, *ordered_to) , + // Iceberg stores UUIDs as 16-byte fixed binary but Spark expects string representation. + // Arrow doesn't support casting FixedSizeBinary to Utf8, so we handle it manually. + (FixedSizeBinary(16), Utf8) => { + let binary_array = array + .as_any() + .downcast_ref::() + .expect("Expected a FixedSizeBinaryArray"); + + let string_array: StringArray = binary_array + .iter() + .map(|opt_bytes| { + opt_bytes.map(|bytes| { + let uuid = uuid::Uuid::from_bytes( + bytes.try_into().expect("Expected 16 bytes") + ); + uuid.to_string() + }) + }) + .collect(); + + Ok(Arc::new(string_array)) + } // If Arrow cast supports the cast, delegate the cast to Arrow _ if can_cast_types(from_type, to_type) => { Ok(cast_with_options(&array, to_type, &PARQUET_OPTIONS)?)