From e08bcaaa8a6e0aa7c8e7644cd24515311831ca00 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 10:03:35 -0700 Subject: [PATCH 01/10] [Variant] Fix several incorrect variant integration test cases --- parquet-variant-compute/src/variant_array.rs | 75 ++++++++------------ parquet/tests/variant_integration.rs | 16 ++--- 2 files changed, 37 insertions(+), 54 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index ed4b6fe37e47..e6ee7eb9ced1 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -23,7 +23,6 @@ use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; @@ -353,37 +352,20 @@ impl VariantArray { /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { - match &self.shredding_state { - ShreddingState::Unshredded { value, .. } => { - // Unshredded case - Variant::new(self.metadata.value(index), value.value(index)) - } - ShreddingState::Typed { typed_value, .. } => { - // Typed case (formerly PerfectlyShredded) - if typed_value.is_null(index) { - Variant::Null - } else { - typed_value_to_variant(typed_value, index) - } + let typed_value = self.typed_value_field(); + let value = self.value_field(); + match (typed_value, value) { + // Always prefer typed_value, if available + (Some(typed_value), value) if typed_value.is_valid(index) => { + typed_value_to_variant(typed_value, value, index) } - ShreddingState::PartiallyShredded { - value, typed_value, .. - } => { - // PartiallyShredded case (formerly ImperfectlyShredded) - if typed_value.is_null(index) { - Variant::new(self.metadata.value(index), value.value(index)) - } else { - typed_value_to_variant(typed_value, index) - } - } - ShreddingState::AllNull => { - // AllNull case: neither value nor typed_value fields exist - // NOTE: This handles the case where neither value nor typed_value fields exist. - // For top-level variants, this returns Variant::Null (JSON null). - // For shredded object fields, this technically should indicate SQL NULL, - // but the current API cannot distinguish these contexts. - Variant::Null + // Otherwise fall back to value, if available + (_, Some(value)) if value.is_valid(index) => { + Variant::new(self.metadata.value(index), value.value(index)) } + // It is technically invalid for neither value nor typed_value fields to be available, + // but the spec specifically requires readers to return Variant::Null in this case. + _ => Variant::Null, } } @@ -796,8 +778,17 @@ impl StructArrayBuilder { } /// returns the non-null element at index as a Variant -fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '_> { - match typed_value.data_type() { +fn typed_value_to_variant<'a>( + typed_value: &'a ArrayRef, + value: Option<&BinaryViewArray>, + index: usize, +) -> Variant<'a, 'a> { + let data_type = typed_value.data_type(); + if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) { + // Only a partially shredded struct is allowed to have values for both columns + panic!("Invalid variant, conflicting value and typed_value"); + } + match data_type { DataType::Boolean => { let boolean_array = typed_value.as_boolean(); let value = boolean_array.value(index); @@ -843,18 +834,6 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' DataType::Int64 => { primitive_conversion_single_value!(Int64Type, typed_value, index) } - DataType::UInt8 => { - primitive_conversion_single_value!(UInt8Type, typed_value, index) - } - DataType::UInt16 => { - primitive_conversion_single_value!(UInt16Type, typed_value, index) - } - DataType::UInt32 => { - primitive_conversion_single_value!(UInt32Type, typed_value, index) - } - DataType::UInt64 => { - primitive_conversion_single_value!(UInt64Type, typed_value, index) - } DataType::Float16 => { primitive_conversion_single_value!(Float16Type, typed_value, index) } @@ -898,6 +877,14 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result /// replaces all instances of Binary with BinaryView in a DataType fn rewrite_to_view_types(data_type: &DataType) -> DataType { match data_type { + // Unsigned integers are not allowed at all + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => { + panic!("Illegal shredded value type: {data_type:?}"); + } + // UUID maps to 16-byte fixed-size binary; no other width is allowed + DataType::FixedSizeBinary(n) if *n != 16 => { + panic!("Illegal shredded value type: {data_type:?}"); + } DataType::Binary => DataType::BinaryView, DataType::List(field) => DataType::List(rewrite_field_type(field)), DataType::Struct(fields) => { diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 5e5c3d944c34..807f3585f41d 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -144,10 +144,7 @@ variant_test_case!(39); variant_test_case!(40, "Unsupported typed_value type: List("); variant_test_case!(41, "Unsupported typed_value type: List(Field"); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!( - 42, - "Expected an error 'Invalid variant, conflicting value and typed_value`, but got no error" -); +variant_test_case!(42, "Invalid variant, conflicting value and typed_value"); // https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(43, "Unsupported typed_value type: Struct([Field"); variant_test_case!(44, "Unsupported typed_value type: Struct([Field"); @@ -197,6 +194,7 @@ variant_test_case!(84, "Unsupported typed_value type: Struct([Field"); variant_test_case!(85, "Unsupported typed_value type: List(Field"); variant_test_case!(86, "Unsupported typed_value type: List(Field"); // Is an error case (should be failing as the expected error message indicates) +// TODO: Once structs are supported, expect "Invalid variant, non-object value with shredded fields" variant_test_case!(87, "Unsupported typed_value type: Struct([Field"); variant_test_case!(88, "Unsupported typed_value type: List(Field"); variant_test_case!(89); @@ -238,13 +236,11 @@ variant_test_case!(124); variant_test_case!(125, "Unsupported typed_value type: Struct"); variant_test_case!(126, "Unsupported typed_value type: List("); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!( - 127, - "Invalid variant data: InvalidArgumentError(\"Received empty bytes\")" -); +variant_test_case!(127, "Illegal shredded value type: UInt32"); // Is an error case (should be failing as the expected error message indicates) +// TODO: Once structs are supported, expect "Invalid variant, non-object value with shredded fields" variant_test_case!(128, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(129, "Invalid variant data: InvalidArgumentError("); +variant_test_case!(129); variant_test_case!(130, "Unsupported typed_value type: Struct([Field"); variant_test_case!(131); variant_test_case!(132, "Unsupported typed_value type: Struct([Field"); @@ -252,7 +248,7 @@ variant_test_case!(133, "Unsupported typed_value type: Struct([Field"); variant_test_case!(134, "Unsupported typed_value type: Struct([Field"); variant_test_case!(135); variant_test_case!(136, "Unsupported typed_value type: List(Field "); -variant_test_case!(137, "Invalid variant data: InvalidArgumentError("); +variant_test_case!(137, "Illegal shredded value type: FixedSizeBinary(4)"); variant_test_case!(138, "Unsupported typed_value type: Struct([Field"); /// Test case definition structure matching the format from From 4f45954824ba313810e0f7363f6abc3e82a91300 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 10:13:38 -0700 Subject: [PATCH 02/10] more fixes; remove illegal unit tests --- parquet-variant-compute/src/variant_array.rs | 12 +- parquet-variant-compute/src/variant_get.rs | 191 +------------------ 2 files changed, 7 insertions(+), 196 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e6ee7eb9ced1..da1ea4a57036 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -800,17 +800,11 @@ fn typed_value_to_variant<'a>( let date = Date32Type::to_naive_date(value); Variant::from(date) } - DataType::FixedSizeBinary(binary_len) => { + // 16-byte FixedSizeBinary is always UUID; all others illegal. + DataType::FixedSizeBinary(16) => { let array = typed_value.as_fixed_size_binary(); - // Try to treat 16 byte FixedSizeBinary as UUID let value = array.value(index); - if *binary_len == 16 { - if let Ok(uuid) = Uuid::from_slice(value) { - return Variant::from(uuid); - } - } - let value = array.value(index); - Variant::from(value) + Uuid::from_slice(value).map_or(Variant::Null, Variant::from) } DataType::BinaryView => { let array = typed_value.as_binary_view(); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index ef602e84f1bf..bd2208f3c3fe 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -297,13 +297,13 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray, - Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float16Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, + StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; - use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8}; + use arrow::datatypes::DataType::{Int16, Int32, Int64}; use arrow_schema::{DataType, Field, FieldRef, Fields}; use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; @@ -438,26 +438,6 @@ mod test { numeric_partially_shredded_test!(i64, partially_shredded_int64_variant_array); } - #[test] - fn get_variant_partially_shredded_uint8_as_variant() { - numeric_partially_shredded_test!(u8, partially_shredded_uint8_variant_array); - } - - #[test] - fn get_variant_partially_shredded_uint16_as_variant() { - numeric_partially_shredded_test!(u16, partially_shredded_uint16_variant_array); - } - - #[test] - fn get_variant_partially_shredded_uint32_as_variant() { - numeric_partially_shredded_test!(u32, partially_shredded_uint32_variant_array); - } - - #[test] - fn get_variant_partially_shredded_uint64_as_variant() { - numeric_partially_shredded_test!(u64, partially_shredded_uint64_variant_array); - } - #[test] fn get_variant_partially_shredded_float16_as_variant() { numeric_partially_shredded_test!(half::f16, partially_shredded_float16_variant_array); @@ -490,23 +470,6 @@ mod test { assert_eq!(result.value(3), Variant::from(false)); } - #[test] - fn get_variant_partially_shredded_fixed_size_binary_as_variant() { - let array = partially_shredded_fixed_size_binary_variant_array(); - let options = GetOptions::new(); - let result = variant_get(&array, options).unwrap(); - - // expect the result is a VariantArray - let result = VariantArray::try_new(&result).unwrap(); - assert_eq!(result.len(), 4); - - // Expect the values are the same as the original values - assert_eq!(result.value(0), Variant::from(&[1u8, 2u8, 3u8][..])); - assert!(!result.is_valid(1)); - assert_eq!(result.value(2), Variant::from("n/a")); - assert_eq!(result.value(3), Variant::from(&[4u8, 5u8, 6u8][..])); - } - #[test] fn get_variant_partially_shredded_utf8_as_variant() { let array = partially_shredded_utf8_variant_array(); @@ -645,26 +608,6 @@ mod test { numeric_perfectly_shredded_test!(i64, perfectly_shredded_int64_variant_array); } - #[test] - fn get_variant_perfectly_shredded_uint8_as_variant() { - numeric_perfectly_shredded_test!(u8, perfectly_shredded_uint8_variant_array); - } - - #[test] - fn get_variant_perfectly_shredded_uint16_as_variant() { - numeric_perfectly_shredded_test!(u16, perfectly_shredded_uint16_variant_array); - } - - #[test] - fn get_variant_perfectly_shredded_uint32_as_variant() { - numeric_perfectly_shredded_test!(u32, perfectly_shredded_uint32_variant_array); - } - - #[test] - fn get_variant_perfectly_shredded_uint64_as_variant() { - numeric_perfectly_shredded_test!(u64, perfectly_shredded_uint64_variant_array); - } - #[test] fn get_variant_perfectly_shredded_float16_as_variant() { numeric_perfectly_shredded_test!(half::f16, perfectly_shredded_float16_variant_array); @@ -749,34 +692,6 @@ mod test { Int64Array::from(vec![Some(1), Some(2), Some(3)]) ); - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_uint8_as_int8, - UInt8, - perfectly_shredded_uint8_variant_array, - UInt8Array::from(vec![Some(1), Some(2), Some(3)]) - ); - - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_uint16_as_uint16, - UInt16, - perfectly_shredded_uint16_variant_array, - UInt16Array::from(vec![Some(1), Some(2), Some(3)]) - ); - - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_uint32_as_uint32, - UInt32, - perfectly_shredded_uint32_variant_array, - UInt32Array::from(vec![Some(1), Some(2), Some(3)]) - ); - - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_uint64_as_uint64, - UInt64, - perfectly_shredded_uint64_variant_array, - UInt64Array::from(vec![Some(1), Some(2), Some(3)]) - ); - /// Return a VariantArray that represents a perfectly "shredded" variant /// for the given typed value. /// @@ -835,26 +750,6 @@ mod test { Int64Array, i64 ); - numeric_perfectly_shredded_variant_array_fn!( - perfectly_shredded_uint8_variant_array, - UInt8Array, - u8 - ); - numeric_perfectly_shredded_variant_array_fn!( - perfectly_shredded_uint16_variant_array, - UInt16Array, - u16 - ); - numeric_perfectly_shredded_variant_array_fn!( - perfectly_shredded_uint32_variant_array, - UInt32Array, - u32 - ); - numeric_perfectly_shredded_variant_array_fn!( - perfectly_shredded_uint64_variant_array, - UInt64Array, - u64 - ); numeric_perfectly_shredded_variant_array_fn!( perfectly_shredded_float16_variant_array, Float16Array, @@ -963,26 +858,6 @@ mod test { Int64Array, i64 ); - numeric_partially_shredded_variant_array_fn!( - partially_shredded_uint8_variant_array, - UInt8Array, - u8 - ); - numeric_partially_shredded_variant_array_fn!( - partially_shredded_uint16_variant_array, - UInt16Array, - u16 - ); - numeric_partially_shredded_variant_array_fn!( - partially_shredded_uint32_variant_array, - UInt32Array, - u32 - ); - numeric_partially_shredded_variant_array_fn!( - partially_shredded_uint64_variant_array, - UInt64Array, - u64 - ); numeric_partially_shredded_variant_array_fn!( partially_shredded_float16_variant_array, Float16Array, @@ -1043,64 +918,6 @@ mod test { Arc::new(struct_array) } - /// Return a VariantArray that represents a partially "shredded" variant for fixed size binary - fn partially_shredded_fixed_size_binary_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - // Create fixed size binary array with 3-byte values - let data = vec![ - 1u8, 2u8, 3u8, // row 0 is shredded - 0u8, 0u8, 0u8, // row 1 is null (value doesn't matter) - 0u8, 0u8, 0u8, // row 2 is a string (value doesn't matter) - 4u8, 5u8, 6u8, // row 3 is shredded - ]; - let typed_value_nulls = arrow::buffer::NullBuffer::from(vec![ - true, // row 0 has value - false, // row 1 is null - false, // row 2 is string - true, // row 3 has value - ]); - let typed_value = FixedSizeBinaryArray::try_new( - 3, // byte width - arrow::buffer::Buffer::from(data), - Some(typed_value_nulls), - ) - .expect("should create fixed size binary array"); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), true) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } - /// Return a VariantArray that represents a partially "shredded" variant for UTF8 fn partially_shredded_utf8_variant_array() -> ArrayRef { let (metadata, string_value) = { From 127e3aef84b0e5d44a4eda5038fb5c3a10f4be5b Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 11:20:41 -0700 Subject: [PATCH 03/10] self review --- parquet-variant-compute/src/variant_array.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 42a9081f4826..a68ceef56322 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -352,9 +352,7 @@ impl VariantArray { /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { - let typed_value = self.typed_value_field(); - let value = self.value_field(); - match (typed_value, value) { + match (self.typed_value_field(), self.value_field()) { // Always prefer typed_value, if available (Some(typed_value), value) if typed_value.is_valid(index) => { typed_value_to_variant(typed_value, value, index) @@ -800,7 +798,7 @@ fn typed_value_to_variant<'a>( let date = Date32Type::to_naive_date(value); Variant::from(date) } - // 16-byte FixedSizeBinary is always UUID; all others illegal. + // 16-byte FixedSizeBinary is always UUID; all other sizes are illegal. DataType::FixedSizeBinary(16) => { let array = typed_value.as_fixed_size_binary(); let value = array.value(index); From c98edee4e63fad2497f230f4a5fa5236a55dfa3f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 13:34:47 -0700 Subject: [PATCH 04/10] review feedback --- parquet-variant-compute/src/variant_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index a68ceef56322..ae0cc918a1de 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -798,11 +798,11 @@ fn typed_value_to_variant<'a>( let date = Date32Type::to_naive_date(value); Variant::from(date) } - // 16-byte FixedSizeBinary is always UUID; all other sizes are illegal. + // 16-byte FixedSizeBinary is alway corresponds to a UUID; all other sizes are illegal. DataType::FixedSizeBinary(16) => { let array = typed_value.as_fixed_size_binary(); let value = array.value(index); - Uuid::from_slice(value).map_or(Variant::Null, Variant::from) + Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice is always 16 bytes } DataType::BinaryView => { let array = typed_value.as_binary_view(); From ab129d9f8b339ff6e90ff2f70a74d7c98607ce7b Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 14:41:05 -0700 Subject: [PATCH 05/10] tighten up illegal shredding type checks --- parquet-variant-compute/src/variant_array.rs | 116 +++++++++++++++---- 1 file changed, 94 insertions(+), 22 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index ae0cc918a1de..f90e0250fa7e 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -25,9 +25,11 @@ use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, }; use arrow_schema::extension::ExtensionType; -use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; use parquet_variant::Uuid; use parquet_variant::Variant; + +use std::borrow::Cow; use std::sync::Arc; /// Arrow Variant [`ExtensionType`]. @@ -862,36 +864,106 @@ fn typed_value_to_variant<'a>( /// /// So cast them to get the right type. fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { - let new_type = rewrite_to_view_types(array.data_type()); - cast(array, &new_type) + let new_type = canonicalize_and_verify_data_type(array.data_type())?; + cast(array, new_type.as_ref()) } +fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool { + *p <= max_precision && (0..=*p as i8).contains(s) +} /// replaces all instances of Binary with BinaryView in a DataType -fn rewrite_to_view_types(data_type: &DataType) -> DataType { - match data_type { +fn canonicalize_and_verify_data_type( + data_type: &DataType, +) -> Result, ArrowError> { + use DataType::*; + + // helper macros + macro_rules! fail { + () => { + return Err(ArrowError::InvalidArgumentError(format!( + "Illegal shredded value type: {data_type}" + ))) + }; + } + macro_rules! borrow { + () => { + Cow::Borrowed(data_type) + }; + } + + let new_data_type = match data_type { + // Primitive arrow types that have a direct variant counterpart are allowed + Null | Boolean => borrow!(), + Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(), + // Unsigned integers are not allowed at all - DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => { - panic!("Illegal shredded value type: {data_type:?}"); - } + UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(), + + // Most decimal types are allowed, with restrictions on precision and scale + Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(), + Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(), + Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(), + Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(), + + // Only micro and nano timestamps are supported + Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(), + Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(), + + // Only 32-bit dates and 64-bit microsecond time are supported. + Date32 | Time64(TimeUnit::Microsecond) => borrow!(), + Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(), + + // Binary and string are allowed + Binary => Cow::Owned(DataType::BinaryView), + BinaryView | Utf8 => borrow!(), + // UUID maps to 16-byte fixed-size binary; no other width is allowed - DataType::FixedSizeBinary(n) if *n != 16 => { - panic!("Illegal shredded value type: {data_type:?}"); + FixedSizeBinary(16) => borrow!(), + FixedSizeBinary(_) | FixedSizeList(..) => fail!(), + + // We can _possibly_ support (some of) these some day? + LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => { + fail!() } - DataType::Binary => DataType::BinaryView, - DataType::List(field) => DataType::List(rewrite_field_type(field)), - DataType::Struct(fields) => { - DataType::Struct(fields.iter().map(rewrite_field_type).collect()) + + // Lists and struct are supported, maps and unions are not + List(field) => match canonicalize_and_verify_field(field)? { + Cow::Borrowed(_) => borrow!(), + Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)), + }, + Struct(fields) => { + // Avoid allocation unless at least one field was rewritten + let mut new_fields = std::collections::HashMap::new(); + for (i, field) in fields.iter().enumerate() { + if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? { + new_fields.insert(i, new_field); + } + } + + if new_fields.is_empty() { + borrow!() + } else { + let new_fields = fields + .iter() + .enumerate() + .map(|(i, field)| new_fields.remove(&i).unwrap_or_else(|| field.clone())); + Cow::Owned(DataType::Struct(new_fields.collect())) + } } - _ => data_type.clone(), - } + Map(..) | Union(..) => fail!(), + + // We can _possibly_ support (some of) these some day? + Dictionary(..) | RunEndEncoded(..) => fail!(), + }; + Ok(new_data_type) } -fn rewrite_field_type(field: impl AsRef) -> Arc { - let field = field.as_ref(); - let new_field = field - .clone() - .with_data_type(rewrite_to_view_types(field.data_type())); - Arc::new(new_field) +fn canonicalize_and_verify_field(field: &Arc) -> Result>, ArrowError> { + let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else { + return Ok(Cow::Borrowed(field)); + }; + let new_field = field.as_ref().clone().with_data_type(new_data_type); + Ok(Cow::Owned(Arc::new(new_field))) } #[cfg(test)] From 54056fe6fda1561a18108c0a1ed45db903e6ab74 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 14:44:06 -0700 Subject: [PATCH 06/10] comment --- parquet-variant-compute/src/variant_array.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index f90e0250fa7e..72b5b603543b 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -868,9 +868,11 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result cast(array, new_type.as_ref()) } +/// Validates whether a given arrow decimal is a valid variant decimal fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool { *p <= max_precision && (0..=*p as i8).contains(s) } + /// replaces all instances of Binary with BinaryView in a DataType fn canonicalize_and_verify_data_type( data_type: &DataType, @@ -913,7 +915,8 @@ fn canonicalize_and_verify_data_type( Date32 | Time64(TimeUnit::Microsecond) => borrow!(), Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(), - // Binary and string are allowed + // Binary and string are allowed. + // NOTE: We force Binary to BinaryView because that's what the parquet reader returns. Binary => Cow::Owned(DataType::BinaryView), BinaryView | Utf8 => borrow!(), From 8a2e4afdaed2a86b32bdce3d960aeb288a2862b7 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 15:21:15 -0700 Subject: [PATCH 07/10] forgot to delete Float16 tests --- parquet-variant-compute/src/variant_get.rs | 25 ++-------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 92d8abf8031f..49f56af57327 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -297,9 +297,8 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float16Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, - StructArray, + Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -438,11 +437,6 @@ mod test { numeric_partially_shredded_test!(i64, partially_shredded_int64_variant_array); } - #[test] - fn get_variant_partially_shredded_float16_as_variant() { - numeric_partially_shredded_test!(half::f16, partially_shredded_float16_variant_array); - } - #[test] fn get_variant_partially_shredded_float32_as_variant() { numeric_partially_shredded_test!(f32, partially_shredded_float32_variant_array); @@ -608,11 +602,6 @@ mod test { numeric_perfectly_shredded_test!(i64, perfectly_shredded_int64_variant_array); } - #[test] - fn get_variant_perfectly_shredded_float16_as_variant() { - numeric_perfectly_shredded_test!(half::f16, perfectly_shredded_float16_variant_array); - } - #[test] fn get_variant_perfectly_shredded_float32_as_variant() { numeric_perfectly_shredded_test!(f32, perfectly_shredded_float32_variant_array); @@ -750,11 +739,6 @@ mod test { Int64Array, i64 ); - numeric_perfectly_shredded_variant_array_fn!( - perfectly_shredded_float16_variant_array, - Float16Array, - half::f16 - ); numeric_perfectly_shredded_variant_array_fn!( perfectly_shredded_float32_variant_array, Float32Array, @@ -858,11 +842,6 @@ mod test { Int64Array, i64 ); - numeric_partially_shredded_variant_array_fn!( - partially_shredded_float16_variant_array, - Float16Array, - half::f16 - ); numeric_partially_shredded_variant_array_fn!( partially_shredded_float32_variant_array, Float32Array, From edc7ecd80922285bd3019498ca4acb30a3a2972c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 24 Sep 2025 16:22:40 -0700 Subject: [PATCH 08/10] decimal comment --- parquet-variant-compute/src/variant_array.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 72b5b603543b..63c23a02379d 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -869,6 +869,13 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result } /// Validates whether a given arrow decimal is a valid variant decimal +/// +/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], each decimal type +/// should have a lower bound on precision as well as an upper bound (i.e. Decimal16 with precision +/// 5 is invalid because Decimal4 can cover it). But the variant shredding integration tests +/// specifically expect such cases to succeed, so we only enforce the upper bound here. +/// +/// [shredding spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool { *p <= max_precision && (0..=*p as i8).contains(s) } From 5de34894e5fae08bb23e31ec554685583045bc17 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 25 Sep 2025 05:05:49 -0700 Subject: [PATCH 09/10] fix comments --- parquet-variant-compute/src/variant_array.rs | 24 ++++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 63c23a02379d..96bf1189c826 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -800,7 +800,7 @@ fn typed_value_to_variant<'a>( let date = Date32Type::to_naive_date(value); Variant::from(date) } - // 16-byte FixedSizeBinary is alway corresponds to a UUID; all other sizes are illegal. + // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal. DataType::FixedSizeBinary(16) => { let array = typed_value.as_fixed_size_binary(); let value = array.value(index); @@ -880,7 +880,9 @@ fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool { *p <= max_precision && (0..=*p as i8).contains(s) } -/// replaces all instances of Binary with BinaryView in a DataType +/// Recursively visits a data type, ensuring that it only contains data types that can legally +/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView, +/// since that's what comes back from the parquet reader and what the variant code expects to find. fn canonicalize_and_verify_data_type( data_type: &DataType, ) -> Result, ArrowError> { @@ -905,7 +907,7 @@ fn canonicalize_and_verify_data_type( Null | Boolean => borrow!(), Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(), - // Unsigned integers are not allowed at all + // Unsigned integers and half-float are not allowed UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(), // Most decimal types are allowed, with restrictions on precision and scale @@ -914,16 +916,16 @@ fn canonicalize_and_verify_data_type( Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(), Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => fail!(), - // Only micro and nano timestamps are supported + // Only micro and nano timestamps are allowed Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => borrow!(), Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(), - // Only 32-bit dates and 64-bit microsecond time are supported. + // Only 32-bit dates and 64-bit microsecond time are allowed. Date32 | Time64(TimeUnit::Microsecond) => borrow!(), Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(), - // Binary and string are allowed. - // NOTE: We force Binary to BinaryView because that's what the parquet reader returns. + // Binary and string are allowed. Force Binary to BinaryView because that's what the parquet + // reader returns and what the rest of the variant code expects. Binary => Cow::Owned(DataType::BinaryView), BinaryView | Utf8 => borrow!(), @@ -931,18 +933,20 @@ fn canonicalize_and_verify_data_type( FixedSizeBinary(16) => borrow!(), FixedSizeBinary(_) | FixedSizeList(..) => fail!(), - // We can _possibly_ support (some of) these some day? + // We can _possibly_ allow (some of) these some day? LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => { fail!() } - // Lists and struct are supported, maps and unions are not + // Lists and struct are allowed, maps and unions are not List(field) => match canonicalize_and_verify_field(field)? { Cow::Borrowed(_) => borrow!(), Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)), }, + // Struct is used by the internal layout, and can also represent a shredded variant object. Struct(fields) => { - // Avoid allocation unless at least one field was rewritten + // Avoid allocation unless at least one field changes, to avoid unnecessary deep cloning + // of the data type. Even if some fields change, the others are shallow arc clones. let mut new_fields = std::collections::HashMap::new(); for (i, field) in fields.iter().enumerate() { if let Cow::Owned(new_field) = canonicalize_and_verify_field(field)? { From 8dd3ab7c6c19368d56c7b63df436ca8c60de17a8 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 25 Sep 2025 05:24:13 -0700 Subject: [PATCH 10/10] decimal precision fix --- parquet-variant-compute/src/variant_array.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 96bf1189c826..16dbff4c341a 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -871,13 +871,13 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result /// Validates whether a given arrow decimal is a valid variant decimal /// /// NOTE: By a strict reading of the "decimal table" in the [shredding spec], each decimal type -/// should have a lower bound on precision as well as an upper bound (i.e. Decimal16 with precision -/// 5 is invalid because Decimal4 can cover it). But the variant shredding integration tests -/// specifically expect such cases to succeed, so we only enforce the upper bound here. +/// should have a width-dependent lower bound on precision as well as an upper bound (i.e. Decimal16 +/// with precision 5 is invalid because Decimal4 "covers" it). But the variant shredding integration +/// tests specifically expect such cases to succeed, so we only enforce the upper bound here. /// /// [shredding spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool { - *p <= max_precision && (0..=*p as i8).contains(s) + (1..=max_precision).contains(p) && (0..=*p as i8).contains(s) } /// Recursively visits a data type, ensuring that it only contains data types that can legally