From 340f9706973997aa079e4e2cccff4d03ba760458 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 24 Mar 2026 13:33:51 +0000 Subject: [PATCH 1/8] Initial work Signed-off-by: Adam Gutglick --- parquet-variant-compute/src/shred_variant.rs | 161 +++++++------ .../src/unshred_variant.rs | 102 ++++---- parquet-variant-compute/src/variant_array.rs | 221 +++++++++++------- .../src/variant_array_builder.rs | 2 +- parquet-variant-compute/src/variant_get.rs | 59 +++-- .../src/variant_to_arrow.rs | 19 +- 6 files changed, 334 insertions(+), 230 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 7b181179d3d6..48a6b12bf062 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result VariantToShreddedObjectVariantRowBuilder<'a> { let mut builder = StructArrayBuilder::new(); for (field_name, typed_value_builder) in self.typed_value_builders { let (value, typed_value, nulls) = typed_value_builder.finish()?; - let array = - ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + let array = ShreddedVariantFieldArray::from_parts( + Some(Arc::new(value) as ArrayRef), + Some(typed_value), + nulls, + ); builder = builder.with_field(field_name, ArrayRef::from(array), false); } if let Some(nulls) = self.typed_value_nulls.finish() { @@ -689,6 +692,7 @@ impl VariantSchemaNode { mod tests { use super::*; use crate::VariantArrayBuilder; + use crate::variant_array::binary_array_value; use arrow::array::{ Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray, @@ -867,7 +871,8 @@ mod tests { ) { assert_eq!(array.len(), expected_len); - let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field())); + let fallback_value = array.value_field().unwrap(); + let fallback_metadata = array.metadata_field(); let array = downcast_list_like_array::(array); assert_eq!( @@ -887,7 +892,7 @@ mod tests { ); assert_eq!( array.len(), - fallbacks.0.len(), + fallback_value.len(), "fallbacks value field should match array length" ); @@ -902,7 +907,7 @@ mod tests { // Successfully shredded: typed list value present, no fallback value assert!(array.is_valid(idx)); assert_eq!(array.value_size(idx), *len); - assert!(fallbacks.0.is_null(idx)); + assert!(fallback_value.is_null(idx)); } None => { // Unable to shred: typed list value absent, fallback should carry the variant @@ -910,20 +915,25 @@ mod tests { assert_eq!(array.value_size(idx), O::zero()); match expected_fallback { Some(expected_variant) => { - assert!(fallbacks.0.is_valid(idx)); - let metadata_bytes = fallbacks - .1 - .filter(|m| m.is_valid(idx)) - .map(|m| m.value(idx)) - .filter(|bytes| !bytes.is_empty()) - .unwrap_or(EMPTY_VARIANT_METADATA_BYTES); + assert!(fallback_value.is_valid(idx)); + let metadata_bytes = + binary_array_value(fallback_metadata.as_ref(), idx); + let metadata_bytes = + if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() { + metadata_bytes + } else { + EMPTY_VARIANT_METADATA_BYTES + }; assert_eq!( - Variant::new(metadata_bytes, fallbacks.0.value(idx)), + Variant::new( + metadata_bytes, + binary_array_value(fallback_value.as_ref(), idx) + ), expected_variant.clone() ); } None => { - assert!(fallbacks.0.is_null(idx)); + assert!(fallback_value.is_null(idx)); } } } @@ -983,7 +993,10 @@ mod tests { Some(expected_variant) => { assert!(element_fallbacks.is_valid(idx)); assert_eq!( - Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)), + Variant::new( + EMPTY_VARIANT_METADATA_BYTES, + binary_array_value(element_fallbacks.as_ref(), idx) + ), expected_variant.clone() ); } @@ -1129,7 +1142,7 @@ mod tests { #[test] fn test_all_null_input() { // Create VariantArray with no value field (all null case) - let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata + let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata let all_null_array = VariantArray::from_parts(metadata, None, None, None); let result = shred_variant(&all_null_array, &DataType::Int64).unwrap(); @@ -1243,7 +1256,10 @@ mod tests { assert!(!value_field.is_null(1)); // value should contain original assert!(typed_value_field.is_null(1)); // typed_value should be null assert_eq!( - Variant::new(metadata_field.value(1), value_field.value(1)), + Variant::new( + binary_array_value(metadata_field.as_ref(), 1), + binary_array_value(value_field.as_ref(), 1) + ), Variant::from("hello") ); @@ -1259,7 +1275,10 @@ mod tests { assert!(!result.is_null(4)); assert!(!value_field.is_null(4)); // should contain Variant::Null assert_eq!( - Variant::new(metadata_field.value(4), value_field.value(4)), + Variant::new( + binary_array_value(metadata_field.as_ref(), 4), + binary_array_value(value_field.as_ref(), 4) + ), Variant::Null ); assert!(typed_value_field.is_null(4)); @@ -1336,7 +1355,10 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new(metadata.value(1), value.value(1)), + Variant::new( + binary_array_value(metadata.as_ref(), 1), + binary_array_value(value.as_ref(), 1) + ), Variant::from(42i64) ); @@ -1350,7 +1372,10 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new(metadata.value(3), value.value(3)), + Variant::new( + binary_array_value(metadata.as_ref(), 3), + binary_array_value(value.as_ref(), 3) + ), Variant::Null ); @@ -1392,7 +1417,10 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new(metadata.value(1), value.value(1)), + Variant::new( + binary_array_value(metadata.as_ref(), 1), + binary_array_value(value.as_ref(), 1) + ), Variant::from("not_binary") ); @@ -1406,7 +1434,10 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new(metadata.value(3), value.value(3)), + Variant::new( + binary_array_value(metadata.as_ref(), 3), + binary_array_value(value.as_ref(), 3) + ), Variant::Null ); @@ -1682,14 +1713,14 @@ mod tests { .unwrap(); let outer_fallbacks = outer_elements.value_field().unwrap(); - let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( + let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, outer_elements.len(), - )); + ))) as ArrayRef; let outer_variant = VariantArray::from_parts( outer_metadata, Some(outer_fallbacks.clone()), - Some(Arc::new(outer_values.clone())), + Some(Arc::new(outer_values.clone()) as ArrayRef), None, ); @@ -1792,7 +1823,10 @@ mod tests { // null is stored as Variant::Null in values assert!(id_values.is_valid(1)); assert_eq!( - Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)), + Variant::new( + EMPTY_VARIANT_METADATA_BYTES, + binary_array_value(id_values.as_ref(), 1) + ), Variant::Null ); assert!(id_typed_values.is_null(1)); @@ -1866,7 +1900,6 @@ mod tests { assert_eq!(result.len(), 9); let metadata = result.metadata_field(); - let value = result.value_field().unwrap(); let typed_value = result .typed_value_field() @@ -1882,24 +1915,14 @@ mod tests { let age_field = ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap(); - let score_value = score_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let score_value = score_field.value_field().unwrap(); let score_typed_value = score_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); - let age_value = age_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let age_value = age_field.value_field().unwrap(); let age_typed_value = age_field .typed_value_field() .unwrap() @@ -1918,10 +1941,13 @@ mod tests { } fn get_value<'m, 'v>( i: usize, - metadata: &'m BinaryViewArray, - value: &'v BinaryViewArray, + metadata: &'m dyn Array, + value: &'v dyn Array, ) -> Variant<'m, 'v> { - Variant::new(metadata.value(i), value.value(i)) + Variant::new( + binary_array_value(metadata, i), + binary_array_value(value, i), + ) } let expect = |i, expected_result: Option>| { match expected_result { @@ -1933,7 +1959,10 @@ mod tests { match expected_value { Some(expected_value) => { assert!(value.is_valid(i)); - assert_eq!(expected_value, get_value(i, metadata, value)); + assert_eq!( + expected_value, + get_value(i, metadata.as_ref(), value.as_ref()) + ); } None => { assert!(value.is_null(i)); @@ -1952,7 +1981,7 @@ mod tests { assert!(score_value.is_valid(i)); assert_eq!( expected_score_value, - get_value(i, metadata, score_value) + get_value(i, metadata.as_ref(), score_value.as_ref()) ); } None => { @@ -1973,7 +2002,7 @@ mod tests { assert!(age_value.is_valid(i)); assert_eq!( expected_age_value, - get_value(i, metadata, age_value) + get_value(i, metadata.as_ref(), age_value.as_ref()) ); } None => { @@ -2114,7 +2143,7 @@ mod tests { // Helper to correctly create a variant object using a row's existing metadata let object_with_foo_field = |i| { use parquet_variant::{ParentState, ValueBuilder, VariantMetadata}; - let metadata = VariantMetadata::new(metadata.value(i)); + let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i)); let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata); let mut value_builder = ValueBuilder::new(); let state = ParentState::variant(&mut value_builder, &mut metadata_builder); @@ -2213,7 +2242,10 @@ mod tests { assert!(value_field.is_null(2)); assert!(value_field.is_valid(3)); assert_eq!( - Variant::new(result.metadata_field().value(3), value_field.value(3)), + Variant::new( + binary_array_value(result.metadata_field().as_ref(), 3), + binary_array_value(value_field.as_ref(), 3) + ), Variant::from("not an object") ); assert!(value_field.is_null(4)); @@ -2231,10 +2263,10 @@ mod tests { .unwrap(); assert_list_structure_and_elements::( &VariantArray::from_parts( - BinaryViewArray::from_iter_values(std::iter::repeat_n( + Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, scores_field.len(), - )), + ))) as ArrayRef, Some(scores_field.value_field().unwrap().clone()), Some(scores_field.typed_value_field().unwrap().clone()), None, @@ -2350,24 +2382,14 @@ mod tests { ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap()) .unwrap(); - let id_value = id_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let id_value = id_field.value_field().unwrap(); let id_typed_value = id_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); - let session_id_value = session_id_field - .value_field() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let session_id_value = session_id_field.value_field().unwrap(); let session_id_typed_value = session_id_field .typed_value_field() .unwrap() @@ -2404,7 +2426,10 @@ mod tests { assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes()); // Verify the value field contains the name field - let row_1_variant = Variant::new(metadata.value(1), value.value(1)); + let row_1_variant = Variant::new( + binary_array_value(metadata.as_ref(), 1), + binary_array_value(value.as_ref(), 1), + ); let Variant::Object(obj) = row_1_variant else { panic!("Expected object"); }; @@ -2436,7 +2461,10 @@ mod tests { assert!(session_id_value.is_valid(3)); // type mismatch, stored in value assert!(session_id_typed_value.is_null(3)); - let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3)); + let session_id_variant = Variant::new( + binary_array_value(metadata.as_ref(), 3), + binary_array_value(session_id_value.as_ref(), 3), + ); assert_eq!(session_id_variant, Variant::from("not-a-uuid")); // Row 4: Type mismatch - id is int64, not UUID @@ -2447,7 +2475,10 @@ mod tests { assert!(id_value.is_valid(4)); // type mismatch, stored in value assert!(id_typed_value.is_null(4)); - let id_variant = Variant::new(metadata.value(4), id_value.value(4)); + let id_variant = Variant::new( + binary_array_value(metadata.as_ref(), 4), + binary_array_value(id_value.as_ref(), 4), + ); assert_eq!(id_variant, Variant::from(12345i64)); assert!(session_id_value.is_null(4)); diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs index 2df36fa63f02..15aaa671d5c7 100644 --- a/parquet-variant-compute/src/unshred_variant.rs +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -17,11 +17,13 @@ //! Module for unshredding VariantArray by folding typed_value columns back into the value column. +use crate::variant_array::binary_array_value; use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - Array, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, - FixedSizeListArray, GenericListArray, GenericListViewArray, LargeBinaryArray, LargeStringArray, - ListLikeArray, PrimitiveArray, StringArray, StringViewArray, StructArray, + Array, ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, + FixedSizeBinaryArray, FixedSizeListArray, GenericListArray, GenericListViewArray, + LargeBinaryArray, LargeStringArray, ListLikeArray, PrimitiveArray, StringArray, + StringViewArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::datatypes::{ @@ -38,6 +40,7 @@ use parquet_variant::{ VariantDecimal16, VariantDecimalType, VariantMetadata, }; use std::marker::PhantomData; +use std::sync::Arc; use uuid::Uuid; /// Removes all (nested) typed_value columns from a VariantArray by converting them back to binary @@ -73,7 +76,8 @@ pub fn unshred_variant(array: &VariantArray) -> Result { if array.is_null(i) { value_builder.append_null(); } else { - let metadata = VariantMetadata::new(metadata.value(i)); + let metadata_bytes = binary_array_value(metadata.as_ref(), i); + let metadata = VariantMetadata::new(metadata_bytes); let mut value_builder = value_builder.builder_ext(&metadata); row_builder.append_row(&mut value_builder, &metadata, i)?; } @@ -82,7 +86,7 @@ pub fn unshred_variant(array: &VariantArray) -> Result { let value = value_builder.build()?; Ok(VariantArray::from_parts( metadata.clone(), - Some(value), + Some(Arc::new(value)), None, nulls.cloned(), )) @@ -308,11 +312,11 @@ impl<'a> NullUnshredVariantBuilder<'a> { /// Builder for arrays that only have value column (already unshredded) struct ValueOnlyUnshredVariantBuilder<'a> { - value: &'a arrow::array::BinaryViewArray, + value: &'a ArrayRef, } impl<'a> ValueOnlyUnshredVariantBuilder<'a> { - fn new(value: &'a BinaryViewArray) -> Self { + fn new(value: &'a ArrayRef) -> Self { Self { value } } @@ -325,7 +329,8 @@ impl<'a> ValueOnlyUnshredVariantBuilder<'a> { if self.value.is_null(index) { builder.append_null(); } else { - let variant = Variant::new_with_metadata(metadata.clone(), self.value.value(index)); + let value_bytes = binary_array_value(self.value.as_ref(), index); + let variant = Variant::new_with_metadata(metadata.clone(), value_bytes); builder.append_value(variant); } Ok(()) @@ -347,7 +352,10 @@ trait AppendToVariantBuilder: Array { macro_rules! handle_unshredded_case { ($self:expr, $builder:expr, $metadata:expr, $index:expr, $partial_shredding:expr) => {{ let value = $self.value.as_ref().filter(|v| v.is_valid($index)); - let value = value.map(|v| Variant::new_with_metadata($metadata.clone(), v.value($index))); + let value = value.map(|v| { + let bytes = binary_array_value(v.as_ref(), $index); + Variant::new_with_metadata($metadata.clone(), bytes) + }); // If typed_value is null, handle unshredded case and return early if $self.typed_value.is_null($index) { @@ -372,12 +380,12 @@ macro_rules! handle_unshredded_case { /// Generic unshred builder that works with any Array implementing AppendToVariantBuilder struct UnshredPrimitiveRowBuilder<'a, T> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a T, } impl<'a, T: AppendToVariantBuilder> UnshredPrimitiveRowBuilder<'a, T> { - fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a T) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a T) -> Self { Self { value, typed_value } } @@ -475,17 +483,13 @@ impl TimestampType for TimestampNanosecondType { /// Generic builder for timestamp types that handles timezone-aware conversion struct TimestampUnshredRowBuilder<'a, T: TimestampType> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a PrimitiveArray, has_timezone: bool, } impl<'a, T: TimestampType> TimestampUnshredRowBuilder<'a, T> { - fn new( - value: Option<&'a BinaryViewArray>, - typed_value: &'a dyn Array, - has_timezone: bool, - ) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array, has_timezone: bool) -> Self { Self { value, typed_value: typed_value.as_primitive(), @@ -518,7 +522,7 @@ struct DecimalUnshredRowBuilder<'a, A: DecimalType, V> where V: VariantDecimalType, { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a PrimitiveArray, scale: i8, _phantom: PhantomData, @@ -528,7 +532,7 @@ impl<'a, A: DecimalType, V> DecimalUnshredRowBuilder<'a, A, V> where V: VariantDecimalType, { - fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a dyn Array, scale: i8) -> Self { + fn new(value: Option<&'a ArrayRef>, typed_value: &'a dyn Array, scale: i8) -> Self { Self { value, typed_value: typed_value.as_primitive(), @@ -554,13 +558,13 @@ where /// Builder for unshredding struct/object types with nested fields struct StructUnshredVariantBuilder<'a> { - value: Option<&'a arrow::array::BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a arrow::array::StructArray, field_unshredders: IndexMap<&'a str, Option>>, } impl<'a> StructUnshredVariantBuilder<'a> { - fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a StructArray) -> Result { + fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a StructArray) -> Result { // Create unshredders for each field in constructor let mut field_unshredders = IndexMap::new(); for (field, field_array) in typed_value.fields().iter().zip(typed_value.columns()) { @@ -626,13 +630,13 @@ impl<'a> StructUnshredVariantBuilder<'a> { /// Builder for unshredding list/array types with recursive element processing struct ListUnshredVariantBuilder<'a, L: ListLikeArray> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: &'a L, element_unshredder: Box>, } impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> { - fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a L) -> Result { + fn try_new(value: Option<&'a ArrayRef>, typed_value: &'a L) -> Result { // Create a recursive unshredder for the list elements // The element type comes from the values array of the list let element_values = typed_value.values(); @@ -684,16 +688,18 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> { mod tests { use crate::VariantArray; use arrow::array::{ - BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray, + ArrayRef, BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray, }; use parquet_variant::Variant; + use std::sync::Arc; #[test] fn test_unshred_utf8view_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = std::sync::Arc::new(StringViewArray::from(vec![ + let typed_value: ArrayRef = Arc::new(StringViewArray::from(vec![ Some("hello"), Some("middle"), Some("world"), @@ -712,14 +718,14 @@ mod tests { #[test] fn test_unshred_largeutf8_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(LargeStringArray::from(vec![ - Some("hello"), - Some("middle"), - Some("world"), - ])); + let typed_value: ArrayRef = Arc::new(LargeStringArray::from(vec![ + Some("hello"), + Some("middle"), + Some("world"), + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); @@ -734,14 +740,14 @@ mod tests { #[test] fn test_unshred_binary_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(BinaryArray::from_iter_values(vec![ - &b"\x00\x01\x02"[..], - &b"\xff\xaa"[..], - &b"\xde\xad\xbe\xef"[..], - ])); + let typed_value: ArrayRef = Arc::new(BinaryArray::from_iter_values(vec![ + &b"\x00\x01\x02"[..], + &b"\xff\xaa"[..], + &b"\xde\xad\xbe\xef"[..], + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); @@ -756,14 +762,14 @@ mod tests { #[test] fn test_unshred_largebinary_typed_value() { let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00]; - let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]); - - let typed_value: arrow::array::ArrayRef = - std::sync::Arc::new(LargeBinaryArray::from_iter_values(vec![ - &b"\x00\x01\x02"[..], - &b"\xff\xaa"[..], - &b"\xde\xad\xbe\xef"[..], - ])); + let metadata: ArrayRef = + Arc::new(BinaryViewArray::from_iter_values(vec![metadata_bytes; 3])); + + let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from_iter_values(vec![ + &b"\x00\x01\x02"[..], + &b"\xff\xaa"[..], + &b"\xde\xad\xbe\xef"[..], + ])); let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None); diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 145de5edfb70..e2c35a44e0bc 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -22,7 +22,7 @@ use crate::type_conversion::{ generic_conversion_single_value, generic_conversion_single_value_with_result, primitive_conversion_single_value, }; -use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; +use arrow::array::{Array, ArrayRef, AsArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ @@ -41,6 +41,40 @@ use parquet_variant::{ use std::borrow::Cow; use std::sync::Arc; +/// Returns the raw bytes at the given index from a binary-like array. +/// +/// # Panics +/// Panics if the array is not `Binary`, `LargeBinary`, or `BinaryView`, +/// or if `index` is out of bounds. +pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> &[u8] { + match array.data_type() { + DataType::Binary => array.as_binary::().value(index), + DataType::LargeBinary => array.as_binary::().value(index), + DataType::BinaryView => array.as_binary_view().value(index), + other => panic!("Expected Binary, LargeBinary, or BinaryView array, got {other}"), + } +} + +/// Returns `true` if the data type is `Binary`, `LargeBinary`, or `BinaryView`. +fn is_binary_like(dt: &DataType) -> bool { + matches!( + dt, + DataType::Binary | DataType::LargeBinary | DataType::BinaryView + ) +} + +/// Validates that an array has a binary-like data type. +fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> { + if is_binary_like(array.data_type()) { + Ok(()) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}", + array.data_type() + ))) + } +} + /// Arrow Variant [`ExtensionType`]. /// /// Represents the canonical Arrow Extension Type for storing variants. @@ -213,13 +247,13 @@ impl ExtensionType for VariantType { /// assert_eq!(variant_array.value(0), Variant::from("such wow")); /// ``` /// -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct VariantArray { /// Reference to the underlying StructArray inner: StructArray, - /// The metadata column of this variant - metadata: BinaryViewArray, + /// The metadata column of this variant (Binary, LargeBinary, or BinaryView) + metadata: ArrayRef, /// how is this variant array shredded? shredding_state: ShreddingState, @@ -252,11 +286,9 @@ impl VariantArray { /// Dictionary-Encoded, preferably (but not required) with an index type of /// int8. /// - /// Currently, only [`BinaryViewArray`] are supported. pub fn try_new(inner: &dyn Array) -> Result { - // Workaround lack of support for Binary - // https://github.com/apache/arrow-rs/issues/8387 - let inner = cast_to_binary_view_arrays(inner)?; + // Canonicalize shredded typed_value fields (e.g. decimal narrowing) + let inner = canonicalize_shredded_types(inner)?; let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( @@ -266,37 +298,31 @@ impl VariantArray { // Note the specification allows for any order so we must search by name - // Ensure the StructArray has a metadata field of BinaryView - let Some(metadata_field) = inner.column_by_name("metadata") else { + // Ensure the StructArray has a metadata field that is a binary type + let Some(metadata_col) = inner.column_by_name("metadata") else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); }; - let Some(metadata) = metadata_field.as_binary_view_opt() else { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'metadata' field must be BinaryView, got {}", - metadata_field.data_type() - ))); - }; + validate_binary_array(metadata_col.as_ref(), "metadata")?; // Note these clones are cheap, they just bump the ref count Ok(Self { inner: inner.clone(), - metadata: metadata.clone(), + metadata: metadata_col.clone(), shredding_state: ShreddingState::try_from(inner)?, }) } pub(crate) fn from_parts( - metadata: BinaryViewArray, - value: Option, + metadata: ArrayRef, + value: Option, typed_value: Option, nulls: Option, ) -> Self { - let mut builder = - StructArrayBuilder::new().with_field("metadata", Arc::new(metadata.clone()), false); + let mut builder = StructArrayBuilder::new().with_field("metadata", metadata.clone(), false); if let Some(value) = value.clone() { - builder = builder.with_field("value", Arc::new(value), true); + builder = builder.with_field("value", value, true); } if let Some(typed_value) = typed_value.clone() { builder = builder.with_field("typed_value", typed_value, true); @@ -375,7 +401,9 @@ impl VariantArray { } // Otherwise fall back to value, if available (_, Some(value)) if value.is_valid(index) => { - Ok(Variant::new(self.metadata.value(index), value.value(index))) + let metadata = binary_array_value(self.metadata.as_ref(), index); + let value = binary_array_value(value.as_ref(), index); + Ok(Variant::new(metadata, value)) } // It is technically invalid for neither value nor typed_value fields to be available, // but the spec specifically requires readers to return Variant::Null in this case. @@ -384,12 +412,12 @@ impl VariantArray { } /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &BinaryViewArray { + pub fn metadata_field(&self) -> &ArrayRef { &self.metadata } /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.shredding_state.value_field() } @@ -453,6 +481,12 @@ impl VariantArray { } } +impl PartialEq for VariantArray { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} + impl From for StructArray { fn from(variant_array: VariantArray) -> Self { variant_array.into_inner() @@ -626,7 +660,6 @@ impl ShreddedVariantFieldArray { /// 2. An optional field named `typed_value` which can be any primitive type /// or be a list, large_list, list_view or struct /// - /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. pub fn try_new(inner: &dyn Array) -> Result { let Some(inner_struct) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( @@ -647,7 +680,7 @@ impl ShreddedVariantFieldArray { } /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.shredding_state.value_field() } @@ -662,13 +695,13 @@ impl ShreddedVariantFieldArray { } pub(crate) fn from_parts( - value: Option, + value: Option, typed_value: Option, nulls: Option, ) -> Self { let mut builder = StructArrayBuilder::new(); if let Some(value) = value.clone() { - builder = builder.with_field("value", Arc::new(value), true); + builder = builder.with_field("value", value, true); } if let Some(typed_value) = typed_value.clone() { builder = builder.with_field("typed_value", typed_value, true); @@ -766,9 +799,9 @@ impl From for StructArray { /// (partial shredding). /// /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct ShreddingState { - value: Option, + value: Option, typed_value: Option, } @@ -787,12 +820,12 @@ impl ShreddingState { /// let struct_array: StructArray = get_struct_array(); /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); /// ``` - pub fn new(value: Option, typed_value: Option) -> Self { + pub fn new(value: Option, typed_value: Option) -> Self { Self { value, typed_value } } /// Return a reference to the value field, if present - pub fn value_field(&self) -> Option<&BinaryViewArray> { + pub fn value_field(&self) -> Option<&ArrayRef> { self.value.as_ref() } @@ -822,7 +855,7 @@ impl ShreddingState { /// for avoiding clone operations when the caller does not need a self-standing shredding state. #[derive(Clone, Debug)] pub struct BorrowedShreddingState<'a> { - value: Option<&'a BinaryViewArray>, + value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>, } @@ -841,12 +874,12 @@ impl<'a> BorrowedShreddingState<'a> { /// let struct_array: StructArray = get_struct_array(); /// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap(); /// ``` - pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self { + pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>) -> Self { Self { value, typed_value } } /// Return a reference to the value field, if present - pub fn value_field(&self) -> Option<&'a BinaryViewArray> { + pub fn value_field(&self) -> Option<&'a ArrayRef> { self.value } @@ -860,15 +893,10 @@ impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { type Error = ArrowError; fn try_from(inner_struct: &'a StructArray) -> Result { - // The `value` column need not exist, but if it does it must be a binary view. + // The `value` column need not exist, but if it does it must be a binary type. let value = if let Some(value_col) = inner_struct.column_by_name("value") { - let Some(binary_view) = value_col.as_binary_view_opt() else { - return Err(ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - value_col.data_type() - ))); - }; - Some(binary_view) + validate_binary_array(value_col.as_ref(), "value")?; + Some(value_col) } else { None }; @@ -936,7 +964,7 @@ impl StructArrayBuilder { /// returns the non-null element at index as a Variant fn typed_value_to_variant<'a>( typed_value: &'a ArrayRef, - value: Option<&BinaryViewArray>, + value: Option<&'a ArrayRef>, index: usize, ) -> Result> { let data_type = typed_value.data_type(); @@ -957,9 +985,8 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes } - DataType::BinaryView => { - let array = typed_value.as_binary_view(); - let value = array.value(index); + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => { + let value = binary_array_value(typed_value.as_ref(), index); Ok(Variant::from(value)) } DataType::Utf8 => { @@ -1099,17 +1126,9 @@ fn typed_value_to_variant<'a>( } } -/// Workaround for lack of direct support for BinaryArray -/// -/// -/// The values are read as -/// * `StructArray` -/// -/// but VariantArray needs them as -/// * `StructArray` -/// -/// So cast them to get the right type. -fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { +/// Canonicalize shredded typed_value fields (e.g. decimal narrowing) and +/// verify that all data types in the struct are legal for a variant array. +fn canonicalize_shredded_types(array: &dyn Array) -> Result { let new_type = canonicalize_and_verify_data_type(array.data_type())?; if let Cow::Borrowed(_) = new_type { if let Some(array) = array.as_struct_opt() { @@ -1120,8 +1139,8 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { } /// Recursively visits a data type, ensuring that it only contains data types that can legally -/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView, -/// since that's what comes back from the parquet reader and what the variant code expects to find. +/// appear in a (possibly shredded) variant array. It also narrows decimal types to the smallest +/// valid precision (e.g. Decimal128 -> Decimal32 when the precision fits). fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result> { use DataType::*; @@ -1172,10 +1191,8 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result borrow!(), Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(), - // Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet - // reader returns and what the rest of the variant code expects. - Binary | LargeBinary => Cow::Owned(BinaryView), - BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), + // Binary, string, and their view counterparts are allowed. + Binary | LargeBinary | BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), // UUID maps to 16-byte fixed-size binary; no other width is allowed FixedSizeBinary(16) => borrow!(), @@ -1242,8 +1259,9 @@ mod test { use super::*; use arrow::array::{ - BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, Int64Array, - LargeListArray, LargeListViewArray, ListArray, ListViewArray, Time64MicrosecondArray, + BinaryArray, BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array, + Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, ListArray, ListViewArray, + Time64MicrosecondArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow_schema::{Field, Fields}; @@ -1313,7 +1331,7 @@ mod test { let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32" + "Invalid argument error: VariantArray 'metadata' field must be Binary, LargeBinary, or BinaryView, got Int32" ); } @@ -1321,7 +1339,7 @@ mod test { fn invalid_value_field_type() { let fields = Fields::from(vec![ Field::new("metadata", DataType::BinaryView, true), - Field::new("value", DataType::Int32, true), // Not yet supported + Field::new("value", DataType::Int32, true), ]); let array = StructArray::new( fields, @@ -1331,7 +1349,7 @@ mod test { let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32" + "Invalid argument error: VariantArray 'value' field must be Binary, LargeBinary, or BinaryView, got Int32" ); } @@ -1445,27 +1463,28 @@ mod test { // use Parquet LIST encoding, but those fixtures do not cover Arrow-specific list container // variants (`LargeList`, `ListView`, `LargeListView`) accepted by `VariantArray::try_new`. let make_item_binary = || Arc::new(Field::new("item", DataType::Binary, true)); + let make_large_binary = || Arc::new(Field::new("item", DataType::LargeBinary, true)); let make_item_binary_view = || Arc::new(Field::new("item", DataType::BinaryView, true)); let cases = vec![ - ( - DataType::LargeList(make_item_binary()), - DataType::LargeList(make_item_binary_view()), - ), - ( - DataType::ListView(make_item_binary()), - DataType::ListView(make_item_binary_view()), - ), - ( - DataType::LargeListView(make_item_binary()), - DataType::LargeListView(make_item_binary_view()), - ), + // Binary item + DataType::LargeList(make_item_binary()), + DataType::ListView(make_item_binary()), + DataType::LargeListView(make_item_binary()), + // Large binary item + DataType::LargeList(make_large_binary()), + DataType::ListView(make_large_binary()), + DataType::LargeListView(make_large_binary()), + // Binary view item + DataType::LargeList(make_item_binary_view()), + DataType::ListView(make_item_binary_view()), + DataType::LargeListView(make_item_binary_view()), ]; - for (input, expected) in cases { + for input in cases { assert_eq!( canonicalize_and_verify_data_type(&input).unwrap().as_ref(), - &expected + &input ); } } @@ -1666,6 +1685,40 @@ mod test { } } + #[test] + fn binary_typed_value_roundtrips() { + // Verify that a shredded variant with Binary typed_value can be read back + let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([ + EMPTY_VARIANT_METADATA_BYTES, + ])); + let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![b"hello" as &[u8]])); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", metadata, false) + .with_field("typed_value", typed_value, true) + .build(); + + let variant_array = VariantArray::try_new(&struct_array).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(b"hello" as &[u8])); + } + + #[test] + fn large_binary_typed_value_roundtrips() { + // Verify that a shredded variant with LargeBinary typed_value can be read back + let metadata: ArrayRef = Arc::new(BinaryViewArray::from_iter_values([ + EMPTY_VARIANT_METADATA_BYTES, + ])); + let typed_value: ArrayRef = Arc::new(LargeBinaryArray::from(vec![b"world" as &[u8]])); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", metadata, false) + .with_field("typed_value", typed_value, true) + .build(); + + let variant_array = VariantArray::try_new(&struct_array).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(b"world" as &[u8])); + } + macro_rules! invalid_variant_array_test { ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => { #[test] diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 86ece0010042..2ef96180c357 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -609,7 +609,7 @@ mod test { let array2 = VariantArray::from_parts( array.metadata_field().clone(), - Some(value_builder.build().unwrap()), + Some(Arc::new(value_builder.build().unwrap()) as ArrayRef), None, None, ); diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 29e28c850be6..be1aae21e580 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, + array::{self, Array, ArrayRef, StructArray}, compute::CastOptions, datatypes::Field, error::Result, @@ -121,7 +121,7 @@ fn shredded_get_path( // Helper that creates a new VariantArray from the given nested value and typed_value columns, // properly accounting for accumulated nulls from path traversal let make_target_variant = - |value: Option, + |value: Option, typed_value: Option, accumulated_nulls: Option| { let metadata = input.metadata_field().clone(); @@ -1055,7 +1055,13 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, typed_value.len(), )); - VariantArray::from_parts(metadata, None, Some(typed_value), None).into() + VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + Some(typed_value), + None, + ) + .into() } }; } @@ -1723,7 +1729,12 @@ mod test { let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); - ArrayRef::from(VariantArray::from_parts(metadata, None, None, Some(nulls))) + ArrayRef::from(VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + None, + Some(nulls), + )) } /// This test manually constructs a shredded variant array representing objects /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field @@ -1829,8 +1840,8 @@ mod test { // Create the main VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2206,8 +2217,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2297,7 +2308,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(a_value_array), + Some(Arc::new(a_value_array) as ArrayRef), Some(a_inner_typed_value), None, ); @@ -2317,8 +2328,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -2399,7 +2410,7 @@ mod test { .unwrap(), ) as ArrayRef; let b_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(b_value_array), + Some(Arc::new(b_value_array) as ArrayRef), Some(b_inner_typed_value), None, ); @@ -2428,7 +2439,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(a_value_array), + Some(Arc::new(a_value_array) as ArrayRef), Some(a_inner_typed_value), None, ); @@ -2448,8 +2459,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), None, )) @@ -3262,7 +3273,7 @@ mod test { // Build final VariantArray with top-level nulls ArrayRef::from(VariantArray::from_parts( - metadata_array, + Arc::new(metadata_array) as ArrayRef, None, Some(Arc::new(typed_value_struct)), Some(nulls), @@ -3321,7 +3332,7 @@ mod test { false, // row 3: top-level NULL ]); ArrayRef::from(VariantArray::from_parts( - metadata_array, + Arc::new(metadata_array) as ArrayRef, None, Some(Arc::new(typed_value)), Some(nulls), @@ -3390,8 +3401,8 @@ mod test { // Top-level null is encoded in the main StructArray's null mask let variant_nulls = NullBuffer::from(vec![true, true, true, false]); // Row 3 is top-level null ArrayRef::from(VariantArray::from_parts( - metadata_array, - Some(value_array), + Arc::new(metadata_array) as ArrayRef, + Some(Arc::new(value_array) as ArrayRef), Some(Arc::new(typed_value_struct)), Some(variant_nulls), )) @@ -4068,9 +4079,13 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, all_nulls_values.len(), )); - let variant_array: ArrayRef = - VariantArray::from_parts(metadata, None, Some(Arc::new(typed_value_struct)), None) - .into(); + let variant_array: ArrayRef = VariantArray::from_parts( + Arc::new(metadata) as ArrayRef, + None, + Some(Arc::new(typed_value_struct)), + None, + ) + .into(); // Case 1: all-null primitive column should reuse the typed_value Arc directly let all_nulls_field_ref = FieldRef::from(Field::new("result", DataType::Int32, true)); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index dd054a5f7d82..6d1626640c10 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -26,11 +26,10 @@ use crate::type_conversion::{ use crate::variant_array::ShreddedVariantFieldArray; use crate::{VariantArray, VariantValueArrayBuilder}; use arrow::array::{ - ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray, - BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, - GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, - OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, - StructArray, + ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewBuilder, + BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray, GenericListViewArray, + LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder, OffsetSizeTrait, + PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder, StructArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::{CastOptions, DecimalCast}; @@ -119,7 +118,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>( } pub(crate) fn make_variant_to_arrow_row_builder<'a>( - metadata: &BinaryViewArray, + metadata: &ArrayRef, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, @@ -924,7 +923,7 @@ impl<'a> ListElementBuilder<'a> { Self::Shredded(b) => { let (value, typed_value, nulls) = b.finish()?; Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts( - Some(value), + Some(Arc::new(value)), Some(typed_value), nulls, ))) @@ -1052,13 +1051,13 @@ where /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { - metadata: BinaryViewArray, + metadata: ArrayRef, builder: VariantValueArrayBuilder, nulls: NullBufferBuilder, } impl VariantToBinaryVariantArrowRowBuilder { - fn new(metadata: BinaryViewArray, capacity: usize) -> Self { + fn new(metadata: ArrayRef, capacity: usize) -> Self { Self { metadata, builder: VariantValueArrayBuilder::new(capacity), @@ -1083,7 +1082,7 @@ impl VariantToBinaryVariantArrowRowBuilder { fn finish(mut self) -> Result { let variant_array = VariantArray::from_parts( self.metadata, - Some(self.builder.build()?), + Some(Arc::new(self.builder.build()?)), None, // no typed_value column self.nulls.finish(), ); From ac09ef5e41ad0faf34cf99836adbfc8fc050f9a5 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 30 Mar 2026 14:08:41 +0100 Subject: [PATCH 2/8] CR comment - option instead of panic --- parquet-variant-compute/src/shred_variant.rs | 54 +++++++++---------- .../src/unshred_variant.rs | 9 ++-- parquet-variant-compute/src/variant_array.rs | 25 +++++---- 3 files changed, 45 insertions(+), 43 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 48a6b12bf062..934a71aa67a7 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -917,7 +917,7 @@ mod tests { Some(expected_variant) => { assert!(fallback_value.is_valid(idx)); let metadata_bytes = - binary_array_value(fallback_metadata.as_ref(), idx); + binary_array_value(fallback_metadata.as_ref(), idx).unwrap(); let metadata_bytes = if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() { metadata_bytes @@ -927,7 +927,7 @@ mod tests { assert_eq!( Variant::new( metadata_bytes, - binary_array_value(fallback_value.as_ref(), idx) + binary_array_value(fallback_value.as_ref(), idx).unwrap() ), expected_variant.clone() ); @@ -995,7 +995,7 @@ mod tests { assert_eq!( Variant::new( EMPTY_VARIANT_METADATA_BYTES, - binary_array_value(element_fallbacks.as_ref(), idx) + binary_array_value(element_fallbacks.as_ref(), idx).unwrap() ), expected_variant.clone() ); @@ -1257,8 +1257,8 @@ mod tests { assert!(typed_value_field.is_null(1)); // typed_value should be null assert_eq!( Variant::new( - binary_array_value(metadata_field.as_ref(), 1), - binary_array_value(value_field.as_ref(), 1) + binary_array_value(metadata_field.as_ref(), 1).unwrap(), + binary_array_value(value_field.as_ref(), 1).unwrap() ), Variant::from("hello") ); @@ -1276,8 +1276,8 @@ mod tests { assert!(!value_field.is_null(4)); // should contain Variant::Null assert_eq!( Variant::new( - binary_array_value(metadata_field.as_ref(), 4), - binary_array_value(value_field.as_ref(), 4) + binary_array_value(metadata_field.as_ref(), 4).unwrap(), + binary_array_value(value_field.as_ref(), 4).unwrap() ), Variant::Null ); @@ -1356,8 +1356,8 @@ mod tests { assert!(typed_value.is_null(1)); assert_eq!( Variant::new( - binary_array_value(metadata.as_ref(), 1), - binary_array_value(value.as_ref(), 1) + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap() ), Variant::from(42i64) ); @@ -1373,8 +1373,8 @@ mod tests { assert!(typed_value.is_null(3)); assert_eq!( Variant::new( - binary_array_value(metadata.as_ref(), 3), - binary_array_value(value.as_ref(), 3) + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(value.as_ref(), 3).unwrap() ), Variant::Null ); @@ -1418,8 +1418,8 @@ mod tests { assert!(typed_value.is_null(1)); assert_eq!( Variant::new( - binary_array_value(metadata.as_ref(), 1), - binary_array_value(value.as_ref(), 1) + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap() ), Variant::from("not_binary") ); @@ -1435,8 +1435,8 @@ mod tests { assert!(typed_value.is_null(3)); assert_eq!( Variant::new( - binary_array_value(metadata.as_ref(), 3), - binary_array_value(value.as_ref(), 3) + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(value.as_ref(), 3).unwrap() ), Variant::Null ); @@ -1825,7 +1825,7 @@ mod tests { assert_eq!( Variant::new( EMPTY_VARIANT_METADATA_BYTES, - binary_array_value(id_values.as_ref(), 1) + binary_array_value(id_values.as_ref(), 1).unwrap() ), Variant::Null ); @@ -1945,8 +1945,8 @@ mod tests { value: &'v dyn Array, ) -> Variant<'m, 'v> { Variant::new( - binary_array_value(metadata, i), - binary_array_value(value, i), + binary_array_value(metadata, i).unwrap(), + binary_array_value(value, i).unwrap(), ) } let expect = |i, expected_result: Option>| { @@ -2143,7 +2143,7 @@ mod tests { // Helper to correctly create a variant object using a row's existing metadata let object_with_foo_field = |i| { use parquet_variant::{ParentState, ValueBuilder, VariantMetadata}; - let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i)); + let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap()); let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata); let mut value_builder = ValueBuilder::new(); let state = ParentState::variant(&mut value_builder, &mut metadata_builder); @@ -2243,8 +2243,8 @@ mod tests { assert!(value_field.is_valid(3)); assert_eq!( Variant::new( - binary_array_value(result.metadata_field().as_ref(), 3), - binary_array_value(value_field.as_ref(), 3) + binary_array_value(result.metadata_field().as_ref(), 3).unwrap(), + binary_array_value(value_field.as_ref(), 3).unwrap() ), Variant::from("not an object") ); @@ -2427,8 +2427,8 @@ mod tests { // Verify the value field contains the name field let row_1_variant = Variant::new( - binary_array_value(metadata.as_ref(), 1), - binary_array_value(value.as_ref(), 1), + binary_array_value(metadata.as_ref(), 1).unwrap(), + binary_array_value(value.as_ref(), 1).unwrap(), ); let Variant::Object(obj) = row_1_variant else { panic!("Expected object"); @@ -2462,8 +2462,8 @@ mod tests { assert!(session_id_value.is_valid(3)); // type mismatch, stored in value assert!(session_id_typed_value.is_null(3)); let session_id_variant = Variant::new( - binary_array_value(metadata.as_ref(), 3), - binary_array_value(session_id_value.as_ref(), 3), + binary_array_value(metadata.as_ref(), 3).unwrap(), + binary_array_value(session_id_value.as_ref(), 3).unwrap(), ); assert_eq!(session_id_variant, Variant::from("not-a-uuid")); @@ -2476,8 +2476,8 @@ mod tests { assert!(id_value.is_valid(4)); // type mismatch, stored in value assert!(id_typed_value.is_null(4)); let id_variant = Variant::new( - binary_array_value(metadata.as_ref(), 4), - binary_array_value(id_value.as_ref(), 4), + binary_array_value(metadata.as_ref(), 4).unwrap(), + binary_array_value(id_value.as_ref(), 4).unwrap(), ); assert_eq!(id_variant, Variant::from(12345i64)); diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs index 15aaa671d5c7..a05ee348d57e 100644 --- a/parquet-variant-compute/src/unshred_variant.rs +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -76,7 +76,8 @@ pub fn unshred_variant(array: &VariantArray) -> Result { if array.is_null(i) { value_builder.append_null(); } else { - let metadata_bytes = binary_array_value(metadata.as_ref(), i); + let metadata_bytes = binary_array_value(metadata.as_ref(), i) + .expect("metadata field must be a binary-like array"); let metadata = VariantMetadata::new(metadata_bytes); let mut value_builder = value_builder.builder_ext(&metadata); row_builder.append_row(&mut value_builder, &metadata, i)?; @@ -329,7 +330,8 @@ impl<'a> ValueOnlyUnshredVariantBuilder<'a> { if self.value.is_null(index) { builder.append_null(); } else { - let value_bytes = binary_array_value(self.value.as_ref(), index); + let value_bytes = binary_array_value(self.value.as_ref(), index) + .expect("value field must be a binary-like array"); let variant = Variant::new_with_metadata(metadata.clone(), value_bytes); builder.append_value(variant); } @@ -353,7 +355,8 @@ macro_rules! handle_unshredded_case { ($self:expr, $builder:expr, $metadata:expr, $index:expr, $partial_shredding:expr) => {{ let value = $self.value.as_ref().filter(|v| v.is_valid($index)); let value = value.map(|v| { - let bytes = binary_array_value(v.as_ref(), $index); + let bytes = binary_array_value(v.as_ref(), $index) + .expect("value field must be a binary-like array"); Variant::new_with_metadata($metadata.clone(), bytes) }); diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e2c35a44e0bc..4d231374f965 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -41,17 +41,13 @@ use parquet_variant::{ use std::borrow::Cow; use std::sync::Arc; -/// Returns the raw bytes at the given index from a binary-like array. -/// -/// # Panics -/// Panics if the array is not `Binary`, `LargeBinary`, or `BinaryView`, -/// or if `index` is out of bounds. -pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> &[u8] { +/// Returns the raw bytes at the given index from a binary-like array, return `None` if the array isn't binary-like. +pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8]> { match array.data_type() { - DataType::Binary => array.as_binary::().value(index), - DataType::LargeBinary => array.as_binary::().value(index), - DataType::BinaryView => array.as_binary_view().value(index), - other => panic!("Expected Binary, LargeBinary, or BinaryView array, got {other}"), + DataType::Binary => Some(array.as_binary::().value(index)), + DataType::LargeBinary => Some(array.as_binary::().value(index)), + DataType::BinaryView => Some(array.as_binary_view().value(index)), + _ => None, } } @@ -401,8 +397,10 @@ impl VariantArray { } // Otherwise fall back to value, if available (_, Some(value)) if value.is_valid(index) => { - let metadata = binary_array_value(self.metadata.as_ref(), index); - let value = binary_array_value(value.as_ref(), index); + let metadata = binary_array_value(self.metadata.as_ref(), index) + .expect("metadata field must be a binary-like array"); + let value = binary_array_value(value.as_ref(), index) + .expect("value field must be a binary-like array"); Ok(Variant::new(metadata, value)) } // It is technically invalid for neither value nor typed_value fields to be available, @@ -986,7 +984,8 @@ fn typed_value_to_variant<'a>( Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes } DataType::Binary | DataType::LargeBinary | DataType::BinaryView => { - let value = binary_array_value(typed_value.as_ref(), index); + let value = binary_array_value(typed_value.as_ref(), index) + .expect("match arm guarantees the array is binary-like"); Ok(Variant::from(value)) } DataType::Utf8 => { From 4d518d63b55c4a70cabe235c5df577c7a1c18490 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 1 Apr 2026 13:31:10 +0100 Subject: [PATCH 3/8] Erro handling Signed-off-by: Adam Gutglick --- .../src/unshred_variant.rs | 30 ++++++++---- parquet-variant-compute/src/variant_array.rs | 46 ++++++++++++------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs index a05ee348d57e..ecffd48bc41e 100644 --- a/parquet-variant-compute/src/unshred_variant.rs +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -76,8 +76,11 @@ pub fn unshred_variant(array: &VariantArray) -> Result { if array.is_null(i) { value_builder.append_null(); } else { - let metadata_bytes = binary_array_value(metadata.as_ref(), i) - .expect("metadata field must be a binary-like array"); + let metadata_bytes = binary_array_value(metadata.as_ref(), i).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "metadata field must be a binary-like array".to_string(), + ) + })?; let metadata = VariantMetadata::new(metadata_bytes); let mut value_builder = value_builder.builder_ext(&metadata); row_builder.append_row(&mut value_builder, &metadata, i)?; @@ -330,8 +333,11 @@ impl<'a> ValueOnlyUnshredVariantBuilder<'a> { if self.value.is_null(index) { builder.append_null(); } else { - let value_bytes = binary_array_value(self.value.as_ref(), index) - .expect("value field must be a binary-like array"); + let value_bytes = binary_array_value(self.value.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "value field must be a binary-like array".to_string(), + ) + })?; let variant = Variant::new_with_metadata(metadata.clone(), value_bytes); builder.append_value(variant); } @@ -354,11 +360,17 @@ trait AppendToVariantBuilder: Array { macro_rules! handle_unshredded_case { ($self:expr, $builder:expr, $metadata:expr, $index:expr, $partial_shredding:expr) => {{ let value = $self.value.as_ref().filter(|v| v.is_valid($index)); - let value = value.map(|v| { - let bytes = binary_array_value(v.as_ref(), $index) - .expect("value field must be a binary-like array"); - Variant::new_with_metadata($metadata.clone(), bytes) - }); + let value = value + .map(|v| { + let bytes = binary_array_value(v.as_ref(), $index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "value field must be a binary-like array, instead got {}", + v.data_type(), + )) + })?; + Result::Ok(Variant::new_with_metadata($metadata.clone(), bytes)) + }) + .transpose()?; // If typed_value is null, handle unshredded case and return early if $self.typed_value.is_null($index) { diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 4d231374f965..5c2412328916 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -51,17 +51,12 @@ pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8 } } -/// Returns `true` if the data type is `Binary`, `LargeBinary`, or `BinaryView`. -fn is_binary_like(dt: &DataType) -> bool { - matches!( - dt, - DataType::Binary | DataType::LargeBinary | DataType::BinaryView - ) -} - /// Validates that an array has a binary-like data type. fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> { - if is_binary_like(array.data_type()) { + if matches!( + array.data_type(), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView + ) { Ok(()) } else { Err(ArrowError::InvalidArgumentError(format!( @@ -397,10 +392,19 @@ impl VariantArray { } // Otherwise fall back to value, if available (_, Some(value)) if value.is_valid(index) => { - let metadata = binary_array_value(self.metadata.as_ref(), index) - .expect("metadata field must be a binary-like array"); - let value = binary_array_value(value.as_ref(), index) - .expect("value field must be a binary-like array"); + let metadata = + binary_array_value(self.metadata.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "metadata field must be a binary-like array, instead got {}", + self.metadata.data_type(), + )) + })?; + let value = binary_array_value(value.as_ref(), index).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "value field must be a binary-like array, instead got {}", + value.data_type(), + )) + })?; Ok(Variant::new(metadata, value)) } // It is technically invalid for neither value nor typed_value fields to be available, @@ -983,9 +987,19 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes } - DataType::Binary | DataType::LargeBinary | DataType::BinaryView => { - let value = binary_array_value(typed_value.as_ref(), index) - .expect("match arm guarantees the array is binary-like"); + DataType::Binary => { + let array = typed_value.as_binary::(); + let value = array.value(index); + Ok(Variant::from(value)) + } + DataType::LargeBinary => { + let array = typed_value.as_binary::(); + let value = array.value(index); + Ok(Variant::from(value)) + } + DataType::BinaryView => { + let array = typed_value.as_binary_view(); + let value = array.value(index); Ok(Variant::from(value)) } DataType::Utf8 => { From cf0d40e3500eb3588fbc91051164c71b734a25cc Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 9 Apr 2026 13:52:07 +0100 Subject: [PATCH 4/8] Fix small thing --- parquet-variant-compute/src/variant_get.rs | 67 +++++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index be1aae21e580..6a5c46233694 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, StructArray}, + array::{self, Array, ArrayRef, StructArray, make_array}, + buffer::NullBuffer, compute::CastOptions, datatypes::Field, error::Result, @@ -123,7 +124,7 @@ fn shredded_get_path( let make_target_variant = |value: Option, typed_value: Option, - accumulated_nulls: Option| { + accumulated_nulls: Option| { let metadata = input.metadata_field().clone(); VariantArray::from_parts(metadata, value, typed_value, accumulated_nulls) }; @@ -168,10 +169,8 @@ fn shredded_get_path( ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed if let Some(typed_value) = shredding_state.typed_value_field() { - accumulated_nulls = arrow::buffer::NullBuffer::union( - accumulated_nulls.as_ref(), - typed_value.nulls(), - ); + accumulated_nulls = + NullBuffer::union(accumulated_nulls.as_ref(), typed_value.nulls()); } shredding_state = state; path_index += 1; @@ -258,6 +257,7 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti return None; } let typed_value = variant_array.typed_value_field()?; + if typed_value.data_type() == as_field.data_type() && variant_array .value_field() @@ -268,9 +268,25 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti // 2. If every row in the `value` column is null // This is a perfect shredding, where the value is entirely shredded out, - // so we can just return the typed value. - return Some(typed_value.clone()); + // so we can just return the typed value after merging the accumulated nulls. + let parent_nulls = variant_array.nulls(); + + let target_array = if parent_nulls.is_none() { + typed_value.clone() + } else { + let merged_nulls = NullBuffer::union(parent_nulls, typed_value.nulls()); + let data = typed_value + .to_data() + .into_builder() + .nulls(merged_nulls) + .build() + .ok()?; + make_array(data) + }; + + return Some(target_array); } + None } @@ -1702,6 +1718,41 @@ mod test { ]) ); + #[test] + fn test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls() { + let metadata = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); + let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![ + Some(b"Apache" as &[u8]), + Some(b"masked-null" as &[u8]), + Some(b"Parquet-variant" as &[u8]), + ])); + let variant_array: ArrayRef = VariantArray::from_parts( + Arc::new(metadata) as _, + None, + Some(typed_value), + Some(NullBuffer::from(vec![true, false, true])), + ) + .into(); + + let result = variant_get( + &variant_array, + GetOptions::new().with_as_type(Some(FieldRef::from(Field::new( + "result", + DataType::Binary, + true, + )))), + ) + .unwrap(); + + let result = result.as_binary::(); + assert_eq!(result.len(), 3); + assert_eq!(result.null_count(), 1); + assert_eq!(result.value(0), b"Apache"); + assert!(result.is_null(1)); + assert_eq!(result.value(2), b"Parquet-variant"); + } + /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): /// From 7635dd6887f7c5141af981e68ab17d80f5deae1c Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 13 Apr 2026 10:45:32 +0100 Subject: [PATCH 5/8] Update parquet-variant-compute/src/variant_array.rs Co-authored-by: Ryan Johnson --- parquet-variant-compute/src/variant_array.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 5c2412328916..0e890c841192 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -53,16 +53,12 @@ pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8 /// Validates that an array has a binary-like data type. fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> { - if matches!( - array.data_type(), - DataType::Binary | DataType::LargeBinary | DataType::BinaryView - ) { - Ok(()) - } else { - Err(ArrowError::InvalidArgumentError(format!( + match array.data_type() { + DataType::Binary | DataType::LargeBinary | DataType::BinaryView => Ok(()), + _ => Err(ArrowError::InvalidArgumentError(format!( "VariantArray '{field_name}' field must be Binary, LargeBinary, or BinaryView, got {}", array.data_type() - ))) + ))), } } From 3890e891fdc1e879aef12637522387689f528ef8 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 13 Apr 2026 18:49:46 +0100 Subject: [PATCH 6/8] More CR comments + pull some stuff to other PR --- parquet-variant-compute/src/shred_variant.rs | 37 ++------ parquet-variant-compute/src/variant_array.rs | 40 +++++---- parquet-variant-compute/src/variant_get.rs | 92 ++++---------------- 3 files changed, 50 insertions(+), 119 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 934a71aa67a7..98a88c4a0e4c 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -692,7 +692,7 @@ impl VariantSchemaNode { mod tests { use super::*; use crate::VariantArrayBuilder; - use crate::variant_array::binary_array_value; + use crate::variant_array::{binary_array_value, variant_from_arrays_at}; use arrow::array::{ Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray, @@ -1256,10 +1256,7 @@ mod tests { assert!(!value_field.is_null(1)); // value should contain original assert!(typed_value_field.is_null(1)); // typed_value should be null assert_eq!( - Variant::new( - binary_array_value(metadata_field.as_ref(), 1).unwrap(), - binary_array_value(value_field.as_ref(), 1).unwrap() - ), + variant_from_arrays_at(metadata_field, value_field, 1).unwrap(), Variant::from("hello") ); @@ -1275,10 +1272,7 @@ mod tests { assert!(!result.is_null(4)); assert!(!value_field.is_null(4)); // should contain Variant::Null assert_eq!( - Variant::new( - binary_array_value(metadata_field.as_ref(), 4).unwrap(), - binary_array_value(value_field.as_ref(), 4).unwrap() - ), + variant_from_arrays_at(metadata_field, value_field, 4).unwrap(), Variant::Null ); assert!(typed_value_field.is_null(4)); @@ -1355,10 +1349,7 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new( - binary_array_value(metadata.as_ref(), 1).unwrap(), - binary_array_value(value.as_ref(), 1).unwrap() - ), + variant_from_arrays_at(metadata, value, 1).unwrap(), Variant::from(42i64) ); @@ -1372,10 +1363,7 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new( - binary_array_value(metadata.as_ref(), 3).unwrap(), - binary_array_value(value.as_ref(), 3).unwrap() - ), + variant_from_arrays_at(metadata, value, 3).unwrap(), Variant::Null ); @@ -1417,10 +1405,7 @@ mod tests { assert!(value.is_valid(1)); assert!(typed_value.is_null(1)); assert_eq!( - Variant::new( - binary_array_value(metadata.as_ref(), 1).unwrap(), - binary_array_value(value.as_ref(), 1).unwrap() - ), + variant_from_arrays_at(metadata, value, 1).unwrap(), Variant::from("not_binary") ); @@ -1434,10 +1419,7 @@ mod tests { assert!(value.is_valid(3)); assert!(typed_value.is_null(3)); assert_eq!( - Variant::new( - binary_array_value(metadata.as_ref(), 3).unwrap(), - binary_array_value(value.as_ref(), 3).unwrap() - ), + variant_from_arrays_at(metadata, value, 3).unwrap(), Variant::Null ); @@ -1944,10 +1926,7 @@ mod tests { metadata: &'m dyn Array, value: &'v dyn Array, ) -> Variant<'m, 'v> { - Variant::new( - binary_array_value(metadata, i).unwrap(), - binary_array_value(value, i).unwrap(), - ) + variant_from_arrays_at(metadata, value, i).unwrap() } let expect = |i, expected_result: Option>| { match expected_result { diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 0e890c841192..3e7027d0a070 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -51,6 +51,18 @@ pub(crate) fn binary_array_value(array: &dyn Array, index: usize) -> Option<&[u8 } } +/// Returns a [`Variant`] from a `metadata` and `value` byte arrays, returns `None` +/// if one of them is of invalid type. +pub(crate) fn variant_from_arrays_at<'m, 'v>( + metadata: &'m dyn Array, + value: &'v dyn Array, + index: usize, +) -> Option> { + let metadata = binary_array_value(metadata, index)?; + let value = binary_array_value(value, index)?; + Some(Variant::new(metadata, value)) +} + /// Validates that an array has a binary-like data type. fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> { match array.data_type() { @@ -387,22 +399,18 @@ impl VariantArray { typed_value_to_variant(typed_value, value, index) } // Otherwise fall back to value, if available - (_, Some(value)) if value.is_valid(index) => { - let metadata = - binary_array_value(self.metadata.as_ref(), index).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "metadata field must be a binary-like array, instead got {}", - self.metadata.data_type(), - )) - })?; - let value = binary_array_value(value.as_ref(), index).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "value field must be a binary-like array, instead got {}", - value.data_type(), - )) - })?; - Ok(Variant::new(metadata, value)) - } + (_, Some(value)) if value.is_valid(index) => variant_from_arrays_at( + &self.metadata, + value, + index, + ) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "metadata and value fields must be binary-like arrays, instead got {} and {}", + self.metadata.data_type(), + value.data_type() + )) + }), // It is technically invalid for neither value nor typed_value fields to be available, // but the spec specifically requires readers to return Variant::Null in this case. _ => Ok(Variant::Null), diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 6a5c46233694..2b17bb9d8b09 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{self, Array, ArrayRef, StructArray, make_array}, + array::{self, Array, ArrayRef, StructArray}, buffer::NullBuffer, compute::CastOptions, datatypes::Field, @@ -268,23 +268,8 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti // 2. If every row in the `value` column is null // This is a perfect shredding, where the value is entirely shredded out, - // so we can just return the typed value after merging the accumulated nulls. - let parent_nulls = variant_array.nulls(); - - let target_array = if parent_nulls.is_none() { - typed_value.clone() - } else { - let merged_nulls = NullBuffer::union(parent_nulls, typed_value.nulls()); - let data = typed_value - .to_data() - .into_builder() - .nulls(merged_nulls) - .build() - .ok()?; - make_array(data) - }; - - return Some(target_array); + // so we can just return the typed value. + return Some(typed_value.clone()); } None @@ -1071,13 +1056,7 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, typed_value.len(), )); - VariantArray::from_parts( - Arc::new(metadata) as ArrayRef, - None, - Some(typed_value), - None, - ) - .into() + VariantArray::from_parts(Arc::new(metadata), None, Some(typed_value), None).into() } }; } @@ -1718,41 +1697,6 @@ mod test { ]) ); - #[test] - fn test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls() { - let metadata = - BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); - let typed_value: ArrayRef = Arc::new(BinaryArray::from(vec![ - Some(b"Apache" as &[u8]), - Some(b"masked-null" as &[u8]), - Some(b"Parquet-variant" as &[u8]), - ])); - let variant_array: ArrayRef = VariantArray::from_parts( - Arc::new(metadata) as _, - None, - Some(typed_value), - Some(NullBuffer::from(vec![true, false, true])), - ) - .into(); - - let result = variant_get( - &variant_array, - GetOptions::new().with_as_type(Some(FieldRef::from(Field::new( - "result", - DataType::Binary, - true, - )))), - ) - .unwrap(); - - let result = result.as_binary::(); - assert_eq!(result.len(), 3); - assert_eq!(result.null_count(), 1); - assert_eq!(result.value(0), b"Apache"); - assert!(result.is_null(1)); - assert_eq!(result.value(2), b"Parquet-variant"); - } - /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): /// @@ -1781,7 +1725,7 @@ mod test { BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata) as ArrayRef, + Arc::new(metadata), None, None, Some(nulls), @@ -1891,8 +1835,8 @@ mod test { // Create the main VariantArray ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, - Some(Arc::new(value_array) as ArrayRef), + Arc::new(metadata_array), + Some(Arc::new(value_array)), Some(Arc::new(typed_value_struct)), None, )) @@ -2268,8 +2212,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, - Some(Arc::new(value_array) as ArrayRef), + Arc::new(metadata_array), + Some(Arc::new(value_array)), Some(Arc::new(typed_value_struct)), None, )) @@ -2379,8 +2323,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, - Some(Arc::new(value_array) as ArrayRef), + Arc::new(metadata_array), + Some(Arc::new(value_array)), Some(Arc::new(typed_value_struct)), None, )) @@ -2510,8 +2454,8 @@ mod test { // Build final VariantArray ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, - Some(Arc::new(value_array) as ArrayRef), + Arc::new(metadata_array), + Some(Arc::new(value_array)), Some(Arc::new(typed_value_struct)), None, )) @@ -3324,7 +3268,7 @@ mod test { // Build final VariantArray with top-level nulls ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, + Arc::new(metadata_array), None, Some(Arc::new(typed_value_struct)), Some(nulls), @@ -3383,7 +3327,7 @@ mod test { false, // row 3: top-level NULL ]); ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, + Arc::new(metadata_array), None, Some(Arc::new(typed_value)), Some(nulls), @@ -3452,8 +3396,8 @@ mod test { // Top-level null is encoded in the main StructArray's null mask let variant_nulls = NullBuffer::from(vec![true, true, true, false]); // Row 3 is top-level null ArrayRef::from(VariantArray::from_parts( - Arc::new(metadata_array) as ArrayRef, - Some(Arc::new(value_array) as ArrayRef), + Arc::new(metadata_array), + Some(Arc::new(value_array)), Some(Arc::new(typed_value_struct)), Some(variant_nulls), )) @@ -4131,7 +4075,7 @@ mod test { all_nulls_values.len(), )); let variant_array: ArrayRef = VariantArray::from_parts( - Arc::new(metadata) as ArrayRef, + Arc::new(metadata), None, Some(Arc::new(typed_value_struct)), None, From 413a7e7a76983a0e86b641dc58adb938d96b260e Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 13 Apr 2026 18:53:53 +0100 Subject: [PATCH 7/8] clean up unnecessery as --- parquet-variant-compute/src/shred_variant.rs | 12 ++++++------ parquet-variant-compute/src/variant_get.rs | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 98a88c4a0e4c..d7661ccb02a7 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result VariantToShreddedObjectVariantRowBuilder<'a> { for (field_name, typed_value_builder) in self.typed_value_builders { let (value, typed_value, nulls) = typed_value_builder.finish()?; let array = ShreddedVariantFieldArray::from_parts( - Some(Arc::new(value) as ArrayRef), + Some(Arc::new(value)), Some(typed_value), nulls, ); @@ -1142,7 +1142,7 @@ mod tests { #[test] fn test_all_null_input() { // Create VariantArray with no value field (all null case) - let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])) as ArrayRef; // minimal valid metadata + let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])); // minimal valid metadata let all_null_array = VariantArray::from_parts(metadata, None, None, None); let result = shred_variant(&all_null_array, &DataType::Int64).unwrap(); @@ -1698,11 +1698,11 @@ mod tests { let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, outer_elements.len(), - ))) as ArrayRef; + ))); let outer_variant = VariantArray::from_parts( outer_metadata, Some(outer_fallbacks.clone()), - Some(Arc::new(outer_values.clone()) as ArrayRef), + Some(Arc::new(outer_values.clone())), None, ); @@ -2245,7 +2245,7 @@ mod tests { Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n( EMPTY_VARIANT_METADATA_BYTES, scores_field.len(), - ))) as ArrayRef, + ))), Some(scores_field.value_field().unwrap().clone()), Some(scores_field.typed_value_field().unwrap().clone()), None, diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 2b17bb9d8b09..b543dc584789 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -2303,7 +2303,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(Arc::new(a_value_array) as ArrayRef), + Some(Arc::new(a_value_array)), Some(a_inner_typed_value), None, ); @@ -2405,7 +2405,7 @@ mod test { .unwrap(), ) as ArrayRef; let b_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(Arc::new(b_value_array) as ArrayRef), + Some(Arc::new(b_value_array)), Some(b_inner_typed_value), None, ); @@ -2434,7 +2434,7 @@ mod test { .unwrap(), ) as ArrayRef; let a_field_shredded = ShreddedVariantFieldArray::from_parts( - Some(Arc::new(a_value_array) as ArrayRef), + Some(Arc::new(a_value_array)), Some(a_inner_typed_value), None, ); From dfd3fd43e08d0049b3b1b86f2cc83016b90821b3 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 14 Apr 2026 12:48:32 +0100 Subject: [PATCH 8/8] Last cleanup --- parquet-variant-compute/src/shred_variant.rs | 20 ++++--------------- .../src/variant_array_builder.rs | 2 +- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index d7661ccb02a7..7b919660d69e 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -2221,10 +2221,7 @@ mod tests { assert!(value_field.is_null(2)); assert!(value_field.is_valid(3)); assert_eq!( - Variant::new( - binary_array_value(result.metadata_field().as_ref(), 3).unwrap(), - binary_array_value(value_field.as_ref(), 3).unwrap() - ), + variant_from_arrays_at(result.metadata_field(), value_field, 3).unwrap(), Variant::from("not an object") ); assert!(value_field.is_null(4)); @@ -2405,10 +2402,7 @@ mod tests { assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes()); // Verify the value field contains the name field - let row_1_variant = Variant::new( - binary_array_value(metadata.as_ref(), 1).unwrap(), - binary_array_value(value.as_ref(), 1).unwrap(), - ); + let row_1_variant = variant_from_arrays_at(metadata, value, 1).unwrap(); let Variant::Object(obj) = row_1_variant else { panic!("Expected object"); }; @@ -2440,10 +2434,7 @@ mod tests { assert!(session_id_value.is_valid(3)); // type mismatch, stored in value assert!(session_id_typed_value.is_null(3)); - let session_id_variant = Variant::new( - binary_array_value(metadata.as_ref(), 3).unwrap(), - binary_array_value(session_id_value.as_ref(), 3).unwrap(), - ); + let session_id_variant = variant_from_arrays_at(metadata, session_id_value, 3).unwrap(); assert_eq!(session_id_variant, Variant::from("not-a-uuid")); // Row 4: Type mismatch - id is int64, not UUID @@ -2454,10 +2445,7 @@ mod tests { assert!(id_value.is_valid(4)); // type mismatch, stored in value assert!(id_typed_value.is_null(4)); - let id_variant = Variant::new( - binary_array_value(metadata.as_ref(), 4).unwrap(), - binary_array_value(id_value.as_ref(), 4).unwrap(), - ); + let id_variant = variant_from_arrays_at(metadata, id_value, 4).unwrap(); assert_eq!(id_variant, Variant::from(12345i64)); assert!(session_id_value.is_null(4)); diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 2ef96180c357..4bea51550ddb 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -609,7 +609,7 @@ mod test { let array2 = VariantArray::from_parts( array.metadata_field().clone(), - Some(Arc::new(value_builder.build().unwrap()) as ArrayRef), + Some(Arc::new(value_builder.build().unwrap())), None, None, );