From ab77733a648eb4223a6e96ab52c2cac866e2ebdb Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 04:03:48 -0700 Subject: [PATCH 01/16] [Variant] Add constants for empty variant metadata --- parquet-variant-compute/src/variant_get.rs | 17 +++++------ parquet-variant/src/variant.rs | 2 +- parquet-variant/src/variant/metadata.rs | 33 ++++++++++++++++++++++ 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 3ac6d2be6c72..bcac113669de 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -304,7 +304,7 @@ mod test { use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow_schema::{DataType, Field, FieldRef, Fields}; - use parquet_variant::{Variant, VariantPath}; + use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; @@ -701,8 +701,10 @@ mod test { fn $func() -> ArrayRef { // At the time of writing, the `VariantArrayBuilder` does not support shredding. // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 - let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + 3, + )); let typed_value = $array_type::from(vec![ Some(<$primitive_type>::try_from(1u8).unwrap()), Some(<$primitive_type>::try_from(2u8).unwrap()), @@ -1032,8 +1034,6 @@ mod test { /// } /// ``` fn all_null_variant_array() -> ArrayRef { - let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; - let nulls = NullBuffer::from(vec![ false, // row 0 is null false, // row 1 is null @@ -1041,7 +1041,8 @@ mod test { ]); // metadata is the same for all rows (though they're all null) - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + let metadata = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 3)); let struct_array = StructArrayBuilder::new() .with_field("metadata", Arc::new(metadata), false) @@ -2502,8 +2503,8 @@ mod test { .build(); // Build final VariantArray with top-level nulls - let (metadata, _) = parquet_variant::VariantBuilder::new().finish(); - let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + let metadata_array = + BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 4)); let nulls = NullBuffer::from(vec![ true, // row 0: inner struct exists with typed_value=42 true, // row 1: inner field NULL diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3dae4daa0ff8..cc4c3bcadd66 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -17,7 +17,7 @@ pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; pub use self::list::VariantList; -pub use self::metadata::VariantMetadata; +pub use self::metadata::{VariantMetadata, EMPTY_VARIANT_METADATA, EMPTY_VARIANT_METADATA_BYTES}; pub use self::object::VariantObject; use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 1c9da6bcc0cf..e7cd838eb6cf 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -141,6 +141,39 @@ pub struct VariantMetadata<'m> { // could increase the size of Variant. All those size increases could hurt performance. const _: () = crate::utils::expect_size_of::(32); +/// The canonical byte slice corresponding to an empty metadata dictionary. +/// +/// ``` +/// # use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, VariantMetadata, WritableMetadataBuilder}; +/// let mut metadata_builder = WritableMetadataBuilder::default(); +/// metadata_builder.finish(); +/// let metadata_bytes = metadata_builder.into_inner(); +/// assert_eq!(&metadata_bytes, EMPTY_VARIANT_METADATA_BYTES); +/// ``` +pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0]; + +/// The empty metadata dictionary. +/// +/// ``` +/// # use parquet_variant::{EMPTY_VARIANT_METADATA, VariantMetadata, WritableMetadataBuilder}; +/// let mut metadata_builder = WritableMetadataBuilder::default(); +/// metadata_builder.finish(); +/// let metadata_bytes = metadata_builder.into_inner(); +/// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap(); +/// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA); +/// ``` +pub static EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata { + bytes: EMPTY_VARIANT_METADATA_BYTES, + header: VariantMetadataHeader { + version: CORRECT_VERSION_VALUE, + is_sorted: false, + offset_size: OffsetSizeBytes::One, + }, + dictionary_size: 0, + first_value_byte: 3, + validated: true, +}; + impl<'m> VariantMetadata<'m> { /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all /// dictionary entries. From 428aae1f8ed523a702e6d3a188671a5845cdf4f0 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 04:47:10 -0700 Subject: [PATCH 02/16] make const instead of static --- parquet-variant/src/variant/metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index e7cd838eb6cf..941247c9f23d 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -162,7 +162,7 @@ pub const EMPTY_VARIANT_METADATA_BYTES: &[u8] = &[1, 0, 0]; /// let empty_metadata = VariantMetadata::try_new(&metadata_bytes).unwrap(); /// assert_eq!(empty_metadata, EMPTY_VARIANT_METADATA); /// ``` -pub static EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata { +pub const EMPTY_VARIANT_METADATA: VariantMetadata = VariantMetadata { bytes: EMPTY_VARIANT_METADATA_BYTES, header: VariantMetadataHeader { version: CORRECT_VERSION_VALUE, From 97f99da9509a18b50b10ffd43bb86df5227453fd Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 10 Sep 2025 12:13:01 -0700 Subject: [PATCH 03/16] [Variant] Implement new VariantValueArrayBuilder --- .../src/variant_array_builder.rs | 176 +++++++++++++++++- parquet-variant/src/builder.rs | 2 +- parquet-variant/src/variant.rs | 2 +- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 9779d4a06d4a..be0be2296e90 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -22,8 +22,11 @@ use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuild use arrow_schema::{ArrowError, DataType, Field, Fields}; use parquet_variant::{ BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt, + EMPTY_VARIANT_METADATA, +}; +use parquet_variant::{ + ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder, }; -use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder}; use std::sync::Arc; /// A builder for [`VariantArray`] @@ -205,6 +208,134 @@ impl VariantBuilderExt for VariantArrayBuilder { } } +/// A builder for creating only the value column of a [`VariantArray`] +/// +/// This builder is used when you have existing metadata and only need to build +/// the value column. It's useful for scenarios like variant unshredding, data +/// transformation, or filtering where you want to reuse existing metadata. +/// +/// The builder produces a [`BinaryViewArray`] that can be combined with existing +/// metadata to create a complete [`VariantArray`]. +/// +/// # Example: +/// ``` +/// # use arrow::array::Array; +/// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; +/// # use parquet_variant_compute::VariantValueArrayBuilder; +/// // Create a variant value builder for 10 rows +/// let mut builder = VariantValueArrayBuilder::new(10); +/// +/// // Append some values with their corresponding metadata +/// // In practice, you should use the existing metadata you have access to. +/// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); +/// builder.append_null(); +/// builder.append_value(Variant::from("hello"), EMPTY_VARIANT_METADATA).unwrap(); +/// +/// // Build the final value array +/// let value_array = builder.build(); +/// assert_eq!(value_array.len(), 3); +/// ``` +#[derive(Debug)] +#[allow(unused)] +pub struct VariantValueArrayBuilder { + value_builder: ValueBuilder, + value_offsets: Vec, + nulls: NullBufferBuilder, +} + +#[allow(unused)] +impl VariantValueArrayBuilder { + /// Create a new `VariantValueArrayBuilder` with the specified row capacity + pub fn new(row_capacity: usize) -> Self { + Self { + value_builder: ValueBuilder::new(), + value_offsets: Vec::with_capacity(row_capacity), + nulls: NullBufferBuilder::new(row_capacity), + } + } + + /// Build the final value array + /// + /// Returns a [`BinaryViewArray`] containing the serialized variant values. + /// This can be combined with existing metadata to create a complete [`VariantArray`]. + pub fn build(mut self) -> Result { + let value_buffer = self.value_builder.into_inner(); + let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets); + if let Some(nulls) = self.nulls.finish() { + let (views, buffers, _) = array.into_parts(); + array = BinaryViewArray::try_new(views, buffers, Some(nulls))?; + } + Ok(array) + } + + /// Append a null row to the builder + /// + /// WARNING: It is only safe to call this method when building the `value` field of a shredded + /// variant column (which is nullable). The `value` field of a binary (unshredded) variant + /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with + /// `Variant::Null`, passing the appropriate metadata value. + pub fn append_null(&mut self) { + self.value_offsets.push(self.value_builder.offset()); + self.nulls.append_null(); + } + + /// Append a variant value with its corresponding metadata + /// + /// # Arguments + /// * `value` - The variant value to append + /// * `metadata` - The metadata dictionary for this variant (used for field name resolution) + /// + /// # Returns + /// * `Ok(())` if the value was successfully appended + /// * `Err(ArrowError)` if the variant contains field names not found in the metadata + /// + /// # Example + /// ``` + /// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; + /// # use parquet_variant_compute::VariantValueArrayBuilder; + /// let mut builder = VariantValueArrayBuilder::new(10); + /// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); + /// ``` + pub fn append_value(&mut self, value: Variant<'_, '_>) { + let metadata = value.metadata().cloned().unwrap_or(EMPTY_VARIANT_METADATA); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata); + ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); + } + + /// Creates a builder-specific parent state + pub fn parent_state<'a>( + &'a mut self, + metadata_builder: &'a mut dyn MetadataBuilder, + ) -> ParentState<'a, ValueArrayBuilderState<'a>> { + let state = ValueArrayBuilderState { + value_offsets: &mut self.value_offsets, + nulls: &mut self.nulls, + }; + + ParentState::new(&mut self.value_builder, metadata_builder, state) + } +} + +/// Builder-specific state for array building that manages array-level offsets and nulls. See +/// [`VariantBuilderExt`] for details. +#[derive(Debug)] +pub struct ValueArrayBuilderState<'a> { + value_offsets: &'a mut Vec, + nulls: &'a mut NullBufferBuilder, +} + +// All changes are pending until finalized +impl BuilderSpecificState for ValueArrayBuilderState<'_> { + fn finish( + &mut self, + _metadata_builder: &mut dyn MetadataBuilder, + value_builder: &mut ValueBuilder, + ) { + self.value_offsets.push(value_builder.offset()); + self.nulls.append_non_null(); + } +} + fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> BinaryViewArray { // All offsets are less than or equal to the buffer length, so we can safely cast all offsets // inside the loop below, as long as the buffer length fits in u32. @@ -228,6 +359,7 @@ fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> Binar mod test { use super::*; use arrow::array::Array; + use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; /// Test that both the metadata and value buffers are non nullable #[test] @@ -288,4 +420,46 @@ mod test { let list = variant.as_list().expect("variant to be a list"); assert_eq!(list.len(), 2); } + + #[test] + fn test_variant_value_array_builder_basic() { + let mut builder = VariantValueArrayBuilder::new(10); + + // Append some values + builder.append_value(Variant::from(42i32)); + builder.append_null(); + builder.append_value(Variant::from("hello")); + + let value_array = builder.build().unwrap(); + assert_eq!(value_array.len(), 3); + } + + #[test] + fn test_variant_value_array_builder_with_objects() { + // Create metadata with field names + let mut metadata_builder = WritableMetadataBuilder::default(); + metadata_builder.upsert_field_name("name"); + metadata_builder.upsert_field_name("age"); + metadata_builder.finish(); + let metadata_bytes = metadata_builder.into_inner(); + let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap(); + + // Create a variant with an object using the same metadata + let mut variant_builder = VariantBuilder::new().with_metadata(metadata); + variant_builder + .new_object() + .with_field("name", "Alice") + .with_field("age", 30i32) + .finish(); + let (_, value_bytes) = variant_builder.finish(); + let variant = Variant::try_new(&metadata_bytes, &value_bytes).unwrap(); + + // Now use the value array builder + let mut builder = VariantValueArrayBuilder::new(10); + builder.append_value(variant); + builder.append_null(); + + let value_array = builder.build().unwrap(); + assert_eq!(value_array.len(), 2); + } } diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 93e736285853..1480d6400db1 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder { impl WritableMetadataBuilder { /// Upsert field name to dictionary, return its ID - fn upsert_field_name(&mut self, field_name: &str) -> u32 { + pub fn upsert_field_name(&mut self, field_name: &str) -> u32 { let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); if new_entry { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index cc4c3bcadd66..697b1bbeb944 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1320,7 +1320,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// Return the metadata associated with this variant, if any. /// /// Returns `Some(&VariantMetadata)` for object and list variants, - pub fn metadata(&self) -> Option<&'m VariantMetadata<'_>> { + pub fn metadata(&self) -> Option<&VariantMetadata<'m>> { match self { Variant::Object(VariantObject { metadata, .. }) | Variant::List(VariantList { metadata, .. }) => Some(metadata), From fb455c0d15d58d0f67a361286f03a566231d8f07 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 05:28:57 -0700 Subject: [PATCH 04/16] fix doctest --- parquet-variant-compute/src/lib.rs | 2 +- .../src/variant_array_builder.rs | 25 ++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 999e118367ac..70fcbdb66f95 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,7 +46,7 @@ pub mod variant_get; mod variant_to_arrow; pub use variant_array::{ShreddingState, VariantArray}; -pub use variant_array_builder::VariantArrayBuilder; +pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index be0be2296e90..3e24fc823581 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -302,7 +302,30 @@ impl VariantValueArrayBuilder { ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); } - /// Creates a builder-specific parent state + /// Creates a builder-specific parent state. + /// + /// For example, this can be useful for code that wants to copy a subset of fields from an + /// object `value` as a new row of `value_array_builder`: + /// + /// ```no_run + /// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant}; + /// # use parquet_variant_compute::VariantValueArrayBuilder; + /// # let value = Variant::Null; + /// # let mut value_array_builder = VariantValueArrayBuilder::new(0); + /// # fn should_keep(field_name: &str) -> bool { todo!() }; + /// let Variant::Object(obj) = value else { + /// panic!("Not a variant object"); + /// }; + /// let mut metadata_builder = ReadOnlyMetadataBuilder::new(obj.metadata.clone()); + /// let state = value_array_builder.parent_state(&mut metadata_builder); + /// let mut object_builder = ObjectBuilder::new(state, false); + /// for (field_name, field_value) in obj.iter() { + /// if should_keep(field_name) { + /// object_builder.insert_bytes(field_name, field_value); + /// } + /// } + /// object_builder.finish(); // appends the new object + /// ``` pub fn parent_state<'a>( &'a mut self, metadata_builder: &'a mut dyn MetadataBuilder, From 2238b495cf9d5d76628a0ed61da8724d89abdd4e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 05:30:32 -0700 Subject: [PATCH 05/16] remove unneeded unused markers --- parquet-variant-compute/src/variant_array_builder.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 3e24fc823581..3b81725902ab 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -236,14 +236,12 @@ impl VariantBuilderExt for VariantArrayBuilder { /// assert_eq!(value_array.len(), 3); /// ``` #[derive(Debug)] -#[allow(unused)] pub struct VariantValueArrayBuilder { value_builder: ValueBuilder, value_offsets: Vec, nulls: NullBufferBuilder, } -#[allow(unused)] impl VariantValueArrayBuilder { /// Create a new `VariantValueArrayBuilder` with the specified row capacity pub fn new(row_capacity: usize) -> Self { From 474fa315c4942297be700acba4e811524244cefc Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 05:44:37 -0700 Subject: [PATCH 06/16] doc fixes --- .../src/variant_array_builder.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 3b81725902ab..d45621691709 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -220,19 +220,19 @@ impl VariantBuilderExt for VariantArrayBuilder { /// # Example: /// ``` /// # use arrow::array::Array; -/// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; +/// # use parquet_variant::{Variant}; /// # use parquet_variant_compute::VariantValueArrayBuilder; /// // Create a variant value builder for 10 rows /// let mut builder = VariantValueArrayBuilder::new(10); /// /// // Append some values with their corresponding metadata -/// // In practice, you should use the existing metadata you have access to. -/// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); +/// // In practice, some of the variant values would be objects with internal metadata. +/// builder.append_value(Variant::from(42)); /// builder.append_null(); -/// builder.append_value(Variant::from("hello"), EMPTY_VARIANT_METADATA).unwrap(); +/// builder.append_value(Variant::from("hello")); /// /// // Build the final value array -/// let value_array = builder.build(); +/// let value_array = builder.build().unwrap(); /// assert_eq!(value_array.len(), 3); /// ``` #[derive(Debug)] @@ -289,10 +289,10 @@ impl VariantValueArrayBuilder { /// /// # Example /// ``` - /// # use parquet_variant::{Variant, EMPTY_VARIANT_METADATA}; + /// # use parquet_variant::Variant; /// # use parquet_variant_compute::VariantValueArrayBuilder; /// let mut builder = VariantValueArrayBuilder::new(10); - /// builder.append_value(Variant::from(42), EMPTY_VARIANT_METADATA).unwrap(); + /// builder.append_value(Variant::from(42)); /// ``` pub fn append_value(&mut self, value: Variant<'_, '_>) { let metadata = value.metadata().cloned().unwrap_or(EMPTY_VARIANT_METADATA); From f286c52113b1b95c92d6a4fd6afcf8b12a121b44 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 21:10:41 -0700 Subject: [PATCH 07/16] review feedback --- .../src/variant_array_builder.rs | 95 ++++++++++++++----- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index d45621691709..2f2f5a16b427 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -225,8 +225,9 @@ impl VariantBuilderExt for VariantArrayBuilder { /// // Create a variant value builder for 10 rows /// let mut builder = VariantValueArrayBuilder::new(10); /// -/// // Append some values with their corresponding metadata -/// // In practice, some of the variant values would be objects with internal metadata. +/// // Append some values with their corresponding metadata. In practice, some of the variant +/// // values would be objects with internal references to pre-existing metadata, which the +/// // builder takes advantage of to avoid creating new metadata. /// builder.append_value(Variant::from(42)); /// builder.append_null(); /// builder.append_value(Variant::from("hello")); @@ -268,7 +269,7 @@ impl VariantValueArrayBuilder { /// Append a null row to the builder /// - /// WARNING: It is only safe to call this method when building the `value` field of a shredded + /// WARNING: It is only valid to call this method when building the `value` field of a shredded /// variant column (which is nullable). The `value` field of a binary (unshredded) variant /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with /// `Variant::Null`, passing the appropriate metadata value. @@ -322,7 +323,7 @@ impl VariantValueArrayBuilder { /// object_builder.insert_bytes(field_name, field_value); /// } /// } - /// object_builder.finish(); // appends the new object + /// object_builder.finish(); // appends the filtered object /// ``` pub fn parent_state<'a>( &'a mut self, @@ -380,7 +381,7 @@ fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> Binar mod test { use super::*; use arrow::array::Array; - use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; + use parquet_variant::Variant; /// Test that both the metadata and value buffers are non nullable #[test] @@ -457,30 +458,76 @@ mod test { #[test] fn test_variant_value_array_builder_with_objects() { - // Create metadata with field names - let mut metadata_builder = WritableMetadataBuilder::default(); - metadata_builder.upsert_field_name("name"); - metadata_builder.upsert_field_name("age"); - metadata_builder.finish(); - let metadata_bytes = metadata_builder.into_inner(); - let metadata = VariantMetadata::try_new(&metadata_bytes).unwrap(); - - // Create a variant with an object using the same metadata - let mut variant_builder = VariantBuilder::new().with_metadata(metadata); - variant_builder + // Populate a variant array with objects + let mut builder = VariantArrayBuilder::new(3); + builder .new_object() .with_field("name", "Alice") .with_field("age", 30i32) .finish(); - let (_, value_bytes) = variant_builder.finish(); - let variant = Variant::try_new(&metadata_bytes, &value_bytes).unwrap(); - // Now use the value array builder - let mut builder = VariantValueArrayBuilder::new(10); - builder.append_value(variant); - builder.append_null(); + builder + .new_object() + .with_field("name", "Bob") + .with_field("age", 42i32) + .with_field("city", "Wonderland") + .finish(); - let value_array = builder.build().unwrap(); - assert_eq!(value_array.len(), 2); + builder + .new_object() + .with_field("name", "Charlie") + .with_field("age", 1i32) + .finish(); + + let array = builder.build(); + + // Copy (some of) the objects over to the value array builder + // + // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only + // filter or manipulate values within a row. + let mut builder = VariantValueArrayBuilder::new(3); + + // straight copy + builder.append_value(array.value(0)); + + // filtering fields takes more work because we need to manually create an object builder + let value = array.value(1); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().unwrap().clone()); + let state = builder.parent_state(&mut metadata_builder); + ObjectBuilder::new(state, false) + .with_field("name", value.get_object_field("name").unwrap()) + .with_field("age", value.get_object_field("age").unwrap()) + .finish(); + + // same bytes, but now nested and duplicated inside a list + let value = array.value(2); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().unwrap().clone()); + let state = builder.parent_state(&mut metadata_builder); + ListBuilder::new(state, false) + .with_value(value.clone()) + .with_value(value.clone()) + .finish(); + + let array2 = VariantArray::from_parts( + array.metadata_field().clone(), + Some(builder.build().unwrap()), + None, + None, + ); + + assert_eq!(array2.len(), 3); + assert_eq!(array.value(0), array2.value(0)); + + assert_eq!( + array.value(1).get_object_field("name"), + array2.value(1).get_object_field("name") + ); + assert_eq!( + array.value(1).get_object_field("age"), + array2.value(1).get_object_field("age") + ); + + assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap()); + assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap()); } } From a476c7227616e0397329043506f8a5e0809522af Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 16 Sep 2025 22:20:44 -0700 Subject: [PATCH 08/16] binary variant row builder uses variant value builder --- parquet-variant-compute/src/variant_get.rs | 9 +++- .../src/variant_to_arrow.rs | 41 +++++++++++-------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index a5819fc45937..01af46e564f2 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -135,8 +135,13 @@ fn shredded_get_path( let shred_basic_variant = |target: VariantArray, path: VariantPath<'_>, as_field: Option<&Field>| { let as_type = as_field.map(|f| f.data_type()); - let mut builder = - make_variant_to_arrow_row_builder(path, as_type, cast_options, target.len())?; + let mut builder = make_variant_to_arrow_row_builder( + target.metadata_field(), + path, + as_type, + cast_options, + target.len(), + )?; for i in 0..target.len() { if target.is_null(i) { builder.append_null()?; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 60f74e365dd4..b24f2d805d3e 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, PrimitiveBuilder}; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder}; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; use arrow::error::{ArrowError, Result}; use parquet_variant::{Variant, VariantPath}; use crate::type_conversion::VariantAsPrimitive; -use crate::VariantArrayBuilder; +use crate::{VariantArray, VariantValueArrayBuilder}; use std::sync::Arc; @@ -93,7 +93,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { } pub(crate) fn make_variant_to_arrow_row_builder<'a>( - //metadata: &BinaryViewArray, + metadata: &BinaryViewArray, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, @@ -103,7 +103,10 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( let mut builder = match data_type { // If no data type was requested, build an unshredded VariantArray. - None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(capacity)), + None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( + metadata.clone(), + capacity, + )), Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, @@ -246,13 +249,17 @@ where /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { - builder: VariantArrayBuilder, + metadata: BinaryViewArray, + builder: VariantValueArrayBuilder, + nulls: NullBufferBuilder, } impl VariantToBinaryVariantArrowRowBuilder { - fn new(capacity: usize) -> Self { + fn new(metadata: BinaryViewArray, capacity: usize) -> Self { Self { - builder: VariantArrayBuilder::new(capacity), + metadata, + builder: VariantValueArrayBuilder::new(capacity), + nulls: NullBufferBuilder::new(capacity), } } } @@ -260,22 +267,22 @@ impl VariantToBinaryVariantArrowRowBuilder { impl VariantToBinaryVariantArrowRowBuilder { fn append_null(&mut self) -> Result<()> { self.builder.append_null(); + self.nulls.append_null(); Ok(()) } fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - // TODO: We need a way to convert a Variant directly to bytes. In particular, we want to - // just copy across the underlying value byte slice of a `Variant::Object` or - // `Variant::List`, without any interaction with a `VariantMetadata` (because the shredding - // spec requires us to reuse the existing metadata when unshredding). - // - // One could _probably_ emulate this with parquet_variant::VariantBuilder, but it would do a - // lot of unnecessary work and would also create a new metadata column we don't need. - self.builder.append_variant(value.clone()); + self.builder.append_value(value.clone()); + self.nulls.append_non_null(); Ok(true) } - fn finish(self) -> Result { - Ok(Arc::new(self.builder.build())) + fn finish(mut self) -> Result { + Ok(Arc::new(VariantArray::from_parts( + self.metadata, + Some(self.builder.build()?), + None, // no typed_value column + self.nulls.finish(), + ))) } } From 7640d500871efd1defa36910f82842df9615f00d Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 15 Sep 2025 04:26:32 -0700 Subject: [PATCH 09/16] [Variant] Define new shred_variant function --- parquet-variant-compute/Cargo.toml | 1 + parquet-variant-compute/src/lib.rs | 2 + parquet-variant-compute/src/shred_variant.rs | 683 ++++++++++++++++++ parquet-variant-compute/src/variant_array.rs | 26 + parquet-variant-compute/src/variant_get.rs | 2 +- .../src/variant_to_arrow.rs | 149 +++- parquet-variant/src/builder.rs | 2 +- 7 files changed, 823 insertions(+), 42 deletions(-) create mode 100644 parquet-variant-compute/src/shred_variant.rs diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index 819a131f9c42..feb8172a9407 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -34,6 +34,7 @@ rust-version = { workspace = true } arrow = { workspace = true } arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } +indexmap = "2.10.0" parquet-variant = { workspace = true } parquet-variant-json = { workspace = true } chrono = { workspace = true } diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 70fcbdb66f95..b0d4c5ac3d3f 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -38,6 +38,7 @@ mod arrow_to_variant; pub mod cast_to_variant; mod from_json; +mod shred_variant; mod to_json; mod type_conversion; mod variant_array; @@ -50,5 +51,6 @@ pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; +pub use shred_variant::shred_variant; pub use to_json::variant_to_json; pub use type_conversion::CastOptions; diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs new file mode 100644 index 000000000000..84a8bc1f6b29 --- /dev/null +++ b/parquet-variant-compute/src/shred_variant.rs @@ -0,0 +1,683 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for shredding VariantArray with a given schema. + +use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; +use crate::variant_to_arrow::{ + make_primitive_variant_to_arrow_row_builder, PrimitiveVariantToArrowRowBuilder, +}; +use crate::{VariantArray, VariantValueArrayBuilder}; +use arrow::array::Array as _; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder}; +use arrow::buffer::NullBuffer; +use arrow::compute::CastOptions; +use arrow::datatypes::{DataType, Fields}; +use arrow::error::{ArrowError, Result}; +use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, EMPTY_VARIANT_METADATA}; + +use indexmap::IndexMap; +use std::sync::Arc; + +pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result { + if array.typed_value_field().is_some() { + return Err(ArrowError::InvalidArgumentError( + "Input is already shredded".to_string(), + )); + } + + if array.value_field().is_none() { + // all-null case + return Ok(VariantArray::from_parts( + array.metadata_field().clone(), + None, + None, + None, + )); + }; + + let cast_options = CastOptions::default(); + let mut builder = make_variant_to_shredded_variant_arrow_row_builder( + as_type, + &cast_options, + array.len(), + true, + )?; + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null()?; + } else { + builder.append_value(array.value(i))?; + } + } + let (value, typed_value, nulls) = builder.finish()?; + Ok(VariantArray::from_parts( + array.metadata_field().clone(), + Some(value), + Some(typed_value), + nulls, + )) +} + +pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( + data_type: &'a DataType, + cast_options: &'a CastOptions, + len: usize, + top_level: bool, +) -> Result> { + let builder = match data_type { + DataType::Struct(fields) => { + let typed_value_builder = VariantToShreddedObjectVariantRowBuilder::try_new( + fields, + cast_options, + len, + top_level, + )?; + VariantToShreddedVariantRowBuilder::Object(typed_value_builder) + } + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(..) => { + // TODO: Special handling for shredded variant arrays + return Err(ArrowError::NotYetImplemented( + "shred_variant not yet implemented for lists".to_string(), + )); + } + _ => { + let builder = + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, len)?; + let typed_value_builder = + VariantToShreddedPrimitiveVariantRowBuilder::new(builder, len, top_level); + VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder) + } + }; + Ok(builder) +} + +pub(crate) enum VariantToShreddedVariantRowBuilder<'a> { + Primitive(VariantToShreddedPrimitiveVariantRowBuilder<'a>), + Object(VariantToShreddedObjectVariantRowBuilder<'a>), +} +impl<'a> VariantToShreddedVariantRowBuilder<'a> { + pub fn append_null(&mut self) -> Result<()> { + use VariantToShreddedVariantRowBuilder::*; + match self { + Primitive(b) => b.append_null(), + Object(b) => b.append_null(), + } + } + + pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + use VariantToShreddedVariantRowBuilder::*; + match self { + Primitive(b) => b.append_value(value), + Object(b) => b.append_value(value), + } + } + + pub fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option)> { + use VariantToShreddedVariantRowBuilder::*; + match self { + Primitive(b) => b.finish(), + Object(b) => b.finish(), + } + } +} + +/// A top-level variant shredder -- appending NULL produces typed_value=NULL and value=Variant::Null +pub(crate) struct VariantToShreddedPrimitiveVariantRowBuilder<'a> { + value_builder: VariantValueArrayBuilder, + typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>, + nulls: NullBufferBuilder, + top_level: bool, +} + +impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> { + pub(crate) fn new( + typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>, + len: usize, + top_level: bool, + ) -> Self { + Self { + value_builder: VariantValueArrayBuilder::new(len), + typed_value_builder, + nulls: NullBufferBuilder::new(len), + top_level, + } + } + fn append_null(&mut self) -> Result<()> { + if self.top_level { + self.nulls.append_null(); + self.value_builder.append_null(); + } else { + self.nulls.append_non_null(); + self.value_builder.append_value(Variant::Null); + } + self.typed_value_builder.append_null() + } + fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + self.nulls.append_non_null(); + if self.typed_value_builder.append_value(&value)? { + self.value_builder.append_null(); + } else { + self.value_builder.append_value(value); + } + Ok(true) + } + fn finish(mut self) -> Result<(BinaryViewArray, ArrayRef, Option)> { + Ok(( + self.value_builder.build()?, + self.typed_value_builder.finish()?, + self.nulls.finish(), + )) + } +} + +pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> { + value_builder: VariantValueArrayBuilder, + typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>, + typed_value_nulls: NullBufferBuilder, + nulls: NullBufferBuilder, + top_level: bool, +} + +impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { + fn try_new( + fields: &'a Fields, + cast_options: &'a CastOptions, + len: usize, + top_level: bool, + ) -> Result { + let typed_value_builders = fields.iter().map(|field| { + let builder = make_variant_to_shredded_variant_arrow_row_builder( + field.data_type(), + cast_options, + len, + top_level, + )?; + Ok((field.name().as_str(), builder)) + }); + Ok(Self { + value_builder: VariantValueArrayBuilder::new(len), + typed_value_builders: typed_value_builders.collect::>()?, + typed_value_nulls: NullBufferBuilder::new(len), + nulls: NullBufferBuilder::new(len), + top_level, + }) + } + + fn append_null(&mut self) -> Result<()> { + if self.top_level { + self.nulls.append_null(); + self.value_builder.append_null(); + } else { + self.nulls.append_non_null(); + self.value_builder.append_value(Variant::Null); + } + self.typed_value_nulls.append_null(); + for (_, typed_value_builder) in &mut self.typed_value_builders { + typed_value_builder.append_null()?; + } + Ok(()) + } + fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + let Variant::Object(ref obj) = value else { + // Not an object => fall back + self.nulls.append_non_null(); + self.value_builder.append_value(value); + self.typed_value_nulls.append_null(); + for (_, typed_value_builder) in &mut self.typed_value_builders { + typed_value_builder.append_null()?; + } + return Ok(false); + }; + + // Route the object's fields by name as either shredded or unshredded + let metadata = value.metadata().cloned().unwrap_or(EMPTY_VARIANT_METADATA); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata); + let state = self.value_builder.parent_state(&mut metadata_builder); + let mut object_builder = ObjectBuilder::new(state, false); + let mut seen = std::collections::HashSet::new(); + let mut partially_shredded = false; + for (field_name, value) in obj.iter() { + match self.typed_value_builders.get_mut(field_name) { + Some(typed_value_builder) => { + typed_value_builder.append_value(value)?; + seen.insert(field_name); + } + None => { + object_builder.insert_bytes(field_name, value); + partially_shredded = true; + } + } + } + + // Handle missing fields + for (field_name, typed_value_builder) in &mut self.typed_value_builders { + if !seen.contains(field_name) { + typed_value_builder.append_null()?; + } + } + + // Only emit the value if it captured any unshredded object fields + if partially_shredded { + object_builder.finish(); + } else { + drop(object_builder); + self.value_builder.append_null(); + } + + self.typed_value_nulls.append_non_null(); + self.nulls.append_non_null(); + Ok(true) + } + fn finish(mut self) -> Result<(BinaryViewArray, ArrayRef, Option)> { + let mut builder = StructArrayBuilder::new(); + for (field_name, typed_value_builder) in self.typed_value_builders { + let (value, typed_value, nulls) = typed_value_builder.finish()?; + let array = + ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + builder = builder.with_field(field_name, Arc::new(array), false); + } + if let Some(nulls) = self.typed_value_nulls.finish() { + builder = builder.with_nulls(nulls); + } + Ok(( + self.value_builder.build()?, + Arc::new(builder.build()), + self.nulls.finish(), + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::VariantArrayBuilder; + use arrow::array::{Float64Array, Int64Array}; + use arrow::datatypes::{DataType, Field, Fields}; + use parquet_variant::{Variant, VariantBuilderExt}; + use std::sync::Arc; + + fn create_test_variant_array(values: Vec>>) -> VariantArray { + let mut builder = VariantArrayBuilder::new(values.len()); + for value in values { + match value { + Some(v) => builder.append_variant(v), + None => builder.append_null(), + } + } + builder.build() + } + + // Input Validation Tests (3 tests - cannot consolidate) + + #[test] + fn test_already_shredded_input_error() { + // Create a VariantArray that already has typed_value_field + // First create a valid VariantArray, then extract its parts to construct a shredded one + let temp_array = create_test_variant_array(vec![Some(Variant::from("test"))]); + let metadata = temp_array.metadata_field().clone(); + let value = temp_array.value_field().unwrap().clone(); + let typed_value = Arc::new(Int64Array::from(vec![42])) as ArrayRef; + + let shredded_array = + VariantArray::from_parts(metadata, Some(value), Some(typed_value), None); + + let result = shred_variant(&shredded_array, &DataType::Int64); + assert!(matches!( + result.unwrap_err(), + ArrowError::InvalidArgumentError(_) + )); + } + + #[test] + fn test_all_null_input() { + // Create VariantArray with no value field (all null case) + let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata + let all_null_array = VariantArray::from_parts(metadata, None, None, None); + let result = shred_variant(&all_null_array, &DataType::Int64).unwrap(); + + // Should return array with no value/typed_value fields + assert!(result.value_field().is_none()); + assert!(result.typed_value_field().is_none()); + } + + #[test] + fn test_unsupported_list_schema() { + let input = create_test_variant_array(vec![Some(Variant::from(42))]); + let list_schema = DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); + shred_variant(&input, &list_schema).expect_err("unsupported"); + } + + // Primitive Shredding Tests (2 consolidated tests) + + #[test] + fn test_primitive_shredding_comprehensive() { + // Test mixed scenarios in a single array + let input = create_test_variant_array(vec![ + Some(Variant::from(42i64)), // successful shred + Some(Variant::from("hello")), // failed shred (string) + Some(Variant::from(100i64)), // successful shred + None, // array-level null + Some(Variant::Null), // variant null + Some(Variant::from(3i8)), // successful shred (int8->int64 conversion) + ]); + + let result = shred_variant(&input, &DataType::Int64).unwrap(); + println!("result: {:?}", result); + + // Verify structure + let value_field = result.value_field().unwrap(); + let typed_value_field = result + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Check specific outcomes for each row + assert_eq!(result.len(), 6); + + // Row 0: 42 -> should shred successfully + assert!(!result.is_null(0)); + assert!(value_field.is_null(0)); // value should be null when shredded + assert!(!typed_value_field.is_null(0)); + assert_eq!(typed_value_field.value(0), 42); + + // Row 1: "hello" -> should fail to shred + assert!(!result.is_null(1)); + assert!(!value_field.is_null(1)); // value should contain original + assert!(typed_value_field.is_null(1)); // typed_value should be null + + // Row 2: 100 -> should shred successfully + assert!(!result.is_null(2)); + assert!(value_field.is_null(2)); + assert_eq!(typed_value_field.value(2), 100); + + // Row 3: array null -> should be null in result + assert!(result.is_null(3)); + + // Row 4: Variant::Null -> should not shred (it's a null variant, not an integer) + assert!(!result.is_null(4)); + assert!(!value_field.is_null(4)); // should contain Variant::Null + assert!(typed_value_field.is_null(4)); + + // Row 5: 3i8 -> should shred successfully (int8->int64 conversion) + assert!(!result.is_null(5)); + assert!(value_field.is_null(5)); // value should be null when shredded + assert!(!typed_value_field.is_null(5)); + assert_eq!(typed_value_field.value(5), 3); + } + + #[test] + fn test_primitive_different_target_types() { + let input = create_test_variant_array(vec![ + Some(Variant::from(42i32)), + Some(Variant::from(3.15f64)), + Some(Variant::from("not_a_number")), + ]); + + // Test Int32 target + let result_int32 = shred_variant(&input, &DataType::Int32).unwrap(); + let typed_value_int32 = result_int32 + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(typed_value_int32.value(0), 42); + assert!(typed_value_int32.is_null(1)); // float doesn't convert to int32 + assert!(typed_value_int32.is_null(2)); // string doesn't convert to int32 + + // Test Float64 target + let result_float64 = shred_variant(&input, &DataType::Float64).unwrap(); + let typed_value_float64 = result_float64 + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(typed_value_float64.value(0), 42.0); // int converts to float + assert_eq!(typed_value_float64.value(1), 3.15); + assert!(typed_value_float64.is_null(2)); // string doesn't convert + } + + // Object Shredding Tests (2 consolidated tests) + + #[test] + fn test_object_shredding_comprehensive() { + let mut builder = VariantArrayBuilder::new(7); + + // Row 0: Fully shredded object + builder + .new_object() + .with_field("score", 95.5f64) + .with_field("age", 30i64) + .finish(); + + // Row 1: Partially shredded object (extra email field) + builder + .new_object() + .with_field("score", 87.2f64) + .with_field("age", 25i64) + .with_field("email", "bob@example.com") + .finish(); + + // Row 2: Missing field (no score) + builder.new_object().with_field("age", 35i64).finish(); + + // Row 3: Type mismatch (score is string, age is string) + builder + .new_object() + .with_field("score", "ninety-five") + .with_field("age", "thirty") + .finish(); + + // Row 4: Non-object + builder.append_variant(Variant::from("not an object")); + + // Row 5: Empty object + builder.new_object().finish(); + + // Row 6: Null + builder.append_null(); + + let input = builder.build(); + println!("input: {input:?}"); + + // Create target schema: struct + // Both types are supported for shredding + let fields = Fields::from(vec![ + Field::new("score", DataType::Float64, true), + Field::new("age", DataType::Int64, true), + ]); + let target_schema = DataType::Struct(fields); + + let result = shred_variant(&input, &target_schema).unwrap(); + println!("result: {result:?}"); + + // Verify structure + assert!(result.value_field().is_some()); + assert!(result.typed_value_field().is_some()); + assert_eq!(result.len(), 7); + + let value_field = result.value_field().unwrap(); + let typed_value_struct = result + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Extract score and age fields from typed_value struct + let score_field_array = typed_value_struct + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let age_field_array = typed_value_struct + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let score_typed_values = score_field_array + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let age_typed_values = age_field_array + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: Fully shredded - both fields shred successfully + assert!(value_field.is_null(0)); // no unshredded fields + assert_eq!(score_typed_values.value(0), 95.5); // score successfully shredded + assert_eq!(age_typed_values.value(0), 30); // age successfully shredded + + // Row 1: Partially shredded - value contains extra email field + assert!(!value_field.is_null(1)); // contains {"email": "bob@example.com"} + assert_eq!(score_typed_values.value(1), 87.2); // score successfully shredded + assert_eq!(age_typed_values.value(1), 25); // age successfully shredded + + // Row 2: Missing score field + assert!(value_field.is_null(2)); // no unshredded fields + assert!(score_typed_values.is_null(2)); // score is missing + assert_eq!(age_typed_values.value(2), 35); // age successfully shredded + + // Row 3: Type mismatches - both score and age are strings + assert!(value_field.is_null(3)); // no unshredded fields (but both fields have fallback values) + assert!(score_typed_values.is_null(3)); // score failed to shred (string "ninety-five") + assert!(age_typed_values.is_null(3)); // age failed to shred (string "thirty") + // Both should be in their respective field's value arrays (type mismatch fallback) + let score_value_field = score_field_array.value_field().unwrap(); + let age_value_field = age_field_array.value_field().unwrap(); + assert!(!score_value_field.is_null(3)); // contains "ninety-five" as variant + assert!(!age_value_field.is_null(3)); // contains "thirty" as variant + + // Row 4: Non-object - falls back to value field + assert!(!value_field.is_null(4)); // contains "not an object" + assert!(typed_value_struct.is_null(4)); // typed_value is null for non-objects + + // Row 5: Empty object + assert!(value_field.is_null(5)); // no unshredded fields + assert!(score_typed_values.is_null(5)); // score is missing + assert!(age_typed_values.is_null(5)); // age is missing + + // Row 6: Null + assert!(result.is_null(6)); + } + + #[test] + fn test_object_different_schemas() { + // Create object with multiple fields + let mut builder = VariantArrayBuilder::new(1); + builder + .new_object() + .with_field("id", 123i32) + .with_field("age", 25i64) + .with_field("score", 95.5f64) + .finish(); + let input = builder.build(); + + // Test with schema containing only id field + let schema1 = DataType::Struct(Fields::from(vec![Field::new("id", DataType::Int32, true)])); + let result1 = shred_variant(&input, &schema1).unwrap(); + let value_field1 = result1.value_field().unwrap(); + assert!(!value_field1.is_null(0)); // should contain {"age": 25, "score": 95.5} + + // Test with schema containing id and age fields + let schema2 = DataType::Struct(Fields::from(vec![ + Field::new("id", DataType::Int32, true), + Field::new("age", DataType::Int64, true), + ])); + let result2 = shred_variant(&input, &schema2).unwrap(); + let value_field2 = result2.value_field().unwrap(); + assert!(!value_field2.is_null(0)); // should contain {"score": 95.5} + + // Test with schema containing all fields + let schema3 = DataType::Struct(Fields::from(vec![ + Field::new("id", DataType::Int32, true), + Field::new("age", DataType::Int64, true), + Field::new("score", DataType::Float64, true), + ])); + let result3 = shred_variant(&input, &schema3).unwrap(); + let value_field3 = result3.value_field().unwrap(); + assert!(value_field3.is_null(0)); // fully shredded, no remaining fields + } + + // Specification Compliance Test (1 consolidated test) + + #[test] + fn test_spec_compliance() { + let input = create_test_variant_array(vec![ + Some(Variant::from(42i64)), + Some(Variant::from("hello")), + ]); + + let result = shred_variant(&input, &DataType::Int64).unwrap(); + + // Test field access by name (not position) + let inner_struct = result.inner(); + assert!(inner_struct.column_by_name("metadata").is_some()); + assert!(inner_struct.column_by_name("value").is_some()); + assert!(inner_struct.column_by_name("typed_value").is_some()); + + // Test metadata preservation + assert_eq!(result.metadata_field().len(), input.metadata_field().len()); + // The metadata should be the same reference (cheap clone) + // Note: BinaryViewArray doesn't have a .values() method, so we compare the arrays directly + assert_eq!(result.metadata_field().len(), input.metadata_field().len()); + + // Test output structure correctness + assert_eq!(result.len(), input.len()); + assert!(result.value_field().is_some()); + assert!(result.typed_value_field().is_some()); + + // For primitive shredding, verify that value and typed_value are never both non-null + // (This rule applies to primitives; for objects, both can be non-null for partial shredding) + let value_field = result.value_field().unwrap(); + let typed_value_field = result + .typed_value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..result.len() { + if !result.is_null(i) { + let value_is_null = value_field.is_null(i); + let typed_value_is_null = typed_value_field.is_null(i); + // For primitive shredding, at least one should be null + assert!( + value_is_null || typed_value_is_null, + "Row {}: both value and typed_value are non-null for primitive shredding", + i + ); + } + } + } +} diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e87d03f88c5b..cf7251993a0f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -351,6 +351,32 @@ impl ShreddedVariantFieldArray { pub fn inner(&self) -> &StructArray { &self.inner } + + pub(crate) fn from_parts( + value: Option, + typed_value: Option, + nulls: Option, + ) -> Self { + let mut builder = StructArrayBuilder::new(); + if let Some(value) = value.clone() { + builder = builder.with_field("value", Arc::new(value), true); + } + if let Some(typed_value) = typed_value.clone() { + builder = builder.with_field("typed_value", typed_value, true); + } + if let Some(nulls) = nulls { + builder = builder.with_nulls(nulls); + } + + // This would be a lot simpler if ShreddingState were just a pair of Option... we already + // have everything we need. + let inner = builder.build(); + let shredding_state = ShreddingState::try_new(value, typed_value).unwrap(); // valid by construction + Self { + inner, + shredding_state, + } + } } impl Array for ShreddedVariantFieldArray { diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 01af46e564f2..24a9f273633b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -146,7 +146,7 @@ fn shredded_get_path( if target.is_null(i) { builder.append_null()?; } else { - builder.append_value(&target.value(i))?; + builder.append_value(target.value(i))?; } } builder.finish() diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index b24f2d805d3e..90733a5964c0 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -26,12 +26,10 @@ use crate::{VariantArray, VariantValueArrayBuilder}; use std::sync::Arc; -/// Builder for converting variant values into strongly typed Arrow arrays. -/// -/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly -/// with casting of leaf values to specific types. -pub(crate) enum VariantToArrowRowBuilder<'a> { - // Direct builders (no path extraction) +/// Builder for converting variant values to primitive Arrow arrays. It is used by both +/// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in +/// `shred_variant.rs`). +pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>), Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>), Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>), @@ -39,15 +37,23 @@ pub(crate) enum VariantToArrowRowBuilder<'a> { Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>), Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>), Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>), +} + +/// Builder for converting variant values into strongly typed Arrow arrays. +/// +/// Useful for variant_get kernels that need to extract specific paths from variant values, possibly +/// with casting of leaf values to specific types. +pub(crate) enum VariantToArrowRowBuilder<'a> { + Primitive(PrimitiveVariantToArrowRowBuilder<'a>), BinaryVariant(VariantToBinaryVariantArrowRowBuilder), // Path extraction wrapper - contains a boxed enum for any of the above WithPath(VariantPathRowBuilder<'a>), } -impl<'a> VariantToArrowRowBuilder<'a> { +impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { pub fn append_null(&mut self) -> Result<()> { - use VariantToArrowRowBuilder::*; + use PrimitiveVariantToArrowRowBuilder::*; match self { Int8(b) => b.append_null(), Int16(b) => b.append_null(), @@ -56,13 +62,11 @@ impl<'a> VariantToArrowRowBuilder<'a> { Float16(b) => b.append_null(), Float32(b) => b.append_null(), Float64(b) => b.append_null(), - BinaryVariant(b) => b.append_null(), - WithPath(path_builder) => path_builder.append_null(), } } pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - use VariantToArrowRowBuilder::*; + use PrimitiveVariantToArrowRowBuilder::*; match self { Int8(b) => b.append_value(value), Int16(b) => b.append_value(value), @@ -71,13 +75,11 @@ impl<'a> VariantToArrowRowBuilder<'a> { Float16(b) => b.append_value(value), Float32(b) => b.append_value(value), Float64(b) => b.append_value(value), - BinaryVariant(b) => b.append_value(value), - WithPath(path_builder) => path_builder.append_value(value), } } pub fn finish(self) -> Result { - use VariantToArrowRowBuilder::*; + use PrimitiveVariantToArrowRowBuilder::*; match self { Int8(b) => b.finish(), Int16(b) => b.finish(), @@ -86,62 +88,129 @@ impl<'a> VariantToArrowRowBuilder<'a> { Float16(b) => b.finish(), Float32(b) => b.finish(), Float64(b) => b.finish(), + } + } +} + +impl<'a> VariantToArrowRowBuilder<'a> { + pub fn append_null(&mut self) -> Result<()> { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_null(), + BinaryVariant(b) => b.append_null(), + WithPath(path_builder) => path_builder.append_null(), + } + } + + pub fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.append_value(&value), + BinaryVariant(b) => b.append_value(value), + WithPath(path_builder) => path_builder.append_value(value), + } + } + + pub fn finish(self) -> Result { + use VariantToArrowRowBuilder::*; + match self { + Primitive(b) => b.finish(), BinaryVariant(b) => b.finish(), WithPath(path_builder) => path_builder.finish(), } } } -pub(crate) fn make_variant_to_arrow_row_builder<'a>( - metadata: &BinaryViewArray, - path: VariantPath<'a>, - data_type: Option<&'a DataType>, +/// Creates a primitive row builder, returning Err if the requested data type is not primitive. +pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( + data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, -) -> Result> { - use VariantToArrowRowBuilder::*; +) -> Result> { + use PrimitiveVariantToArrowRowBuilder::*; - let mut builder = match data_type { - // If no data type was requested, build an unshredded VariantArray. - None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( - metadata.clone(), - capacity, - )), - Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new( + let builder = match data_type { + DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Int16) => Int16(VariantToPrimitiveArrowRowBuilder::new( + DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Int32) => Int32(VariantToPrimitiveArrowRowBuilder::new( + DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Int64) => Int64(VariantToPrimitiveArrowRowBuilder::new( + DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Float16) => Float16(VariantToPrimitiveArrowRowBuilder::new( + DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Float32) => Float32(VariantToPrimitiveArrowRowBuilder::new( + DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - Some(DataType::Float64) => Float64(VariantToPrimitiveArrowRowBuilder::new( + DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, )), - _ => { + _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( - "variant_get with path={:?} and data_type={:?} not yet implemented", - path, data_type + "Primitive data_type {data_type:?} not yet implemented" + ))); + } + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "Not a primitive type: {data_type:?}" ))); } }; + Ok(builder) +} + +pub(crate) fn make_variant_to_arrow_row_builder<'a>( + metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a DataType>, + cast_options: &'a CastOptions, + capacity: usize, +) -> Result> { + use VariantToArrowRowBuilder::*; + + let mut builder = match data_type { + // If no data type was requested, build an unshredded VariantArray. + None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( + metadata.clone(), + capacity, + )), + Some(DataType::Struct(_)) => { + // TODO: Special handling for shredded variant objects + return Err(ArrowError::NotYetImplemented( + "variant_get not yet implemented for structs".to_string(), + )); + } + Some( + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(..), + ) => { + // TODO: Special handling for shredded variant arrays + return Err(ArrowError::NotYetImplemented( + "variant_get not yet implemented for lists".to_string(), + )); + } + Some(data_type) => { + let builder = + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; + Primitive(builder) + } + }; // Wrap with path extraction if needed if !path.is_empty() { @@ -166,9 +235,9 @@ impl<'a> VariantPathRowBuilder<'a> { self.builder.append_null() } - fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { + fn append_value(&mut self, value: Variant<'_, '_>) -> Result { if let Some(v) = value.get_path(&self.path) { - self.builder.append_value(&v) + self.builder.append_value(v) } else { self.builder.append_null()?; Ok(false) @@ -271,8 +340,8 @@ impl VariantToBinaryVariantArrowRowBuilder { Ok(()) } - fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - self.builder.append_value(value.clone()); + fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + self.builder.append_value(value); self.nulls.append_non_null(); Ok(true) } diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 1480d6400db1..95a30c206d59 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -3441,7 +3441,7 @@ mod tests { let mut metadata = ReadOnlyMetadataBuilder::new(metadata); let mut builder2 = ValueBuilder::new(); let state = ParentState::variant(&mut builder2, &mut metadata); - ValueBuilder::append_variant_bytes(state, variant1.clone()); + ValueBuilder::append_variant_bytes(state, variant1); let value2 = builder2.into_inner(); // The bytes should be identical, we merely copied them across. From d5349d6452e69859a0beafd2c7406ecfcda76109 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 17 Sep 2025 05:20:36 -0700 Subject: [PATCH 10/16] self review fixes --- parquet-variant-compute/src/shred_variant.rs | 114 ++++++++++++------- 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 84a8bc1f6b29..484c2fa82087 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -22,8 +22,7 @@ use crate::variant_to_arrow::{ make_primitive_variant_to_arrow_row_builder, PrimitiveVariantToArrowRowBuilder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::Array as _; -use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder}; +use arrow::array::{Array as _, ArrayRef, BinaryViewArray, NullBufferBuilder}; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::{DataType, Fields}; @@ -33,6 +32,37 @@ use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, EMPTY_VAR use indexmap::IndexMap; use std::sync::Arc; +/// Shreds the input binary variant using a target shredding schema derived from the requested data type. +/// +/// For example, requesting `DataType::Int64` would produce an output variant array with the schema: +/// +/// ``` +/// { +/// metadata: BINARY, +/// value: BINARY, +/// typed_value: LONG, +/// } +/// ``` +/// +/// Similarly, requesting `DataType::Struct` with two integer fields `a` and `b` would produce an +/// output variant array with the schema: +/// +/// ``` +/// { +/// metadata: BINARY, +/// value: BINARY, +/// typed_value: { +/// a: { +/// value: BINARY, +/// typed_value: INT, +/// }, +/// b: { +/// value: BINARY, +/// typed_value: INT, +/// }, +/// } +/// } +/// ``` pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result { if array.typed_value_field().is_some() { return Err(ArrowError::InvalidArgumentError( @@ -76,7 +106,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result( data_type: &'a DataType, cast_options: &'a CastOptions, - len: usize, + capacity: usize, top_level: bool, ) -> Result> { let builder = match data_type { @@ -84,7 +114,7 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( let typed_value_builder = VariantToShreddedObjectVariantRowBuilder::try_new( fields, cast_options, - len, + capacity, top_level, )?; VariantToShreddedVariantRowBuilder::Object(typed_value_builder) @@ -101,9 +131,9 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( } _ => { let builder = - make_primitive_variant_to_arrow_row_builder(data_type, cast_options, len)?; + make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; let typed_value_builder = - VariantToShreddedPrimitiveVariantRowBuilder::new(builder, len, top_level); + VariantToShreddedPrimitiveVariantRowBuilder::new(builder, capacity, top_level); VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder) } }; @@ -115,11 +145,37 @@ pub(crate) enum VariantToShreddedVariantRowBuilder<'a> { Object(VariantToShreddedObjectVariantRowBuilder<'a>), } impl<'a> VariantToShreddedVariantRowBuilder<'a> { + fn start_append_null(&mut self) { + use VariantToShreddedVariantRowBuilder::*; + + let (top_level, nulls, value_builder) = match self { + Primitive(VariantToShreddedPrimitiveVariantRowBuilder { + top_level, + nulls, + value_builder, + .. + }) + | Object(VariantToShreddedObjectVariantRowBuilder { + top_level, + nulls, + value_builder, + .. + }) => (top_level, nulls, value_builder), + }; + if *top_level { + nulls.append_null(); + value_builder.append_null(); + } else { + nulls.append_non_null(); + value_builder.append_value(Variant::Null); + } + } pub fn append_null(&mut self) -> Result<()> { use VariantToShreddedVariantRowBuilder::*; + self.start_append_null(); match self { - Primitive(b) => b.append_null(), - Object(b) => b.append_null(), + Primitive(b) => b.finish_append_null(), + Object(b) => b.finish_append_null(), } } @@ -151,24 +207,17 @@ pub(crate) struct VariantToShreddedPrimitiveVariantRowBuilder<'a> { impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> { pub(crate) fn new( typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>, - len: usize, + capacity: usize, top_level: bool, ) -> Self { Self { - value_builder: VariantValueArrayBuilder::new(len), + value_builder: VariantValueArrayBuilder::new(capacity), typed_value_builder, - nulls: NullBufferBuilder::new(len), + nulls: NullBufferBuilder::new(capacity), top_level, } } - fn append_null(&mut self) -> Result<()> { - if self.top_level { - self.nulls.append_null(); - self.value_builder.append_null(); - } else { - self.nulls.append_non_null(); - self.value_builder.append_value(Variant::Null); - } + fn finish_append_null(&mut self) -> Result<()> { self.typed_value_builder.append_null() } fn append_value(&mut self, value: Variant<'_, '_>) -> Result { @@ -201,35 +250,28 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { fn try_new( fields: &'a Fields, cast_options: &'a CastOptions, - len: usize, + capacity: usize, top_level: bool, ) -> Result { let typed_value_builders = fields.iter().map(|field| { let builder = make_variant_to_shredded_variant_arrow_row_builder( field.data_type(), cast_options, - len, + capacity, top_level, )?; Ok((field.name().as_str(), builder)) }); Ok(Self { - value_builder: VariantValueArrayBuilder::new(len), + value_builder: VariantValueArrayBuilder::new(capacity), typed_value_builders: typed_value_builders.collect::>()?, - typed_value_nulls: NullBufferBuilder::new(len), - nulls: NullBufferBuilder::new(len), + typed_value_nulls: NullBufferBuilder::new(capacity), + nulls: NullBufferBuilder::new(capacity), top_level, }) } - fn append_null(&mut self) -> Result<()> { - if self.top_level { - self.nulls.append_null(); - self.value_builder.append_null(); - } else { - self.nulls.append_non_null(); - self.value_builder.append_value(Variant::Null); - } + fn finish_append_null(&mut self) -> Result<()> { self.typed_value_nulls.append_null(); for (_, typed_value_builder) in &mut self.typed_value_builders { typed_value_builder.append_null()?; @@ -326,8 +368,6 @@ mod tests { builder.build() } - // Input Validation Tests (3 tests - cannot consolidate) - #[test] fn test_already_shredded_input_error() { // Create a VariantArray that already has typed_value_field @@ -366,8 +406,6 @@ mod tests { shred_variant(&input, &list_schema).expect_err("unsupported"); } - // Primitive Shredding Tests (2 consolidated tests) - #[test] fn test_primitive_shredding_comprehensive() { // Test mixed scenarios in a single array @@ -459,8 +497,6 @@ mod tests { assert!(typed_value_float64.is_null(2)); // string doesn't convert } - // Object Shredding Tests (2 consolidated tests) - #[test] fn test_object_shredding_comprehensive() { let mut builder = VariantArrayBuilder::new(7); @@ -629,8 +665,6 @@ mod tests { assert!(value_field3.is_null(0)); // fully shredded, no remaining fields } - // Specification Compliance Test (1 consolidated test) - #[test] fn test_spec_compliance() { let input = create_test_variant_array(vec![ From 77789422cc383095743b52525b17e264ce779608 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 17 Sep 2025 22:07:59 -0700 Subject: [PATCH 11/16] review feedback --- parquet-variant-compute/src/shred_variant.rs | 50 +++++++++++++------ parquet-variant-compute/src/variant_array.rs | 49 +++++++----------- parquet-variant-compute/src/variant_get.rs | 4 +- .../src/variant_to_arrow.rs | 6 +-- 4 files changed, 58 insertions(+), 51 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 484c2fa82087..a6f2c5a789f6 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -36,7 +36,7 @@ use std::sync::Arc; /// /// For example, requesting `DataType::Int64` would produce an output variant array with the schema: /// -/// ``` +/// ```text /// { /// metadata: BINARY, /// value: BINARY, @@ -47,7 +47,7 @@ use std::sync::Arc; /// Similarly, requesting `DataType::Struct` with two integer fields `a` and `b` would produce an /// output variant array with the schema: /// -/// ``` +/// ```text /// { /// metadata: BINARY, /// value: BINARY, @@ -71,13 +71,8 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result( | DataType::ListView(_) | DataType::LargeListView(_) | DataType::FixedSizeList(..) => { - // TODO: Special handling for shredded variant arrays return Err(ArrowError::NotYetImplemented( - "shred_variant not yet implemented for lists".to_string(), + "Shredding variant array values as arrow lists".to_string(), )); } _ => { @@ -419,9 +413,9 @@ mod tests { ]); let result = shred_variant(&input, &DataType::Int64).unwrap(); - println!("result: {:?}", result); // Verify structure + let metadata_field = result.metadata_field(); let value_field = result.value_field().unwrap(); let typed_value_field = result .typed_value_field() @@ -443,6 +437,10 @@ mod tests { assert!(!result.is_null(1)); assert!(!value_field.is_null(1)); // value should contain original assert!(typed_value_field.is_null(1)); // typed_value should be null + assert_eq!( + Variant::new(metadata_field.value(1), value_field.value(1)), + Variant::from("hello") + ); // Row 2: 100 -> should shred successfully assert!(!result.is_null(2)); @@ -455,6 +453,10 @@ mod tests { // Row 4: Variant::Null -> should not shred (it's a null variant, not an integer) assert!(!result.is_null(4)); assert!(!value_field.is_null(4)); // should contain Variant::Null + assert_eq!( + Variant::new(metadata_field.value(4), value_field.value(4)), + Variant::Null + ); assert!(typed_value_field.is_null(4)); // Row 5: 3i8 -> should shred successfully (int8->int64 conversion) @@ -535,8 +537,17 @@ mod tests { // Row 6: Null builder.append_null(); + // Row 7: Object with only "wrong" fields + builder.new_object().with_field("foo", 10).finish(); + + // Row 8: Object with one "right" and one "wrong" field + builder + .new_object() + .with_field("foo", 10) + .with_field("score", 66.67f64) + .finish(); + let input = builder.build(); - println!("input: {input:?}"); // Create target schema: struct // Both types are supported for shredding @@ -547,12 +558,11 @@ mod tests { let target_schema = DataType::Struct(fields); let result = shred_variant(&input, &target_schema).unwrap(); - println!("result: {result:?}"); // Verify structure assert!(result.value_field().is_some()); assert!(result.typed_value_field().is_some()); - assert_eq!(result.len(), 7); + assert_eq!(result.len(), 9); let value_field = result.value_field().unwrap(); let typed_value_struct = result @@ -625,6 +635,16 @@ mod tests { // Row 6: Null assert!(result.is_null(6)); + + // Row 7: Object with only a "wrong" field + assert!(!value_field.is_null(7)); + assert!(score_typed_values.is_null(7)); + assert!(age_typed_values.is_null(7)); + + // Row 8: Object with one "wrong" and one "right" field + assert!(!value_field.is_null(8)); + assert!(!score_typed_values.is_null(8)); + assert!(age_typed_values.is_null(8)); } #[test] diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index cf7251993a0f..ee4bce7710a4 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -48,7 +48,7 @@ use std::sync::Arc; /// /// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908 /// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct VariantArray { /// Reference to the underlying StructArray inner: StructArray, @@ -129,7 +129,7 @@ impl VariantArray { Ok(Self { inner: inner.clone(), metadata: metadata.clone(), - shredding_state: ShreddingState::try_new(value, typed_value)?, + shredding_state: ShreddingState::new(value, typed_value), }) } @@ -151,14 +151,10 @@ impl VariantArray { builder = builder.with_nulls(nulls); } - // This would be a lot simpler if ShreddingState were just a pair of Option... we already - // have everything we need. - let inner = builder.build(); - let shredding_state = ShreddingState::try_new(value, typed_value).unwrap(); // valid by construction Self { - inner, + inner: builder.build(), metadata, - shredding_state, + shredding_state: ShreddingState::new(value, typed_value), } } @@ -325,10 +321,9 @@ impl ShreddedVariantFieldArray { let typed_value = inner_struct.column_by_name("typed_value").cloned(); // Note this clone is cheap, it just bumps the ref count - let inner = inner_struct.clone(); Ok(Self { - inner: inner.clone(), - shredding_state: ShreddingState::try_new(value, typed_value)?, + inner: inner_struct.clone(), + shredding_state: ShreddingState::new(value, typed_value), }) } @@ -368,13 +363,9 @@ impl ShreddedVariantFieldArray { builder = builder.with_nulls(nulls); } - // This would be a lot simpler if ShreddingState were just a pair of Option... we already - // have everything we need. - let inner = builder.build(); - let shredding_state = ShreddingState::try_new(value, typed_value).unwrap(); // valid by construction Self { - inner, - shredding_state, + inner: builder.build(), + shredding_state: ShreddingState::new(value, typed_value), } } } @@ -451,7 +442,7 @@ impl Array for ShreddedVariantFieldArray { /// | non-null | non-null | The value is present and is a partially shredded object | /// /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding -#[derive(Debug)] +#[derive(Clone, Debug)] pub enum ShreddingState { /// This variant has no typed_value field Unshredded { value: BinaryViewArray }, @@ -482,16 +473,13 @@ pub enum ShreddingState { } impl ShreddingState { - /// try to create a new `ShreddingState` from the given fields - pub fn try_new( - value: Option, - typed_value: Option, - ) -> Result { + /// Create a new `ShreddingState` from the given fields + pub fn new(value: Option, typed_value: Option) -> Self { match (value, typed_value) { - (Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded { value, typed_value }), - (Some(value), None) => Ok(Self::Unshredded { value }), - (None, Some(typed_value)) => Ok(Self::Typed { typed_value }), - (None, None) => Ok(Self::AllNull), + (Some(value), Some(typed_value)) => Self::PartiallyShredded { value, typed_value }, + (Some(value), None) => Self::Unshredded { value }, + (None, Some(typed_value)) => Self::Typed { typed_value }, + (None, None) => Self::AllNull, } } @@ -795,10 +783,11 @@ mod test { #[test] fn all_null_shredding_state() { - let shredding_state = ShreddingState::try_new(None, None).unwrap(); - // Verify the shredding state is AllNull - assert!(matches!(shredding_state, ShreddingState::AllNull)); + assert!(matches!( + ShreddingState::new(None, None), + ShreddingState::AllNull + )); } #[test] diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 24a9f273633b..19c966cce45c 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -1221,7 +1221,7 @@ mod test { } Err(e) => { println!("Nested path 'a.x' error: {}", e); - if e.to_string().contains("not yet implemented") + if e.to_string().contains("Not yet implemented") || e.to_string().contains("NotYetImplemented") { println!("This is expected - nested paths are not implemented"); @@ -2392,7 +2392,7 @@ mod test { // Should fail with NotYetImplemented when the row builder tries to handle struct type assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("not yet implemented")); + assert!(error.to_string().contains("Not yet implemented")); } /// Create comprehensive shredded variant with diverse null patterns and empty objects diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 90733a5964c0..6bb349b4564b 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -188,9 +188,8 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( capacity, )), Some(DataType::Struct(_)) => { - // TODO: Special handling for shredded variant objects return Err(ArrowError::NotYetImplemented( - "variant_get not yet implemented for structs".to_string(), + "Converting unshredded variant objects to arrow structs".to_string(), )); } Some( @@ -200,9 +199,8 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( | DataType::LargeListView(_) | DataType::FixedSizeList(..), ) => { - // TODO: Special handling for shredded variant arrays return Err(ArrowError::NotYetImplemented( - "variant_get not yet implemented for lists".to_string(), + "Converting unshredded variant arrays to arrow lists".to_string(), )); } Some(data_type) => { From d007b8806e24c582365ed39cd06b0bd85e1cc926 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 18 Sep 2025 00:37:20 -0700 Subject: [PATCH 12/16] review feedback --- parquet-variant-compute/src/shred_variant.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index a6f2c5a789f6..8227fea39b58 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -601,21 +601,25 @@ mod tests { // Row 0: Fully shredded - both fields shred successfully assert!(value_field.is_null(0)); // no unshredded fields + assert!(!typed_value_struct.is_null(0)); assert_eq!(score_typed_values.value(0), 95.5); // score successfully shredded assert_eq!(age_typed_values.value(0), 30); // age successfully shredded // Row 1: Partially shredded - value contains extra email field assert!(!value_field.is_null(1)); // contains {"email": "bob@example.com"} + assert!(!typed_value_struct.is_null(1)); assert_eq!(score_typed_values.value(1), 87.2); // score successfully shredded assert_eq!(age_typed_values.value(1), 25); // age successfully shredded // Row 2: Missing score field assert!(value_field.is_null(2)); // no unshredded fields + assert!(!typed_value_struct.is_null(2)); assert!(score_typed_values.is_null(2)); // score is missing assert_eq!(age_typed_values.value(2), 35); // age successfully shredded // Row 3: Type mismatches - both score and age are strings assert!(value_field.is_null(3)); // no unshredded fields (but both fields have fallback values) + assert!(!typed_value_struct.is_null(3)); assert!(score_typed_values.is_null(3)); // score failed to shred (string "ninety-five") assert!(age_typed_values.is_null(3)); // age failed to shred (string "thirty") // Both should be in their respective field's value arrays (type mismatch fallback) @@ -630,6 +634,7 @@ mod tests { // Row 5: Empty object assert!(value_field.is_null(5)); // no unshredded fields + assert!(!typed_value_struct.is_null(5)); // the struct is there, but all fields are NULL assert!(score_typed_values.is_null(5)); // score is missing assert!(age_typed_values.is_null(5)); // age is missing @@ -638,11 +643,13 @@ mod tests { // Row 7: Object with only a "wrong" field assert!(!value_field.is_null(7)); + assert!(!typed_value_struct.is_null(7)); // the struct is there, but all fields are NULL assert!(score_typed_values.is_null(7)); assert!(age_typed_values.is_null(7)); // Row 8: Object with one "wrong" and one "right" field assert!(!value_field.is_null(8)); + assert!(!typed_value_struct.is_null(8)); assert!(!score_typed_values.is_null(8)); assert!(age_typed_values.is_null(8)); } From f6001e87baff55d82e21f350c993a1ebf3d0d375 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 18 Sep 2025 08:56:32 -0700 Subject: [PATCH 13/16] remove top_level --- parquet-variant-compute/src/shred_variant.rs | 333 ++++++++++++++----- 1 file changed, 248 insertions(+), 85 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 27329264906f..464da8f67323 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -80,7 +80,6 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result( data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, - top_level: bool, ) -> Result> { let builder = match data_type { DataType::Struct(fields) => { @@ -110,7 +108,6 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( fields, cast_options, capacity, - top_level, )?; VariantToShreddedVariantRowBuilder::Object(typed_value_builder) } @@ -127,7 +124,7 @@ pub(crate) fn make_variant_to_shredded_variant_arrow_row_builder<'a>( let builder = make_primitive_variant_to_arrow_row_builder(data_type, cast_options, capacity)?; let typed_value_builder = - VariantToShreddedPrimitiveVariantRowBuilder::new(builder, capacity, top_level); + VariantToShreddedPrimitiveVariantRowBuilder::new(builder, capacity); VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder) } }; @@ -139,37 +136,11 @@ pub(crate) enum VariantToShreddedVariantRowBuilder<'a> { Object(VariantToShreddedObjectVariantRowBuilder<'a>), } impl<'a> VariantToShreddedVariantRowBuilder<'a> { - fn start_append_null(&mut self) { - use VariantToShreddedVariantRowBuilder::*; - - let (top_level, nulls, value_builder) = match self { - Primitive(VariantToShreddedPrimitiveVariantRowBuilder { - top_level, - nulls, - value_builder, - .. - }) - | Object(VariantToShreddedObjectVariantRowBuilder { - top_level, - nulls, - value_builder, - .. - }) => (top_level, nulls, value_builder), - }; - if *top_level { - nulls.append_null(); - value_builder.append_null(); - } else { - nulls.append_non_null(); - value_builder.append_value(Variant::Null); - } - } pub fn append_null(&mut self) -> Result<()> { use VariantToShreddedVariantRowBuilder::*; - self.start_append_null(); match self { - Primitive(b) => b.finish_append_null(), - Object(b) => b.finish_append_null(), + Primitive(b) => b.append_null(), + Object(b) => b.append_null(), } } @@ -195,23 +166,22 @@ pub(crate) struct VariantToShreddedPrimitiveVariantRowBuilder<'a> { value_builder: VariantValueArrayBuilder, typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>, nulls: NullBufferBuilder, - top_level: bool, } impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> { pub(crate) fn new( typed_value_builder: PrimitiveVariantToArrowRowBuilder<'a>, capacity: usize, - top_level: bool, ) -> Self { Self { value_builder: VariantValueArrayBuilder::new(capacity), typed_value_builder, nulls: NullBufferBuilder::new(capacity), - top_level, } } - fn finish_append_null(&mut self) -> Result<()> { + fn append_null(&mut self) -> Result<()> { + self.nulls.append_null(); + self.value_builder.append_null(); self.typed_value_builder.append_null() } fn append_value(&mut self, value: Variant<'_, '_>) -> Result { @@ -237,7 +207,6 @@ pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> { typed_value_builders: IndexMap<&'a str, VariantToShreddedVariantRowBuilder<'a>>, typed_value_nulls: NullBufferBuilder, nulls: NullBufferBuilder, - top_level: bool, } impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { @@ -245,14 +214,12 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { fields: &'a Fields, cast_options: &'a CastOptions, capacity: usize, - top_level: bool, ) -> Result { let typed_value_builders = fields.iter().map(|field| { let builder = make_variant_to_shredded_variant_arrow_row_builder( field.data_type(), cast_options, capacity, - top_level, )?; Ok((field.name().as_str(), builder)) }); @@ -261,11 +228,12 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { typed_value_builders: typed_value_builders.collect::>()?, typed_value_nulls: NullBufferBuilder::new(capacity), nulls: NullBufferBuilder::new(capacity), - top_level, }) } - fn finish_append_null(&mut self) -> Result<()> { + fn append_null(&mut self) -> Result<()> { + self.nulls.append_null(); + self.value_builder.append_null(); self.typed_value_nulls.append_null(); for (_, typed_value_builder) in &mut self.typed_value_builders { typed_value_builder.append_null()?; @@ -347,7 +315,7 @@ mod tests { use crate::VariantArrayBuilder; use arrow::array::{Float64Array, Int64Array}; use arrow::datatypes::{DataType, Field, Fields}; - use parquet_variant::{Variant, VariantBuilderExt}; + use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt as _}; use std::sync::Arc; fn create_test_variant_array(values: Vec>>) -> VariantArray { @@ -563,8 +531,10 @@ mod tests { assert!(result.typed_value_field().is_some()); assert_eq!(result.len(), 9); - let value_field = result.value_field().unwrap(); - let typed_value_struct = result + let metadata = result.metadata_field(); + + let value = result.value_field().unwrap(); + let typed_value = result .typed_value_field() .unwrap() .as_any() @@ -572,85 +542,278 @@ mod tests { .unwrap(); // Extract score and age fields from typed_value struct - let score_field_array = typed_value_struct + let score_field = typed_value .column_by_name("score") .unwrap() .as_any() .downcast_ref::() .unwrap(); - let age_field_array = typed_value_struct + let age_field = typed_value .column_by_name("age") .unwrap() .as_any() .downcast_ref::() .unwrap(); - let score_typed_values = score_field_array + let score_value = score_field + .value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let score_typed_value = score_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); - let age_typed_values = age_field_array + let age_value = age_field + .value_field() + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let age_typed_value = age_field .typed_value_field() .unwrap() .as_any() .downcast_ref::() .unwrap(); + // Set up exhaustive checking of all shredded columns and their nulls/values + struct ShreddedValue<'m, 'v, T> { + value: Option>, + typed_value: Option, + } + struct ShreddedStruct<'m, 'v> { + score: ShreddedValue<'m, 'v, f64>, + age: ShreddedValue<'m, 'v, i64>, + } + fn get_value<'m, 'v>(i: usize, metadata: &'m BinaryViewArray, value: &'v BinaryViewArray) -> Variant<'m, 'v> { + Variant::new(metadata.value(i), value.value(i)) + } + let expect = |i, expected_result: Option>| { + match expected_result { + Some(ShreddedValue { value: expected_value, typed_value: expected_typed_value }) => { + assert!(result.is_valid(i)); + match expected_value { + Some(expected_value) => { + assert!(value.is_valid(i)); + assert_eq!(expected_value, get_value(i, metadata, value)); + } + None => { + assert!(value.is_null(i)); + } + } + match expected_typed_value { + Some(ShreddedStruct { score: expected_score, age: expected_age }) => { + assert!(typed_value.is_valid(i)); + assert!(score_field.is_valid(i)); // non-nullable + assert!(age_field.is_valid(i)); // non-nullable + match expected_score.value { + Some(expected_score_value) => { + assert!(score_value.is_valid(i)); + assert_eq!(expected_score_value, get_value(i, metadata, score_value)); + } + None => { + assert!(score_value.is_null(i)); + } + } + match expected_score.typed_value { + Some(expected_score) => { + assert!(score_typed_value.is_valid(i)); + assert_eq!(expected_score, score_typed_value.value(i)); + } + None => { + assert!(score_typed_value.is_null(i)); + } + } + match expected_age.value { + Some(expected_age_value) => { + assert!(age_value.is_valid(i)); + assert_eq!(expected_age_value, get_value(i, metadata, age_value)); + } + None => { + assert!(age_value.is_null(i)); + } + } + match expected_age.typed_value { + Some(expected_age) => { + assert!(age_typed_value.is_valid(i)); + assert_eq!(expected_age, age_typed_value.value(i)); + } + None => { + assert!(age_typed_value.is_null(i)); + } + } + } + None => { + assert!(typed_value.is_null(i)); + } + } + } + None => { + assert!(result.is_null(i)); + } + }; + + }; + // Row 0: Fully shredded - both fields shred successfully - assert!(value_field.is_null(0)); // no unshredded fields - assert!(!typed_value_struct.is_null(0)); - assert_eq!(score_typed_values.value(0), 95.5); // score successfully shredded - assert_eq!(age_typed_values.value(0), 30); // age successfully shredded + expect( + 0, + Some(ShreddedValue { + value: None, + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: Some(95.5), + }, + age: ShreddedValue { + value: None, + typed_value: Some(30), + }, + }), + }), + ); // Row 1: Partially shredded - value contains extra email field - assert!(!value_field.is_null(1)); // contains {"email": "bob@example.com"} - assert!(!typed_value_struct.is_null(1)); - assert_eq!(score_typed_values.value(1), 87.2); // score successfully shredded - assert_eq!(age_typed_values.value(1), 25); // age successfully shredded + let mut builder = VariantBuilder::new(); + builder.new_object().with_field("email", "bob@example.com").finish(); + let (m, v) = builder.finish(); + let expected_value = Variant::new(&m, &v); + + expect( + 1, + Some(ShreddedValue { + value: Some(expected_value), + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: Some(87.2), + }, + age: ShreddedValue { + value: None, + typed_value: Some(25), + }, + }), + }), + ); - // Row 2: Missing score field - assert!(value_field.is_null(2)); // no unshredded fields - assert!(!typed_value_struct.is_null(2)); - assert!(score_typed_values.is_null(2)); // score is missing - assert_eq!(age_typed_values.value(2), 35); // age successfully shredded + // Row 2: Fully shredded -- missing score field + expect( + 2, + Some(ShreddedValue { + value: None, + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: None, + }, + age: ShreddedValue { + value: None, + typed_value: Some(35), + }, + }), + }), + ); // Row 3: Type mismatches - both score and age are strings - assert!(value_field.is_null(3)); // no unshredded fields (but both fields have fallback values) - assert!(!typed_value_struct.is_null(3)); - assert!(score_typed_values.is_null(3)); // score failed to shred (string "ninety-five") - assert!(age_typed_values.is_null(3)); // age failed to shred (string "thirty") - // Both should be in their respective field's value arrays (type mismatch fallback) - let score_value_field = score_field_array.value_field().unwrap(); - let age_value_field = age_field_array.value_field().unwrap(); - assert!(!score_value_field.is_null(3)); // contains "ninety-five" as variant - assert!(!age_value_field.is_null(3)); // contains "thirty" as variant + expect( + 3, + Some(ShreddedValue { + value: None, + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: Some(Variant::from("ninety-five")), + typed_value: None, + }, + age: ShreddedValue { + value: Some(Variant::from("thirty")), + typed_value: None, + }, + }), + }), + ); // Row 4: Non-object - falls back to value field - assert!(!value_field.is_null(4)); // contains "not an object" - assert!(typed_value_struct.is_null(4)); // typed_value is null for non-objects + expect( + 4, + Some(ShreddedValue { + value: Some(Variant::from("not an object")), + typed_value: None, + }), + ); // Row 5: Empty object - assert!(value_field.is_null(5)); // no unshredded fields - assert!(!typed_value_struct.is_null(5)); // the struct is there, but all fields are NULL - assert!(score_typed_values.is_null(5)); // score is missing - assert!(age_typed_values.is_null(5)); // age is missing + expect( + 5, + Some(ShreddedValue { + value: None, + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: None, + }, + age: ShreddedValue { + value: None, + typed_value: None, + }, + }), + }), + ); // Row 6: Null - assert!(result.is_null(6)); + expect( + 6, + None, + ); // Row 7: Object with only a "wrong" field - assert!(!value_field.is_null(7)); - assert!(!typed_value_struct.is_null(7)); // the struct is there, but all fields are NULL - assert!(score_typed_values.is_null(7)); - assert!(age_typed_values.is_null(7)); + let mut builder = VariantBuilder::new(); + builder.new_object().with_field("foo", 10).finish(); + let (m, v) = builder.finish(); + let expected_value = Variant::new(&m, &v); + + expect( + 7, + Some(ShreddedValue { + value: Some(expected_value), + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: None, + }, + age: ShreddedValue { + value: None, + typed_value: None, + }, + }), + }), + ); // Row 8: Object with one "wrong" and one "right" field - assert!(!value_field.is_null(8)); - assert!(!typed_value_struct.is_null(8)); - assert!(!score_typed_values.is_null(8)); - assert!(age_typed_values.is_null(8)); + let mut builder = VariantBuilder::new(); + builder.new_object().with_field("foo", 10).finish(); + let (m, v) = builder.finish(); + let expected_value = Variant::new(&m, &v); + + expect( + 8, + Some(ShreddedValue { + value: Some(expected_value), + typed_value: Some(ShreddedStruct { + score: ShreddedValue { + value: None, + typed_value: Some(66.67), + }, + age: ShreddedValue { + value: None, + typed_value: None, + }, + }), + }), + ); } #[test] From 370dac180bdadd64134c97e8d91b7c83f5ec3c0e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 18 Sep 2025 08:57:42 -0700 Subject: [PATCH 14/16] fmt --- parquet-variant-compute/src/shred_variant.rs | 59 +++++++++++--------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 464da8f67323..1b5047529316 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -76,11 +76,8 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result( ) -> Result> { let builder = match data_type { DataType::Struct(fields) => { - let typed_value_builder = VariantToShreddedObjectVariantRowBuilder::try_new( - fields, - cast_options, - capacity, - )?; + let typed_value_builder = + VariantToShreddedObjectVariantRowBuilder::try_new(fields, cast_options, capacity)?; VariantToShreddedVariantRowBuilder::Object(typed_value_builder) } DataType::List(_) @@ -210,11 +204,7 @@ pub(crate) struct VariantToShreddedObjectVariantRowBuilder<'a> { } impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { - fn try_new( - fields: &'a Fields, - cast_options: &'a CastOptions, - capacity: usize, - ) -> Result { + fn try_new(fields: &'a Fields, cast_options: &'a CastOptions, capacity: usize) -> Result { let typed_value_builders = fields.iter().map(|field| { let builder = make_variant_to_shredded_variant_arrow_row_builder( field.data_type(), @@ -589,12 +579,19 @@ mod tests { score: ShreddedValue<'m, 'v, f64>, age: ShreddedValue<'m, 'v, i64>, } - fn get_value<'m, 'v>(i: usize, metadata: &'m BinaryViewArray, value: &'v BinaryViewArray) -> Variant<'m, 'v> { + fn get_value<'m, 'v>( + i: usize, + metadata: &'m BinaryViewArray, + value: &'v BinaryViewArray, + ) -> Variant<'m, 'v> { Variant::new(metadata.value(i), value.value(i)) } let expect = |i, expected_result: Option>| { match expected_result { - Some(ShreddedValue { value: expected_value, typed_value: expected_typed_value }) => { + Some(ShreddedValue { + value: expected_value, + typed_value: expected_typed_value, + }) => { assert!(result.is_valid(i)); match expected_value { Some(expected_value) => { @@ -606,14 +603,20 @@ mod tests { } } match expected_typed_value { - Some(ShreddedStruct { score: expected_score, age: expected_age }) => { + Some(ShreddedStruct { + score: expected_score, + age: expected_age, + }) => { assert!(typed_value.is_valid(i)); assert!(score_field.is_valid(i)); // non-nullable - assert!(age_field.is_valid(i)); // non-nullable + assert!(age_field.is_valid(i)); // non-nullable match expected_score.value { Some(expected_score_value) => { assert!(score_value.is_valid(i)); - assert_eq!(expected_score_value, get_value(i, metadata, score_value)); + assert_eq!( + expected_score_value, + get_value(i, metadata, score_value) + ); } None => { assert!(score_value.is_null(i)); @@ -631,7 +634,10 @@ mod tests { match expected_age.value { Some(expected_age_value) => { assert!(age_value.is_valid(i)); - assert_eq!(expected_age_value, get_value(i, metadata, age_value)); + assert_eq!( + expected_age_value, + get_value(i, metadata, age_value) + ); } None => { assert!(age_value.is_null(i)); @@ -656,7 +662,6 @@ mod tests { assert!(result.is_null(i)); } }; - }; // Row 0: Fully shredded - both fields shred successfully @@ -679,7 +684,10 @@ mod tests { // Row 1: Partially shredded - value contains extra email field let mut builder = VariantBuilder::new(); - builder.new_object().with_field("email", "bob@example.com").finish(); + builder + .new_object() + .with_field("email", "bob@example.com") + .finish(); let (m, v) = builder.finish(); let expected_value = Variant::new(&m, &v); @@ -764,10 +772,7 @@ mod tests { ); // Row 6: Null - expect( - 6, - None, - ); + expect(6, None); // Row 7: Object with only a "wrong" field let mut builder = VariantBuilder::new(); From 8af381001e1a4e05f5c3760d40ab6b1356945dac Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 18 Sep 2025 22:57:30 -0700 Subject: [PATCH 15/16] fix unit test --- parquet-variant-compute/src/shred_variant.rs | 29 +++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 1b5047529316..27793a5d384e 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -500,8 +500,8 @@ mod tests { // Row 8: Object with one "right" and one "wrong" field builder .new_object() - .with_field("foo", 10) .with_field("score", 66.67f64) + .with_field("foo", 10) .finish(); let input = builder.build(); @@ -774,16 +774,23 @@ mod tests { // Row 6: Null expect(6, None); - // Row 7: Object with only a "wrong" field - let mut builder = VariantBuilder::new(); - builder.new_object().with_field("foo", 10).finish(); - let (m, v) = builder.finish(); - let expected_value = Variant::new(&m, &v); + // Helper to correctly create a variant object using a row's existing metadata + let object_with_foo_field = |i| { + use parquet_variant::{ParentState, ValueBuilder, VariantMetadata}; + let metadata = VariantMetadata::new(metadata.value(i)); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata.clone()); + let mut value_builder = ValueBuilder::new(); + let state = ParentState::variant(&mut value_builder, &mut metadata_builder); + ObjectBuilder::new(state, false).with_field("foo", 10).finish(); + (metadata, value_builder.into_inner()) + }; + // Row 7: Object with only a "wrong" field + let (m, v) = object_with_foo_field(7); expect( 7, Some(ShreddedValue { - value: Some(expected_value), + value: Some(Variant::new_with_metadata(m, &v)), typed_value: Some(ShreddedStruct { score: ShreddedValue { value: None, @@ -798,15 +805,11 @@ mod tests { ); // Row 8: Object with one "wrong" and one "right" field - let mut builder = VariantBuilder::new(); - builder.new_object().with_field("foo", 10).finish(); - let (m, v) = builder.finish(); - let expected_value = Variant::new(&m, &v); - + let (m, v) = object_with_foo_field(8); expect( 8, Some(ShreddedValue { - value: Some(expected_value), + value: Some(Variant::new_with_metadata(m, &v)), typed_value: Some(ShreddedStruct { score: ShreddedValue { value: None, From cf862e4bb32ba0a3c5ed990f555e6b3aa6267d2f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 19 Sep 2025 01:13:59 -0700 Subject: [PATCH 16/16] fmt --- parquet-variant-compute/src/shred_variant.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index 27793a5d384e..9b517c034646 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -781,7 +781,9 @@ mod tests { let mut metadata_builder = ReadOnlyMetadataBuilder::new(metadata.clone()); let mut value_builder = ValueBuilder::new(); let state = ParentState::variant(&mut value_builder, &mut metadata_builder); - ObjectBuilder::new(state, false).with_field("foo", 10).finish(); + ObjectBuilder::new(state, false) + .with_field("foo", 10) + .finish(); (metadata, value_builder.into_inner()) };