From 1b4361caf1f71bcff11d999409e2895c167b80c0 Mon Sep 17 00:00:00 2001 From: klion26 Date: Mon, 22 Sep 2025 10:08:43 +0800 Subject: [PATCH 1/6] [Varaint] Extract test data gen logic to macro --- parquet-variant-compute/src/variant_get.rs | 328 ++++++++------------- 1 file changed, 117 insertions(+), 211 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 49f56af57327..edbe9d99462f 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -297,8 +297,8 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -417,6 +417,49 @@ mod test { }; } + macro_rules! partially_shredded_variant_array_gen { + ($func_name:ident, $typed_value_array_gen: expr) => { + fn $func_name() -> ArrayRef { + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = $typed_value_array_gen(); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), false) + .with_field("typed_value", Arc::new(typed_value), true) + .with_field("value", Arc::new(values), true) + .with_nulls(nulls) + .build(); + ArrayRef::from( + VariantArray::try_new(&struct_array).expect("should create variant array"), + ) + } + }; + } + #[test] fn get_variant_partially_shredded_int8_as_variant() { numeric_partially_shredded_test!(i8, partially_shredded_int8_variant_array); @@ -481,6 +524,15 @@ mod test { assert_eq!(result.value(3), Variant::from("world")); } + partially_shredded_variant_array_gen!(partially_shredded_binary_view_variant_array, || { + BinaryViewArray::from(vec![ + Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded + None, // row 1 is null + None, // row 2 is a string + Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded + ]) + }); + #[test] fn get_variant_partially_shredded_date32_as_variant() { let array = partially_shredded_date32_variant_array(); @@ -775,51 +827,13 @@ mod test { /// ``` macro_rules! numeric_partially_shredded_variant_array_fn { ($func:ident, $array_type:ident, $primitive_type:ty) => { - fn $func() -> ArrayRef { - // At the time of writing, the `VariantArrayBuilder` does not support shredding. - // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value (why?) - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = $array_type::from(vec![ - Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value - None, // row 1 is null, so no value - None, // row 2 is a string, so no typed value - Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } - }; + partially_shredded_variant_array_gen!($func, || $array_type::from(vec![ + Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value + None, // row 1 is null, so no value + None, // row 2 is a string, so no typed value + Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value + ])); + } } numeric_partially_shredded_variant_array_fn!( @@ -853,184 +867,76 @@ mod test { f64 ); - /// Return a VariantArray that represents a partially "shredded" variant for bool - fn partially_shredded_bool_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value (why?) - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = BooleanArray::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_bool_variant_array, || { + arrow::array::BooleanArray::from(vec![ Some(true), // row 0 is shredded, so it has a value None, // row 1 is null, so no value None, // row 2 is a string, so no typed value Some(false), // row 3 is shredded, so it has a value - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } + ]) + }); - /// Return a VariantArray that represents a partially "shredded" variant for UTF8 - fn partially_shredded_utf8_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = StringArray::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, || + StringArray::from(vec![ Some("hello"), // row 0 is shredded None, // row 1 is null None, // row 2 is a string Some("world"), // row 3 is shredded - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } - - /// Return a VariantArray that represents a partially "shredded" variant for Date32 - fn partially_shredded_date32_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); + ]) + ); - let typed_value = Date32Array::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_date32_variant_array, || { + Date32Array::from(vec![ Some(20348), // row 0 is shredded, 2025-09-17 None, // row 1 is null None, // row 2 is a string, not a date Some(20340), // row 3 is shredded, 2025-09-09 - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } - - /// Return a VariantArray that represents a partially "shredded" variant for BinaryView - fn partially_shredded_binary_view_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = BinaryViewArray::from(vec![ - Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded - None, // row 1 is null - None, // row 2 is a string - Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } + ]) + }); + // /// Return a VariantArray that represents a partially "shredded" variant for Date32 + // fn partially_shredded_date32_variant_array() -> ArrayRef { + // let (metadata, string_value) = { + // let mut builder = parquet_variant::VariantBuilder::new(); + // builder.append_value("n/a"); + // builder.finish() + // }; + // + // // Create the null buffer for the overall array + // let nulls = NullBuffer::from(vec![ + // true, // row 0 non null + // false, // row 1 is null + // true, // row 2 non null + // true, // row 3 non null + // ]); + // + // // metadata is the same for all rows + // let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + // + // // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // // about why row1 is an empty but non null, value. + // let values = BinaryViewArray::from(vec![ + // None, // row 0 is shredded, so no value + // Some(b"" as &[u8]), // row 1 is null, so empty value + // Some(&string_value), // copy the string value "N/A" + // None, // row 3 is shredded, so no value + // ]); + // + // let typed_value = Date32Array::from(vec![ + // Some(20348), // row 0 is shredded, 2025-09-17 + // None, // row 1 is null + // None, // row 2 is a string, not a date + // Some(20340), // row 3 is shredded, 2025-09-09 + // ]); + // + // let struct_array = StructArrayBuilder::new() + // .with_field("metadata", Arc::new(metadata), false) + // .with_field("typed_value", Arc::new(typed_value), true) + // .with_field("value", Arc::new(values), true) + // .with_nulls(nulls) + // .build(); + // + // Arc::new(struct_array) + // } /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): From 4518a731a2dfb3b3abf485d5aac0328c801b7978 Mon Sep 17 00:00:00 2001 From: klion26 Date: Mon, 22 Sep 2025 10:08:02 +0800 Subject: [PATCH 2/6] [Variant] Support typed access for timestamp(micro&nano) --- parquet-variant-compute/src/variant_array.rs | 49 ++++- parquet-variant-compute/src/variant_get.rs | 217 ++++++++++++++++++- parquet/tests/variant_integration.rs | 19 +- 3 files changed, 266 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 16dbff4c341a..fa296cb83019 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -17,15 +17,17 @@ //! [`VariantArray`] implementation -use crate::type_conversion::primitive_conversion_single_value; +use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value}; use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampMicrosecondType, TimestampNanosecondType }; use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; +use chrono::DateTime; use parquet_variant::Uuid; use parquet_variant::Variant; @@ -837,6 +839,51 @@ fn typed_value_to_variant<'a>( DataType::Float64 => { primitive_conversion_single_value!(Float64Type, typed_value, index) } + DataType::Timestamp(timeunit, tz) => { + match (timeunit, tz) { + (TimeUnit::Microsecond, Some(_)) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap(), + typed_value, + index + ) + } + (TimeUnit::Microsecond, None) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(), + typed_value, + index + ) + } + (TimeUnit::Nanosecond, Some(_)) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + DateTime::from_timestamp_nanos, + typed_value, + index + ) + } + (TimeUnit::Nanosecond, None) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + |v| DateTime::from_timestamp_nanos(v).naive_utc(), + typed_value, + index + ) + } + // Variant timestamp only support time unit with microsecond or nanosecond precision + _ => panic!( + "Variant only support timestamp with microsecond or nanosecond precision" + ), + } + } + // todo other types here (note this is very similar to cast_to_variant.rs) // so it would be great to figure out how to share this code _ => { diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index edbe9d99462f..d66bc9920da7 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -34,7 +34,7 @@ pub(crate) enum ShreddedPathStep { /// Path step succeeded, return the new shredding state Success(ShreddingState), /// The path element is not present in the `typed_value` column and there is no `value` column, - /// so we we know it does not exist. It, and all paths under it, are all-NULL. + /// so we know it does not exist. It, and all paths under it, are all-NULL. Missing, /// The path element is not present in the `typed_value` column and must be retrieved from the `value` /// column instead. The caller should be prepared to handle any value, including the requested @@ -300,17 +300,24 @@ mod test { Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray }; + use std::sync::Arc; + + use arrow::array::{ + Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + StringArray, StructArray + }; + use super::{variant_get, GetOptions}; + use crate::json_to_variant; + use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; + use crate::VariantArray; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64}; use arrow_schema::{DataType, Field, FieldRef, Fields}; + use chrono::DateTime; use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; - - use crate::json_to_variant; - use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; - use crate::VariantArray; - - use super::{variant_get, GetOptions}; + use std::sync::Arc; fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { // Create input array from JSON string @@ -802,6 +809,156 @@ mod test { f64 ); + macro_rules! assert_variant_get_as_variant_array_with_default_option { + ($variant_array: expr, $array_expected: expr) => {{ + let options = GetOptions::new(); + let array = $variant_array; + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result.len(), $array_expected.len()); + + for (idx, item) in $array_expected.into_iter().enumerate() { + match item { + Some(item) => assert_eq!(result.value(idx), item), + None => assert!(result.is_null(idx)), + } + } + }}; + } + + partially_shredded_variant_array_gen!( + partially_shredded_timestamp_micro_ntz_variant_array, + || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + None, + None, + Some(1758602096000000), + ]) + } + ); + + #[test] + fn get_variant_partial_shredded_timestamp_micro_ntz_as_variant() { + let array = partially_shredded_timestamp_micro_ntz_variant_array(); + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp_micros(-456000i64) + .unwrap() + .naive_utc(), + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .naive_utc(), + )), + ] + ) + } + + partially_shredded_variant_array_gen!(partially_shredded_timestamp_micro_variant_array, || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + None, + None, + Some(1758602096000000), + ]) + .with_timezone("+00:00") + }); + + #[test] + fn get_variant_partial_shredded_timestamp_micro_as_variant() { + let array = partially_shredded_timestamp_micro_variant_array(); + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp_micros(-456000i64) + .unwrap() + .to_utc(), + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .to_utc(), + )), + ] + ) + } + + partially_shredded_variant_array_gen!( + partially_shredded_timestamp_nano_ntz_variant_array, + || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + None, + None, + Some(1758602096000000000), + ]) + } + ); + + #[test] + fn get_variant_partial_shredded_timestamp_nano_ntz_as_variant() { + let array = partially_shredded_timestamp_nano_ntz_variant_array(); + + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp(-5, 439).unwrap().naive_utc() + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .naive_utc() + )), + ] + ) + } + + partially_shredded_variant_array_gen!(partially_shredded_timestamp_nano_variant_array, || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + None, + None, + Some(1758602096000000000), + ]) + .with_timezone("+00:00") + }); + + #[test] + fn get_variant_partial_shredded_timestamp_nano_as_variant() { + let array = partially_shredded_timestamp_nano_variant_array(); + + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp(-5, 439).unwrap().to_utc() + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .to_utc() + )), + ] + ) + } + /// Return a VariantArray that represents a normal "shredded" variant /// for the following example /// @@ -836,6 +993,52 @@ mod test { } } + macro_rules! partially_shredded_variant_array_gen { + ($func:ident, $typed_array_gen: expr) => { + fn $func() -> ArrayRef { + // At the time of writing, the `VariantArrayBuilder` does not support shredding. + // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = $typed_array_gen(); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), false) + .with_field("typed_value", Arc::new(typed_value), true) + .with_field("value", Arc::new(values), true) + .with_nulls(nulls) + .build(); + + ArrayRef::from( + VariantArray::try_new(&struct_array).expect("should create variant array"), + ) + } + }; + } + numeric_partially_shredded_variant_array_fn!( partially_shredded_int8_variant_array, Int8Array, diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 9f202f4db803..9653c82462c0 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -91,11 +91,10 @@ variant_test_case!(16); variant_test_case!(17); variant_test_case!(18); variant_test_case!(19); -// https://github.com/apache/arrow-rs/issues/8331 -variant_test_case!(20, "Unsupported typed_value type: Timestamp(µs, \"UTC\")"); -variant_test_case!(21, "Unsupported typed_value type: Timestamp(µs, \"UTC\")"); -variant_test_case!(22, "Unsupported typed_value type: Timestamp(µs)"); -variant_test_case!(23, "Unsupported typed_value type: Timestamp(µs)"); +variant_test_case!(20); +variant_test_case!(21); +variant_test_case!(22); +variant_test_case!(23); // https://github.com/apache/arrow-rs/issues/8332 variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)"); variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)"); @@ -105,13 +104,11 @@ variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(30); variant_test_case!(31); -// https://github.com/apache/arrow-rs/issues/8334 variant_test_case!(32, "Unsupported typed_value type: Time64(µs)"); -// https://github.com/apache/arrow-rs/issues/8331 -variant_test_case!(33, "Unsupported typed_value type: Timestamp(ns, \"UTC\")"); -variant_test_case!(34, "Unsupported typed_value type: Timestamp(ns, \"UTC\")"); -variant_test_case!(35, "Unsupported typed_value type: Timestamp(ns)"); -variant_test_case!(36, "Unsupported typed_value type: Timestamp(ns)"); +variant_test_case!(33); +variant_test_case!(34); +variant_test_case!(35); +variant_test_case!(36); variant_test_case!(37); // https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(38, "Unsupported typed_value type: Struct("); From ccb0e65f6a6c1396c2385679a1a0e352ba6f266d Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 24 Sep 2025 22:17:29 +0800 Subject: [PATCH 3/6] address comment --- parquet-variant-compute/src/variant_array.rs | 79 +++++++++----------- 1 file changed, 35 insertions(+), 44 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index fa296cb83019..812f7123ec4f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -839,51 +839,42 @@ fn typed_value_to_variant<'a>( DataType::Float64 => { primitive_conversion_single_value!(Float64Type, typed_value, index) } - DataType::Timestamp(timeunit, tz) => { - match (timeunit, tz) { - (TimeUnit::Microsecond, Some(_)) => { - generic_conversion_single_value!( - TimestampMicrosecondType, - as_primitive, - |v| DateTime::from_timestamp_micros(v).unwrap(), - typed_value, - index - ) - } - (TimeUnit::Microsecond, None) => { - generic_conversion_single_value!( - TimestampMicrosecondType, - as_primitive, - |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(), - typed_value, - index - ) - } - (TimeUnit::Nanosecond, Some(_)) => { - generic_conversion_single_value!( - TimestampNanosecondType, - as_primitive, - DateTime::from_timestamp_nanos, - typed_value, - index - ) - } - (TimeUnit::Nanosecond, None) => { - generic_conversion_single_value!( - TimestampNanosecondType, - as_primitive, - |v| DateTime::from_timestamp_nanos(v).naive_utc(), - typed_value, - index - ) - } - // Variant timestamp only support time unit with microsecond or nanosecond precision - _ => panic!( - "Variant only support timestamp with microsecond or nanosecond precision" - ), - } + DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap(), + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(), + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + DateTime::from_timestamp_nanos, + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + |v| DateTime::from_timestamp_nanos(v).naive_utc(), + typed_value, + index + ) } - // todo other types here (note this is very similar to cast_to_variant.rs) // so it would be great to figure out how to share this code _ => { From d91fbb1e2ee584412b6ee2031f38cb21d4f672a4 Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 24 Sep 2025 22:42:39 +0800 Subject: [PATCH 4/6] fix style --- parquet-variant-compute/src/variant_get.rs | 57 ++-------------------- 1 file changed, 3 insertions(+), 54 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index d66bc9920da7..df7b5d9a5775 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -296,12 +296,6 @@ impl<'a> GetOptions<'a> { mod test { use std::sync::Arc; - use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray - }; - use std::sync::Arc; - use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, @@ -317,7 +311,6 @@ mod test { use arrow_schema::{DataType, Field, FieldRef, Fields}; use chrono::DateTime; use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; - use std::sync::Arc; fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { // Create input array from JSON string @@ -990,7 +983,7 @@ mod test { None, // row 2 is a string, so no typed value Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value ])); - } + }; } macro_rules! partially_shredded_variant_array_gen { @@ -1079,14 +1072,14 @@ mod test { ]) }); - partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, || + partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, || { StringArray::from(vec![ Some("hello"), // row 0 is shredded None, // row 1 is null None, // row 2 is a string Some("world"), // row 3 is shredded ]) - ); + }); partially_shredded_variant_array_gen!(partially_shredded_date32_variant_array, || { Date32Array::from(vec![ @@ -1096,50 +1089,6 @@ mod test { Some(20340), // row 3 is shredded, 2025-09-09 ]) }); - // /// Return a VariantArray that represents a partially "shredded" variant for Date32 - // fn partially_shredded_date32_variant_array() -> ArrayRef { - // let (metadata, string_value) = { - // let mut builder = parquet_variant::VariantBuilder::new(); - // builder.append_value("n/a"); - // builder.finish() - // }; - // - // // Create the null buffer for the overall array - // let nulls = NullBuffer::from(vec![ - // true, // row 0 non null - // false, // row 1 is null - // true, // row 2 non null - // true, // row 3 non null - // ]); - // - // // metadata is the same for all rows - // let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - // - // // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // // about why row1 is an empty but non null, value. - // let values = BinaryViewArray::from(vec![ - // None, // row 0 is shredded, so no value - // Some(b"" as &[u8]), // row 1 is null, so empty value - // Some(&string_value), // copy the string value "N/A" - // None, // row 3 is shredded, so no value - // ]); - // - // let typed_value = Date32Array::from(vec![ - // Some(20348), // row 0 is shredded, 2025-09-17 - // None, // row 1 is null - // None, // row 2 is a string, not a date - // Some(20340), // row 3 is shredded, 2025-09-09 - // ]); - // - // let struct_array = StructArrayBuilder::new() - // .with_field("metadata", Arc::new(metadata), false) - // .with_field("typed_value", Arc::new(typed_value), true) - // .with_field("value", Arc::new(values), true) - // .with_nulls(nulls) - // .build(); - // - // Arc::new(struct_array) - // } /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): From bb4c3ebcc9c092809b10adcb0e98bc8098e1c12c Mon Sep 17 00:00:00 2001 From: klion26 Date: Fri, 26 Sep 2025 09:00:35 +0800 Subject: [PATCH 5/6] fix style after rebase --- parquet-variant-compute/src/variant_array.rs | 2 +- parquet-variant-compute/src/variant_get.rs | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 812f7123ec4f..bf24eb626611 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -23,7 +23,7 @@ use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - TimestampMicrosecondType, TimestampNanosecondType + TimestampMicrosecondType, TimestampNanosecondType, }; use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index df7b5d9a5775..a923732ca41b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -296,15 +296,14 @@ impl<'a> GetOptions<'a> { mod test { use std::sync::Arc; - use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - StringArray, StructArray - }; use super::{variant_get, GetOptions}; use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::VariantArray; + use arrow::array::{ + Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64}; From 12b97ee935c58604f33b8b6d91328520747067a2 Mon Sep 17 00:00:00 2001 From: klion26 Date: Fri, 26 Sep 2025 09:20:55 +0800 Subject: [PATCH 6/6] fix --- parquet/tests/variant_integration.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 9653c82462c0..a933a3faa1d4 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -104,6 +104,7 @@ variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(30); variant_test_case!(31); +// https://github.com/apache/arrow-rs/issues/8334 variant_test_case!(32, "Unsupported typed_value type: Time64(µs)"); variant_test_case!(33); variant_test_case!(34);