diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 16dbff4c341a..bf24eb626611 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -17,15 +17,17 @@ //! [`VariantArray`] implementation -use crate::type_conversion::primitive_conversion_single_value; +use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value}; use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::compute::cast; use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampMicrosecondType, TimestampNanosecondType, }; use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; +use chrono::DateTime; use parquet_variant::Uuid; use parquet_variant::Variant; @@ -837,6 +839,42 @@ fn typed_value_to_variant<'a>( DataType::Float64 => { primitive_conversion_single_value!(Float64Type, typed_value, index) } + DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap(), + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + generic_conversion_single_value!( + TimestampMicrosecondType, + as_primitive, + |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(), + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + DateTime::from_timestamp_nanos, + typed_value, + index + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + generic_conversion_single_value!( + TimestampNanosecondType, + as_primitive, + |v| DateTime::from_timestamp_nanos(v).naive_utc(), + typed_value, + index + ) + } // todo other types here (note this is very similar to cast_to_variant.rs) // so it would be great to figure out how to share this code _ => { diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 49f56af57327..a923732ca41b 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -34,7 +34,7 @@ pub(crate) enum ShreddedPathStep { /// Path step succeeded, return the new shredding state Success(ShreddingState), /// The path element is not present in the `typed_value` column and there is no `value` column, - /// so we we know it does not exist. It, and all paths under it, are all-NULL. + /// so we know it does not exist. It, and all paths under it, are all-NULL. Missing, /// The path element is not present in the `typed_value` column and must be retrieved from the `value` /// column instead. The caller should be prepared to handle any value, including the requested @@ -296,22 +296,21 @@ impl<'a> GetOptions<'a> { mod test { use std::sync::Arc; + use super::{variant_get, GetOptions}; + use crate::json_to_variant; + use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; + use crate::VariantArray; use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64}; use arrow_schema::{DataType, Field, FieldRef, Fields}; + use chrono::DateTime; use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; - use crate::json_to_variant; - use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; - use crate::VariantArray; - - use super::{variant_get, GetOptions}; - fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { // Create input array from JSON string let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); @@ -417,6 +416,49 @@ mod test { }; } + macro_rules! partially_shredded_variant_array_gen { + ($func_name:ident, $typed_value_array_gen: expr) => { + fn $func_name() -> ArrayRef { + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value (why?) + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = $typed_value_array_gen(); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), false) + .with_field("typed_value", Arc::new(typed_value), true) + .with_field("value", Arc::new(values), true) + .with_nulls(nulls) + .build(); + ArrayRef::from( + VariantArray::try_new(&struct_array).expect("should create variant array"), + ) + } + }; + } + #[test] fn get_variant_partially_shredded_int8_as_variant() { numeric_partially_shredded_test!(i8, partially_shredded_int8_variant_array); @@ -481,6 +523,15 @@ mod test { assert_eq!(result.value(3), Variant::from("world")); } + partially_shredded_variant_array_gen!(partially_shredded_binary_view_variant_array, || { + BinaryViewArray::from(vec![ + Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded + None, // row 1 is null + None, // row 2 is a string + Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded + ]) + }); + #[test] fn get_variant_partially_shredded_date32_as_variant() { let array = partially_shredded_date32_variant_array(); @@ -750,6 +801,156 @@ mod test { f64 ); + macro_rules! assert_variant_get_as_variant_array_with_default_option { + ($variant_array: expr, $array_expected: expr) => {{ + let options = GetOptions::new(); + let array = $variant_array; + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result = VariantArray::try_new(&result).unwrap(); + + assert_eq!(result.len(), $array_expected.len()); + + for (idx, item) in $array_expected.into_iter().enumerate() { + match item { + Some(item) => assert_eq!(result.value(idx), item), + None => assert!(result.is_null(idx)), + } + } + }}; + } + + partially_shredded_variant_array_gen!( + partially_shredded_timestamp_micro_ntz_variant_array, + || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + None, + None, + Some(1758602096000000), + ]) + } + ); + + #[test] + fn get_variant_partial_shredded_timestamp_micro_ntz_as_variant() { + let array = partially_shredded_timestamp_micro_ntz_variant_array(); + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp_micros(-456000i64) + .unwrap() + .naive_utc(), + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .naive_utc(), + )), + ] + ) + } + + partially_shredded_variant_array_gen!(partially_shredded_timestamp_micro_variant_array, || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + None, + None, + Some(1758602096000000), + ]) + .with_timezone("+00:00") + }); + + #[test] + fn get_variant_partial_shredded_timestamp_micro_as_variant() { + let array = partially_shredded_timestamp_micro_variant_array(); + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp_micros(-456000i64) + .unwrap() + .to_utc(), + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .to_utc(), + )), + ] + ) + } + + partially_shredded_variant_array_gen!( + partially_shredded_timestamp_nano_ntz_variant_array, + || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + None, + None, + Some(1758602096000000000), + ]) + } + ); + + #[test] + fn get_variant_partial_shredded_timestamp_nano_ntz_as_variant() { + let array = partially_shredded_timestamp_nano_ntz_variant_array(); + + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp(-5, 439).unwrap().naive_utc() + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .naive_utc() + )), + ] + ) + } + + partially_shredded_variant_array_gen!(partially_shredded_timestamp_nano_variant_array, || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + None, + None, + Some(1758602096000000000), + ]) + .with_timezone("+00:00") + }); + + #[test] + fn get_variant_partial_shredded_timestamp_nano_as_variant() { + let array = partially_shredded_timestamp_nano_variant_array(); + + assert_variant_get_as_variant_array_with_default_option!( + array, + vec![ + Some(Variant::from( + DateTime::from_timestamp(-5, 439).unwrap().to_utc() + )), + None, + Some(Variant::from("n/a")), + Some(Variant::from( + DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00") + .unwrap() + .to_utc() + )), + ] + ) + } + /// Return a VariantArray that represents a normal "shredded" variant /// for the following example /// @@ -775,6 +976,17 @@ mod test { /// ``` macro_rules! numeric_partially_shredded_variant_array_fn { ($func:ident, $array_type:ident, $primitive_type:ty) => { + partially_shredded_variant_array_gen!($func, || $array_type::from(vec![ + Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value + None, // row 1 is null, so no value + None, // row 2 is a string, so no typed value + Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value + ])); + }; + } + + macro_rules! partially_shredded_variant_array_gen { + ($func:ident, $typed_array_gen: expr) => { fn $func() -> ArrayRef { // At the time of writing, the `VariantArrayBuilder` does not support shredding. // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 @@ -803,12 +1015,7 @@ mod test { None, // row 3 is shredded, so no value ]); - let typed_value = $array_type::from(vec![ - Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value - None, // row 1 is null, so no value - None, // row 2 is a string, so no typed value - Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value - ]); + let typed_value = $typed_array_gen(); let struct_array = StructArrayBuilder::new() .with_field("metadata", Arc::new(metadata), false) @@ -817,7 +1024,9 @@ mod test { .with_nulls(nulls) .build(); - Arc::new(struct_array) + ArrayRef::from( + VariantArray::try_new(&struct_array).expect("should create variant array"), + ) } }; } @@ -853,184 +1062,32 @@ mod test { f64 ); - /// Return a VariantArray that represents a partially "shredded" variant for bool - fn partially_shredded_bool_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value (why?) - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = BooleanArray::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_bool_variant_array, || { + arrow::array::BooleanArray::from(vec![ Some(true), // row 0 is shredded, so it has a value None, // row 1 is null, so no value None, // row 2 is a string, so no typed value Some(false), // row 3 is shredded, so it has a value - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); + ]) + }); - Arc::new(struct_array) - } - - /// Return a VariantArray that represents a partially "shredded" variant for UTF8 - fn partially_shredded_utf8_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = StringArray::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, || { + StringArray::from(vec![ Some("hello"), // row 0 is shredded None, // row 1 is null None, // row 2 is a string Some("world"), // row 3 is shredded - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } + ]) + }); - /// Return a VariantArray that represents a partially "shredded" variant for Date32 - fn partially_shredded_date32_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = Date32Array::from(vec![ + partially_shredded_variant_array_gen!(partially_shredded_date32_variant_array, || { + Date32Array::from(vec![ Some(20348), // row 0 is shredded, 2025-09-17 None, // row 1 is null None, // row 2 is a string, not a date Some(20340), // row 3 is shredded, 2025-09-09 - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } - - /// Return a VariantArray that represents a partially "shredded" variant for BinaryView - fn partially_shredded_binary_view_variant_array() -> ArrayRef { - let (metadata, string_value) = { - let mut builder = parquet_variant::VariantBuilder::new(); - builder.append_value("n/a"); - builder.finish() - }; - - // Create the null buffer for the overall array - let nulls = NullBuffer::from(vec![ - true, // row 0 non null - false, // row 1 is null - true, // row 2 non null - true, // row 3 non null - ]); - - // metadata is the same for all rows - let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); - - // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY - // about why row1 is an empty but non null, value. - let values = BinaryViewArray::from(vec![ - None, // row 0 is shredded, so no value - Some(b"" as &[u8]), // row 1 is null, so empty value - Some(&string_value), // copy the string value "N/A" - None, // row 3 is shredded, so no value - ]); - - let typed_value = BinaryViewArray::from(vec![ - Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded - None, // row 1 is null - None, // row 2 is a string - Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded - ]); - - let struct_array = StructArrayBuilder::new() - .with_field("metadata", Arc::new(metadata), false) - .with_field("typed_value", Arc::new(typed_value), true) - .with_field("value", Arc::new(values), true) - .with_nulls(nulls) - .build(); - - Arc::new(struct_array) - } + ]) + }); /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 9f202f4db803..a933a3faa1d4 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -91,11 +91,10 @@ variant_test_case!(16); variant_test_case!(17); variant_test_case!(18); variant_test_case!(19); -// https://github.com/apache/arrow-rs/issues/8331 -variant_test_case!(20, "Unsupported typed_value type: Timestamp(µs, \"UTC\")"); -variant_test_case!(21, "Unsupported typed_value type: Timestamp(µs, \"UTC\")"); -variant_test_case!(22, "Unsupported typed_value type: Timestamp(µs)"); -variant_test_case!(23, "Unsupported typed_value type: Timestamp(µs)"); +variant_test_case!(20); +variant_test_case!(21); +variant_test_case!(22); +variant_test_case!(23); // https://github.com/apache/arrow-rs/issues/8332 variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)"); variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)"); @@ -107,11 +106,10 @@ variant_test_case!(30); variant_test_case!(31); // https://github.com/apache/arrow-rs/issues/8334 variant_test_case!(32, "Unsupported typed_value type: Time64(µs)"); -// https://github.com/apache/arrow-rs/issues/8331 -variant_test_case!(33, "Unsupported typed_value type: Timestamp(ns, \"UTC\")"); -variant_test_case!(34, "Unsupported typed_value type: Timestamp(ns, \"UTC\")"); -variant_test_case!(35, "Unsupported typed_value type: Timestamp(ns)"); -variant_test_case!(36, "Unsupported typed_value type: Timestamp(ns)"); +variant_test_case!(33); +variant_test_case!(34); +variant_test_case!(35); +variant_test_case!(36); variant_test_case!(37); // https://github.com/apache/arrow-rs/issues/8336 variant_test_case!(38, "Unsupported typed_value type: Struct(");