From 80bf5cd217696f191eed019963903817f2024806 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 6 Aug 2025 20:36:10 -0700 Subject: [PATCH 1/2] Implement `DataType::{Binary, LargeBinary, BinaryView}` => `Variant::Binary` --- .../src/cast_to_variant.rs | 106 ++++++++++++++---- 1 file changed, 86 insertions(+), 20 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index cbd16c589c61..c85ad4251877 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -18,24 +18,26 @@ use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{Array, AsArray}; use arrow::datatypes::{ - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + BinaryType, BinaryViewType, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, LargeBinaryType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow_schema::{ArrowError, DataType}; use half::f16; use parquet_variant::Variant; -/// Convert the input array of a specific primitive type to a `VariantArray` -/// row by row -macro_rules! primitive_conversion { - ($t:ty, $input:expr, $builder:expr) => {{ - let array = $input.as_primitive::<$t>(); +/// Convert the input array to a `VariantArray` row by row, using `method` +/// to downcast the generic array to a specific array type and `cast_fn` +/// to transform each element to a type compatible with Variant +macro_rules! conversion { + ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ + let array = $input.$method::<$t>(); for i in 0..array.len() { if array.is_null(i) { $builder.append_null(); continue; } - $builder.append_variant(Variant::from(array.value(i))); + let cast_value = $cast_fn(array.value(i)); + $builder.append_variant(Variant::from(cast_value)); } }}; } @@ -85,38 +87,47 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let input_type = input.data_type(); // todo: handle other types like Boolean, Strings, Date, Timestamp, etc. match input_type { + DataType::Binary => { + conversion!(BinaryType, as_bytes, |v| v, input, builder); + } + DataType::LargeBinary => { + conversion!(LargeBinaryType, as_bytes, |v| v, input, builder); + } + DataType::BinaryView => { + conversion!(BinaryViewType, as_byte_view, |v| v, input, builder); + } DataType::Int8 => { - primitive_conversion!(Int8Type, input, builder); + conversion!(Int8Type, as_primitive, |v| v, input, builder); } DataType::Int16 => { - primitive_conversion!(Int16Type, input, builder); + conversion!(Int16Type, as_primitive, |v| v, input, builder); } DataType::Int32 => { - primitive_conversion!(Int32Type, input, builder); + conversion!(Int32Type, as_primitive, |v| v, input, builder); } DataType::Int64 => { - primitive_conversion!(Int64Type, input, builder); + conversion!(Int64Type, as_primitive, |v| v, input, builder); } DataType::UInt8 => { - primitive_conversion!(UInt8Type, input, builder); + conversion!(UInt8Type, as_primitive, |v| v, input, builder); } DataType::UInt16 => { - primitive_conversion!(UInt16Type, input, builder); + conversion!(UInt16Type, as_primitive, |v| v, input, builder); } DataType::UInt32 => { - primitive_conversion!(UInt32Type, input, builder); + conversion!(UInt32Type, as_primitive, |v| v, input, builder); } DataType::UInt64 => { - primitive_conversion!(UInt64Type, input, builder); + conversion!(UInt64Type, as_primitive, |v| v, input, builder); } DataType::Float16 => { cast_conversion!(Float16Type, |v: f16| -> f32 { v.into() }, input, builder); } DataType::Float32 => { - primitive_conversion!(Float32Type, input, builder); + conversion!(Float32Type, as_primitive, |v| v, input, builder); } DataType::Float64 => { - primitive_conversion!(Float64Type, input, builder); + conversion!(Float64Type, as_primitive, |v| v, input, builder); } dt => { return Err(ArrowError::CastError(format!( @@ -135,12 +146,67 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { mod tests { use super::*; use arrow::array::{ - ArrayRef, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, Float16Array, Float32Array, Float64Array, GenericByteBuilder, + GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use parquet_variant::{Variant, VariantDecimal16}; use std::sync::Arc; + #[test] + fn test_cast_to_variant_binary() { + // BinaryType + let mut builder = GenericByteBuilder::::new(); + builder.append_value(b"hello"); + builder.append_value(b""); + builder.append_null(); + builder.append_value(b"world"); + let binary_array = builder.finish(); + run_test( + Arc::new(binary_array), + vec![ + Some(Variant::Binary(b"hello")), + Some(Variant::Binary(b"")), + None, + Some(Variant::Binary(b"world")), + ], + ); + + // LargeBinaryType + let mut builder = GenericByteBuilder::::new(); + builder.append_value(b"hello"); + builder.append_value(b""); + builder.append_null(); + builder.append_value(b"world"); + let large_binary_array = builder.finish(); + run_test( + Arc::new(large_binary_array), + vec![ + Some(Variant::Binary(b"hello")), + Some(Variant::Binary(b"")), + None, + Some(Variant::Binary(b"world")), + ], + ); + + // BinaryViewType + let mut builder = GenericByteViewBuilder::::new(); + builder.append_value(b"hello"); + builder.append_value(b""); + builder.append_null(); + builder.append_value(b"world"); + let byte_view_array = builder.finish(); + run_test( + Arc::new(byte_view_array), + vec![ + Some(Variant::Binary(b"hello")), + Some(Variant::Binary(b"")), + None, + Some(Variant::Binary(b"world")), + ], + ); + } + #[test] fn test_cast_to_variant_int8() { run_test( From 809d7ae2c7e68368f24231672b7d564114bdd010 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 6 Aug 2025 20:36:10 -0700 Subject: [PATCH 2/2] Reduce scope of macro changes --- .../src/cast_to_variant.rs | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index c85ad4251877..446baf30384c 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -25,28 +25,27 @@ use arrow_schema::{ArrowError, DataType}; use half::f16; use parquet_variant::Variant; -/// Convert the input array to a `VariantArray` row by row, using `method` -/// to downcast the generic array to a specific array type and `cast_fn` -/// to transform each element to a type compatible with Variant -macro_rules! conversion { - ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ - let array = $input.$method::<$t>(); +/// Convert the input array of a specific primitive type to a `VariantArray` +/// row by row +macro_rules! primitive_conversion { + ($t:ty, $input:expr, $builder:expr) => {{ + let array = $input.as_primitive::<$t>(); for i in 0..array.len() { if array.is_null(i) { $builder.append_null(); continue; } - let cast_value = $cast_fn(array.value(i)); - $builder.append_variant(Variant::from(cast_value)); + $builder.append_variant(Variant::from(array.value(i))); } }}; } -/// Convert the input array to a `VariantArray` row by row, -/// transforming each element with `cast_fn` +/// Convert the input array to a `VariantArray` row by row, using `method` +/// to downcast the generic array to a specific array type and `cast_fn` +/// to transform each element to a type compatible with Variant macro_rules! cast_conversion { - ($t:ty, $cast_fn:expr, $input:expr, $builder:expr) => {{ - let array = $input.as_primitive::<$t>(); + ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ + let array = $input.$method::<$t>(); for i in 0..array.len() { if array.is_null(i) { $builder.append_null(); @@ -88,46 +87,52 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { // todo: handle other types like Boolean, Strings, Date, Timestamp, etc. match input_type { DataType::Binary => { - conversion!(BinaryType, as_bytes, |v| v, input, builder); + cast_conversion!(BinaryType, as_bytes, |v| v, input, builder); } DataType::LargeBinary => { - conversion!(LargeBinaryType, as_bytes, |v| v, input, builder); + cast_conversion!(LargeBinaryType, as_bytes, |v| v, input, builder); } DataType::BinaryView => { - conversion!(BinaryViewType, as_byte_view, |v| v, input, builder); + cast_conversion!(BinaryViewType, as_byte_view, |v| v, input, builder); } DataType::Int8 => { - conversion!(Int8Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Int8Type, input, builder); } DataType::Int16 => { - conversion!(Int16Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Int16Type, input, builder); } DataType::Int32 => { - conversion!(Int32Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Int32Type, input, builder); } DataType::Int64 => { - conversion!(Int64Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Int64Type, input, builder); } DataType::UInt8 => { - conversion!(UInt8Type, as_primitive, |v| v, input, builder); + primitive_conversion!(UInt8Type, input, builder); } DataType::UInt16 => { - conversion!(UInt16Type, as_primitive, |v| v, input, builder); + primitive_conversion!(UInt16Type, input, builder); } DataType::UInt32 => { - conversion!(UInt32Type, as_primitive, |v| v, input, builder); + primitive_conversion!(UInt32Type, input, builder); } DataType::UInt64 => { - conversion!(UInt64Type, as_primitive, |v| v, input, builder); + primitive_conversion!(UInt64Type, input, builder); } DataType::Float16 => { - cast_conversion!(Float16Type, |v: f16| -> f32 { v.into() }, input, builder); + cast_conversion!( + Float16Type, + as_primitive, + |v: f16| -> f32 { v.into() }, + input, + builder + ); } DataType::Float32 => { - conversion!(Float32Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Float32Type, input, builder); } DataType::Float64 => { - conversion!(Float64Type, as_primitive, |v| v, input, builder); + primitive_conversion!(Float64Type, input, builder); } dt => { return Err(ArrowError::CastError(format!(