From 569a8471c7bbd0867b3959dec17885ca1a1c7c78 Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Mon, 28 Jul 2025 10:53:53 -0400 Subject: [PATCH 1/9] [Variant] impl FromIterator for VariantPath --- parquet-variant/src/path.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs index 42dbdb3abc2d..6b058aa8a3ee 100644 --- a/parquet-variant/src/path.rs +++ b/parquet-variant/src/path.rs @@ -43,10 +43,10 @@ use std::{borrow::Cow, ops::Deref}; /// // access the field "foo" and then the first element in a variant list value /// let path = VariantPath::from("foo").join(0); /// // this is the same as the previous one -/// let path2 = VariantPath::new(vec!["foo".into(), 0.into()]); +/// let path2 = VariantPath::from_iter(["foo".into(), 0.into()]); /// assert_eq!(path, path2); /// // you can also create a path from a vector of `VariantPathElement` directly -/// let path3 = VariantPath::new(vec![ +/// let path3 = VariantPath::from_iter([ /// VariantPathElement::field("foo"), /// VariantPathElement::index(0) /// ]); @@ -109,6 +109,13 @@ impl<'a> From for VariantPath<'a> { } } +/// Create from iter +impl<'a> FromIterator> for VariantPath<'a> { + fn from_iter>>(iter: T) -> Self { + VariantPath::new(Vec::from_iter(iter)) + } +} + impl<'a> Deref for VariantPath<'a> { type Target = [VariantPathElement<'a>]; From 70c6f1f397155c61fd7c6f03e9c8978317f2c43a Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Thu, 7 Aug 2025 14:08:02 -0400 Subject: [PATCH 2/9] Implement `DataType::Boolean` support for `cast_to_variant` --- .../src/cast_to_variant.rs | 49 ++++++++++++++++--- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 446baf30384c..adced93f2740 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -41,9 +41,9 @@ macro_rules! primitive_conversion { } /// Convert the input array to a `VariantArray` row by row, using `method` -/// to downcast the generic array to a specific array type and `cast_fn` -/// to transform each element to a type compatible with Variant -macro_rules! cast_conversion { +/// requiring a generic type to downcast the generic array to a specific +/// array type and `cast_fn` to transform each element to a type compatible with Variant +macro_rules! generic_conversion { ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ let array = $input.$method::<$t>(); for i in 0..array.len() { @@ -57,6 +57,23 @@ macro_rules! cast_conversion { }}; } +/// Convert the input array to a `VariantArray` row by row, using `method` +/// not requiring a generic type to downcast the generic array to a specific +/// array type and `cast_fn` to transform each element to a type compatible with Variant +macro_rules! non_generic_conversion { + ($method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ + let array = $input.$method(); + for i in 0..array.len() { + if array.is_null(i) { + $builder.append_null(); + continue; + } + let cast_value = $cast_fn(array.value(i)); + $builder.append_variant(Variant::from(cast_value)); + } + }}; +} + /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you /// need to convert a specific data type /// @@ -86,14 +103,18 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let input_type = input.data_type(); // todo: handle other types like Boolean, Strings, Date, Timestamp, etc. match input_type { + DataType::Boolean => { + non_generic_conversion!(as_boolean, |v| v, input, builder); + } + DataType::Binary => { - cast_conversion!(BinaryType, as_bytes, |v| v, input, builder); + generic_conversion!(BinaryType, as_bytes, |v| v, input, builder); } DataType::LargeBinary => { - cast_conversion!(LargeBinaryType, as_bytes, |v| v, input, builder); + generic_conversion!(LargeBinaryType, as_bytes, |v| v, input, builder); } DataType::BinaryView => { - cast_conversion!(BinaryViewType, as_byte_view, |v| v, input, builder); + generic_conversion!(BinaryViewType, as_byte_view, |v| v, input, builder); } DataType::Int8 => { primitive_conversion!(Int8Type, input, builder); @@ -120,7 +141,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { primitive_conversion!(UInt64Type, input, builder); } DataType::Float16 => { - cast_conversion!( + generic_conversion!( Float16Type, as_primitive, |v: f16| -> f32 { v.into() }, @@ -151,7 +172,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { mod tests { use super::*; use arrow::array::{ - ArrayRef, Float16Array, Float32Array, Float64Array, GenericByteBuilder, + ArrayRef, BooleanArray, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; @@ -212,6 +233,18 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_bool() { + run_test( + Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])), + vec![ + Some(Variant::BooleanTrue), + None, + Some(Variant::BooleanFalse), + ], + ); + } + #[test] fn test_cast_to_variant_int8() { run_test( From 352d52ff20eef320c3c89f0870bedd5afb735c48 Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Thu, 14 Aug 2025 10:38:29 -0400 Subject: [PATCH 3/9] Cargo fmt + trying to add List/LargeList support --- .../src/cast_to_variant.rs | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index adced93f2740..58adf3a578e7 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -41,7 +41,7 @@ macro_rules! primitive_conversion { } /// Convert the input array to a `VariantArray` row by row, using `method` -/// requiring a generic type to downcast the generic array to a specific +/// requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant macro_rules! generic_conversion { ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ @@ -58,7 +58,7 @@ macro_rules! generic_conversion { } /// Convert the input array to a `VariantArray` row by row, using `method` -/// not requiring a generic type to downcast the generic array to a specific +/// not requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant macro_rules! non_generic_conversion { ($method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ @@ -106,7 +106,6 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { DataType::Boolean => { non_generic_conversion!(as_boolean, |v| v, input, builder); } - DataType::Binary => { generic_conversion!(BinaryType, as_bytes, |v| v, input, builder); } @@ -155,6 +154,12 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { DataType::Float64 => { primitive_conversion!(Float64Type, input, builder); } + DataType::List(_) => { + generic_conversion!(i32, as_list, |v| v, input, builder); + } + DataType::LargeList(_) => { + generic_conversion!(i64, as_list, |v| v, input, builder); + } dt => { return Err(ArrowError::CastError(format!( "Unsupported data type for casting to Variant: {dt:?}", @@ -173,12 +178,28 @@ mod tests { use super::*; use arrow::array::{ ArrayRef, BooleanArray, Float16Array, Float32Array, Float64Array, GenericByteBuilder, - GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; - use parquet_variant::{Variant, VariantDecimal16}; + use arrow_schema::Field; + use parquet_variant::{Variant, VariantDecimal16, VariantList}; use std::sync::Arc; + #[test] + fn test_cast_to_variant_list() { + run_test( + Arc::new(ListArray::from(vec![ + Some(Field::new_list_field(DataType::Int32, false)), + None, + Some(Field::new_list_field(DataType::Int32, true)), + ])), + vec![ + Some(Variant::List(Variant::from(vec!["foo", "bar", "baz"]))), + None, + Some(Variant::List(Variant::as_list(&'m self))), + ], + ); + } #[test] fn test_cast_to_variant_binary() { // BinaryType From cc269bde657587ad61ece04ab03f8f8f4a1c568a Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Mon, 18 Aug 2025 16:00:47 -0400 Subject: [PATCH 4/9] Trying to implement variant::list --- .../src/cast_to_variant.rs | 82 +++++++++++++++---- 1 file changed, 64 insertions(+), 18 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index a23566f6799e..fd71fe0d6428 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -17,15 +17,12 @@ use std::sync::Arc; -use crate::{VariantArray, VariantArrayBuilder}; +use crate::{variant_array, VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, + Array, AsArray, Datum, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray }; use arrow::datatypes::{ - i256, BinaryType, BinaryViewType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, - Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - LargeBinaryType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + i256, BinaryType, BinaryViewType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, UInt16Type, UInt32Type, UInt64Type, UInt8Type }; use arrow::temporal_conversions::{ timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, @@ -34,7 +31,7 @@ use arrow::temporal_conversions::{ use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; use half::f16; -use parquet_variant::{Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8}; +use parquet_variant::{Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList}; /// Convert the input array of a specific primitive type to a `VariantArray` /// row by row @@ -216,7 +213,6 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { DataType::Boolean => { non_generic_conversion!(as_boolean, |v| v, input, builder); } - DataType::Binary => { generic_conversion!(BinaryType, as_bytes, |v| v, input, builder); } @@ -329,10 +325,42 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { )); } DataType::List(_) => { - generic_conversion!(i32, as_list, |v| v, input, builder); + let list_array = input.as_list::(); + + // let iterators = list_array.iter().map(|inner_list| cast_to_variant(inner_list.as_ref())) + + for i in 0..list_array.len() { + if list_array.is_null(i) { + builder.append_null(); + continue; + } + // Building a VariantList to convert it to a Variant::List + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + let inner_list = list_array.value(i).into_data(); + let variant = Variant::from(inner_list); + for j in inner_list. { // <- `std::sync::Arc` is not an iterator + list_builder.append_value(j); + } + + list_builder.finish(); + + let (metadata, value) = variant_builder.finish(); + let variant = Variant::new(&metadata, &value); + let variant_inner_list = variant.as_list(); + let variant_list = match variant_inner_list { + Some(value) => Variant::List(*value), + None => { + builder.append_null(); + continue; + } + }; + + builder.append_variant(variant_list); + } } DataType::LargeList(_) => { - generic_conversion!(i64, as_list, |v| v, input, builder); + // generic_conversion!(i64, as_list, |v| Variant::List(v), input, builder); } dt => { return Err(ArrowError::CastError(format!( @@ -352,7 +380,7 @@ mod tests { use super::*; use arrow::array::{ ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, - FixedSizeBinaryBuilder, BooleanArray, Float16Array, Float32Array, Float64Array, GenericByteBuilder, + FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, NullArray, ListArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; @@ -454,16 +482,34 @@ mod tests { #[test] fn test_cast_to_variant_list() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [], [3, 4, 5]] + let value_offsets = Buffer::from_slice_ref([0, 3, 3, 6]); + + // Construct a list array from the above two + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListArray::from(list_data); + run_test( - Arc::new(ListArray::from(vec![ - Some(Field::new_list_field(DataType::Int32, false)), - None, - Some(Field::new_list_field(DataType::Int32, true)), - ])), + Arc::new(list_array), vec![ - Some(Variant::List(Variant::from(vec!["foo", "bar", "baz"]))), + Some(), None, - Some(Variant::List(Variant::as_list(&'m self))), + Some(), ], ); } From 03930301489ec0d63f4d35350ed0f0f909c17b26 Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Wed, 20 Aug 2025 15:48:43 -0400 Subject: [PATCH 5/9] Logic figured out --- .../src/cast_to_variant.rs | 68 +++---------------- 1 file changed, 10 insertions(+), 58 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index fd71fe0d6428..5ed6825beebb 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -17,9 +17,9 @@ use std::sync::Arc; -use crate::{variant_array, VariantArray, VariantArrayBuilder}; +use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, Datum, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray + Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray }; use arrow::datatypes::{ i256, BinaryType, BinaryViewType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, UInt16Type, UInt32Type, UInt64Type, UInt8Type @@ -31,7 +31,7 @@ use arrow::temporal_conversions::{ use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; use half::f16; -use parquet_variant::{Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList}; +use parquet_variant::{Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8}; /// Convert the input array of a specific primitive type to a `VariantArray` /// row by row @@ -327,36 +327,18 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { DataType::List(_) => { let list_array = input.as_list::(); - // let iterators = list_array.iter().map(|inner_list| cast_to_variant(inner_list.as_ref())) + let values_variant_array = cast_to_variant(list_array.values().as_ref())?; for i in 0..list_array.len() { if list_array.is_null(i) { builder.append_null(); continue; } - // Building a VariantList to convert it to a Variant::List - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - let inner_list = list_array.value(i).into_data(); - let variant = Variant::from(inner_list); - for j in inner_list. { // <- `std::sync::Arc` is not an iterator - list_builder.append_value(j); - } - - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::new(&metadata, &value); - let variant_inner_list = variant.as_list(); - let variant_list = match variant_inner_list { - Some(value) => Variant::List(*value), - None => { - builder.append_null(); - continue; - } - }; + let value = values_variant_array.value(i); + let variant_list = value.as_list().expect("variant should be a list").clone(); - builder.append_variant(variant_list); + builder.append_variant(Variant::List(variant_list)); } } DataType::LargeList(_) => { @@ -382,13 +364,12 @@ mod tests { ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, - IntervalYearMonthArray, NullArray, ListArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + IntervalYearMonthArray, NullArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; - use arrow_schema::Field; - use parquet_variant::{Variant, VariantDecimal16, VariantList}; + use parquet_variant::{Variant, VariantDecimal16}; use std::{sync::Arc, vec}; macro_rules! max_unscaled_value { @@ -482,36 +463,7 @@ mod tests { #[test] fn test_cast_to_variant_list() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5])) - .build() - .unwrap(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [], [3, 4, 5]] - let value_offsets = Buffer::from_slice_ref([0, 3, 3, 6]); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_buffer(value_offsets.clone()) - .add_child_data(value_data.clone()) - .build() - .unwrap(); - let list_array = ListArray::from(list_data); - - run_test( - Arc::new(list_array), - vec![ - Some(), - None, - Some(), - ], - ); + } #[test] fn test_cast_to_variant_binary() { From b91df83b314dc0ae1538fe93d27961a0dea22693 Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Thu, 21 Aug 2025 12:05:09 -0400 Subject: [PATCH 6/9] Looking good --- .../src/cast_to_variant.rs | 132 +++++++++++++++--- 1 file changed, 112 insertions(+), 20 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 717ba6038090..65c042fc85d3 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -36,7 +36,7 @@ use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; use half::f16; use parquet_variant::{ - Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, + Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8 }; /// Convert the input array of a specific primitive type to a `VariantArray` @@ -535,24 +535,76 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { } DataType::List(_) => { let list_array = input.as_list::(); - let values_variant_array = cast_to_variant(list_array.values().as_ref())?; - + let offsets = list_array.offsets(); + for i in 0..list_array.len() { if list_array.is_null(i) { builder.append_null(); continue; } - - let value = values_variant_array.value(i); - let variant_list = value.as_list().expect("variant should be a list").clone(); - - builder.append_variant(Variant::List(variant_list)); + + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; + + // Start building the inner VariantList + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + + // Add all values from the slice + for j in start..end { + list_builder.append_value(values_variant_array.value(j)); + } + + list_builder.finish(); + + let (metadata, value) = variant_builder.finish(); + let variant = Variant::new(&metadata, &value); + + // Now safely unwrap the VariantList + match variant.as_list() { + Some(inner_list) => builder.append_variant(Variant::List(inner_list.clone())), + None => builder.append_null(), // This should never happen, but safe fallback + } } } + DataType::LargeList(_) => { - // generic_conversion!(i64, as_list, |v| Variant::List(v), input, builder); + let large_list_array = input.as_list::(); + let values_variant_array = cast_to_variant(large_list_array.values().as_ref())?; + let offsets = large_list_array.offsets(); + + for i in 0..large_list_array.len() { + if large_list_array.is_null(i) { + builder.append_null(); + continue; + } + + let start = offsets[i] as usize; // What if the system is 32bit and offset is > usize::MAX? + let end = offsets[i + 1] as usize; + + // Start building the inner VariantList + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + + // Add all values from the slice + for j in start..end { + list_builder.append_value(values_variant_array.value(j)); + } + + list_builder.finish(); + + let (metadata, value) = variant_builder.finish(); + let variant = Variant::new(&metadata, &value); + + // Now safely unwrap the VariantList + match variant.as_list() { + Some(inner_list) => builder.append_variant(Variant::List(inner_list.clone())), + None => builder.append_null(), // This should never happen, but safe fallback + } + } } + dt => { return Err(ArrowError::CastError(format!( "Unsupported data type for casting to Variant: {dt:?}", @@ -605,13 +657,7 @@ fn process_run_end_encoded( mod tests { use super::*; use arrow::array::{ - ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, - Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder, - Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, - Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeStringArray, - NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeListArray, LargeStringArray, ListArray, NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array }; use arrow::buffer::NullBuffer; use arrow_schema::{Field, Fields}; @@ -716,10 +762,6 @@ mod tests { ); } - #[test] - fn test_cast_to_variant_list() { - - } #[test] fn test_cast_to_variant_binary() { // BinaryType @@ -2005,6 +2047,56 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_list() { + // List Array + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + ]; + let list_array = ListArray::from_iter_primitive::(data); + + // Expected value + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0); + list.append_value(1); + list.append_value(2); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test(Arc::new(list_array), + vec![Some(variant), None]); + } + + #[test] + fn test_cast_to_variant_large_list() { + // Large List Array + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + ]; + let large_list_array = LargeListArray::from_iter_primitive::(data); + + // Expected value + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i64); + list.append_value(1i64); + list.append_value(2i64); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test(Arc::new(large_list_array), + vec![Some(variant), None]); + } + /// Converts the given `Array` to a `VariantArray` and tests the conversion /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output. From 3dee2c300c30497d68f9239982116bce8e7660e0 Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Thu, 21 Aug 2025 12:12:17 -0400 Subject: [PATCH 7/9] Update final append logic --- parquet-variant-compute/src/cast_to_variant.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 65c042fc85d3..fbf48082eba4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -560,12 +560,8 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let (metadata, value) = variant_builder.finish(); let variant = Variant::new(&metadata, &value); - - // Now safely unwrap the VariantList - match variant.as_list() { - Some(inner_list) => builder.append_variant(Variant::List(inner_list.clone())), - None => builder.append_null(), // This should never happen, but safe fallback - } + let variant_list = variant.as_list().expect("Variant should be list"); + builder.append_variant(Variant::List(variant_list.clone())) } } @@ -596,12 +592,8 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let (metadata, value) = variant_builder.finish(); let variant = Variant::new(&metadata, &value); - - // Now safely unwrap the VariantList - match variant.as_list() { - Some(inner_list) => builder.append_variant(Variant::List(inner_list.clone())), - None => builder.append_null(), // This should never happen, but safe fallback - } + let variant_list = variant.as_list().expect("Variant should be list"); + builder.append_variant(Variant::List(variant_list.clone())) } } From 60962eea32b8e75d1e167d2bc72b6c9e62722bba Mon Sep 17 00:00:00 2001 From: "Konstantin.Tarasov" Date: Thu, 21 Aug 2025 12:19:19 -0400 Subject: [PATCH 8/9] cargo fmt --- .../src/cast_to_variant.rs | 83 +++++++++---------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index fbf48082eba4..0a5c511ec6a6 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -19,7 +19,8 @@ use std::sync::Arc; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray + Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, }; use arrow::datatypes::{ i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, @@ -36,7 +37,7 @@ use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; use half::f16; use parquet_variant::{ - Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8 + Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; /// Convert the input array of a specific primitive type to a `VariantArray` @@ -537,59 +538,59 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let list_array = input.as_list::(); let values_variant_array = cast_to_variant(list_array.values().as_ref())?; let offsets = list_array.offsets(); - + for i in 0..list_array.len() { if list_array.is_null(i) { builder.append_null(); continue; } - + let start = offsets[i] as usize; let end = offsets[i + 1] as usize; - + // Start building the inner VariantList let mut variant_builder = VariantBuilder::new(); let mut list_builder = variant_builder.new_list(); - + // Add all values from the slice for j in start..end { list_builder.append_value(values_variant_array.value(j)); } - + list_builder.finish(); - + let (metadata, value) = variant_builder.finish(); let variant = Variant::new(&metadata, &value); let variant_list = variant.as_list().expect("Variant should be list"); builder.append_variant(Variant::List(variant_list.clone())) } } - + DataType::LargeList(_) => { let large_list_array = input.as_list::(); let values_variant_array = cast_to_variant(large_list_array.values().as_ref())?; let offsets = large_list_array.offsets(); - + for i in 0..large_list_array.len() { if large_list_array.is_null(i) { builder.append_null(); continue; } - + let start = offsets[i] as usize; // What if the system is 32bit and offset is > usize::MAX? let end = offsets[i + 1] as usize; - + // Start building the inner VariantList let mut variant_builder = VariantBuilder::new(); let mut list_builder = variant_builder.new_list(); - + // Add all values from the slice for j in start..end { list_builder.append_value(values_variant_array.value(j)); } - + list_builder.finish(); - + let (metadata, value) = variant_builder.finish(); let variant = Variant::new(&metadata, &value); let variant_list = variant.as_list().expect("Variant should be list"); @@ -649,7 +650,13 @@ fn process_run_end_encoded( mod tests { use super::*; use arrow::array::{ - ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeListArray, LargeStringArray, ListArray, NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array + ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, + Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder, + Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, + Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeListArray, + LargeStringArray, ListArray, NullArray, StringArray, StringRunBuilder, StringViewArray, + StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow_schema::{Field, Fields}; @@ -2042,51 +2049,43 @@ mod tests { #[test] fn test_cast_to_variant_list() { // List Array - let data = vec![ - Some(vec![Some(0), Some(1), Some(2)]), - None, - ]; + let data = vec![Some(vec![Some(0), Some(1), Some(2)]), None]; let list_array = ListArray::from_iter_primitive::(data); // Expected value let (metadata, value) = { - let mut builder = VariantBuilder::new(); - let mut list = builder.new_list(); - list.append_value(0); - list.append_value(1); - list.append_value(2); - list.finish(); - builder.finish() + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0); + list.append_value(1); + list.append_value(2); + list.finish(); + builder.finish() }; let variant = Variant::new(&metadata, &value); - run_test(Arc::new(list_array), - vec![Some(variant), None]); + run_test(Arc::new(list_array), vec![Some(variant), None]); } #[test] fn test_cast_to_variant_large_list() { // Large List Array - let data = vec![ - Some(vec![Some(0), Some(1), Some(2)]), - None, - ]; + let data = vec![Some(vec![Some(0), Some(1), Some(2)]), None]; let large_list_array = LargeListArray::from_iter_primitive::(data); // Expected value let (metadata, value) = { - let mut builder = VariantBuilder::new(); - let mut list = builder.new_list(); - list.append_value(0i64); - list.append_value(1i64); - list.append_value(2i64); - list.finish(); - builder.finish() + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i64); + list.append_value(1i64); + list.append_value(2i64); + list.finish(); + builder.finish() }; let variant = Variant::new(&metadata, &value); - run_test(Arc::new(large_list_array), - vec![Some(variant), None]); + run_test(Arc::new(large_list_array), vec![Some(variant), None]); } /// Converts the given `Array` to a `VariantArray` and tests the conversion From 9a5676e3a38c05e85e25f084e359c8bc8646f9f8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 22 Aug 2025 16:40:36 -0400 Subject: [PATCH 9/9] Cast_to_variant optimization --- .../src/cast_to_variant.rs | 84 +++++++++++++++++-- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 0a5c511ec6a6..3999af668e33 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -22,6 +22,7 @@ use arrow::array::{ Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }; +use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{ i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, @@ -536,17 +537,26 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { } DataType::List(_) => { let list_array = input.as_list::(); - let values_variant_array = cast_to_variant(list_array.values().as_ref())?; + let values = list_array.values(); let offsets = list_array.offsets(); + let first_offset = offsets.first().expect("There should be an offset"); + let length = offsets.last().expect("There should be an offset") - first_offset; + let sliced_values = values.slice(*first_offset as usize, length as usize); + + let values_variant_array = cast_to_variant(sliced_values.as_ref())?; + let new_offsets = OffsetBuffer::new(ScalarBuffer::from_iter( + offsets.iter().map(|o| o - first_offset), + )); + for i in 0..list_array.len() { if list_array.is_null(i) { builder.append_null(); continue; } - let start = offsets[i] as usize; - let end = offsets[i + 1] as usize; + let start = new_offsets[i] as usize; + let end = new_offsets[i + 1] as usize; // Start building the inner VariantList let mut variant_builder = VariantBuilder::new(); @@ -568,17 +578,26 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { DataType::LargeList(_) => { let large_list_array = input.as_list::(); - let values_variant_array = cast_to_variant(large_list_array.values().as_ref())?; + let values = large_list_array.values(); let offsets = large_list_array.offsets(); + let first_offset = offsets.first().expect("There should be an offset"); + let length = offsets.last().expect("There should be an offset") - first_offset; + let sliced_values = values.slice(*first_offset as usize, length as usize); + + let values_variant_array = cast_to_variant(sliced_values.as_ref())?; + let new_offsets = OffsetBuffer::new(ScalarBuffer::from_iter( + offsets.iter().map(|o| o - first_offset), + )); + for i in 0..large_list_array.len() { if large_list_array.is_null(i) { builder.append_null(); continue; } - let start = offsets[i] as usize; // What if the system is 32bit and offset is > usize::MAX? - let end = offsets[i + 1] as usize; + let start = new_offsets[i] as usize; // What if the system is 32bit and offset is > usize::MAX? + let end = new_offsets[i + 1] as usize; // Start building the inner VariantList let mut variant_builder = VariantBuilder::new(); @@ -2067,6 +2086,31 @@ mod tests { run_test(Arc::new(list_array), vec![Some(variant), None]); } + #[test] + fn test_cast_to_variant_sliced_list() { + // List Array + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + None, + ]; + let list_array = ListArray::from_iter_primitive::(data); + + // Expected value + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3); + list.append_value(4); + list.append_value(5); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test(Arc::new(list_array.slice(1, 2)), vec![Some(variant), None]); + } + #[test] fn test_cast_to_variant_large_list() { // Large List Array @@ -2088,6 +2132,34 @@ mod tests { run_test(Arc::new(large_list_array), vec![Some(variant), None]); } + #[test] + fn test_cast_to_variant_sliced_large_list() { + // List Array + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + None, + ]; + let large_list_array = ListArray::from_iter_primitive::(data); + + // Expected value + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i64); + list.append_value(4i64); + list.append_value(5i64); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(large_list_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + /// Converts the given `Array` to a `VariantArray` and tests the conversion /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output.