From 3f991b34e90577bd6b01278d5cdcf6684d3b5f4d Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 7 Aug 2025 15:16:59 -0400 Subject: [PATCH 1/5] [ADD] add support for DataType::Utf8/LargeUtf8/Utf8View for cast_to_varoant --- .../src/cast_to_variant.rs | 168 +++++++++++++++++- 1 file changed, 166 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 9c36ed19f0ab..a26f97a29e5d 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -211,7 +211,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { let mut builder = VariantArrayBuilder::new(input.len()); let input_type = input.data_type(); - // todo: handle other types like Boolean, Strings, Date, Timestamp, etc. + // todo: handle other types like Boolean, Date, Timestamp, etc. match input_type { DataType::Boolean => { non_generic_conversion!(as_boolean, |v| v, input, builder); @@ -328,6 +328,39 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { .to_string(), )); } + DataType::Utf8 => { + let array = input.as_string::(); + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + continue; + } + let cast_value = array.value(i); + builder.append_variant(Variant::from(cast_value)); + } + } + DataType::LargeUtf8 => { + let array = input.as_string::(); + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + continue; + } + let cast_value = array.value(i); + builder.append_variant(Variant::from(cast_value)); + } + } + DataType::Utf8View => { + let array = input.as_string_view(); + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + continue; + } + let cast_value = array.value(i); + builder.append_variant(Variant::from(cast_value)); + } + } dt => { return Err(ArrowError::CastError(format!( "Unsupported data type for casting to Variant: {dt:?}", @@ -347,7 +380,7 @@ mod tests { use arrow::array::{ ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, - GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, + GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringBuilder, StringBuilder, StringViewBuilder, IntervalYearMonthArray, NullArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{ @@ -1152,6 +1185,137 @@ mod tests { ) } + fn test_cast_to_variant_utf8() { + // Test with short strings (should become ShortString variants) + let short_strings = vec![ + Some("hello"), + Some(""), + None, + Some("world"), + Some("test"), + ]; + let mut string_builder = StringBuilder::new(); + for s in short_strings.iter() { + match s { + Some(value) => string_builder.append_value(value), + None => string_builder.append_null(), + } + } + let string_array = string_builder.finish(); + + run_test( + Arc::new(string_array), + vec![ + Some(Variant::from("hello")), + Some(Variant::from("")), + None, + Some(Variant::from("world")), + Some(Variant::from("test")), + ], + ); + + // Test with a long string (should become String variant) + let long_string = "a".repeat(100); // > 63 bytes, so will be Variant::String + let mut string_builder = StringBuilder::new(); + string_builder.append_value(&long_string); + string_builder.append_null(); + string_builder.append_value("short"); + let string_array = string_builder.finish(); + + run_test( + Arc::new(string_array), + vec![ + Some(Variant::from(long_string.as_str())), + None, + Some(Variant::from("short")), + ], + ); + } + + #[test] + fn test_cast_to_variant_large_utf8() { + // Test with short strings (should become ShortString variants) + let short_strings = vec![ + Some("hello"), + Some(""), + None, + Some("world"), + ]; + let mut string_builder = LargeStringBuilder::new(); + for s in short_strings.iter() { + match s { + Some(value) => string_builder.append_value(value), + None => string_builder.append_null(), + } + } + let string_array = string_builder.finish(); + + run_test( + Arc::new(string_array), + vec![ + Some(Variant::from("hello")), + Some(Variant::from("")), + None, + Some(Variant::from("world")), + ], + ); + + // Test with a long string (should become String variant) + let long_string = "b".repeat(100); // > 63 bytes, so will be Variant::String + let mut string_builder = LargeStringBuilder::new(); + string_builder.append_value(&long_string); + string_builder.append_null(); + string_builder.append_value("short"); + let string_array = string_builder.finish(); + + run_test( + Arc::new(string_array), + vec![ + Some(Variant::from(long_string.as_str())), + None, + Some(Variant::from("short")), + ], + ); + } + + #[test] + fn test_cast_to_variant_utf8_view() { + // Test with short strings (should become ShortString variants) + let mut builder = StringViewBuilder::new(); + builder.append_value("hello"); + builder.append_value(""); + builder.append_null(); + builder.append_value("world"); + let string_view_array = builder.finish(); + + run_test( + Arc::new(string_view_array), + vec![ + Some(Variant::from("hello")), + Some(Variant::from("")), + None, + Some(Variant::from("world")), + ], + ); + + // Test with a long string (should become String variant) + let long_string = "c".repeat(100); // > 63 bytes, so will be Variant::String + let mut builder = StringViewBuilder::new(); + builder.append_value(&long_string); + builder.append_null(); + builder.append_value("short"); + let string_view_array = builder.finish(); + + run_test( + Arc::new(string_view_array), + vec![ + Some(Variant::from(long_string.as_str())), + None, + Some(Variant::from("short")), + ], + ); + } + /// Converts the given `Array` to a `VariantArray` and tests the conversion /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output. From 0acf4dfd1955fd2609d7901f27daa6370c84a4fd Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Fri, 8 Aug 2025 13:21:45 -0400 Subject: [PATCH 2/5] [FIX] clean code --- parquet-variant-compute/src/variant_array_builder.rs | 4 ++-- parquet-variant-compute/src/variant_get/output/primitive.rs | 2 +- parquet-variant-compute/src/variant_get/output/variant.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 36bd6567700b..39527340d55e 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -217,7 +217,7 @@ pub struct VariantArrayVariantBuilder<'a> { variant_builder: VariantBuilder, } -impl<'a> VariantBuilderExt for VariantArrayVariantBuilder<'a> { +impl VariantBuilderExt for VariantArrayVariantBuilder<'_> { fn append_value<'m, 'v>(&mut self, value: impl Into>) { self.variant_builder.append_value(value); } @@ -300,7 +300,7 @@ impl<'a> VariantArrayVariantBuilder<'a> { } } -impl<'a> Drop for VariantArrayVariantBuilder<'a> { +impl Drop for VariantArrayVariantBuilder<'_> { /// If the builder was not finished, roll back any changes made to the /// underlying buffers (by truncating them) fn drop(&mut self) { diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 517635e7913d..496d711c1044 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -68,7 +68,7 @@ impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { } } -impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { +impl OutputBuilder for PrimitiveOutputBuilder<'_, T> { fn partially_shredded( &self, variant_array: &VariantArray, diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index c20949ce6474..6f2f829b662d 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -35,7 +35,7 @@ impl<'a> VariantOutputBuilder<'a> { } } -impl<'a> OutputBuilder for VariantOutputBuilder<'a> { +impl OutputBuilder for VariantOutputBuilder<'_> { fn partially_shredded( &self, variant_array: &VariantArray, From 5fd5cbeafa36300a7fea087327f9a7774e24e94e Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 12 Aug 2025 15:51:25 -0400 Subject: [PATCH 3/5] [FIX] fix conflict and linting issues --- .../src/cast_to_variant.rs | 74 +++++-------------- 1 file changed, 18 insertions(+), 56 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index a26f97a29e5d..b0659ba1f0ba 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -1187,22 +1187,9 @@ mod tests { fn test_cast_to_variant_utf8() { // Test with short strings (should become ShortString variants) - let short_strings = vec![ - Some("hello"), - Some(""), - None, - Some("world"), - Some("test"), - ]; - let mut string_builder = StringBuilder::new(); - for s in short_strings.iter() { - match s { - Some(value) => string_builder.append_value(value), - None => string_builder.append_null(), - } - } - let string_array = string_builder.finish(); - + let short_strings = vec![Some("hello"), Some(""), None, Some("world"), Some("test")]; + let string_array = StringArray::from(short_strings.clone()); + run_test( Arc::new(string_array), vec![ @@ -1216,12 +1203,9 @@ mod tests { // Test with a long string (should become String variant) let long_string = "a".repeat(100); // > 63 bytes, so will be Variant::String - let mut string_builder = StringBuilder::new(); - string_builder.append_value(&long_string); - string_builder.append_null(); - string_builder.append_value("short"); - let string_array = string_builder.finish(); - + let long_strings = vec![Some(long_string.clone()), None, Some("short".to_string())]; + let string_array = StringArray::from(long_strings); + run_test( Arc::new(string_array), vec![ @@ -1235,21 +1219,9 @@ mod tests { #[test] fn test_cast_to_variant_large_utf8() { // Test with short strings (should become ShortString variants) - let short_strings = vec![ - Some("hello"), - Some(""), - None, - Some("world"), - ]; - let mut string_builder = LargeStringBuilder::new(); - for s in short_strings.iter() { - match s { - Some(value) => string_builder.append_value(value), - None => string_builder.append_null(), - } - } - let string_array = string_builder.finish(); - + let short_strings = vec![Some("hello"), Some(""), None, Some("world")]; + let string_array = LargeStringArray::from(short_strings.clone()); + run_test( Arc::new(string_array), vec![ @@ -1262,12 +1234,9 @@ mod tests { // Test with a long string (should become String variant) let long_string = "b".repeat(100); // > 63 bytes, so will be Variant::String - let mut string_builder = LargeStringBuilder::new(); - string_builder.append_value(&long_string); - string_builder.append_null(); - string_builder.append_value("short"); - let string_array = string_builder.finish(); - + let long_strings = vec![Some(long_string.clone()), None, Some("short".to_string())]; + let string_array = LargeStringArray::from(long_strings); + run_test( Arc::new(string_array), vec![ @@ -1281,13 +1250,9 @@ mod tests { #[test] fn test_cast_to_variant_utf8_view() { // Test with short strings (should become ShortString variants) - let mut builder = StringViewBuilder::new(); - builder.append_value("hello"); - builder.append_value(""); - builder.append_null(); - builder.append_value("world"); - let string_view_array = builder.finish(); - + let short_strings = vec![Some("hello"), Some(""), None, Some("world")]; + let string_view_array = StringViewArray::from(short_strings.clone()); + run_test( Arc::new(string_view_array), vec![ @@ -1300,12 +1265,9 @@ mod tests { // Test with a long string (should become String variant) let long_string = "c".repeat(100); // > 63 bytes, so will be Variant::String - let mut builder = StringViewBuilder::new(); - builder.append_value(&long_string); - builder.append_null(); - builder.append_value("short"); - let string_view_array = builder.finish(); - + let long_strings = vec![Some(long_string.clone()), None, Some("short".to_string())]; + let string_view_array = StringViewArray::from(long_strings); + run_test( Arc::new(string_view_array), vec![ From b82734dd852ccb18a799f68f55d7dbf7d263e6a5 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 14 Aug 2025 12:42:07 -0400 Subject: [PATCH 4/5] [FIX] add suggestion of macro cast_conversion --- .../src/cast_to_variant.rs | 45 ++++++++----------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index b0659ba1f0ba..b3226c6810ab 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -178,6 +178,21 @@ macro_rules! decimal_to_variant_decimal { }; } +/// Convert string arrays using the offset size as the type parameter +macro_rules! cast_conversion_string { + ($offset_type:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ + let array = $input.$method::<$offset_type>(); + for i in 0..array.len() { + if array.is_null(i) { + $builder.append_null(); + continue; + } + let cast_value = $cast_fn(array.value(i)); + $builder.append_variant(Variant::from(cast_value)); + } + }}; +} + /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you /// need to convert a specific data type /// @@ -329,37 +344,13 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { )); } DataType::Utf8 => { - let array = input.as_string::(); - for i in 0..array.len() { - if array.is_null(i) { - builder.append_null(); - continue; - } - let cast_value = array.value(i); - builder.append_variant(Variant::from(cast_value)); - } + cast_conversion_string!(i32, as_string, |v| v, input, builder); } DataType::LargeUtf8 => { - let array = input.as_string::(); - for i in 0..array.len() { - if array.is_null(i) { - builder.append_null(); - continue; - } - let cast_value = array.value(i); - builder.append_variant(Variant::from(cast_value)); - } + cast_conversion_string!(i64, as_string, |v| v, input, builder); } DataType::Utf8View => { - let array = input.as_string_view(); - for i in 0..array.len() { - if array.is_null(i) { - builder.append_null(); - continue; - } - let cast_value = array.value(i); - builder.append_variant(Variant::from(cast_value)); - } + cast_conversion_nongeneric!(as_string_view, |v| v, input, builder); } dt => { return Err(ArrowError::CastError(format!( From 295a90df2ce44b92ff69db1ef6444c94163dbd33 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 14 Aug 2025 12:51:18 -0400 Subject: [PATCH 5/5] [FIX] fix linting and clippy issues --- .../src/cast_to_variant.rs | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index b3226c6810ab..343d387b241e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -178,6 +178,21 @@ macro_rules! decimal_to_variant_decimal { }; } +/// Convert arrays that don't need generic type parameters +macro_rules! cast_conversion_nongeneric { + ($method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ + let array = $input.$method(); + for i in 0..array.len() { + if array.is_null(i) { + $builder.append_null(); + continue; + } + let cast_value = $cast_fn(array.value(i)); + $builder.append_variant(Variant::from(cast_value)); + } + }}; +} + /// Convert string arrays using the offset size as the type parameter macro_rules! cast_conversion_string { ($offset_type:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ @@ -371,8 +386,9 @@ mod tests { use arrow::array::{ ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, - GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringBuilder, StringBuilder, StringViewBuilder, - IntervalYearMonthArray, NullArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, + IntervalYearMonthArray, LargeStringArray, NullArray, StringArray, StringViewArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, @@ -1176,6 +1192,7 @@ mod tests { ) } + #[test] fn test_cast_to_variant_utf8() { // Test with short strings (should become ShortString variants) let short_strings = vec![Some("hello"), Some(""), None, Some("world"), Some("test")];