diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs index a101bf01cfda..644bd8ad6a90 100644 --- a/parquet-variant-compute/src/from_json.rs +++ b/parquet-variant-compute/src/from_json.rs @@ -19,40 +19,57 @@ //! STRUCT use crate::{VariantArray, VariantArrayBuilder}; -use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray}; use arrow_schema::ArrowError; use parquet_variant_json::json_to_variant; +/// Macro to convert string array to variant array +macro_rules! string_array_to_variant { + ($input:expr, $array:expr, $builder:expr) => {{ + for i in 0..$input.len() { + if $input.is_null(i) { + $builder.append_null(); + } else { + let mut vb = $builder.variant_builder(); + json_to_variant($array.value(i), &mut vb)?; + vb.finish() + } + } + }}; +} + /// Parse a batch of JSON strings into a batch of Variants represented as /// STRUCT where nulls are preserved. The JSON strings in the input /// must be valid. +/// +/// Supports the following string array types: +/// - [`StringArray`] +/// - [`LargeStringArray`] +/// - [`StringViewArray`] pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result { - let input_string_array = match input.as_any().downcast_ref::() { - Some(string_array) => Ok(string_array), - None => Err(ArrowError::CastError( - "Expected reference to StringArray as input".into(), - )), - }?; - - let mut variant_array_builder = VariantArrayBuilder::new(input_string_array.len()); - for i in 0..input.len() { - if input.is_null(i) { - // The subfields are expected to be non-nullable according to the parquet variant spec. - variant_array_builder.append_null(); - } else { - let mut vb = variant_array_builder.variant_builder(); - // parse JSON directly to the variant builder - json_to_variant(input_string_array.value(i), &mut vb)?; - vb.finish() - } + let mut variant_array_builder = VariantArrayBuilder::new(input.len()); + + // Try each string array type in sequence + if let Some(string_array) = input.as_any().downcast_ref::() { + string_array_to_variant!(input, string_array, variant_array_builder); + } else if let Some(large_string_array) = input.as_any().downcast_ref::() { + string_array_to_variant!(input, large_string_array, variant_array_builder); + } else if let Some(string_view_array) = input.as_any().downcast_ref::() { + string_array_to_variant!(input, string_view_array, variant_array_builder); + } else { + return Err(ArrowError::CastError( + "Expected reference to StringArray, LargeStringArray, or StringViewArray as input" + .into(), + )); } + Ok(variant_array_builder.build()) } #[cfg(test)] mod test { use crate::batch_json_string_to_variant; - use arrow::array::{Array, ArrayRef, StringArray}; + use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray}; use arrow_schema::ArrowError; use parquet_variant::{Variant, VariantBuilder}; use std::sync::Arc; @@ -105,4 +122,102 @@ mod test { assert!(!value_array.is_null(4)); Ok(()) } + + #[test] + fn test_batch_json_string_to_variant_large_string() -> Result<(), ArrowError> { + let input = LargeStringArray::from(vec![ + Some("1"), + None, + Some("{\"a\": 32}"), + Some("null"), + None, + ]); + let array_ref: ArrayRef = Arc::new(input); + let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); + + let metadata_array = variant_array.metadata_field(); + let value_array = variant_array.value_field().expect("value field"); + + // Compare row 0 + assert!(!variant_array.is_null(0)); + assert_eq!(variant_array.value(0), Variant::Int8(1)); + + // Compare row 1 + assert!(variant_array.is_null(1)); + + // Compare row 2 + assert!(!variant_array.is_null(2)); + { + let mut vb = VariantBuilder::new(); + let mut ob = vb.new_object(); + ob.insert("a", Variant::Int8(32)); + ob.finish()?; + let (object_metadata, object_value) = vb.finish(); + let expected = Variant::new(&object_metadata, &object_value); + assert_eq!(variant_array.value(2), expected); + } + + // Compare row 3 (Note this is a variant NULL, not a null row) + assert!(!variant_array.is_null(3)); + assert_eq!(variant_array.value(3), Variant::Null); + + // Compare row 4 + assert!(variant_array.is_null(4)); + + // Ensure that the subfields are not nullable + assert!(!metadata_array.is_null(1)); + assert!(!value_array.is_null(1)); + assert!(!metadata_array.is_null(4)); + assert!(!value_array.is_null(4)); + Ok(()) + } + + #[test] + fn test_batch_json_string_to_variant_string_view() -> Result<(), ArrowError> { + let input = StringViewArray::from(vec![ + Some("1"), + None, + Some("{\"a\": 32}"), + Some("null"), + None, + ]); + let array_ref: ArrayRef = Arc::new(input); + let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); + + let metadata_array = variant_array.metadata_field(); + let value_array = variant_array.value_field().expect("value field"); + + // Compare row 0 + assert!(!variant_array.is_null(0)); + assert_eq!(variant_array.value(0), Variant::Int8(1)); + + // Compare row 1 + assert!(variant_array.is_null(1)); + + // Compare row 2 + assert!(!variant_array.is_null(2)); + { + let mut vb = VariantBuilder::new(); + let mut ob = vb.new_object(); + ob.insert("a", Variant::Int8(32)); + ob.finish()?; + let (object_metadata, object_value) = vb.finish(); + let expected = Variant::new(&object_metadata, &object_value); + assert_eq!(variant_array.value(2), expected); + } + + // Compare row 3 (Note this is a variant NULL, not a null row) + assert!(!variant_array.is_null(3)); + assert_eq!(variant_array.value(3), Variant::Null); + + // Compare row 4 + assert!(variant_array.is_null(4)); + + // Ensure that the subfields are not nullable + assert!(!metadata_array.is_null(1)); + assert!(!value_array.is_null(1)); + assert!(!metadata_array.is_null(4)); + assert!(!value_array.is_null(4)); + Ok(()) + } }