Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 135 additions & 20 deletions parquet-variant-compute/src/from_json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,40 +19,57 @@
//! STRUCT<metadata: BINARY, value: BINARY>

use crate::{VariantArray, VariantArrayBuilder};
use arrow::array::{Array, ArrayRef, StringArray};
use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
use arrow_schema::ArrowError;
use parquet_variant_json::json_to_variant;

/// Macro to convert string array to variant array
macro_rules! string_array_to_variant {
($input:expr, $array:expr, $builder:expr) => {{
for i in 0..$input.len() {
if $input.is_null(i) {
$builder.append_null();
} else {
let mut vb = $builder.variant_builder();
json_to_variant($array.value(i), &mut vb)?;
vb.finish()
}
}
}};
}

/// Parse a batch of JSON strings into a batch of Variants represented as
/// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The JSON strings in the input
/// must be valid.
///
/// Supports the following string array types:
/// - [`StringArray`]
/// - [`LargeStringArray`]
/// - [`StringViewArray`]
pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, ArrowError> {
let input_string_array = match input.as_any().downcast_ref::<StringArray>() {
Some(string_array) => Ok(string_array),
None => Err(ArrowError::CastError(
"Expected reference to StringArray as input".into(),
)),
}?;

let mut variant_array_builder = VariantArrayBuilder::new(input_string_array.len());
for i in 0..input.len() {
if input.is_null(i) {
// The subfields are expected to be non-nullable according to the parquet variant spec.
variant_array_builder.append_null();
} else {
let mut vb = variant_array_builder.variant_builder();
// parse JSON directly to the variant builder
json_to_variant(input_string_array.value(i), &mut vb)?;
vb.finish()
}
let mut variant_array_builder = VariantArrayBuilder::new(input.len());

// Try each string array type in sequence
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

if let Some(string_array) = input.as_any().downcast_ref::<StringArray>() {
string_array_to_variant!(input, string_array, variant_array_builder);
} else if let Some(large_string_array) = input.as_any().downcast_ref::<LargeStringArray>() {
string_array_to_variant!(input, large_string_array, variant_array_builder);
} else if let Some(string_view_array) = input.as_any().downcast_ref::<StringViewArray>() {
string_array_to_variant!(input, string_view_array, variant_array_builder);
} else {
return Err(ArrowError::CastError(
"Expected reference to StringArray, LargeStringArray, or StringViewArray as input"
.into(),
));
}

Ok(variant_array_builder.build())
}

#[cfg(test)]
mod test {
use crate::batch_json_string_to_variant;
use arrow::array::{Array, ArrayRef, StringArray};
use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray, StringViewArray};
use arrow_schema::ArrowError;
use parquet_variant::{Variant, VariantBuilder};
use std::sync::Arc;
Expand Down Expand Up @@ -105,4 +122,102 @@ mod test {
assert!(!value_array.is_null(4));
Ok(())
}

#[test]
fn test_batch_json_string_to_variant_large_string() -> Result<(), ArrowError> {
let input = LargeStringArray::from(vec![
Some("1"),
None,
Some("{\"a\": 32}"),
Some("null"),
None,
]);
let array_ref: ArrayRef = Arc::new(input);
let variant_array = batch_json_string_to_variant(&array_ref).unwrap();

let metadata_array = variant_array.metadata_field();
let value_array = variant_array.value_field().expect("value field");

// Compare row 0
assert!(!variant_array.is_null(0));
assert_eq!(variant_array.value(0), Variant::Int8(1));

// Compare row 1
assert!(variant_array.is_null(1));

// Compare row 2
assert!(!variant_array.is_null(2));
{
let mut vb = VariantBuilder::new();
let mut ob = vb.new_object();
ob.insert("a", Variant::Int8(32));
ob.finish()?;
let (object_metadata, object_value) = vb.finish();
let expected = Variant::new(&object_metadata, &object_value);
assert_eq!(variant_array.value(2), expected);
}

// Compare row 3 (Note this is a variant NULL, not a null row)
assert!(!variant_array.is_null(3));
assert_eq!(variant_array.value(3), Variant::Null);

// Compare row 4
assert!(variant_array.is_null(4));

// Ensure that the subfields are not nullable
assert!(!metadata_array.is_null(1));
assert!(!value_array.is_null(1));
assert!(!metadata_array.is_null(4));
assert!(!value_array.is_null(4));
Ok(())
}

#[test]
fn test_batch_json_string_to_variant_string_view() -> Result<(), ArrowError> {
let input = StringViewArray::from(vec![
Some("1"),
None,
Some("{\"a\": 32}"),
Some("null"),
None,
]);
let array_ref: ArrayRef = Arc::new(input);
let variant_array = batch_json_string_to_variant(&array_ref).unwrap();

let metadata_array = variant_array.metadata_field();
let value_array = variant_array.value_field().expect("value field");

// Compare row 0
assert!(!variant_array.is_null(0));
assert_eq!(variant_array.value(0), Variant::Int8(1));

// Compare row 1
assert!(variant_array.is_null(1));

// Compare row 2
assert!(!variant_array.is_null(2));
{
let mut vb = VariantBuilder::new();
let mut ob = vb.new_object();
ob.insert("a", Variant::Int8(32));
ob.finish()?;
let (object_metadata, object_value) = vb.finish();
let expected = Variant::new(&object_metadata, &object_value);
assert_eq!(variant_array.value(2), expected);
}

// Compare row 3 (Note this is a variant NULL, not a null row)
assert!(!variant_array.is_null(3));
assert_eq!(variant_array.value(3), Variant::Null);

// Compare row 4
assert!(variant_array.is_null(4));

// Ensure that the subfields are not nullable
assert!(!metadata_array.is_null(1));
assert!(!value_array.is_null(1));
assert!(!metadata_array.is_null(4));
assert!(!value_array.is_null(4));
Ok(())
}
}
Loading