diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 0d1a01ca5e23..09fc18e351d9 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -42,6 +42,9 @@ on: - arrow-json/** - arrow-avro/** - parquet/** + - parquet-variant/** + - parquet-variant-compute/** + - parquet-variant-json/** - .github/** jobs: diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e87d03f88c5b..4abffa65c23f 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -568,6 +568,16 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' let value = array.value(index); Variant::from(value) } + DataType::BinaryView => { + let array = typed_value.as_binary_view(); + let value = array.value(index); + Variant::from(value) + } + DataType::Utf8 => { + let array = typed_value.as_string::(); + let value = array.value(index); + Variant::from(value) + } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index a5819fc45937..5cd3c094e286 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -508,6 +508,40 @@ mod test { assert_eq!(result.value(3), Variant::from(&[4u8, 5u8, 6u8][..])); } + #[test] + fn get_variant_partially_shredded_utf8_as_variant() { + let array = partially_shredded_utf8_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::from("hello")); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("n/a")); + assert_eq!(result.value(3), Variant::from("world")); + } + + #[test] + fn get_variant_partially_shredded_binary_view_as_variant() { + let array = partially_shredded_binary_view_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 4); + + // Expect the values are the same as the original values + assert_eq!(result.value(0), Variant::from(&[1u8, 2u8, 3u8][..])); + assert!(!result.is_valid(1)); + assert_eq!(result.value(2), Variant::from("n/a")); + assert_eq!(result.value(3), Variant::from(&[4u8, 5u8, 6u8][..])); + } + /// Shredding: extract a value as an Int32Array #[test] fn get_variant_shredded_int32_as_int32_safe_cast() { @@ -1018,6 +1052,100 @@ mod test { ) } + /// Return a VariantArray that represents a partially "shredded" variant for UTF8 + fn partially_shredded_utf8_variant_array() -> ArrayRef { + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + // Create the null buffer for the overall array + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = StringArray::from(vec![ + Some("hello"), // row 0 is shredded + None, // row 1 is null + None, // row 2 is a string + Some("world"), // row 3 is shredded + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), true) + .with_field("typed_value", Arc::new(typed_value), true) + .with_field("value", Arc::new(values), true) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + + /// Return a VariantArray that represents a partially "shredded" variant for BinaryView + fn partially_shredded_binary_view_variant_array() -> ArrayRef { + let (metadata, string_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.append_value("n/a"); + builder.finish() + }; + + // Create the null buffer for the overall array + let nulls = NullBuffer::from(vec![ + true, // row 0 non null + false, // row 1 is null + true, // row 2 non null + true, // row 3 non null + ]); + + // metadata is the same for all rows + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // See https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY + // about why row1 is an empty but non null, value. + let values = BinaryViewArray::from(vec![ + None, // row 0 is shredded, so no value + Some(b"" as &[u8]), // row 1 is null, so empty value + Some(&string_value), // copy the string value "N/A" + None, // row 3 is shredded, so no value + ]); + + let typed_value = BinaryViewArray::from(vec![ + Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded + None, // row 1 is null + None, // row 2 is a string + Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded + ]); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), true) + .with_field("typed_value", Arc::new(typed_value), true) + .with_field("value", Arc::new(values), true) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } + /// Return a VariantArray that represents an "all null" variant /// for the following example (3 null values): /// diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 6a586e013ef5..97fb6b880108 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -119,9 +119,8 @@ variant_test_case!(26, "Unsupported typed_value type: Decimal128(18, 9)"); variant_test_case!(27, "Unsupported typed_value type: Decimal128(18, 9)"); variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); -// https://github.com/apache/arrow-rs/issues/8333 -variant_test_case!(30, "Unsupported typed_value type: BinaryView"); -variant_test_case!(31, "Unsupported typed_value type: Utf8"); +variant_test_case!(30); +variant_test_case!(31); // https://github.com/apache/arrow-rs/issues/8334 variant_test_case!(32, "Unsupported typed_value type: Time64(Microsecond)"); // https://github.com/apache/arrow-rs/issues/8331