From be5cf31f9ce848d3f61c98dff807994a1e2028b7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Aug 2025 17:11:00 -0400 Subject: [PATCH 1/7] Add `variant` feature to parquet crate --- .github/workflows/parquet.yml | 4 +- Cargo.toml | 2 +- parquet/Cargo.toml | 6 + parquet/README.md | 2 + parquet/src/lib.rs | 11 ++ parquet/src/variant.rs | 111 ++++++++++++++++++ ..._integration.rs => variant_integration.rs} | 0 7 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 parquet/src/variant.rs rename parquet/tests/{simple_variant_integration.rs => variant_integration.rs} (100%) diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 8a2301acd90c..5d44afea78ff 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -119,7 +119,9 @@ jobs: run: cargo check -p parquet --no-default-features --features flate2 --features flate2-rust_backened - name: Check compilation --no-default-features --features flate2 --features flate2-zlib-rs run: cargo check -p parquet --no-default-features --features flate2 --features flate2-zlib-rs - + - name: Check compilation --no-default-features --features variant_experimental + run: cargo check -p parquet --no-default-features --features variant_experimental + # test the parquet crate builds against wasm32 in stable rust wasm32-build: diff --git a/Cargo.toml b/Cargo.toml index 9d1ad6d03b5e..f78b65482155 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -104,7 +104,7 @@ parquet = { version = "56.0.0", path = "./parquet", default-features = false } # These crates have not yet been released and thus do not use the workspace version parquet-variant = { version = "0.1.0", path = "./parquet-variant" } parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" } -parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" } +parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-compute" } chrono = { version = "0.4.40", default-features = false, features = ["clock"] } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 05557069aa7d..bff914bb4312 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -45,6 +45,10 @@ arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } +parquet-variant = { workspace = true, optional = true } +parquet-variant-json = { workspace = true, optional = true } +parquet-variant-compute = { workspace = true, optional = true } + object_store = { version = "0.12.0", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -123,6 +127,8 @@ encryption = ["dep:ring"] # Explicitely enabling rust_backend and zlib-rs features for flate2 flate2-rust_backened = ["flate2/rust_backend"] flate2-zlib-rs = ["flate2/zlib-rs"] +# Enable parquet variant support +variant_experimental = ["parquet-variant", "parquet-variant-json", "parquet-variant-compute"] [[example]] diff --git a/parquet/README.md b/parquet/README.md index 8fc72bfbc32a..e1b33d89612c 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -64,9 +64,11 @@ The `parquet` crate provides the following features which may be enabled in your - `experimental` - Experimental APIs which may change, even between minor releases - `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation - `encryption` - support for reading / writing encrypted Parquet files +- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which may change, even between minor releases. [`arrow`]: https://crates.io/crates/arrow [`simdutf8`]: https://crates.io/crates/simdutf8 +[Parquet Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md ## Parquet Feature Status diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 07a673c295bc..1142a1c4a0d0 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -86,6 +86,14 @@ //! [`ParquetRecordBatchStreamBuilder`]: arrow::async_reader::ParquetRecordBatchStreamBuilder //! [`ParquetObjectReader`]: arrow::async_reader::ParquetObjectReader //! +//! ## Variant Logical Type (`variant_experimental` feature) +//! +//! The [`variant`] module supports reading and writing Parquet files +//! with the [Variant Binary Encoding] logical type, which can represent +//! semi-structured data such as JSON efficiently. +//! +//! [Variant Binary Encoding]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +//! //! ## Read/Write Parquet Directly //! //! Workloads needing finer-grained control, or to avoid a dependence on arrow, @@ -179,3 +187,6 @@ pub mod record; pub mod schema; pub mod thrift; + +#[cfg(feature = "variant_experimental")] +pub mod variant; diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs new file mode 100644 index 000000000000..52073474d70b --- /dev/null +++ b/parquet/src/variant.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! ⚠️ Experimental Support for reading and writing [`Variant`]s to / from Parquet files ⚠️ +//! +//! This is a 🚧 Work In Progress +//! +//! Note: Requires the `variant` feature of the `parquet` crate to be enabled. +//! +//! # Features +//! * [`Variant`] represents variant value, which can be an object, list, or primitive. +//! * [`VariantBuilder`] for building `Variant` values. +//! * [`VariantArray`] for representing a column of Variant values. +//! * [`compute`] module with functions for manipulating Variants, such as +//! [`variant_get`] to extracting a value by path and functions to convert +//! between `Variant` and JSON. +//! +//! [Variant Logical Type]: Variant +//! [`VariantArray`]: compute::VariantArray +//! [`variant_get`]: compute::variant_get +//! +//! # Example: Writing a Parquet file with Variant column +//! ```rust +//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder}; +//! # use parquet::variant::VariantBuilderExt; +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, RecordBatch}; +//! # use parquet::arrow::ArrowWriter; +//! # fn main() -> Result<(), parquet::errors::ParquetError> { +//! // Use the VariantArrayBuilder to build a VariantArray +//! let mut builder = VariantArrayBuilder::new(3); +//! // row 1: {"name": "Alice"} +//! let mut variant_builder = builder.variant_builder(); +//! variant_builder.new_object().with_field("name", "Alice").finish()?; +//! variant_builder.finish(); +//! let array = builder.build(); +//! +//! // TODO support writing VariantArray directly +//! // at the moment it panics when trying to downcast to a struct array +//! // let array: ArrayRef = Arc::new(array); +//! let array: ArrayRef = Arc::new(array.into_inner()); +//! +//! // create a RecordBatch with the VariantArray +//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! +//! // write the RecordBatch to a Parquet file +//! let file = std::fs::File::create("variant.parquet")?; +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +//! writer.write(&batch)?; +//! writer.close()?; +//! +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Writing JSON with a Parquet file with Variant column +//! ```rust +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, RecordBatch, StringArray}; +//! # use parquet::variant::compute::batch_json_string_to_variant; +//! # use parquet::variant::compute::VariantArray; +//! # use parquet::arrow::ArrowWriter; +//! # fn main() -> Result<(), parquet::errors::ParquetError> { +//! // Create an array of JSON strings, simulating a column of JSON data +//! // TODO use StringViewArray when available +//! let input_array = StringArray::from(vec![ +//! Some(r#"{"name": "Alice", "age": 30}"#), +//! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#), +//! None, +//! Some("{}"), +//! ]); +//! let input_array: ArrayRef = Arc::new(input_array); +//! +//! // Convert the JSON strings to a VariantArray +//! let array: VariantArray = batch_json_string_to_variant(&input_array)?; +//! +//! // TODO support writing VariantArray directly +//! // at the moment it panics when trying to downcast to a struct array +//! // let array: ArrayRef = Arc::new(array); +//! let array: ArrayRef = Arc::new(array.into_inner()); +//! +//! // create a RecordBatch with the VariantArray +//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! +//! // write the RecordBatch to a Parquet file +//! let file = std::fs::File::create("variant-json.parquet")?; +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +//! writer.write(&batch)?; +//! writer.close()?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Reading a Parquet file with Variant column +//! (TODO: add example) +pub use parquet_variant::*; +pub use parquet_variant_compute as compute; diff --git a/parquet/tests/simple_variant_integration.rs b/parquet/tests/variant_integration.rs similarity index 100% rename from parquet/tests/simple_variant_integration.rs rename to parquet/tests/variant_integration.rs From d702545225938973d08ccbe0487e36e8c4f9914a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Aug 2025 14:54:36 -0400 Subject: [PATCH 2/7] prettier --- parquet/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/README.md b/parquet/README.md index e1b33d89612c..5e087ac6a929 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -64,11 +64,11 @@ The `parquet` crate provides the following features which may be enabled in your - `experimental` - Experimental APIs which may change, even between minor releases - `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation - `encryption` - support for reading / writing encrypted Parquet files -- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which may change, even between minor releases. +- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which may change, even between minor releases. [`arrow`]: https://crates.io/crates/arrow [`simdutf8`]: https://crates.io/crates/simdutf8 -[Parquet Variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +[parquet variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md ## Parquet Feature Status From 1be011e7ac0d344086eb5dde2f2b81faa0f1a1cc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Aug 2025 15:05:37 -0400 Subject: [PATCH 3/7] fix tests --- parquet/src/variant.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index 52073474d70b..a3e0192ea348 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -71,7 +71,7 @@ //! ```rust //! # use std::sync::Arc; //! # use arrow_array::{ArrayRef, RecordBatch, StringArray}; -//! # use parquet::variant::compute::batch_json_string_to_variant; +//! # use parquet::variant::compute::json_to_variant; //! # use parquet::variant::compute::VariantArray; //! # use parquet::arrow::ArrowWriter; //! # fn main() -> Result<(), parquet::errors::ParquetError> { @@ -86,7 +86,7 @@ //! let input_array: ArrayRef = Arc::new(input_array); //! //! // Convert the JSON strings to a VariantArray -//! let array: VariantArray = batch_json_string_to_variant(&input_array)?; +//! let array: VariantArray = json_to_variant(&input_array)?; //! //! // TODO support writing VariantArray directly //! // at the moment it panics when trying to downcast to a struct array From 2c4650f255c0bfe6c88c69faf3ba648fcda5d7fa Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 23 Aug 2025 06:09:28 -0400 Subject: [PATCH 4/7] Update parquet/src/variant.rs Co-authored-by: Matthijs Brobbel --- parquet/src/variant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index a3e0192ea348..f91fd7a58ff2 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -19,7 +19,7 @@ //! //! This is a 🚧 Work In Progress //! -//! Note: Requires the `variant` feature of the `parquet` crate to be enabled. +//! Note: Requires the `variant_experimental` feature of the `parquet` crate to be enabled. //! //! # Features //! * [`Variant`] represents variant value, which can be an object, list, or primitive. From abd29177f7d616c93955ba9bddec4e08676392a7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 8 Sep 2025 13:26:07 -0400 Subject: [PATCH 5/7] Add variant_experimental in "experimental" feature --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index c540f8220ee0..a39275fb254e 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -112,7 +112,7 @@ json = ["serde_json", "base64"] # Enable internal testing APIs test_common = ["arrow/test_utils"] # Experimental, unstable functionality primarily used for testing -experimental = [] +experimental = ["variant_experimental"] # Enable async APIs async = ["futures", "tokio"] # Enable object_store integration From 26196b58e4bb0be6d955497e3b63e3fd3b89d50a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 8 Sep 2025 13:35:06 -0400 Subject: [PATCH 6/7] fixup doc tests --- parquet/src/variant.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index f91fd7a58ff2..e3b57d53b096 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -45,7 +45,7 @@ //! let mut builder = VariantArrayBuilder::new(3); //! // row 1: {"name": "Alice"} //! let mut variant_builder = builder.variant_builder(); -//! variant_builder.new_object().with_field("name", "Alice").finish()?; +//! variant_builder.new_object().with_field("name", "Alice").finish(); //! variant_builder.finish(); //! let array = builder.build(); //! @@ -63,6 +63,7 @@ //! writer.write(&batch)?; //! writer.close()?; //! +//! # std::fs::remove_file("variant.parquet")?; //! # Ok(()) //! # } //! ``` @@ -101,6 +102,7 @@ //! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; //! writer.write(&batch)?; //! writer.close()?; +//! # std::fs::remove_file("variant-json.parquet")?; //! # Ok(()) //! # } //! ``` From 9b9844fefc0ddd818246967b743ef7ae47bcace3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 8 Sep 2025 13:45:23 -0400 Subject: [PATCH 7/7] add doc link --- parquet/src/variant.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs index e3b57d53b096..a837a877df76 100644 --- a/parquet/src/variant.rs +++ b/parquet/src/variant.rs @@ -51,6 +51,7 @@ //! //! // TODO support writing VariantArray directly //! // at the moment it panics when trying to downcast to a struct array +//! // https://github.com/apache/arrow-rs/issues/8296 //! // let array: ArrayRef = Arc::new(array); //! let array: ArrayRef = Arc::new(array.into_inner()); //! @@ -91,6 +92,7 @@ //! //! // TODO support writing VariantArray directly //! // at the moment it panics when trying to downcast to a struct array +//! // https://github.com/apache/arrow-rs/issues/8296 //! // let array: ArrayRef = Arc::new(array); //! let array: ArrayRef = Arc::new(array.into_inner()); //!