From 87b438de715b3123bbd23a5405c64c51ec549165 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 14:23:41 -0700 Subject: [PATCH 01/35] add json_to_variant --- parquet-variant/Cargo.toml | 5 +- .../examples/variant_from_json_examples.rs | 105 ++++++ parquet-variant/src/decoder.rs | 5 + parquet-variant/src/from_json.rs | 98 +++++ parquet-variant/src/lib.rs | 4 + parquet-variant/src/variant.rs | 53 +++ parquet-variant/src/variant_buffer_manager.rs | 26 ++ parquet-variant/tests/test_json_to_variant.rs | 356 ++++++++++++++++++ 8 files changed, 651 insertions(+), 1 deletion(-) create mode 100644 parquet-variant/examples/variant_from_json_examples.rs create mode 100644 parquet-variant/src/from_json.rs create mode 100644 parquet-variant/src/variant_buffer_manager.rs create mode 100644 parquet-variant/tests/test_json_to_variant.rs diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 51cec81b2ab6..189da1cfc4c3 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -35,7 +35,10 @@ rust-version = "1.83" [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } -serde_json = "1.0" +# "arbitrary_precision" allows us to manually parse numbers. "preserve_order" does not automatically +# sort object keys +serde_json = { version = "1.0", features = ["arbitrary_precision", "preserve_order"] } +rust_decimal = "1.37.2" base64 = "0.21" [lib] diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs new file mode 100644 index 000000000000..43d9bf5281dc --- /dev/null +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Example showing how to convert Variant values to JSON + +use arrow_schema::ArrowError; +use parquet_variant::{ + json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, + VariantBufferManager, +}; + +/// The caller must provide an object implementing the `VariantBufferManager` trait to the library. +/// This allows the library to write the constructed variant to buffers provided by the caller. +/// This way, the caller has direct control over the output buffers. +pub struct SampleVariantBufferManager { + pub value_buffer: Box<[u8]>, + pub metadata_buffer: Box<[u8]>, +} + +impl VariantBufferManager for SampleVariantBufferManager { + #[inline(always)] + fn borrow_value_buffer(&mut self) -> &mut [u8] { + &mut self.value_buffer + } + + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.value_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.value_buffer); + self.value_buffer = new_buffer; + } + Ok(()) + } + + #[inline(always)] + fn borrow_metadata_buffer(&mut self) -> &mut [u8] { + &mut self.metadata_buffer + } + + fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.metadata_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); + self.metadata_buffer = new_buffer; + } + Ok(()) + } +} + +fn main() -> Result<(), Box> { + let mut variant_buffer_manager = SampleVariantBufferManager { + value_buffer: vec![0u8; 1].into_boxed_slice(), + metadata_buffer: vec![0u8; 1].into_boxed_slice(), + }; + + let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() + + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," + + "\"additional_info\": null}"; + let mut value_size = 0usize; + let mut metadata_size = 0usize; + json_to_variant( + &person_string, + &mut variant_buffer_manager, + &mut value_size, + &mut metadata_size, + )?; + + let variant = parquet_variant::Variant::try_new( + &variant_buffer_manager.metadata_buffer, + &variant_buffer_manager.value_buffer, + )?; + + let json_string = variant_to_json_string(&variant)?; + let json_value = variant_to_json_value(&variant)?; + let pretty_json = serde_json::to_string_pretty(&json_value)?; + println!("{}", pretty_json); + + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, &variant)?; + let buffer_result = String::from_utf8(buffer)?; + + // Verify all methods produce the same result + assert_eq!(json_string, buffer_result); + assert_eq!(json_string, serde_json::to_string(&json_value)?); + + Ok(()) +} diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index cb8336b5b88d..e1e1eafba660 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -26,6 +26,11 @@ use std::num::TryFromIntError; // Makes the code a bit more readable pub(crate) const VARIANT_VALUE_HEADER_BYTES: usize = 1; +pub(crate) const MAX_UNSCALED_DECIMAL_4: i32 = 999999999; +pub(crate) const MAX_PRECISION_DECIMAL_4: u8 = 9; +pub(crate) const MAX_UNSCALED_DECIMAL_8: i64 = 999999999999999999i64; +pub(crate) const MAX_PRECISION_DECIMAL_8: u8 = 18; + #[derive(Debug, Clone, Copy, PartialEq)] pub enum VariantBasicType { Primitive = 0, diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs new file mode 100644 index 000000000000..d70eba517c22 --- /dev/null +++ b/parquet-variant/src/from_json.rs @@ -0,0 +1,98 @@ +use crate::variant_buffer_manager::VariantBufferManager; +use crate::{ListBuilder, ObjectBuilder, Variant, VariantBuilder}; +use arrow_schema::ArrowError; +use serde_json::{Map, Value}; + +/// Eventually, internal writes should also be performed using VariantBufferManager instead of +/// ValueBuffer and MetadataBuffer so the caller has control of the memory +pub fn json_to_variant( + json: &str, + variant_buffer_manager: &mut T, + value_size: &mut usize, + metadata_size: &mut usize, +) -> Result<(), ArrowError> { + let mut builder = VariantBuilder::new(); + let json: Value = serde_json::from_str(json) + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {}", e)))?; + + build_json(&json, &mut builder)?; + let (metadata, value) = builder.finish(); + *value_size = value.len(); + *metadata_size = metadata.len(); + + // Write to caller's buffers - Remove this when the library internally writes to the caller's + // buffers anyway + variant_buffer_manager.ensure_value_buffer_size(*value_size)?; + variant_buffer_manager.ensure_metadata_buffer_size(*metadata_size)?; + let caller_value_buffer = variant_buffer_manager.borrow_value_buffer(); + caller_value_buffer[..*value_size].copy_from_slice(value.as_slice()); + let caller_metadata_buffer = variant_buffer_manager.borrow_metadata_buffer(); + caller_metadata_buffer[..*metadata_size].copy_from_slice(metadata.as_slice()); + Ok(()) +} + +fn build_object(obj: &Map, builder: &mut ObjectBuilder) -> Result<(), ArrowError> { + for (key, value) in obj.iter() { + match value { + Value::Null => builder.append_value(key, Variant::Null), + Value::Bool(b) => builder.append_value(key, *b), + Value::Number(n) => { + let v: Variant = n.try_into()?; + builder.append_value(key, v) + } + Value::String(s) => builder.append_value(key, s.as_str()), + Value::Array(_) | Value::Object(_) => { + todo!("Nesting within objects unsupported right now."); + } + } + } + Ok(()) +} + +fn build_list(arr: &Vec, builder: &mut ListBuilder) -> Result<(), ArrowError> { + for val in arr { + match val { + Value::Null => builder.append_value(Variant::Null), + Value::Bool(b) => builder.append_value(*b), + Value::Number(n) => { + let v: Variant = n.try_into()?; + builder.append_value(v) + } + Value::String(s) => builder.append_value(s.as_str()), + Value::Array(arr) => { + let mut list_builder = builder.new_list(); + build_list(arr, &mut list_builder)?; + list_builder.finish() + } + Value::Object(obj) => { + let mut obj_builder = builder.new_object(); + build_object(obj, &mut obj_builder)?; + obj_builder.finish(); + } + } + } + Ok(()) +} + +fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { + match json { + Value::Null => builder.append_value(Variant::Null), + Value::Bool(b) => builder.append_value(*b), + Value::Number(n) => { + let v: Variant = n.try_into()?; + builder.append_value(v) + } + Value::String(s) => builder.append_value(s.as_str()), + Value::Array(arr) => { + let mut list_builder = builder.new_list(); + build_list(arr, &mut list_builder)?; + list_builder.finish(); + } + Value::Object(obj) => { + let mut obj_builder = builder.new_object(); + build_object(obj, &mut obj_builder)?; + obj_builder.finish(); + } + }; + Ok(()) +} diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 8ce3008655d4..977a19522b73 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -33,10 +33,14 @@ mod decoder; mod variant; // TODO: dead code removal mod builder; +mod from_json; mod to_json; #[allow(dead_code)] mod utils; +mod variant_buffer_manager; pub use builder::*; +pub use from_json::json_to_variant; pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; +pub use variant_buffer_manager::VariantBufferManager; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index d1a34018a158..ae5a862dee83 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -30,6 +30,8 @@ use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; mod list; mod metadata; mod object; +use rust_decimal::prelude::*; +use serde_json::Number; const MAX_SHORT_STRING_BYTES: usize = 0x3F; @@ -1031,6 +1033,57 @@ impl From for Variant<'_, '_> { } } +impl TryFrom<&Number> for Variant<'_, '_> { + type Error = ArrowError; + + fn try_from(n: &Number) -> Result { + if n.is_i64() { + // Find minimum Integer width to fit + let i = n.as_i64().unwrap(); + if i as i8 as i64 == i { + Ok((i as i8).into()) + } else if i as i16 as i64 == i { + Ok((i as i16).into()) + } else if i as i32 as i64 == i { + Ok((i as i32).into()) + } else { + Ok(i.into()) + } + } else { + // Try decimal + // TODO: Replace with custom decimal parsing as the rust_decimal library only supports + // a max unscaled value of 2^96. + match Decimal::from_str_exact(n.as_str()) { + Ok(dec) => { + let unscaled: i128 = dec.mantissa(); + let scale = dec.scale() as u8; + if unscaled.abs() <= decoder::MAX_UNSCALED_DECIMAL_4 as i128 + && scale <= decoder::MAX_PRECISION_DECIMAL_4 + { + (unscaled as i32, scale).try_into() + } else if unscaled.abs() <= decoder::MAX_UNSCALED_DECIMAL_8 as i128 + && scale <= decoder::MAX_PRECISION_DECIMAL_8 + { + (unscaled as i64, scale).try_into() + } else { + (unscaled, scale).try_into() + } + } + Err(_) => { + // Try double + match n.as_f64() { + Some(f) => return Ok(f.into()), + None => Err(ArrowError::InvalidArgumentError(format!( + "Failed to parse {} as number", + n.as_str() + ))), + }? + } + } + } + } +} + impl From for Variant<'_, '_> { fn from(value: f32) -> Self { Variant::Float(value) diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs new file mode 100644 index 000000000000..e3084957cc65 --- /dev/null +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -0,0 +1,26 @@ +use arrow_schema::ArrowError; + +pub trait VariantBufferManager { + /// Returns the slice where value needs to be written to. This method may be called several + /// times during the construction of a new `value` field in a variant. The implementation must + /// make sure that on every call, all the data written to the value buffer so far are preserved. + fn borrow_value_buffer(&mut self) -> &mut [u8]; + + /// Returns the slice where the variant metadata needs to be written to. This method may be + /// called several times during the construction of a new `metadata` field in a variant. The + /// implementation must make sure that on every call, all the data written to the metadata + /// buffer so far are preserved. + fn borrow_metadata_buffer(&mut self) -> &mut [u8]; + + /// Ensures that the next call to `borrow_value_buffer` returns a slice having at least `size` + /// bytes. Also ensures that the value bytes written so far are persisted - this means that + /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new + /// buffer must have the contents of the old value buffer. + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; + + /// Ensures that the next call to `borrow_metadata_buffer` returns a slice having at least + /// `size` bytes. Also ensures that the value metadata written so far are persisted - this means + /// that if `borrow_metadata_buffer` is to return a new buffer from the next call onwards, the + /// new buffer must have the contents of the old metadata buffer. + fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; +} diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs new file mode 100644 index 000000000000..da476a85c634 --- /dev/null +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -0,0 +1,356 @@ +use arrow_schema::ArrowError; +use parquet_variant::{json_to_variant, VariantBufferManager}; + +pub struct SampleVariantBufferManager { + pub value_buffer: Box<[u8]>, + pub metadata_buffer: Box<[u8]>, +} + +impl VariantBufferManager for SampleVariantBufferManager { + fn borrow_value_buffer(&mut self) -> &mut [u8] { + &mut self.value_buffer + } + + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.value_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.value_buffer); + self.value_buffer = new_buffer; + } + Ok(()) + } + + fn borrow_metadata_buffer(&mut self) -> &mut [u8] { + &mut self.metadata_buffer + } + + fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.metadata_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); + self.metadata_buffer = new_buffer; + } + Ok(()) + } +} + +#[test] +fn test_json_to_variant() -> Result<(), ArrowError> { + fn compare_results( + json: &str, + expected_value: &[u8], + expected_metadata: &[u8], + ) -> Result<(), ArrowError> { + let json = json; + let mut value_size: usize = 0; + let mut metadata_size: usize = 0; + + let mut variant_buffer_manager = SampleVariantBufferManager { + value_buffer: vec![0u8; 1].into_boxed_slice(), + metadata_buffer: vec![0u8; 1].into_boxed_slice(), + }; + json_to_variant( + json, + &mut variant_buffer_manager, + &mut value_size, + &mut metadata_size, + )?; + let computed_value_slize: &[u8] = &*variant_buffer_manager.value_buffer; + let computed_metadata_slize: &[u8] = &*variant_buffer_manager.metadata_buffer; + assert_eq!(&computed_value_slize[..value_size], expected_value); + assert_eq!(&computed_metadata_slize[..metadata_size], expected_metadata); + Ok(()) + } + + let empty_metadata: &[u8] = &[1u8, 0u8, 0u8]; + // Null + compare_results("null", &[0u8], empty_metadata)?; + // Bool + compare_results("true", &[4u8], empty_metadata)?; + compare_results("false", &[8u8], empty_metadata)?; + // Integers + compare_results(" 127 ", &[12u8, 127u8], empty_metadata)?; + compare_results(" -128 ", &[12u8, 128u8], empty_metadata)?; + compare_results(" 27134 ", &[16u8, 254u8, 105u8], empty_metadata)?; + compare_results( + " -32767431 ", + &[20u8, 57u8, 2u8, 12u8, 254u8], + empty_metadata, + )?; + compare_results( + "92842754201389", + &[24u8, 45u8, 87u8, 98u8, 163u8, 112u8, 84u8, 0u8, 0u8], + empty_metadata, + )?; + // Decimals + // Decimal 4 + compare_results("1.23", &[32u8, 2u8, 123u8, 0u8, 0u8, 0u8], empty_metadata)?; + compare_results( + "99999999.9", + &[32u8, 1u8, 0xffu8, 0xc9u8, 0x9au8, 0x3bu8], + empty_metadata, + )?; + compare_results( + "-99999999.9", + &[32u8, 1u8, 1u8, 0x36u8, 0x65u8, 0xc4u8], + empty_metadata, + )?; + compare_results( + "0.999999999", + &[32u8, 9u8, 0xffu8, 0xc9u8, 0x9au8, 0x3bu8], + empty_metadata, + )?; + compare_results("0.000000001", &[32u8, 9u8, 1u8, 0, 0, 0], empty_metadata)?; + compare_results( + "-0.999999999", + &[32u8, 9u8, 1u8, 0x36u8, 0x65u8, 0xc4u8], + empty_metadata, + )?; + compare_results( + "-0.000000001", + &[32u8, 9u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8], + empty_metadata, + )?; + // Decimal 8 + compare_results( + "999999999.0", + &[36u8, 1u8, 0xf6u8, 0xe3u8, 0x0bu8, 0x54u8, 0x02u8, 0, 0, 0], + empty_metadata, + )?; + compare_results( + "-999999999.0", + &[ + 36u8, 1u8, 0x0au8, 0x1cu8, 0xf4u8, 0xabu8, 0xfdu8, 0xffu8, 0xffu8, 0xffu8, + ], + empty_metadata, + )?; + compare_results( + "0.999999999999999999", + &[ + 36u8, 18u8, 0xffu8, 0xffu8, 0x63u8, 0xa7u8, 0xb3u8, 0xb6u8, 0xe0u8, 0x0du8, + ], + empty_metadata, + )?; + compare_results( + "-9999999999999999.99", + &[ + 36u8, 2u8, 0x01u8, 0x00u8, 0x9cu8, 0x58u8, 0x4cu8, 0x49u8, 0x1fu8, 0xf2u8, + ], + empty_metadata, + )?; + // Decimal 16 + compare_results( + "9999999999999999999", // integer larger than i64 + &[ + 40u8, 0u8, 0xffu8, 0xffu8, 0xe7u8, 0x89u8, 4u8, 0x23u8, 0xc7u8, 0x8au8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, + ], + empty_metadata, + )?; + compare_results( + "0.9999999999999999999", + &[ + 40u8, 19u8, 0xffu8, 0xffu8, 0xe7u8, 0x89u8, 4u8, 0x23u8, 0xc7u8, 0x8au8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, + ], + empty_metadata, + )?; + compare_results( + "79228162514264337593543950335", // 2 ^ 96 - 1 + &[ + 40u8, 0u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, + 0xffu8, 0xffu8, 0xffu8, 0u8, 0u8, 0u8, 0u8, + ], + empty_metadata, + )?; + compare_results( + "7.9228162514264337593543950335", // using scale higher than this falls into double + // since the max scale is 28. + &[ + 40u8, 28u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, + 0xffu8, 0xffu8, 0xffu8, 0u8, 0u8, 0u8, 0u8, + ], + empty_metadata, + )?; + // Double + { + let mut arr = [28u8; 9]; + arr[1..].copy_from_slice(&0.79228162514264337593543950335f64.to_le_bytes()); + compare_results("0.79228162514264337593543950335", &arr, empty_metadata)?; + } + compare_results( + "15e-1", + &[28u8, 0, 0, 0, 0, 0, 0, 0xf8, 0x3fu8], + empty_metadata, + )?; + compare_results( + "-15e-1", + &[28u8, 0, 0, 0, 0, 0, 0, 0xf8, 0xBfu8], + empty_metadata, + )?; + + // short strings + // random short string + compare_results( + "\"harsh\"", + &[21u8, 104u8, 97u8, 114u8, 115u8, 104u8], + empty_metadata, + )?; + // longest short string + let mut expected = [97u8; 64]; + expected[0] = 253u8; + compare_results( + &format!( + "\"{}\"", + std::iter::repeat('a').take(63).collect::() + ), + &expected, + empty_metadata, + )?; + // long strings + let mut expected = [97u8; 69]; + expected[..5].copy_from_slice(&[64u8, 64u8, 0, 0, 0]); + compare_results( + &format!( + "\"{}\"", + std::iter::repeat('a').take(64).collect::() + ), + &expected, + empty_metadata, + )?; + let mut expected = [98u8; 100005]; + expected[0] = 64u8; + expected[1..5].copy_from_slice(&(100000 as u32).to_le_bytes()); + compare_results( + &format!( + "\"{}\"", + std::iter::repeat('b').take(100000).collect::() + ), + &expected, + empty_metadata, + )?; + + // arrays + // u8 offset + compare_results( + "[127, 128, -32767431]", + &[ + 3u8, 3u8, 0u8, 2u8, 5u8, 10u8, 12u8, 127u8, 16u8, 128u8, 0u8, 20u8, 57u8, 2u8, 12u8, + 254u8, + ], + empty_metadata, + )?; + compare_results( + "[[\"a\", null, true, 4], 128, false]", + &[ + 3u8, 3u8, 0u8, 13u8, 16u8, 17u8, 3u8, 4u8, 0u8, 2u8, 3u8, 4u8, 6u8, 5u8, 97u8, 0u8, + 4u8, 12u8, 4u8, 16u8, 128u8, 0u8, 8u8, + ], + empty_metadata, + )?; + compare_results( + "[{\"age\": 32}, 128, false]", + &[ + 3u8, 3u8, 0u8, 7u8, 10u8, 11u8, 2u8, 1u8, 0u8, 0u8, 2u8, 12u8, 32u8, 16u8, 128u8, 0u8, + 8u8, + ], + &[1u8, 1u8, 0u8, 3u8, 97u8, 103u8, 101u8], + )?; + // u16 offset - 128 i8's + 1 "true" = 257 bytes + compare_results( + &format!( + "[{} true]", + std::iter::repeat("1, ").take(128).collect::() + ), + &[ + 7u8, 129u8, 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, + 0, 24, 0, 26, 0, 28, 0, 30, 0, 32, 0, 34, 0, 36, 0, 38, 0, 40, 0, 42, 0, 44, 0, 46, 0, + 48, 0, 50, 0, 52, 0, 54, 0, 56, 0, 58, 0, 60, 0, 62, 0, 64, 0, 66, 0, 68, 0, 70, 0, 72, + 0, 74, 0, 76, 0, 78, 0, 80, 0, 82, 0, 84, 0, 86, 0, 88, 0, 90, 0, 92, 0, 94, 0, 96, 0, + 98, 0, 100, 0, 102, 0, 104, 0, 106, 0, 108, 0, 110, 0, 112, 0, 114, 0, 116, 0, 118, 0, + 120, 0, 122, 0, 124, 0, 126, 0, 128, 0, 130, 0, 132, 0, 134, 0, 136, 0, 138, 0, 140, 0, + 142, 0, 144, 0, 146, 0, 148, 0, 150, 0, 152, 0, 154, 0, 156, 0, 158, 0, 160, 0, 162, 0, + 164, 0, 166, 0, 168, 0, 170, 0, 172, 0, 174, 0, 176, 0, 178, 0, 180, 0, 182, 0, 184, 0, + 186, 0, 188, 0, 190, 0, 192, 0, 194, 0, 196, 0, 198, 0, 200, 0, 202, 0, 204, 0, 206, 0, + 208, 0, 210, 0, 212, 0, 214, 0, 216, 0, 218, 0, 220, 0, 222, 0, 224, 0, 226, 0, 228, 0, + 230, 0, 232, 0, 234, 0, 236, 0, 238, 0, 240, 0, 242, 0, 244, 0, 246, 0, 248, 0, 250, 0, + 252, 0, 254, 0, 0, 1, 1, 1, // Final offset is 257 + 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, + 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, + 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, + 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, + 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, + 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, + 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, + 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, + 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, + 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, + 12, 1, 12, 1, 12, 1, 4u8, + ], + empty_metadata, + )?; + // verify u24, and large_size + { + let null_array: [u8; 513] = std::array::from_fn(|i| match i { + 0 => 3u8, + 1 => 255u8, + j => { + if j <= 257 { + (j - 2) as u8 + } else { + 0u8 + } + } + }); + // 256 elements => large size + // each element is an array of 256 nulls => u24 offset + let mut whole_array: [u8; 5 + 3 * 257 + 256 * 513] = std::array::from_fn(|i| match i { + 0 => 0x1Bu8, + 1 => 0u8, + 2 => 1u8, + 3 => 0u8, + 4 => 0u8, + _ => 0, + }); + for i in 0..257 { + let cur_idx = 5 + i * 3 as usize; + let cur_offset: usize = i * 513; + whole_array[cur_idx..cur_idx + 3].copy_from_slice(&cur_offset.to_le_bytes()[..3]); + if i != 256 { + let abs_offset = 5 + 3 * 257 + cur_offset; + whole_array[abs_offset..abs_offset + 513].copy_from_slice(&null_array); + } + } + let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); + let json = format!("[{}]", vec![intermediate; 256].join(", ")); + compare_results(json.as_str(), &whole_array, empty_metadata)?; + } + + // objects + compare_results( + "{\"b\": 2, \"a\": 1, \"a\": 3}", + &[2u8, 2u8, 1u8, 0u8, 2u8, 0u8, 4u8, 12u8, 2u8, 12u8, 3u8], + &[1, 2, 0, 1, 2, 98u8, 97u8], + )?; + // TODO: verify different offset_size, id_size, is_large values, nesting and more object + // tests in general + // compare_results( + // "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + // &[ + // 2u8, 3u8, 2u8, 1u8, 0u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, + // 12u8, 4u8, 28u8, 0, 0, 0, 0, 0, 0, 0x08, 0xc0, 32u8, 3, 0xe9, 0x03, 0, 0, 0, 3u8, + // 2u8, 0u8, 1u8, 2u8, 4u8, 8u8, + // ], + // &[ + // 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, + // 0x73u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, + // 0x61u8, 0x6eu8, 0x73u8, + // ], + // )?; + + Ok(()) +} From a946ac69ecc78cf5e6799e6c1e21f69c28bd49e0 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 14:50:48 -0700 Subject: [PATCH 02/35] comment fix --- .../examples/variant_from_json_examples.rs | 25 +++++++---------- parquet-variant/src/from_json.rs | 27 +++++++++---------- parquet-variant/src/variant_buffer_manager.rs | 20 +++++++------- parquet-variant/tests/test_json_to_variant.rs | 13 +++------ 4 files changed, 35 insertions(+), 50 deletions(-) diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 43d9bf5281dc..07d81338c23b 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -23,15 +23,12 @@ use parquet_variant::{ VariantBufferManager, }; -/// The caller must provide an object implementing the `VariantBufferManager` trait to the library. -/// This allows the library to write the constructed variant to buffers provided by the caller. -/// This way, the caller has direct control over the output buffers. -pub struct SampleVariantBufferManager { +pub struct SampleBoxBasedVariantBufferManager { pub value_buffer: Box<[u8]>, pub metadata_buffer: Box<[u8]>, } -impl VariantBufferManager for SampleVariantBufferManager { +impl VariantBufferManager for SampleBoxBasedVariantBufferManager { #[inline(always)] fn borrow_value_buffer(&mut self) -> &mut [u8] { &mut self.value_buffer @@ -66,7 +63,10 @@ impl VariantBufferManager for SampleVariantBufferManager { } fn main() -> Result<(), Box> { - let mut variant_buffer_manager = SampleVariantBufferManager { + // The caller must provide an object implementing the `VariantBufferManager` trait to the library. + // This allows the library to write the constructed variant to buffers provided by the caller. + // This way, the caller has direct control over the output buffers. + let mut variant_buffer_manager = SampleBoxBasedVariantBufferManager { value_buffer: vec![0u8; 1].into_boxed_slice(), metadata_buffer: vec![0u8; 1].into_boxed_slice(), }; @@ -74,18 +74,11 @@ fn main() -> Result<(), Box> { let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," + "\"additional_info\": null}"; - let mut value_size = 0usize; - let mut metadata_size = 0usize; - json_to_variant( - &person_string, - &mut variant_buffer_manager, - &mut value_size, - &mut metadata_size, - )?; + let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; let variant = parquet_variant::Variant::try_new( - &variant_buffer_manager.metadata_buffer, - &variant_buffer_manager.value_buffer, + &variant_buffer_manager.metadata_buffer[..metadata_size], + &variant_buffer_manager.value_buffer[..value_size], )?; let json_string = variant_to_json_string(&variant)?; diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index d70eba517c22..4ef9c14d393e 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -4,31 +4,30 @@ use arrow_schema::ArrowError; use serde_json::{Map, Value}; /// Eventually, internal writes should also be performed using VariantBufferManager instead of -/// ValueBuffer and MetadataBuffer so the caller has control of the memory -pub fn json_to_variant( +/// ValueBuffer and MetadataBuffer so the caller has control of the memory. +/// Returns a pair +pub fn json_to_variant<'a, T: VariantBufferManager>( json: &str, - variant_buffer_manager: &mut T, - value_size: &mut usize, - metadata_size: &mut usize, -) -> Result<(), ArrowError> { + variant_buffer_manager: &'a mut T, +) -> Result<(usize, usize), ArrowError> { let mut builder = VariantBuilder::new(); let json: Value = serde_json::from_str(json) .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {}", e)))?; build_json(&json, &mut builder)?; let (metadata, value) = builder.finish(); - *value_size = value.len(); - *metadata_size = metadata.len(); + let value_size = value.len(); + let metadata_size = metadata.len(); // Write to caller's buffers - Remove this when the library internally writes to the caller's // buffers anyway - variant_buffer_manager.ensure_value_buffer_size(*value_size)?; - variant_buffer_manager.ensure_metadata_buffer_size(*metadata_size)?; - let caller_value_buffer = variant_buffer_manager.borrow_value_buffer(); - caller_value_buffer[..*value_size].copy_from_slice(value.as_slice()); + variant_buffer_manager.ensure_metadata_buffer_size(metadata_size)?; + variant_buffer_manager.ensure_value_buffer_size(value_size)?; let caller_metadata_buffer = variant_buffer_manager.borrow_metadata_buffer(); - caller_metadata_buffer[..*metadata_size].copy_from_slice(metadata.as_slice()); - Ok(()) + caller_metadata_buffer[..metadata_size].copy_from_slice(metadata.as_slice()); + let caller_value_buffer = variant_buffer_manager.borrow_value_buffer(); + caller_value_buffer[..value_size].copy_from_slice(value.as_slice()); + Ok((metadata_size, value_size)) } fn build_object(obj: &Map, builder: &mut ObjectBuilder) -> Result<(), ArrowError> { diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index e3084957cc65..6b7b4e6a2a33 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -1,26 +1,26 @@ use arrow_schema::ArrowError; pub trait VariantBufferManager { - /// Returns the slice where value needs to be written to. This method may be called several - /// times during the construction of a new `value` field in a variant. The implementation must - /// make sure that on every call, all the data written to the value buffer so far are preserved. - fn borrow_value_buffer(&mut self) -> &mut [u8]; - /// Returns the slice where the variant metadata needs to be written to. This method may be /// called several times during the construction of a new `metadata` field in a variant. The /// implementation must make sure that on every call, all the data written to the metadata /// buffer so far are preserved. fn borrow_metadata_buffer(&mut self) -> &mut [u8]; - /// Ensures that the next call to `borrow_value_buffer` returns a slice having at least `size` - /// bytes. Also ensures that the value bytes written so far are persisted - this means that - /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new - /// buffer must have the contents of the old value buffer. - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; + /// Returns the slice where value needs to be written to. This method may be called several + /// times during the construction of a new `value` field in a variant. The implementation must + /// make sure that on every call, all the data written to the value buffer so far are preserved. + fn borrow_value_buffer(&mut self) -> &mut [u8]; /// Ensures that the next call to `borrow_metadata_buffer` returns a slice having at least /// `size` bytes. Also ensures that the value metadata written so far are persisted - this means /// that if `borrow_metadata_buffer` is to return a new buffer from the next call onwards, the /// new buffer must have the contents of the old metadata buffer. fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; + + /// Ensures that the next call to `borrow_value_buffer` returns a slice having at least `size` + /// bytes. Also ensures that the value bytes written so far are persisted - this means that + /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new + /// buffer must have the contents of the old value buffer. + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; } diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index da476a85c634..6cb84149100a 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -46,23 +46,16 @@ fn test_json_to_variant() -> Result<(), ArrowError> { expected_metadata: &[u8], ) -> Result<(), ArrowError> { let json = json; - let mut value_size: usize = 0; - let mut metadata_size: usize = 0; let mut variant_buffer_manager = SampleVariantBufferManager { value_buffer: vec![0u8; 1].into_boxed_slice(), metadata_buffer: vec![0u8; 1].into_boxed_slice(), }; - json_to_variant( - json, - &mut variant_buffer_manager, - &mut value_size, - &mut metadata_size, - )?; - let computed_value_slize: &[u8] = &*variant_buffer_manager.value_buffer; + let (metadata_size, value_size) = json_to_variant(json, &mut variant_buffer_manager)?; let computed_metadata_slize: &[u8] = &*variant_buffer_manager.metadata_buffer; - assert_eq!(&computed_value_slize[..value_size], expected_value); + let computed_value_slize: &[u8] = &*variant_buffer_manager.value_buffer; assert_eq!(&computed_metadata_slize[..metadata_size], expected_metadata); + assert_eq!(&computed_value_slize[..value_size], expected_value); Ok(()) } From bca3b812e19bdc12b46f53311ceffbfd0b6e63e6 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 15:12:22 -0700 Subject: [PATCH 03/35] Added another sample buffer managger --- .../examples/variant_from_json_examples.rs | 79 +++++---------- parquet-variant/src/lib.rs | 4 +- parquet-variant/src/variant_buffer_manager.rs | 98 +++++++++++++++++++ 3 files changed, 127 insertions(+), 54 deletions(-) diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 07d81338c23b..24b489405706 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -17,68 +17,22 @@ //! Example showing how to convert Variant values to JSON -use arrow_schema::ArrowError; use parquet_variant::{ json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, - VariantBufferManager, + SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager, }; -pub struct SampleBoxBasedVariantBufferManager { - pub value_buffer: Box<[u8]>, - pub metadata_buffer: Box<[u8]>, -} - -impl VariantBufferManager for SampleBoxBasedVariantBufferManager { - #[inline(always)] - fn borrow_value_buffer(&mut self) -> &mut [u8] { - &mut self.value_buffer - } - - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.value_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.value_buffer); - self.value_buffer = new_buffer; - } - Ok(()) - } - - #[inline(always)] - fn borrow_metadata_buffer(&mut self) -> &mut [u8] { - &mut self.metadata_buffer - } - - fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.metadata_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); - self.metadata_buffer = new_buffer; - } - Ok(()) - } -} - -fn main() -> Result<(), Box> { - // The caller must provide an object implementing the `VariantBufferManager` trait to the library. - // This allows the library to write the constructed variant to buffers provided by the caller. - // This way, the caller has direct control over the output buffers. - let mut variant_buffer_manager = SampleBoxBasedVariantBufferManager { - value_buffer: vec![0u8; 1].into_boxed_slice(), - metadata_buffer: vec![0u8; 1].into_boxed_slice(), - }; - +fn from_json_example( + variant_buffer_manager: &mut T, +) -> Result<(), Box> { let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," + "\"additional_info\": null}"; - let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; + let (metadata_size, value_size) = json_to_variant(&person_string, variant_buffer_manager)?; let variant = parquet_variant::Variant::try_new( - &variant_buffer_manager.metadata_buffer[..metadata_size], - &variant_buffer_manager.value_buffer[..value_size], + &variant_buffer_manager.get_immutable_metadata_buffer()[..metadata_size], + &variant_buffer_manager.get_immutable_value_buffer()[..value_size], )?; let json_string = variant_to_json_string(&variant)?; @@ -96,3 +50,22 @@ fn main() -> Result<(), Box> { Ok(()) } + +fn main() -> Result<(), Box> { + // The caller must provide an object implementing the `VariantBufferManager` trait to the library. + // This allows the library to write the constructed variant to buffers provided by the caller. + // This way, the caller has direct control over the output buffers. + let mut box_based_buffer_manager = SampleBoxBasedVariantBufferManager { + value_buffer: vec![0u8; 1].into_boxed_slice(), + metadata_buffer: vec![0u8; 1].into_boxed_slice(), + }; + + let mut vec_based_buffer_manager = SampleVecBasedVariantBufferManager { + value_buffer: vec![0u8; 1], + metadata_buffer: vec![0u8; 1], + }; + + from_json_example(&mut box_based_buffer_manager)?; + from_json_example(&mut vec_based_buffer_manager)?; + Ok(()) +} diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 977a19522b73..4b732bbc2dca 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -43,4 +43,6 @@ pub use builder::*; pub use from_json::json_to_variant; pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; -pub use variant_buffer_manager::VariantBufferManager; +pub use variant_buffer_manager::{ + SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager, +}; diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index 6b7b4e6a2a33..1de79383fb87 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -23,4 +23,102 @@ pub trait VariantBufferManager { /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new /// buffer must have the contents of the old value buffer. fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; + + fn get_immutable_metadata_buffer(&self) -> &[u8] { + unimplemented!("Optional method not implemented in VariantBufferManager") + } + + fn get_immutable_value_buffer(&self) -> &[u8] { + unimplemented!("Optional method not implemented in VariantBufferManager") + } +} + +pub struct SampleBoxBasedVariantBufferManager { + pub value_buffer: Box<[u8]>, + pub metadata_buffer: Box<[u8]>, +} + +impl VariantBufferManager for SampleBoxBasedVariantBufferManager { + #[inline(always)] + fn borrow_value_buffer(&mut self) -> &mut [u8] { + &mut self.value_buffer + } + + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.value_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.value_buffer); + self.value_buffer = new_buffer; + } + Ok(()) + } + + #[inline(always)] + fn borrow_metadata_buffer(&mut self) -> &mut [u8] { + &mut self.metadata_buffer + } + + fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.metadata_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + let mut new_buffer = vec![0u8; size].into_boxed_slice(); + new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); + self.metadata_buffer = new_buffer; + } + Ok(()) + } + + fn get_immutable_metadata_buffer(&self) -> &[u8] { + &self.metadata_buffer + } + + fn get_immutable_value_buffer(&self) -> &[u8] { + &self.value_buffer + } +} + +pub struct SampleVecBasedVariantBufferManager { + pub value_buffer: Vec, + pub metadata_buffer: Vec, +} + +impl VariantBufferManager for SampleVecBasedVariantBufferManager { + #[inline(always)] + fn borrow_value_buffer(&mut self) -> &mut [u8] { + self.value_buffer.as_mut_slice() + } + + fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.value_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + self.value_buffer.resize(size, 0); + } + Ok(()) + } + + #[inline(always)] + fn borrow_metadata_buffer(&mut self) -> &mut [u8] { + self.metadata_buffer.as_mut_slice() + } + + fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { + let cur_len = self.metadata_buffer.len(); + if size > cur_len { + // Reallocate larger buffer + self.metadata_buffer.resize(size, 0); + } + Ok(()) + } + + fn get_immutable_metadata_buffer(&self) -> &[u8] { + self.metadata_buffer.as_slice() + } + + fn get_immutable_value_buffer(&self) -> &[u8] { + self.value_buffer.as_slice() + } } From 339e880151bf2abdd3a1658a2a2b02fbc8a83f84 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 15:16:43 -0700 Subject: [PATCH 04/35] minor refactoring --- parquet-variant/src/from_json.rs | 66 ++++++++++++++++---------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 4ef9c14d393e..c1a200ed5c00 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -30,21 +30,26 @@ pub fn json_to_variant<'a, T: VariantBufferManager>( Ok((metadata_size, value_size)) } -fn build_object(obj: &Map, builder: &mut ObjectBuilder) -> Result<(), ArrowError> { - for (key, value) in obj.iter() { - match value { - Value::Null => builder.append_value(key, Variant::Null), - Value::Bool(b) => builder.append_value(key, *b), - Value::Number(n) => { - let v: Variant = n.try_into()?; - builder.append_value(key, v) - } - Value::String(s) => builder.append_value(key, s.as_str()), - Value::Array(_) | Value::Object(_) => { - todo!("Nesting within objects unsupported right now."); - } +fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { + match json { + Value::Null => builder.append_value(Variant::Null), + Value::Bool(b) => builder.append_value(*b), + Value::Number(n) => { + let v: Variant = n.try_into()?; + builder.append_value(v) } - } + Value::String(s) => builder.append_value(s.as_str()), + Value::Array(arr) => { + let mut list_builder = builder.new_list(); + build_list(arr, &mut list_builder)?; + list_builder.finish(); + } + Value::Object(obj) => { + let mut obj_builder = builder.new_object(); + build_object(obj, &mut obj_builder)?; + obj_builder.finish(); + } + }; Ok(()) } @@ -73,25 +78,20 @@ fn build_list(arr: &Vec, builder: &mut ListBuilder) -> Result<(), ArrowEr Ok(()) } -fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { - match json { - Value::Null => builder.append_value(Variant::Null), - Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => { - let v: Variant = n.try_into()?; - builder.append_value(v) - } - Value::String(s) => builder.append_value(s.as_str()), - Value::Array(arr) => { - let mut list_builder = builder.new_list(); - build_list(arr, &mut list_builder)?; - list_builder.finish(); - } - Value::Object(obj) => { - let mut obj_builder = builder.new_object(); - build_object(obj, &mut obj_builder)?; - obj_builder.finish(); +fn build_object(obj: &Map, builder: &mut ObjectBuilder) -> Result<(), ArrowError> { + for (key, value) in obj.iter() { + match value { + Value::Null => builder.append_value(key, Variant::Null), + Value::Bool(b) => builder.append_value(key, *b), + Value::Number(n) => { + let v: Variant = n.try_into()?; + builder.append_value(key, v) + } + Value::String(s) => builder.append_value(key, s.as_str()), + Value::Array(_) | Value::Object(_) => { + todo!("Nesting within objects unsupported right now."); + } } - }; + } Ok(()) } From 3c18fdfcd7ca9a27cd9d644237b6f463d84ff146 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 16:26:53 -0700 Subject: [PATCH 05/35] incorporated new changes --- parquet-variant/src/from_json.rs | 25 ++++++++++----- parquet-variant/tests/test_json_to_variant.rs | 31 ++++++++++--------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index c1a200ed5c00..04f787b123b5 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -23,6 +23,7 @@ pub fn json_to_variant<'a, T: VariantBufferManager>( // buffers anyway variant_buffer_manager.ensure_metadata_buffer_size(metadata_size)?; variant_buffer_manager.ensure_value_buffer_size(value_size)?; + let caller_metadata_buffer = variant_buffer_manager.borrow_metadata_buffer(); caller_metadata_buffer[..metadata_size].copy_from_slice(metadata.as_slice()); let caller_value_buffer = variant_buffer_manager.borrow_value_buffer(); @@ -78,18 +79,28 @@ fn build_list(arr: &Vec, builder: &mut ListBuilder) -> Result<(), ArrowEr Ok(()) } -fn build_object(obj: &Map, builder: &mut ObjectBuilder) -> Result<(), ArrowError> { +fn build_object<'a, 'b>( + obj: &'b Map, + builder: &mut ObjectBuilder<'a, 'b>, +) -> Result<(), ArrowError> { for (key, value) in obj.iter() { match value { - Value::Null => builder.append_value(key, Variant::Null), - Value::Bool(b) => builder.append_value(key, *b), + Value::Null => builder.insert(key, Variant::Null), + Value::Bool(b) => builder.insert(key, *b), Value::Number(n) => { let v: Variant = n.try_into()?; - builder.append_value(key, v) + builder.insert(key, v) + } + Value::String(s) => builder.insert(key, s.as_str()), + Value::Array(arr) => { + let mut list_builder = builder.new_list(key); + build_list(arr, &mut list_builder)?; + list_builder.finish() } - Value::String(s) => builder.append_value(key, s.as_str()), - Value::Array(_) | Value::Object(_) => { - todo!("Nesting within objects unsupported right now."); + Value::Object(obj) => { + let mut obj_builder = builder.new_object(key); + build_object(obj, &mut obj_builder)?; + obj_builder.finish(); } } } diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 6cb84149100a..a33716fee4a2 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -323,27 +323,30 @@ fn test_json_to_variant() -> Result<(), ArrowError> { compare_results(json.as_str(), &whole_array, empty_metadata)?; } - // objects + // // objects compare_results( "{\"b\": 2, \"a\": 1, \"a\": 3}", &[2u8, 2u8, 1u8, 0u8, 2u8, 0u8, 4u8, 12u8, 2u8, 12u8, 3u8], &[1, 2, 0, 1, 2, 98u8, 97u8], )?; + // TODO: verify different offset_size, id_size, is_large values, nesting and more object // tests in general - // compare_results( - // "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", - // &[ - // 2u8, 3u8, 2u8, 1u8, 0u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, - // 12u8, 4u8, 28u8, 0, 0, 0, 0, 0, 0, 0x08, 0xc0, 32u8, 3, 0xe9, 0x03, 0, 0, 0, 3u8, - // 2u8, 0u8, 1u8, 2u8, 4u8, 8u8, - // ], - // &[ - // 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, - // 0x73u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, - // 0x61u8, 0x6eu8, 0x73u8, - // ], - // )?; + // NOTE: objects and lists are treated as `pending` and are added to the dictionary after scalar + // values. Therefore "null" has ID 0, "numbers" as ID 1 and "booleans" has ID 2. + compare_results( + "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + &[ + 2u8, 3u8, 2u8, 0u8, 1u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, 12u8, + 4u8, 28u8, 0, 0, 0, 0, 0, 0, 0x08, 0xc0, 32u8, 3, 0xe9, 0x03, 0, 0, 0, 3u8, 2u8, 0u8, + 1u8, 2u8, 4u8, 8u8, + ], + &[ + 1u8, 3u8, 0u8, 4u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x6eu8, 0x75u8, 0x6du8, + 0x62u8, 0x65u8, 0x72u8, 0x73u8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, 0x61u8, 0x6eu8, + 0x73u8, + ], + )?; Ok(()) } From 67a83fe40115a6bcb281d70c24739102246a938c Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 18:09:07 -0700 Subject: [PATCH 06/35] minor changes --- parquet-variant/src/from_json.rs | 16 +++---- parquet-variant/src/variant.rs | 3 +- parquet-variant/tests/test_json_to_variant.rs | 45 ++----------------- 3 files changed, 10 insertions(+), 54 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 04f787b123b5..5b48e53b5742 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -6,9 +6,9 @@ use serde_json::{Map, Value}; /// Eventually, internal writes should also be performed using VariantBufferManager instead of /// ValueBuffer and MetadataBuffer so the caller has control of the memory. /// Returns a pair -pub fn json_to_variant<'a, T: VariantBufferManager>( +pub fn json_to_variant( json: &str, - variant_buffer_manager: &'a mut T, + variant_buffer_manager: &mut T, ) -> Result<(usize, usize), ArrowError> { let mut builder = VariantBuilder::new(); let json: Value = serde_json::from_str(json) @@ -54,15 +54,12 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr Ok(()) } -fn build_list(arr: &Vec, builder: &mut ListBuilder) -> Result<(), ArrowError> { +fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> { for val in arr { match val { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => { - let v: Variant = n.try_into()?; - builder.append_value(v) - } + Value::Number(n) => builder.append_value(Variant::try_from(n)?), Value::String(s) => builder.append_value(s.as_str()), Value::Array(arr) => { let mut list_builder = builder.new_list(); @@ -87,10 +84,7 @@ fn build_object<'a, 'b>( match value { Value::Null => builder.insert(key, Variant::Null), Value::Bool(b) => builder.insert(key, *b), - Value::Number(n) => { - let v: Variant = n.try_into()?; - builder.insert(key, v) - } + Value::Number(n) => builder.insert(key, Variant::try_from(n)?), Value::String(s) => builder.insert(key, s.as_str()), Value::Array(arr) => { let mut list_builder = builder.new_list(key); diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 2a067fc9c652..2e5c85a9986d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -947,9 +947,8 @@ impl TryFrom<&Number> for Variant<'_, '_> { type Error = ArrowError; fn try_from(n: &Number) -> Result { - if n.is_i64() { + if let Some(i) = n.as_i64() { // Find minimum Integer width to fit - let i = n.as_i64().unwrap(); if i as i8 as i64 == i { Ok((i as i8).into()) } else if i as i16 as i64 == i { diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index a33716fee4a2..224171f5bd61 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -1,42 +1,5 @@ use arrow_schema::ArrowError; -use parquet_variant::{json_to_variant, VariantBufferManager}; - -pub struct SampleVariantBufferManager { - pub value_buffer: Box<[u8]>, - pub metadata_buffer: Box<[u8]>, -} - -impl VariantBufferManager for SampleVariantBufferManager { - fn borrow_value_buffer(&mut self) -> &mut [u8] { - &mut self.value_buffer - } - - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.value_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.value_buffer); - self.value_buffer = new_buffer; - } - Ok(()) - } - - fn borrow_metadata_buffer(&mut self) -> &mut [u8] { - &mut self.metadata_buffer - } - - fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.metadata_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); - self.metadata_buffer = new_buffer; - } - Ok(()) - } -} +use parquet_variant::{json_to_variant, SampleVecBasedVariantBufferManager}; #[test] fn test_json_to_variant() -> Result<(), ArrowError> { @@ -47,9 +10,9 @@ fn test_json_to_variant() -> Result<(), ArrowError> { ) -> Result<(), ArrowError> { let json = json; - let mut variant_buffer_manager = SampleVariantBufferManager { - value_buffer: vec![0u8; 1].into_boxed_slice(), - metadata_buffer: vec![0u8; 1].into_boxed_slice(), + let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { + value_buffer: vec![0u8; 1], + metadata_buffer: vec![0u8; 1], }; let (metadata_size, value_size) = json_to_variant(json, &mut variant_buffer_manager)?; let computed_metadata_slize: &[u8] = &*variant_buffer_manager.metadata_buffer; From dede88dcccc7a37db9ab3fb2e325ebe2dbdcb964 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 18:10:29 -0700 Subject: [PATCH 07/35] test fix based on recent commit --- parquet-variant/tests/test_json_to_variant.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 224171f5bd61..120e6928563e 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -300,14 +300,14 @@ fn test_json_to_variant() -> Result<(), ArrowError> { compare_results( "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", &[ - 2u8, 3u8, 2u8, 0u8, 1u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, 12u8, + 2u8, 3u8, 2u8, 1u8, 0u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, 12u8, 4u8, 28u8, 0, 0, 0, 0, 0, 0, 0x08, 0xc0, 32u8, 3, 0xe9, 0x03, 0, 0, 0, 3u8, 2u8, 0u8, 1u8, 2u8, 4u8, 8u8, ], &[ - 1u8, 3u8, 0u8, 4u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x6eu8, 0x75u8, 0x6du8, - 0x62u8, 0x65u8, 0x72u8, 0x73u8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, 0x61u8, 0x6eu8, - 0x73u8, + 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, + 0x73u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, + 0x61u8, 0x6eu8, 0x73u8, ], )?; From fa3befcf115313f36a58dd7cbf14fdb87cc3e863 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 18:13:56 -0700 Subject: [PATCH 08/35] constant fix --- parquet-variant/src/variant.rs | 8 ++++---- parquet-variant/src/variant/decimal.rs | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 2e5c85a9986d..56d90720c5c3 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -966,12 +966,12 @@ impl TryFrom<&Number> for Variant<'_, '_> { Ok(dec) => { let unscaled: i128 = dec.mantissa(); let scale = dec.scale() as u8; - if unscaled.abs() <= decoder::MAX_UNSCALED_DECIMAL_4 as i128 - && scale <= decoder::MAX_PRECISION_DECIMAL_4 + if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128 + && scale <= VariantDecimal4::MAX_PRECISION as u8 { (unscaled as i32, scale).try_into() - } else if unscaled.abs() <= decoder::MAX_UNSCALED_DECIMAL_8 as i128 - && scale <= decoder::MAX_PRECISION_DECIMAL_8 + } else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128 + && scale <= VariantDecimal8::MAX_PRECISION as u8 { (unscaled as i64, scale).try_into() } else { diff --git a/parquet-variant/src/variant/decimal.rs b/parquet-variant/src/variant/decimal.rs index f03b1e1e388d..fed1f03c3a58 100644 --- a/parquet-variant/src/variant/decimal.rs +++ b/parquet-variant/src/variant/decimal.rs @@ -31,8 +31,8 @@ pub struct VariantDecimal4 { } impl VariantDecimal4 { - const MAX_PRECISION: u32 = 9; - const MAX_UNSCALED_VALUE: u32 = 10_u32.pow(Self::MAX_PRECISION) - 1; + pub(crate) const MAX_PRECISION: u32 = 9; + pub(crate) const MAX_UNSCALED_VALUE: u32 = 10_u32.pow(Self::MAX_PRECISION) - 1; pub fn try_new(integer: i32, scale: u8) -> Result { // Validate that scale doesn't exceed precision @@ -73,8 +73,8 @@ pub struct VariantDecimal8 { } impl VariantDecimal8 { - const MAX_PRECISION: u32 = 18; - const MAX_UNSCALED_VALUE: u64 = 10_u64.pow(Self::MAX_PRECISION) - 1; + pub(crate) const MAX_PRECISION: u32 = 18; + pub(crate) const MAX_UNSCALED_VALUE: u64 = 10_u64.pow(Self::MAX_PRECISION) - 1; pub fn try_new(integer: i64, scale: u8) -> Result { // Validate that scale doesn't exceed precision From 38bac591393eace134ecbc1de89d059932be373a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 18:14:19 -0700 Subject: [PATCH 09/35] fix --- parquet-variant/src/decoder.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index e1e1eafba660..cb8336b5b88d 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -26,11 +26,6 @@ use std::num::TryFromIntError; // Makes the code a bit more readable pub(crate) const VARIANT_VALUE_HEADER_BYTES: usize = 1; -pub(crate) const MAX_UNSCALED_DECIMAL_4: i32 = 999999999; -pub(crate) const MAX_PRECISION_DECIMAL_4: u8 = 9; -pub(crate) const MAX_UNSCALED_DECIMAL_8: i64 = 999999999999999999i64; -pub(crate) const MAX_PRECISION_DECIMAL_8: u8 = 18; - #[derive(Debug, Clone, Copy, PartialEq)] pub enum VariantBasicType { Primitive = 0, From cd530eee81c1e38deecf80e88434afd17d206d59 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 23:47:50 -0700 Subject: [PATCH 10/35] addressed comments --- .../examples/variant_from_json_examples.rs | 39 ++++-------- parquet-variant/src/from_json.rs | 4 +- parquet-variant/src/lib.rs | 4 +- parquet-variant/src/variant_buffer_manager.rs | 63 ------------------- parquet-variant/tests/test_json_to_variant.rs | 6 +- 5 files changed, 19 insertions(+), 97 deletions(-) diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 24b489405706..0ccfd68365d0 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -19,20 +19,26 @@ use parquet_variant::{ json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, - SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager, + SampleVecBasedVariantBufferManager, }; -fn from_json_example( - variant_buffer_manager: &mut T, -) -> Result<(), Box> { +fn main() -> Result<(), Box> { + // The caller must provide an object implementing the `VariantBufferManager` trait to the library. + // This allows the library to write the constructed variant to buffers provided by the caller. + // This way, the caller has direct control over the output buffers. + let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { + value_buffer: vec![0u8; 1], + metadata_buffer: vec![0u8; 1], + }; + let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," + "\"additional_info\": null}"; - let (metadata_size, value_size) = json_to_variant(&person_string, variant_buffer_manager)?; + let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; let variant = parquet_variant::Variant::try_new( - &variant_buffer_manager.get_immutable_metadata_buffer()[..metadata_size], - &variant_buffer_manager.get_immutable_value_buffer()[..value_size], + &variant_buffer_manager.metadata_buffer[..metadata_size], + &variant_buffer_manager.value_buffer[..value_size], )?; let json_string = variant_to_json_string(&variant)?; @@ -50,22 +56,3 @@ fn from_json_example( Ok(()) } - -fn main() -> Result<(), Box> { - // The caller must provide an object implementing the `VariantBufferManager` trait to the library. - // This allows the library to write the constructed variant to buffers provided by the caller. - // This way, the caller has direct control over the output buffers. - let mut box_based_buffer_manager = SampleBoxBasedVariantBufferManager { - value_buffer: vec![0u8; 1].into_boxed_slice(), - metadata_buffer: vec![0u8; 1].into_boxed_slice(), - }; - - let mut vec_based_buffer_manager = SampleVecBasedVariantBufferManager { - value_buffer: vec![0u8; 1], - metadata_buffer: vec![0u8; 1], - }; - - from_json_example(&mut box_based_buffer_manager)?; - from_json_example(&mut vec_based_buffer_manager)?; - Ok(()) -} diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 5b48e53b5742..86d3ef3747a2 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -6,9 +6,9 @@ use serde_json::{Map, Value}; /// Eventually, internal writes should also be performed using VariantBufferManager instead of /// ValueBuffer and MetadataBuffer so the caller has control of the memory. /// Returns a pair -pub fn json_to_variant( +pub fn json_to_variant( json: &str, - variant_buffer_manager: &mut T, + variant_buffer_manager: &mut impl VariantBufferManager, ) -> Result<(usize, usize), ArrowError> { let mut builder = VariantBuilder::new(); let json: Value = serde_json::from_str(json) diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 4b732bbc2dca..65a8bdeffcaf 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -43,6 +43,4 @@ pub use builder::*; pub use from_json::json_to_variant; pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; -pub use variant_buffer_manager::{ - SampleBoxBasedVariantBufferManager, SampleVecBasedVariantBufferManager, VariantBufferManager, -}; +pub use variant_buffer_manager::{SampleVecBasedVariantBufferManager, VariantBufferManager}; diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index 1de79383fb87..892db0fd853f 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -23,61 +23,6 @@ pub trait VariantBufferManager { /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new /// buffer must have the contents of the old value buffer. fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; - - fn get_immutable_metadata_buffer(&self) -> &[u8] { - unimplemented!("Optional method not implemented in VariantBufferManager") - } - - fn get_immutable_value_buffer(&self) -> &[u8] { - unimplemented!("Optional method not implemented in VariantBufferManager") - } -} - -pub struct SampleBoxBasedVariantBufferManager { - pub value_buffer: Box<[u8]>, - pub metadata_buffer: Box<[u8]>, -} - -impl VariantBufferManager for SampleBoxBasedVariantBufferManager { - #[inline(always)] - fn borrow_value_buffer(&mut self) -> &mut [u8] { - &mut self.value_buffer - } - - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.value_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.value_buffer); - self.value_buffer = new_buffer; - } - Ok(()) - } - - #[inline(always)] - fn borrow_metadata_buffer(&mut self) -> &mut [u8] { - &mut self.metadata_buffer - } - - fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.metadata_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let mut new_buffer = vec![0u8; size].into_boxed_slice(); - new_buffer[..cur_len].copy_from_slice(&self.metadata_buffer); - self.metadata_buffer = new_buffer; - } - Ok(()) - } - - fn get_immutable_metadata_buffer(&self) -> &[u8] { - &self.metadata_buffer - } - - fn get_immutable_value_buffer(&self) -> &[u8] { - &self.value_buffer - } } pub struct SampleVecBasedVariantBufferManager { @@ -113,12 +58,4 @@ impl VariantBufferManager for SampleVecBasedVariantBufferManager { } Ok(()) } - - fn get_immutable_metadata_buffer(&self) -> &[u8] { - self.metadata_buffer.as_slice() - } - - fn get_immutable_value_buffer(&self) -> &[u8] { - self.value_buffer.as_slice() - } } diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 120e6928563e..92fb27c63ecd 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -305,9 +305,9 @@ fn test_json_to_variant() -> Result<(), ArrowError> { 1u8, 2u8, 4u8, 8u8, ], &[ - 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, - 0x73u8, 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, - 0x61u8, 0x6eu8, 0x73u8, + 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, 0x73u8, + 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, 0x61u8, 0x6eu8, + 0x73u8, ], )?; From 57b3eb0dbf0c7b147615014f1ef98fd420680744 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 25 Jun 2025 23:49:15 -0700 Subject: [PATCH 11/35] fix --- parquet-variant/src/from_json.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 86d3ef3747a2..1be879fc1291 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -35,10 +35,7 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr match json { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => { - let v: Variant = n.try_into()?; - builder.append_value(v) - } + Value::Number(n) => builder.append_value(Variant::try_from(n)?), Value::String(s) => builder.append_value(s.as_str()), Value::Array(arr) => { let mut list_builder = builder.new_list(); From 71b7d6f7993ee6e1f8d958f6a50cfd9f1d26ed47 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 00:07:54 -0700 Subject: [PATCH 12/35] fixed VariantBufferManager --- parquet-variant/src/from_json.rs | 9 ++- parquet-variant/src/variant_buffer_manager.rs | 60 +++++++++---------- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 1be879fc1291..bf68ba1e6415 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -21,12 +21,11 @@ pub fn json_to_variant( // Write to caller's buffers - Remove this when the library internally writes to the caller's // buffers anyway - variant_buffer_manager.ensure_metadata_buffer_size(metadata_size)?; - variant_buffer_manager.ensure_value_buffer_size(value_size)?; - - let caller_metadata_buffer = variant_buffer_manager.borrow_metadata_buffer(); + let caller_metadata_buffer = + variant_buffer_manager.ensure_size_and_borrow_metadata_buffer(metadata_size)?; caller_metadata_buffer[..metadata_size].copy_from_slice(metadata.as_slice()); - let caller_value_buffer = variant_buffer_manager.borrow_value_buffer(); + let caller_value_buffer = + variant_buffer_manager.ensure_size_and_borrow_value_buffer(value_size)?; caller_value_buffer[..value_size].copy_from_slice(value.as_slice()); Ok((metadata_size, value_size)) } diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index 892db0fd853f..d7d7925d161c 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -5,24 +5,22 @@ pub trait VariantBufferManager { /// called several times during the construction of a new `metadata` field in a variant. The /// implementation must make sure that on every call, all the data written to the metadata /// buffer so far are preserved. - fn borrow_metadata_buffer(&mut self) -> &mut [u8]; + /// The implementation must also make sure that the length of the slice being returned is at + /// least `size` bytes. The implementation may throw an error if it is unable to fulfill its + /// requirements. + fn ensure_size_and_borrow_metadata_buffer( + &mut self, + size: usize, + ) -> Result<&mut [u8], ArrowError>; /// Returns the slice where value needs to be written to. This method may be called several /// times during the construction of a new `value` field in a variant. The implementation must /// make sure that on every call, all the data written to the value buffer so far are preserved. - fn borrow_value_buffer(&mut self) -> &mut [u8]; - - /// Ensures that the next call to `borrow_metadata_buffer` returns a slice having at least - /// `size` bytes. Also ensures that the value metadata written so far are persisted - this means - /// that if `borrow_metadata_buffer` is to return a new buffer from the next call onwards, the - /// new buffer must have the contents of the old metadata buffer. - fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; - - /// Ensures that the next call to `borrow_value_buffer` returns a slice having at least `size` - /// bytes. Also ensures that the value bytes written so far are persisted - this means that - /// if `borrow_value_buffer` is to return a new buffer from the next call onwards, the new - /// buffer must have the contents of the old value buffer. - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError>; + /// The implementation must also make sure that the length of the slice being returned is at + /// least `size` bytes. The implementation may throw an error if it is unable to fulfill its + /// requirements. + fn ensure_size_and_borrow_value_buffer(&mut self, size: usize) + -> Result<&mut [u8], ArrowError>; } pub struct SampleVecBasedVariantBufferManager { @@ -31,31 +29,29 @@ pub struct SampleVecBasedVariantBufferManager { } impl VariantBufferManager for SampleVecBasedVariantBufferManager { - #[inline(always)] - fn borrow_value_buffer(&mut self) -> &mut [u8] { - self.value_buffer.as_mut_slice() - } - - fn ensure_value_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.value_buffer.len(); + fn ensure_size_and_borrow_metadata_buffer( + &mut self, + size: usize, + ) -> Result<&mut [u8], ArrowError> { + let cur_len = self.metadata_buffer.len(); if size > cur_len { // Reallocate larger buffer - self.value_buffer.resize(size, 0); + let new_len = size.next_power_of_two(); + self.metadata_buffer.resize(new_len, 0); } - Ok(()) - } - - #[inline(always)] - fn borrow_metadata_buffer(&mut self) -> &mut [u8] { - self.metadata_buffer.as_mut_slice() + Ok(&mut self.metadata_buffer) } - fn ensure_metadata_buffer_size(&mut self, size: usize) -> Result<(), ArrowError> { - let cur_len = self.metadata_buffer.len(); + fn ensure_size_and_borrow_value_buffer( + &mut self, + size: usize, + ) -> Result<&mut [u8], ArrowError> { + let cur_len = self.value_buffer.len(); if size > cur_len { // Reallocate larger buffer - self.metadata_buffer.resize(size, 0); + let new_len = size.next_power_of_two(); + self.value_buffer.resize(new_len, 0); } - Ok(()) + Ok(&mut self.value_buffer) } } From 031c916c109feaab1004526a7b6327a72df9aaad Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 00:52:12 -0700 Subject: [PATCH 13/35] deduped a bit of code --- parquet-variant/src/builder.rs | 36 +++++++++++++++++++++++++++++++ parquet-variant/src/from_json.rs | 37 ++++++++++++-------------------- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 73197e612483..361fc3a5afb9 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -721,6 +721,42 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { } } +pub(crate) trait AppendVariantHelper { + fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T); + + fn new_list_helper(&mut self) -> ListBuilder; + + fn new_object_helper(&mut self) -> ObjectBuilder; +} + +impl AppendVariantHelper for ListBuilder<'_> { + fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + self.append_value(value); + } + + fn new_list_helper(&mut self) -> ListBuilder { + self.new_list() + } + + fn new_object_helper(&mut self) -> ObjectBuilder { + self.new_object() + } +} + +impl AppendVariantHelper for VariantBuilder { + fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + self.append_value(value); + } + + fn new_list_helper(&mut self) -> ListBuilder { + self.new_list() + } + + fn new_object_helper(&mut self) -> ObjectBuilder { + self.new_object() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index bf68ba1e6415..2814177d6af1 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -1,5 +1,5 @@ use crate::variant_buffer_manager::VariantBufferManager; -use crate::{ListBuilder, ObjectBuilder, Variant, VariantBuilder}; +use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; use arrow_schema::ArrowError; use serde_json::{Map, Value}; @@ -31,18 +31,23 @@ pub fn json_to_variant( } fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { + append_json(json, builder)?; + Ok(()) +} + +fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<(), ArrowError> { match json { - Value::Null => builder.append_value(Variant::Null), - Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => builder.append_value(Variant::try_from(n)?), - Value::String(s) => builder.append_value(s.as_str()), + Value::Null => builder.append_value_helper(Variant::Null), + Value::Bool(b) => builder.append_value_helper(*b), + Value::Number(n) => builder.append_value_helper(Variant::try_from(n)?), + Value::String(s) => builder.append_value_helper(s.as_str()), Value::Array(arr) => { - let mut list_builder = builder.new_list(); + let mut list_builder = builder.new_list_helper(); build_list(arr, &mut list_builder)?; list_builder.finish(); } Value::Object(obj) => { - let mut obj_builder = builder.new_object(); + let mut obj_builder = builder.new_object_helper(); build_object(obj, &mut obj_builder)?; obj_builder.finish(); } @@ -52,22 +57,7 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> { for val in arr { - match val { - Value::Null => builder.append_value(Variant::Null), - Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => builder.append_value(Variant::try_from(n)?), - Value::String(s) => builder.append_value(s.as_str()), - Value::Array(arr) => { - let mut list_builder = builder.new_list(); - build_list(arr, &mut list_builder)?; - list_builder.finish() - } - Value::Object(obj) => { - let mut obj_builder = builder.new_object(); - build_object(obj, &mut obj_builder)?; - obj_builder.finish(); - } - } + append_json(val, builder)?; } Ok(()) } @@ -76,6 +66,7 @@ fn build_object<'a, 'b>( obj: &'b Map, builder: &mut ObjectBuilder<'a, 'b>, ) -> Result<(), ArrowError> { + for (key, value) in obj.iter() { match value { Value::Null => builder.insert(key, Variant::Null), From d4fc876b91b44d5abac25c99c891493eccb21d3c Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 01:01:34 -0700 Subject: [PATCH 14/35] deduplicated code --- parquet-variant/src/from_json.rs | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 2814177d6af1..2b6e1e6ef48d 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -66,24 +66,28 @@ fn build_object<'a, 'b>( obj: &'b Map, builder: &mut ObjectBuilder<'a, 'b>, ) -> Result<(), ArrowError> { - for (key, value) in obj.iter() { - match value { - Value::Null => builder.insert(key, Variant::Null), - Value::Bool(b) => builder.insert(key, *b), - Value::Number(n) => builder.insert(key, Variant::try_from(n)?), - Value::String(s) => builder.insert(key, s.as_str()), - Value::Array(arr) => { - let mut list_builder = builder.new_list(key); - build_list(arr, &mut list_builder)?; - list_builder.finish() - } - Value::Object(obj) => { - let mut obj_builder = builder.new_object(key); - build_object(obj, &mut obj_builder)?; - obj_builder.finish(); - } - } + let mut field_builder = ObjectFieldBuilder { key, builder }; + append_json(value, &mut field_builder)?; } Ok(()) } + +struct ObjectFieldBuilder<'a, 'b, 'c> { + key: &'a str, + builder: &'b mut ObjectBuilder<'c, 'a>, +} + +impl AppendVariantHelper for ObjectFieldBuilder<'_, '_, '_> { + fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + self.builder.insert(self.key, value); + } + + fn new_list_helper(&mut self) -> ListBuilder { + self.builder.new_list(self.key) + } + + fn new_object_helper(&mut self) -> ObjectBuilder { + self.builder.new_object(self.key) + } +} From c41af4e89b46b7c512b199f0b605395c1bd09227 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 01:10:09 -0700 Subject: [PATCH 15/35] moved serde code out of variant.rs --- parquet-variant/src/from_json.rs | 54 ++++++++++++++++++++++++++++++-- parquet-variant/src/variant.rs | 52 ------------------------------ 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 2b6e1e6ef48d..0cf91d9816f4 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -1,7 +1,9 @@ +pub use crate::variant::{VariantDecimal4, VariantDecimal8}; use crate::variant_buffer_manager::VariantBufferManager; use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; use arrow_schema::ArrowError; -use serde_json::{Map, Value}; +use rust_decimal::prelude::*; +use serde_json::{Map, Number, Value}; /// Eventually, internal writes should also be performed using VariantBufferManager instead of /// ValueBuffer and MetadataBuffer so the caller has control of the memory. @@ -35,11 +37,59 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr Ok(()) } +fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError> { + if let Some(i) = n.as_i64() { + // Find minimum Integer width to fit + if i as i8 as i64 == i { + Ok((i as i8).into()) + } else if i as i16 as i64 == i { + Ok((i as i16).into()) + } else if i as i32 as i64 == i { + Ok((i as i32).into()) + } else { + Ok(i.into()) + } + } else { + // Try decimal + // TODO: Replace with custom decimal parsing as the rust_decimal library only supports + // a max unscaled value of 2^96. + match Decimal::from_str_exact(n.as_str()) { + Ok(dec) => { + let unscaled: i128 = dec.mantissa(); + let scale = dec.scale() as u8; + if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128 + && scale <= VariantDecimal4::MAX_PRECISION as u8 + { + (unscaled as i32, scale).try_into() + } else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128 + && scale <= VariantDecimal8::MAX_PRECISION as u8 + { + (unscaled as i64, scale).try_into() + } else { + (unscaled, scale).try_into() + } + } + Err(_) => { + // Try double + match n.as_f64() { + Some(f) => return Ok(f.into()), + None => Err(ArrowError::InvalidArgumentError(format!( + "Failed to parse {} as number", + n.as_str() + ))), + }? + } + } + } +} + fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<(), ArrowError> { match json { Value::Null => builder.append_value_helper(Variant::Null), Value::Bool(b) => builder.append_value_helper(*b), - Value::Number(n) => builder.append_value_helper(Variant::try_from(n)?), + Value::Number(n) => { + builder.append_value_helper(variant_from_number(n)?); + } Value::String(s) => builder.append_value_helper(s.as_str()), Value::Array(arr) => { let mut list_builder = builder.new_list_helper(); diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 56d90720c5c3..4ca23aee0fa1 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -32,8 +32,6 @@ mod decimal; mod list; mod metadata; mod object; -use rust_decimal::prelude::*; -use serde_json::Number; const MAX_SHORT_STRING_BYTES: usize = 0x3F; @@ -943,56 +941,6 @@ impl From for Variant<'_, '_> { } } -impl TryFrom<&Number> for Variant<'_, '_> { - type Error = ArrowError; - - fn try_from(n: &Number) -> Result { - if let Some(i) = n.as_i64() { - // Find minimum Integer width to fit - if i as i8 as i64 == i { - Ok((i as i8).into()) - } else if i as i16 as i64 == i { - Ok((i as i16).into()) - } else if i as i32 as i64 == i { - Ok((i as i32).into()) - } else { - Ok(i.into()) - } - } else { - // Try decimal - // TODO: Replace with custom decimal parsing as the rust_decimal library only supports - // a max unscaled value of 2^96. - match Decimal::from_str_exact(n.as_str()) { - Ok(dec) => { - let unscaled: i128 = dec.mantissa(); - let scale = dec.scale() as u8; - if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128 - && scale <= VariantDecimal4::MAX_PRECISION as u8 - { - (unscaled as i32, scale).try_into() - } else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128 - && scale <= VariantDecimal8::MAX_PRECISION as u8 - { - (unscaled as i64, scale).try_into() - } else { - (unscaled, scale).try_into() - } - } - Err(_) => { - // Try double - match n.as_f64() { - Some(f) => return Ok(f.into()), - None => Err(ArrowError::InvalidArgumentError(format!( - "Failed to parse {} as number", - n.as_str() - ))), - }? - } - } - } - } -} - impl From for Variant<'_, '_> { fn from(value: f32) -> Self { Variant::Float(value) From ecaf55785d77fa7a88c52a3f91f9d68572efd9e4 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 12:11:37 -0700 Subject: [PATCH 16/35] incorporated Ryan's latest comments --- parquet-variant/src/builder.rs | 20 ++++++------ parquet-variant/src/from_json.rs | 32 +++++++++---------- parquet-variant/tests/test_json_to_variant.rs | 2 -- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 361fc3a5afb9..5e7e61e86bfc 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -721,38 +721,40 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { } } +/// Trait that abstracts functionality from Variant fconstruction implementations, namely +/// `VariantBuilder`, `ListBuilder` and `ObjectFieldBuilder` to minimize code duplication. pub(crate) trait AppendVariantHelper { - fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T); + fn append_value<'m, 'd, T: Into>>(&mut self, value: T); - fn new_list_helper(&mut self) -> ListBuilder; + fn new_list(&mut self) -> ListBuilder; - fn new_object_helper(&mut self) -> ObjectBuilder; + fn new_object(&mut self) -> ObjectBuilder; } impl AppendVariantHelper for ListBuilder<'_> { - fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.append_value(value); } - fn new_list_helper(&mut self) -> ListBuilder { + fn new_list(&mut self) -> ListBuilder { self.new_list() } - fn new_object_helper(&mut self) -> ObjectBuilder { + fn new_object(&mut self) -> ObjectBuilder { self.new_object() } } impl AppendVariantHelper for VariantBuilder { - fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.append_value(value); } - fn new_list_helper(&mut self) -> ListBuilder { + fn new_list(&mut self) -> ListBuilder { self.new_list() } - fn new_object_helper(&mut self) -> ObjectBuilder { + fn new_object(&mut self) -> ObjectBuilder { self.new_object() } } diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 0cf91d9816f4..8ad37b3561eb 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -85,19 +85,19 @@ fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<(), ArrowError> { match json { - Value::Null => builder.append_value_helper(Variant::Null), - Value::Bool(b) => builder.append_value_helper(*b), + Value::Null => builder.append_value(Variant::Null), + Value::Bool(b) => builder.append_value(*b), Value::Number(n) => { - builder.append_value_helper(variant_from_number(n)?); + builder.append_value(variant_from_number(n)?); } - Value::String(s) => builder.append_value_helper(s.as_str()), + Value::String(s) => builder.append_value(s.as_str()), Value::Array(arr) => { - let mut list_builder = builder.new_list_helper(); + let mut list_builder = builder.new_list(); build_list(arr, &mut list_builder)?; list_builder.finish(); } Value::Object(obj) => { - let mut obj_builder = builder.new_object_helper(); + let mut obj_builder = builder.new_object(); build_object(obj, &mut obj_builder)?; obj_builder.finish(); } @@ -106,21 +106,19 @@ fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<( } fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> { - for val in arr { - append_json(val, builder)?; - } - Ok(()) + arr.iter().try_fold((), |_, val| { + append_json(val, builder) + }) } fn build_object<'a, 'b>( obj: &'b Map, builder: &mut ObjectBuilder<'a, 'b>, ) -> Result<(), ArrowError> { - for (key, value) in obj.iter() { + obj.iter().try_fold((), |_, (key, value)| { let mut field_builder = ObjectFieldBuilder { key, builder }; - append_json(value, &mut field_builder)?; - } - Ok(()) + append_json(value, &mut field_builder) + }) } struct ObjectFieldBuilder<'a, 'b, 'c> { @@ -129,15 +127,15 @@ struct ObjectFieldBuilder<'a, 'b, 'c> { } impl AppendVariantHelper for ObjectFieldBuilder<'_, '_, '_> { - fn append_value_helper<'m, 'd, T: Into>>(&mut self, value: T) { + fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.builder.insert(self.key, value); } - fn new_list_helper(&mut self) -> ListBuilder { + fn new_list(&mut self) -> ListBuilder { self.builder.new_list(self.key) } - fn new_object_helper(&mut self) -> ObjectBuilder { + fn new_object(&mut self) -> ObjectBuilder { self.builder.new_object(self.key) } } diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 92fb27c63ecd..0dae799d8af9 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -295,8 +295,6 @@ fn test_json_to_variant() -> Result<(), ArrowError> { // TODO: verify different offset_size, id_size, is_large values, nesting and more object // tests in general - // NOTE: objects and lists are treated as `pending` and are added to the dictionary after scalar - // values. Therefore "null" has ID 0, "numbers" as ID 1 and "booleans" has ID 2. compare_results( "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", &[ From 4abc5987f9c19f1134dab90f23765974e84fd7dc Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 13:59:31 -0700 Subject: [PATCH 17/35] add more object tests --- parquet-variant/src/from_json.rs | 4 +- parquet-variant/tests/test_json_to_variant.rs | 72 +++++++++++++++++-- 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 8ad37b3561eb..21a736410fc3 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -106,9 +106,7 @@ fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<( } fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> { - arr.iter().try_fold((), |_, val| { - append_json(val, builder) - }) + arr.iter().try_fold((), |_, val| append_json(val, builder)) } fn build_object<'a, 'b>( diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 0dae799d8af9..711efbf974c2 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -1,5 +1,7 @@ use arrow_schema::ArrowError; -use parquet_variant::{json_to_variant, SampleVecBasedVariantBufferManager}; +use parquet_variant::{ + json_to_variant, variant_to_json_string, SampleVecBasedVariantBufferManager, +}; #[test] fn test_json_to_variant() -> Result<(), ArrowError> { @@ -8,8 +10,6 @@ fn test_json_to_variant() -> Result<(), ArrowError> { expected_value: &[u8], expected_metadata: &[u8], ) -> Result<(), ArrowError> { - let json = json; - let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { value_buffer: vec![0u8; 1], metadata_buffer: vec![0u8; 1], @@ -293,8 +293,6 @@ fn test_json_to_variant() -> Result<(), ArrowError> { &[1, 2, 0, 1, 2, 98u8, 97u8], )?; - // TODO: verify different offset_size, id_size, is_large values, nesting and more object - // tests in general compare_results( "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", &[ @@ -309,5 +307,69 @@ fn test_json_to_variant() -> Result<(), ArrowError> { ], )?; + { + // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each + // element a list of numbers from 0-127 + let keys: Vec = (0..=255).map(|n| format!("\"{:03}\"", n)).collect(); + let innermost_list: String = format!( + "[{}]", + (0..=127) + .map(|n| format!("{}", n)) + .collect::>() + .join(",") + ); + let inner_keys: Vec = (240..=495).map(|n| format!("\"{}\"", n)).collect(); + let inner_object = format!( + "{{{}:{}}}", + inner_keys.join(format!(":{},", innermost_list).as_str()), + innermost_list + ); + let json = format!( + "{{{}:{}}}", + keys.join(format!(":{},", inner_object).as_str()), + inner_object + ); + let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { + value_buffer: vec![0u8; 1], + metadata_buffer: vec![0u8; 1], + }; + let (metadata_size, value_size) = json_to_variant(&json, &mut variant_buffer_manager)?; + let computed_metadata_slize: &[u8] = + &variant_buffer_manager.metadata_buffer[..metadata_size]; + let computed_value_slize: &[u8] = &variant_buffer_manager.value_buffer[..value_size]; + let v = parquet_variant::Variant::try_new(computed_metadata_slize, computed_value_slize)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, json); + // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 + assert_eq!(metadata_size, 2485); + // Verify value size. + // Size of innermost_list: 1 + 1 + 258 + 256 = 516 + // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 + // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 + assert_eq!(value_size, 34082313); + } + { + let json = "{\"爱\":\"अ\",\"a\":1}"; + let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { + value_buffer: vec![0u8; 1], + metadata_buffer: vec![0u8; 1], + }; + let (metadata_size, value_size) = json_to_variant(&json, &mut variant_buffer_manager)?; + let computed_metadata_slize: &[u8] = + &variant_buffer_manager.metadata_buffer[..metadata_size]; + let computed_value_slize: &[u8] = &variant_buffer_manager.value_buffer[..value_size]; + let v = parquet_variant::Variant::try_new(computed_metadata_slize, computed_value_slize)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); + assert_eq!( + computed_value_slize, + &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] + ); + assert_eq!( + computed_metadata_slize, + &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] + ); + } + Ok(()) } From 0842ef8c57f51c185256089bd12cd510f5e1b699 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 16:10:26 -0700 Subject: [PATCH 18/35] fix --- .../examples/variant_from_json_examples.rs | 10 +- parquet-variant/src/from_json.rs | 95 +++++++++++++++---- parquet-variant/src/variant_buffer_manager.rs | 20 ++++ parquet-variant/tests/test_json_to_variant.rs | 19 ++++ 4 files changed, 121 insertions(+), 23 deletions(-) diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 0ccfd68365d0..73ce702f3345 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -41,7 +41,7 @@ fn main() -> Result<(), Box> { &variant_buffer_manager.value_buffer[..value_size], )?; - let json_string = variant_to_json_string(&variant)?; + let json_result = variant_to_json_string(&variant)?; let json_value = variant_to_json_value(&variant)?; let pretty_json = serde_json::to_string_pretty(&json_value)?; println!("{}", pretty_json); @@ -49,10 +49,10 @@ fn main() -> Result<(), Box> { let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant)?; let buffer_result = String::from_utf8(buffer)?; - - // Verify all methods produce the same result - assert_eq!(json_string, buffer_result); - assert_eq!(json_string, serde_json::to_string(&json_value)?); + assert_eq!(json_result, "{\"additional_info\":null,\"age\":30,".to_string() + + "\"email\":\"alice@example.com\",\"is_active\":true,\"name\":\"Alice\",\"score\":95.7}"); + assert_eq!(json_result, buffer_result); + assert_eq!(json_result, serde_json::to_string(&json_value)?); Ok(()) } diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 21a736410fc3..2231ba183a52 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -1,13 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for parsing JSON strings as Variant + pub use crate::variant::{VariantDecimal4, VariantDecimal8}; use crate::variant_buffer_manager::VariantBufferManager; use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; use arrow_schema::ArrowError; use rust_decimal::prelude::*; -use serde_json::{Map, Number, Value}; +use serde_json::{Number, Value}; +/// Converts a JSON string to Variant and writes the corresponding `value` and `metadata` values +/// to buffers provided by `variant_buffer_manager`. +/// +/// # Arguments +/// * `json` - The JSON string to parse as Variant. +/// * `variant_buffer_manager` - Object implementing the `VariantBufferManager` trait for the caller +/// to provide buffers to write the Variant output to. +/// +/// # Returns +/// +/// * `Ok(metadata_size, value_size)` denoting the sizes of the resulting value and metadata values +/// if successful +/// * `Err` with error details if the conversion fails +/// +/// ```rust +/// # use parquet_variant::{ +/// json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, +/// SampleVecBasedVariantBufferManager, +/// }; +/// +/// let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { +/// value_buffer: vec![0u8; 1], +/// metadata_buffer: vec![0u8; 1], +/// }; +/// let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() +/// + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," +/// + "\"additional_info\": null}"; +/// let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; +/// +/// let variant = parquet_variant::Variant::try_new( +/// &variant_buffer_manager.metadata_buffer[..metadata_size], +/// &variant_buffer_manager.value_buffer[..value_size], +/// )?; +/// +/// let json_result = variant_to_json_string(&variant)?; +/// let json_value = variant_to_json_value(&variant)?; +/// +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// let buffer_result = String::from_utf8(buffer)?; +/// assert_eq!(json_result, "{\"additional_info\":null,\"age\":30,".to_string() + +/// "\"email\":\"alice@example.com\",\"is_active\":true,\"name\":\"Alice\",\"score\":95.7}"); +/// assert_eq!(json_result, buffer_result); +/// assert_eq!(json_result, serde_json::to_string(&json_value)?); +/// # Ok::<(), Box>(()) +/// ``` +/// /// Eventually, internal writes should also be performed using VariantBufferManager instead of /// ValueBuffer and MetadataBuffer so the caller has control of the memory. -/// Returns a pair pub fn json_to_variant( json: &str, variant_buffer_manager: &mut impl VariantBufferManager, @@ -93,32 +158,26 @@ fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<( Value::String(s) => builder.append_value(s.as_str()), Value::Array(arr) => { let mut list_builder = builder.new_list(); - build_list(arr, &mut list_builder)?; + for val in arr { + append_json(val, &mut list_builder)?; + } list_builder.finish(); } Value::Object(obj) => { let mut obj_builder = builder.new_object(); - build_object(obj, &mut obj_builder)?; + for (key, value) in obj.iter() { + let mut field_builder = ObjectFieldBuilder { + key, + builder: &mut obj_builder, + }; + append_json(value, &mut field_builder)?; + } obj_builder.finish(); } }; Ok(()) } -fn build_list(arr: &[Value], builder: &mut ListBuilder) -> Result<(), ArrowError> { - arr.iter().try_fold((), |_, val| append_json(val, builder)) -} - -fn build_object<'a, 'b>( - obj: &'b Map, - builder: &mut ObjectBuilder<'a, 'b>, -) -> Result<(), ArrowError> { - obj.iter().try_fold((), |_, (key, value)| { - let mut field_builder = ObjectFieldBuilder { key, builder }; - append_json(value, &mut field_builder) - }) -} - struct ObjectFieldBuilder<'a, 'b, 'c> { key: &'a str, builder: &'b mut ObjectBuilder<'c, 'a>, diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index d7d7925d161c..b4f6eb78a8d9 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -1,3 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! End-to-end check: (almost) every sample from apache/parquet-testing/variant +//! can be parsed into our `Variant`. + use arrow_schema::ArrowError; pub trait VariantBufferManager { diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 711efbf974c2..2e0e87a1161e 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -1,3 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Manually tests if parsing JSON strings to Variants returns the expected results. + use arrow_schema::ArrowError; use parquet_variant::{ json_to_variant, variant_to_json_string, SampleVecBasedVariantBufferManager, From 94531af9c8258241f75ce6f5a32e7a0cb660d59a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 26 Jun 2025 16:20:43 -0700 Subject: [PATCH 19/35] doc fix --- parquet-variant/src/variant_buffer_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs index b4f6eb78a8d9..2f353aa1c70e 100644 --- a/parquet-variant/src/variant_buffer_manager.rs +++ b/parquet-variant/src/variant_buffer_manager.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! End-to-end check: (almost) every sample from apache/parquet-testing/variant -//! can be parsed into our `Variant`. +//! `VariantBufferManager` allows callers to have full control over the outputs that the Variant +//! library writes to when constructing Variants. use arrow_schema::ArrowError; From e2788f5eda1358f3a6fdfcf83800c25a84bea5ce Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 27 Jun 2025 15:12:18 -0700 Subject: [PATCH 20/35] Removed dependency on VariantBufferManager and leave that to later --- .../examples/variant_from_json_examples.rs | 21 ++--- parquet-variant/src/from_json.rs | 55 ++++--------- parquet-variant/src/lib.rs | 2 - parquet-variant/src/variant_buffer_manager.rs | 77 ------------------- parquet-variant/tests/test_json_to_variant.rs | 50 +++++------- 5 files changed, 42 insertions(+), 163 deletions(-) delete mode 100644 parquet-variant/src/variant_buffer_manager.rs diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 73ce702f3345..81dbc4629763 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -18,27 +18,22 @@ //! Example showing how to convert Variant values to JSON use parquet_variant::{ - json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, - SampleVecBasedVariantBufferManager, + json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder, }; fn main() -> Result<(), Box> { - // The caller must provide an object implementing the `VariantBufferManager` trait to the library. - // This allows the library to write the constructed variant to buffers provided by the caller. - // This way, the caller has direct control over the output buffers. - let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { - value_buffer: vec![0u8; 1], - metadata_buffer: vec![0u8; 1], - }; - let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," + "\"additional_info\": null}"; - let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; + + let mut variant_builder = VariantBuilder::new(); + json_to_variant(&person_string, &mut variant_builder)?; + + let (metadata, value) = variant_builder.finish(); let variant = parquet_variant::Variant::try_new( - &variant_buffer_manager.metadata_buffer[..metadata_size], - &variant_buffer_manager.value_buffer[..value_size], + &metadata, + &value, )?; let json_result = variant_to_json_string(&variant)?; diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 2231ba183a52..5963390c838f 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -18,45 +18,38 @@ //! Module for parsing JSON strings as Variant pub use crate::variant::{VariantDecimal4, VariantDecimal8}; -use crate::variant_buffer_manager::VariantBufferManager; use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; use arrow_schema::ArrowError; use rust_decimal::prelude::*; use serde_json::{Number, Value}; -/// Converts a JSON string to Variant and writes the corresponding `value` and `metadata` values -/// to buffers provided by `variant_buffer_manager`. +/// Converts a JSON string to Variant using `variant_builder`. The resulting `value` and `metadata` +/// buffers can be extracted using `builder.finish()` /// /// # Arguments /// * `json` - The JSON string to parse as Variant. -/// * `variant_buffer_manager` - Object implementing the `VariantBufferManager` trait for the caller -/// to provide buffers to write the Variant output to. +/// * `variant_builder` - Object of type `VariantBuilder` used to build the vatiant from the JSON +/// string /// /// # Returns /// -/// * `Ok(metadata_size, value_size)` denoting the sizes of the resulting value and metadata values -/// if successful +/// * `Ok(())` if successful /// * `Err` with error details if the conversion fails /// /// ```rust /// # use parquet_variant::{ -/// json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, -/// SampleVecBasedVariantBufferManager, +/// json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder /// }; /// -/// let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { -/// value_buffer: vec![0u8; 1], -/// metadata_buffer: vec![0u8; 1], -/// }; +/// let mut variant_builder = VariantBuilder::new(); /// let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() /// + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," /// + "\"additional_info\": null}"; -/// let (metadata_size, value_size) = json_to_variant(&person_string, &mut variant_buffer_manager)?; +/// json_to_variant(&person_string, &mut variant_builder)?; +/// +/// let (metadata, value) = variant_builder.finish(); /// -/// let variant = parquet_variant::Variant::try_new( -/// &variant_buffer_manager.metadata_buffer[..metadata_size], -/// &variant_buffer_manager.value_buffer[..value_size], -/// )?; +/// let variant = parquet_variant::Variant::try_new(&metadata, &value)?; /// /// let json_result = variant_to_json_string(&variant)?; /// let json_value = variant_to_json_value(&variant)?; @@ -70,31 +63,15 @@ use serde_json::{Number, Value}; /// assert_eq!(json_result, serde_json::to_string(&json_value)?); /// # Ok::<(), Box>(()) /// ``` -/// -/// Eventually, internal writes should also be performed using VariantBufferManager instead of -/// ValueBuffer and MetadataBuffer so the caller has control of the memory. -pub fn json_to_variant( +pub fn json_to_variant<'a>( json: &str, - variant_buffer_manager: &mut impl VariantBufferManager, -) -> Result<(usize, usize), ArrowError> { - let mut builder = VariantBuilder::new(); + builder: &'a mut VariantBuilder, +) -> Result<(), ArrowError> { let json: Value = serde_json::from_str(json) .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {}", e)))?; - build_json(&json, &mut builder)?; - let (metadata, value) = builder.finish(); - let value_size = value.len(); - let metadata_size = metadata.len(); - - // Write to caller's buffers - Remove this when the library internally writes to the caller's - // buffers anyway - let caller_metadata_buffer = - variant_buffer_manager.ensure_size_and_borrow_metadata_buffer(metadata_size)?; - caller_metadata_buffer[..metadata_size].copy_from_slice(metadata.as_slice()); - let caller_value_buffer = - variant_buffer_manager.ensure_size_and_borrow_value_buffer(value_size)?; - caller_value_buffer[..value_size].copy_from_slice(value.as_slice()); - Ok((metadata_size, value_size)) + build_json(&json, builder)?; + Ok(()) } fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 65a8bdeffcaf..7dbfff52b1b5 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -37,10 +37,8 @@ mod from_json; mod to_json; #[allow(dead_code)] mod utils; -mod variant_buffer_manager; pub use builder::*; pub use from_json::json_to_variant; pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; -pub use variant_buffer_manager::{SampleVecBasedVariantBufferManager, VariantBufferManager}; diff --git a/parquet-variant/src/variant_buffer_manager.rs b/parquet-variant/src/variant_buffer_manager.rs deleted file mode 100644 index 2f353aa1c70e..000000000000 --- a/parquet-variant/src/variant_buffer_manager.rs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! `VariantBufferManager` allows callers to have full control over the outputs that the Variant -//! library writes to when constructing Variants. - -use arrow_schema::ArrowError; - -pub trait VariantBufferManager { - /// Returns the slice where the variant metadata needs to be written to. This method may be - /// called several times during the construction of a new `metadata` field in a variant. The - /// implementation must make sure that on every call, all the data written to the metadata - /// buffer so far are preserved. - /// The implementation must also make sure that the length of the slice being returned is at - /// least `size` bytes. The implementation may throw an error if it is unable to fulfill its - /// requirements. - fn ensure_size_and_borrow_metadata_buffer( - &mut self, - size: usize, - ) -> Result<&mut [u8], ArrowError>; - - /// Returns the slice where value needs to be written to. This method may be called several - /// times during the construction of a new `value` field in a variant. The implementation must - /// make sure that on every call, all the data written to the value buffer so far are preserved. - /// The implementation must also make sure that the length of the slice being returned is at - /// least `size` bytes. The implementation may throw an error if it is unable to fulfill its - /// requirements. - fn ensure_size_and_borrow_value_buffer(&mut self, size: usize) - -> Result<&mut [u8], ArrowError>; -} - -pub struct SampleVecBasedVariantBufferManager { - pub value_buffer: Vec, - pub metadata_buffer: Vec, -} - -impl VariantBufferManager for SampleVecBasedVariantBufferManager { - fn ensure_size_and_borrow_metadata_buffer( - &mut self, - size: usize, - ) -> Result<&mut [u8], ArrowError> { - let cur_len = self.metadata_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let new_len = size.next_power_of_two(); - self.metadata_buffer.resize(new_len, 0); - } - Ok(&mut self.metadata_buffer) - } - - fn ensure_size_and_borrow_value_buffer( - &mut self, - size: usize, - ) -> Result<&mut [u8], ArrowError> { - let cur_len = self.value_buffer.len(); - if size > cur_len { - // Reallocate larger buffer - let new_len = size.next_power_of_two(); - self.value_buffer.resize(new_len, 0); - } - Ok(&mut self.value_buffer) - } -} diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 2e0e87a1161e..17eb0590e3dd 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -19,7 +19,7 @@ use arrow_schema::ArrowError; use parquet_variant::{ - json_to_variant, variant_to_json_string, SampleVecBasedVariantBufferManager, + json_to_variant, variant_to_json_string, VariantBuilder, }; #[test] @@ -29,15 +29,11 @@ fn test_json_to_variant() -> Result<(), ArrowError> { expected_value: &[u8], expected_metadata: &[u8], ) -> Result<(), ArrowError> { - let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { - value_buffer: vec![0u8; 1], - metadata_buffer: vec![0u8; 1], - }; - let (metadata_size, value_size) = json_to_variant(json, &mut variant_buffer_manager)?; - let computed_metadata_slize: &[u8] = &*variant_buffer_manager.metadata_buffer; - let computed_value_slize: &[u8] = &*variant_buffer_manager.value_buffer; - assert_eq!(&computed_metadata_slize[..metadata_size], expected_metadata); - assert_eq!(&computed_value_slize[..value_size], expected_value); + let mut variant_builder = VariantBuilder::new(); + json_to_variant(json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + assert_eq!(&metadata, expected_metadata); + assert_eq!(&value, expected_value); Ok(()) } @@ -348,44 +344,34 @@ fn test_json_to_variant() -> Result<(), ArrowError> { keys.join(format!(":{},", inner_object).as_str()), inner_object ); - let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { - value_buffer: vec![0u8; 1], - metadata_buffer: vec![0u8; 1], - }; - let (metadata_size, value_size) = json_to_variant(&json, &mut variant_buffer_manager)?; - let computed_metadata_slize: &[u8] = - &variant_buffer_manager.metadata_buffer[..metadata_size]; - let computed_value_slize: &[u8] = &variant_buffer_manager.value_buffer[..value_size]; - let v = parquet_variant::Variant::try_new(computed_metadata_slize, computed_value_slize)?; + let mut variant_builder = VariantBuilder::new(); + json_to_variant(&json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = parquet_variant::Variant::try_new(&metadata, &value)?; let output_string = variant_to_json_string(&v)?; assert_eq!(output_string, json); // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 - assert_eq!(metadata_size, 2485); + assert_eq!(metadata.len(), 2485); // Verify value size. // Size of innermost_list: 1 + 1 + 258 + 256 = 516 // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 - assert_eq!(value_size, 34082313); + assert_eq!(value.len(), 34082313); } { let json = "{\"爱\":\"अ\",\"a\":1}"; - let mut variant_buffer_manager = SampleVecBasedVariantBufferManager { - value_buffer: vec![0u8; 1], - metadata_buffer: vec![0u8; 1], - }; - let (metadata_size, value_size) = json_to_variant(&json, &mut variant_buffer_manager)?; - let computed_metadata_slize: &[u8] = - &variant_buffer_manager.metadata_buffer[..metadata_size]; - let computed_value_slize: &[u8] = &variant_buffer_manager.value_buffer[..value_size]; - let v = parquet_variant::Variant::try_new(computed_metadata_slize, computed_value_slize)?; + let mut variant_builder = VariantBuilder::new(); + json_to_variant(&json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = parquet_variant::Variant::try_new(&metadata, &value)?; let output_string = variant_to_json_string(&v)?; assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); assert_eq!( - computed_value_slize, + value, &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] ); assert_eq!( - computed_metadata_slize, + metadata, &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] ); } From 0455685d523ff78e4ef52a8f4ff4c18cef56732b Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 27 Jun 2025 15:19:46 -0700 Subject: [PATCH 21/35] fix --- parquet-variant/src/from_json.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 5963390c838f..e62f0fa0830f 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -29,7 +29,7 @@ use serde_json::{Number, Value}; /// # Arguments /// * `json` - The JSON string to parse as Variant. /// * `variant_builder` - Object of type `VariantBuilder` used to build the vatiant from the JSON -/// string +/// string /// /// # Returns /// @@ -63,9 +63,9 @@ use serde_json::{Number, Value}; /// assert_eq!(json_result, serde_json::to_string(&json_value)?); /// # Ok::<(), Box>(()) /// ``` -pub fn json_to_variant<'a>( +pub fn json_to_variant( json: &str, - builder: &'a mut VariantBuilder, + builder: &mut VariantBuilder, ) -> Result<(), ArrowError> { let json: Value = serde_json::from_str(json) .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {}", e)))?; From cc0b66ebd7129fd35cbf1c1d8bd98c4b7e7dcc81 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 12:19:54 -0700 Subject: [PATCH 22/35] fix --- parquet-variant/src/from_json.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index e62f0fa0830f..e5742bfe3e83 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -68,7 +68,7 @@ pub fn json_to_variant( builder: &mut VariantBuilder, ) -> Result<(), ArrowError> { let json: Value = serde_json::from_str(json) - .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {}", e)))?; + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {e}")))?; build_json(&json, builder)?; Ok(()) From d2a751691d864ba68f88a8843b3546a9916301e9 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 15:56:13 -0700 Subject: [PATCH 23/35] resolved test comment --- .../examples/variant_from_json_examples.rs | 5 +- parquet-variant/src/from_json.rs | 5 +- parquet-variant/tests/test_json_to_variant.rs | 621 ++++++++++-------- 3 files changed, 356 insertions(+), 275 deletions(-) diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs index 81dbc4629763..c2d35576caa5 100644 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ b/parquet-variant/examples/variant_from_json_examples.rs @@ -31,10 +31,7 @@ fn main() -> Result<(), Box> { let (metadata, value) = variant_builder.finish(); - let variant = parquet_variant::Variant::try_new( - &metadata, - &value, - )?; + let variant = parquet_variant::Variant::try_new(&metadata, &value)?; let json_result = variant_to_json_string(&variant)?; let json_value = variant_to_json_value(&variant)?; diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index e5742bfe3e83..a8902d6b5be0 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -63,10 +63,7 @@ use serde_json::{Number, Value}; /// assert_eq!(json_result, serde_json::to_string(&json_value)?); /// # Ok::<(), Box>(()) /// ``` -pub fn json_to_variant( - json: &str, - builder: &mut VariantBuilder, -) -> Result<(), ArrowError> { +pub fn json_to_variant(json: &str, builder: &mut VariantBuilder) -> Result<(), ArrowError> { let json: Value = serde_json::from_str(json) .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {e}")))?; diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 17eb0590e3dd..e570efada5c5 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -19,313 +19,356 @@ use arrow_schema::ArrowError; use parquet_variant::{ - json_to_variant, variant_to_json_string, VariantBuilder, + json_to_variant, variant_to_json_string, ShortString, Variant, VariantBuilder, + VariantDecimal16, VariantDecimal4, VariantDecimal8, }; #[test] fn test_json_to_variant() -> Result<(), ArrowError> { - fn compare_results( - json: &str, - expected_value: &[u8], - expected_metadata: &[u8], - ) -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - json_to_variant(json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - assert_eq!(&metadata, expected_metadata); - assert_eq!(&value, expected_value); - Ok(()) + struct JsonToVariantTest<'a> { + json: &'a str, + expected: Variant<'a, 'a>, + } + + impl<'a> JsonToVariantTest<'a> { + fn run(self) -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + json_to_variant(self.json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + assert_eq!(variant, self.expected); + Ok(()) + } } - let empty_metadata: &[u8] = &[1u8, 0u8, 0u8]; // Null - compare_results("null", &[0u8], empty_metadata)?; + JsonToVariantTest { + json: "null", + expected: Variant::Null, + } + .run()?; // Bool - compare_results("true", &[4u8], empty_metadata)?; - compare_results("false", &[8u8], empty_metadata)?; + JsonToVariantTest { + json: "true", + expected: Variant::BooleanTrue, + } + .run()?; + JsonToVariantTest { + json: "false", + expected: Variant::BooleanFalse, + } + .run()?; // Integers - compare_results(" 127 ", &[12u8, 127u8], empty_metadata)?; - compare_results(" -128 ", &[12u8, 128u8], empty_metadata)?; - compare_results(" 27134 ", &[16u8, 254u8, 105u8], empty_metadata)?; - compare_results( - " -32767431 ", - &[20u8, 57u8, 2u8, 12u8, 254u8], - empty_metadata, - )?; - compare_results( - "92842754201389", - &[24u8, 45u8, 87u8, 98u8, 163u8, 112u8, 84u8, 0u8, 0u8], - empty_metadata, - )?; + JsonToVariantTest { + json: " 127 ", + expected: Variant::Int8(127), + } + .run()?; + JsonToVariantTest { + json: " -128 ", + expected: Variant::Int8(-128), + } + .run()?; + JsonToVariantTest { + json: " 27134 ", + expected: Variant::Int16(27134), + } + .run()?; + JsonToVariantTest { + json: " -32767431 ", + expected: Variant::Int32(-32767431), + } + .run()?; + JsonToVariantTest { + json: "92842754201389", + expected: Variant::Int64(92842754201389), + } + .run()?; // Decimals // Decimal 4 - compare_results("1.23", &[32u8, 2u8, 123u8, 0u8, 0u8, 0u8], empty_metadata)?; - compare_results( - "99999999.9", - &[32u8, 1u8, 0xffu8, 0xc9u8, 0x9au8, 0x3bu8], - empty_metadata, - )?; - compare_results( - "-99999999.9", - &[32u8, 1u8, 1u8, 0x36u8, 0x65u8, 0xc4u8], - empty_metadata, - )?; - compare_results( - "0.999999999", - &[32u8, 9u8, 0xffu8, 0xc9u8, 0x9au8, 0x3bu8], - empty_metadata, - )?; - compare_results("0.000000001", &[32u8, 9u8, 1u8, 0, 0, 0], empty_metadata)?; - compare_results( - "-0.999999999", - &[32u8, 9u8, 1u8, 0x36u8, 0x65u8, 0xc4u8], - empty_metadata, - )?; - compare_results( - "-0.000000001", - &[32u8, 9u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8], - empty_metadata, - )?; + JsonToVariantTest { + json: "1.23", + expected: Variant::from(VariantDecimal4::try_new(123, 2)?), + } + .run()?; + JsonToVariantTest { + json: "99999999.9", + expected: Variant::from(VariantDecimal4::try_new(999999999, 1)?), + } + .run()?; + JsonToVariantTest { + json: "-99999999.9", + expected: Variant::from(VariantDecimal4::try_new(-999999999, 1)?), + } + .run()?; + JsonToVariantTest { + json: "0.999999999", + expected: Variant::from(VariantDecimal4::try_new(999999999, 9)?), + } + .run()?; + JsonToVariantTest { + json: "0.000000001", + expected: Variant::from(VariantDecimal4::try_new(1, 9)?), + } + .run()?; + JsonToVariantTest { + json: "-0.999999999", + expected: Variant::from(VariantDecimal4::try_new(-999999999, 9)?), + } + .run()?; // Decimal 8 - compare_results( - "999999999.0", - &[36u8, 1u8, 0xf6u8, 0xe3u8, 0x0bu8, 0x54u8, 0x02u8, 0, 0, 0], - empty_metadata, - )?; - compare_results( - "-999999999.0", - &[ - 36u8, 1u8, 0x0au8, 0x1cu8, 0xf4u8, 0xabu8, 0xfdu8, 0xffu8, 0xffu8, 0xffu8, - ], - empty_metadata, - )?; - compare_results( - "0.999999999999999999", - &[ - 36u8, 18u8, 0xffu8, 0xffu8, 0x63u8, 0xa7u8, 0xb3u8, 0xb6u8, 0xe0u8, 0x0du8, - ], - empty_metadata, - )?; - compare_results( - "-9999999999999999.99", - &[ - 36u8, 2u8, 0x01u8, 0x00u8, 0x9cu8, 0x58u8, 0x4cu8, 0x49u8, 0x1fu8, 0xf2u8, - ], - empty_metadata, - )?; + JsonToVariantTest { + json: "999999999.0", + expected: Variant::from(VariantDecimal8::try_new(9999999990, 1)?), + } + .run()?; + JsonToVariantTest { + json: "-999999999.0", + expected: Variant::from(VariantDecimal8::try_new(-9999999990, 1)?), + } + .run()?; + JsonToVariantTest { + json: "0.999999999999999999", + expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 18)?), + } + .run()?; + JsonToVariantTest { + json: "9999999999999999.99", + expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 2)?), + } + .run()?; + JsonToVariantTest { + json: "-9999999999999999.99", + expected: Variant::from(VariantDecimal8::try_new(-999999999999999999, 2)?), + } + .run()?; // Decimal 16 - compare_results( - "9999999999999999999", // integer larger than i64 - &[ - 40u8, 0u8, 0xffu8, 0xffu8, 0xe7u8, 0x89u8, 4u8, 0x23u8, 0xc7u8, 0x8au8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, - ], - empty_metadata, - )?; - compare_results( - "0.9999999999999999999", - &[ - 40u8, 19u8, 0xffu8, 0xffu8, 0xe7u8, 0x89u8, 4u8, 0x23u8, 0xc7u8, 0x8au8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, - ], - empty_metadata, - )?; - compare_results( - "79228162514264337593543950335", // 2 ^ 96 - 1 - &[ - 40u8, 0u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, - 0xffu8, 0xffu8, 0xffu8, 0u8, 0u8, 0u8, 0u8, - ], - empty_metadata, - )?; - compare_results( - "7.9228162514264337593543950335", // using scale higher than this falls into double + JsonToVariantTest { + json: "9999999999999999999", // integer larger than i64 + expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 0)?), + } + .run()?; + JsonToVariantTest { + json: "0.9999999999999999999", + expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 19)?), + } + .run()?; + JsonToVariantTest { + json: "79228162514264337593543950335", // 2 ^ 96 - 1 + expected: Variant::from(VariantDecimal16::try_new(79228162514264337593543950335, 0)?), + } + .run()?; + JsonToVariantTest { + json: "7.9228162514264337593543950335", // using scale higher than this falls into double // since the max scale is 28. - &[ - 40u8, 28u8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, 0xffu8, - 0xffu8, 0xffu8, 0xffu8, 0u8, 0u8, 0u8, 0u8, - ], - empty_metadata, - )?; + expected: Variant::from(VariantDecimal16::try_new( + 79228162514264337593543950335, + 28, + )?), + } + .run()?; // Double - { - let mut arr = [28u8; 9]; - arr[1..].copy_from_slice(&0.79228162514264337593543950335f64.to_le_bytes()); - compare_results("0.79228162514264337593543950335", &arr, empty_metadata)?; - } - compare_results( - "15e-1", - &[28u8, 0, 0, 0, 0, 0, 0, 0xf8, 0x3fu8], - empty_metadata, - )?; - compare_results( - "-15e-1", - &[28u8, 0, 0, 0, 0, 0, 0, 0xf8, 0xBfu8], - empty_metadata, - )?; - + JsonToVariantTest { + json: "0.79228162514264337593543950335", + expected: Variant::Double(0.79228162514264337593543950335f64), + } + .run()?; + JsonToVariantTest { + json: "15e-1", + expected: Variant::Double(15e-1f64), + } + .run()?; + JsonToVariantTest { + json: "-15e-1", + expected: Variant::Double(-15e-1f64), + } + .run()?; // short strings // random short string - compare_results( - "\"harsh\"", - &[21u8, 104u8, 97u8, 114u8, 115u8, 104u8], - empty_metadata, - )?; + JsonToVariantTest { + json: "\"harsh\"", + expected: Variant::ShortString(ShortString::try_new("harsh")?), + } + .run()?; // longest short string - let mut expected = [97u8; 64]; - expected[0] = 253u8; - compare_results( - &format!( + JsonToVariantTest { + json: &format!( "\"{}\"", std::iter::repeat('a').take(63).collect::() ), - &expected, - empty_metadata, - )?; + expected: Variant::ShortString(ShortString::try_new(&format!( + "{}", + std::iter::repeat('a').take(63).collect::() + ))?), + } + .run()?; // long strings - let mut expected = [97u8; 69]; - expected[..5].copy_from_slice(&[64u8, 64u8, 0, 0, 0]); - compare_results( - &format!( + JsonToVariantTest { + json: &format!( "\"{}\"", std::iter::repeat('a').take(64).collect::() ), - &expected, - empty_metadata, - )?; - let mut expected = [98u8; 100005]; - expected[0] = 64u8; - expected[1..5].copy_from_slice(&(100000 as u32).to_le_bytes()); - compare_results( - &format!( + expected: Variant::String(&format!( + "{}", + std::iter::repeat('a').take(64).collect::() + )), + } + .run()?; + JsonToVariantTest { + json: &format!( "\"{}\"", std::iter::repeat('b').take(100000).collect::() ), - &expected, - empty_metadata, - )?; + expected: Variant::String(&format!( + "{}", + std::iter::repeat('b').take(100000).collect::() + )), + } + .run()?; // arrays // u8 offset - compare_results( - "[127, 128, -32767431]", - &[ - 3u8, 3u8, 0u8, 2u8, 5u8, 10u8, 12u8, 127u8, 16u8, 128u8, 0u8, 20u8, 57u8, 2u8, 12u8, - 254u8, - ], - empty_metadata, - )?; - compare_results( - "[[\"a\", null, true, 4], 128, false]", - &[ - 3u8, 3u8, 0u8, 13u8, 16u8, 17u8, 3u8, 4u8, 0u8, 2u8, 3u8, 4u8, 6u8, 5u8, 97u8, 0u8, - 4u8, 12u8, 4u8, 16u8, 128u8, 0u8, 8u8, - ], - empty_metadata, - )?; - compare_results( - "[{\"age\": 32}, 128, false]", - &[ - 3u8, 3u8, 0u8, 7u8, 10u8, 11u8, 2u8, 1u8, 0u8, 0u8, 2u8, 12u8, 32u8, 16u8, 128u8, 0u8, - 8u8, - ], - &[1u8, 1u8, 0u8, 3u8, 97u8, 103u8, 101u8], - )?; + { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + list_builder.append_value(Variant::Int8(127)); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::Int32(-32767431)); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: "[127, 128, -32767431]", + expected: variant, + } + .run()?; + } + { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + let mut object_builder_inner = list_builder.new_object(); + object_builder_inner.insert("age", Variant::Int8(32)); + object_builder_inner.finish(); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::BooleanFalse); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: "[{\"age\": 32}, 128, false]", + expected: variant, + } + .run()?; + } // u16 offset - 128 i8's + 1 "true" = 257 bytes - compare_results( - &format!( - "[{} true]", - std::iter::repeat("1, ").take(128).collect::() - ), - &[ - 7u8, 129u8, 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, - 0, 24, 0, 26, 0, 28, 0, 30, 0, 32, 0, 34, 0, 36, 0, 38, 0, 40, 0, 42, 0, 44, 0, 46, 0, - 48, 0, 50, 0, 52, 0, 54, 0, 56, 0, 58, 0, 60, 0, 62, 0, 64, 0, 66, 0, 68, 0, 70, 0, 72, - 0, 74, 0, 76, 0, 78, 0, 80, 0, 82, 0, 84, 0, 86, 0, 88, 0, 90, 0, 92, 0, 94, 0, 96, 0, - 98, 0, 100, 0, 102, 0, 104, 0, 106, 0, 108, 0, 110, 0, 112, 0, 114, 0, 116, 0, 118, 0, - 120, 0, 122, 0, 124, 0, 126, 0, 128, 0, 130, 0, 132, 0, 134, 0, 136, 0, 138, 0, 140, 0, - 142, 0, 144, 0, 146, 0, 148, 0, 150, 0, 152, 0, 154, 0, 156, 0, 158, 0, 160, 0, 162, 0, - 164, 0, 166, 0, 168, 0, 170, 0, 172, 0, 174, 0, 176, 0, 178, 0, 180, 0, 182, 0, 184, 0, - 186, 0, 188, 0, 190, 0, 192, 0, 194, 0, 196, 0, 198, 0, 200, 0, 202, 0, 204, 0, 206, 0, - 208, 0, 210, 0, 212, 0, 214, 0, 216, 0, 218, 0, 220, 0, 222, 0, 224, 0, 226, 0, 228, 0, - 230, 0, 232, 0, 234, 0, 236, 0, 238, 0, 240, 0, 242, 0, 244, 0, 246, 0, 248, 0, 250, 0, - 252, 0, 254, 0, 0, 1, 1, 1, // Final offset is 257 - 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, - 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, - 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, - 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, - 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, - 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, - 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, - 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, - 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, - 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, - 12, 1, 12, 1, 12, 1, 4u8, - ], - empty_metadata, - )?; + { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..128 { + list_builder.append_value(Variant::Int8(1)); + } + list_builder.append_value(Variant::BooleanTrue); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: &format!( + "[{} true]", + std::iter::repeat("1, ").take(128).collect::() + ), + expected: variant, + } + .run()?; + } // verify u24, and large_size { - let null_array: [u8; 513] = std::array::from_fn(|i| match i { - 0 => 3u8, - 1 => 255u8, - j => { - if j <= 257 { - (j - 2) as u8 - } else { - 0u8 - } - } - }); - // 256 elements => large size - // each element is an array of 256 nulls => u24 offset - let mut whole_array: [u8; 5 + 3 * 257 + 256 * 513] = std::array::from_fn(|i| match i { - 0 => 0x1Bu8, - 1 => 0u8, - 2 => 1u8, - 3 => 0u8, - 4 => 0u8, - _ => 0, - }); - for i in 0..257 { - let cur_idx = 5 + i * 3 as usize; - let cur_offset: usize = i * 513; - whole_array[cur_idx..cur_idx + 3].copy_from_slice(&cur_offset.to_le_bytes()[..3]); - if i != 256 { - let abs_offset = 5 + 3 * 257 + cur_offset; - whole_array[abs_offset..abs_offset + 513].copy_from_slice(&null_array); + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..256 { + let mut list_builder_inner = list_builder.new_list(); + for _ in 0..255 { + list_builder_inner.append_value(Variant::Null); } + list_builder_inner.finish(); } + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); let json = format!("[{}]", vec![intermediate; 256].join(", ")); - compare_results(json.as_str(), &whole_array, empty_metadata)?; + JsonToVariantTest { + json: json.as_str(), + expected: variant, + } + .run()?; } - // // objects - compare_results( - "{\"b\": 2, \"a\": 1, \"a\": 3}", - &[2u8, 2u8, 1u8, 0u8, 2u8, 0u8, 4u8, 12u8, 2u8, 12u8, 3u8], - &[1, 2, 0, 1, 2, 98u8, 97u8], - )?; - - compare_results( - "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", - &[ - 2u8, 3u8, 2u8, 1u8, 0u8, 24u8, 23u8, 0u8, 31u8, 3u8, 3u8, 0u8, 2u8, 11u8, 17u8, 12u8, - 4u8, 28u8, 0, 0, 0, 0, 0, 0, 0x08, 0xc0, 32u8, 3, 0xe9, 0x03, 0, 0, 0, 3u8, 2u8, 0u8, - 1u8, 2u8, 4u8, 8u8, - ], - &[ - 1u8, 3u8, 0u8, 7u8, 11u8, 19u8, 0x6eu8, 0x75u8, 0x6du8, 0x62u8, 0x65u8, 0x72u8, 0x73u8, - 0x6eu8, 0x75u8, 0x6cu8, 0x6cu8, 0x62u8, 0x6fu8, 0x6fu8, 0x6cu8, 0x65u8, 0x61u8, 0x6eu8, - 0x73u8, - ], - )?; - + // objects + { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("b", Variant::Int8(2)); + object_builder.insert("a", Variant::Int8(3)); + object_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"b\": 2, \"a\": 1, \"a\": 3}", + expected: variant, + } + .run()?; + } + { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + let mut inner_list_builder = object_builder.new_list("numbers"); + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); + inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("booleans"); + inner_list_builder.append_value(Variant::BooleanTrue); + inner_list_builder.append_value(Variant::BooleanFalse); + inner_list_builder.finish(); + object_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + expected: variant, + } + .run()?; + } + { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + let mut inner_list_builder = object_builder.new_list("numbers"); + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); + inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("booleans"); + inner_list_builder.append_value(Variant::BooleanTrue); + inner_list_builder.append_value(Variant::BooleanFalse); + inner_list_builder.finish(); + object_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + expected: variant, + } + .run()?; + } { // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each // element a list of numbers from 0-127 - let keys: Vec = (0..=255).map(|n| format!("\"{:03}\"", n)).collect(); + let keys: Vec = (0..=255).map(|n| format!("{:03}", n)).collect(); let innermost_list: String = format!( "[{}]", (0..=127) @@ -333,17 +376,25 @@ fn test_json_to_variant() -> Result<(), ArrowError> { .collect::>() .join(",") ); - let inner_keys: Vec = (240..=495).map(|n| format!("\"{}\"", n)).collect(); + let inner_keys: Vec = (240..=495).map(|n| format!("{}", n)).collect(); let inner_object = format!( "{{{}:{}}}", - inner_keys.join(format!(":{},", innermost_list).as_str()), + inner_keys + .iter() + .map(|k| format!("\"{}\"", k)) + .collect::>() + .join(format!(":{},", innermost_list).as_str()), innermost_list ); let json = format!( "{{{}:{}}}", - keys.join(format!(":{},", inner_object).as_str()), + keys.iter() + .map(|k| format!("\"{}\"", k)) + .collect::>() + .join(format!(":{},", inner_object).as_str()), inner_object ); + // Manually verify raw JSON value size let mut variant_builder = VariantBuilder::new(); json_to_variant(&json, &mut variant_builder)?; let (metadata, value) = variant_builder.finish(); @@ -357,6 +408,29 @@ fn test_json_to_variant() -> Result<(), ArrowError> { // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 assert_eq!(value.len(), 34082313); + + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + keys.iter().for_each(|key| { + let mut inner_object_builder = object_builder.new_object(key); + inner_keys.iter().for_each(|inner_key| { + let mut list_builder = inner_object_builder.new_list(&inner_key); + for i in 0..=127 { + list_builder.append_value(Variant::Int8(i)); + } + list_builder.finish(); + }); + inner_object_builder.finish(); + }); + object_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: &json, + expected: variant, + } + .run()?; } { let json = "{\"爱\":\"अ\",\"a\":1}"; @@ -366,6 +440,14 @@ fn test_json_to_variant() -> Result<(), ArrowError> { let v = parquet_variant::Variant::try_new(&metadata, &value)?; let output_string = variant_to_json_string(&v)?; assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); + object_builder.insert("a", Variant::Int8(1)); + object_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + assert_eq!( value, &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] @@ -374,6 +456,11 @@ fn test_json_to_variant() -> Result<(), ArrowError> { metadata, &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] ); + JsonToVariantTest { + json: &json, + expected: variant, + } + .run()?; } Ok(()) From a29b5c3aba59940a6f8980d0779c01ea972cf061 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 16:00:37 -0700 Subject: [PATCH 24/35] fixed more comments --- parquet-variant/src/from_json.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index a8902d6b5be0..740ae7cd5db6 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -23,7 +23,7 @@ use arrow_schema::ArrowError; use rust_decimal::prelude::*; use serde_json::{Number, Value}; -/// Converts a JSON string to Variant using `variant_builder`. The resulting `value` and `metadata` +/// Converts a JSON string to Variant using [`VariantBuilder`]. The resulting `value` and `metadata` /// buffers can be extracted using `builder.finish()` /// /// # Arguments From 7f23cf52f7cdb180236a4540f6c52d31499930c5 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 17:02:01 -0700 Subject: [PATCH 25/35] fix decimal to string --- parquet-variant/src/to_json.rs | 77 +++++++++----------------- parquet-variant/src/variant.rs | 2 +- parquet-variant/src/variant/decimal.rs | 36 ++++++++++++ 3 files changed, 64 insertions(+), 51 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 07ce7b83d1eb..7338caabd4f9 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -18,10 +18,11 @@ //! Module for converting Variant data to JSON format use arrow_schema::ArrowError; use base64::{engine::general_purpose, Engine as _}; -use serde_json::Value; +use rust_decimal::prelude::*; +use serde_json::{Number, Value}; use std::io::Write; -use crate::variant::{Variant, VariantList, VariantObject}; +use crate::{variant::{Variant, VariantList, VariantObject}, VariantDecimal}; // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; @@ -245,6 +246,27 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { .map_err(|e| ArrowError::InvalidArgumentError(format!("UTF-8 conversion error: {e}"))) } +fn variant_decimal_to_json_value(decimal: &impl VariantDecimal) -> Result { + let integer = decimal.integer() as i128; + let scale = decimal.scale() as u32; + if scale == 0 { + return Ok(Value::from(integer)); + } + let divisor = 10_i128.pow(scale); + if integer % divisor != 0 { + // try decimal + if let Ok(dec) = Decimal::try_from_i128_with_scale(integer, scale) { + if let Ok(value) = serde_json::from_str::(&dec.to_string()) { + return Ok(serde_json::Value::Number(value)); + } + } + // fall back to floating point + return Ok(Value::from(integer as f64 / divisor as f64)); + } + // integer perfect division + Ok(Value::from(integer / divisor)) +} + /// Convert [`Variant`] to [`serde_json::Value`] /// /// This function converts a Variant to a [`serde_json::Value`], which is useful @@ -287,58 +309,13 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), Variant::Decimal4(decimal4) => { - let scale = decimal4.scale(); - let integer = decimal4.integer(); - - let integer = if scale == 0 { - integer - } else { - let divisor = 10_i32.pow(scale as u32); - if integer % divisor != 0 { - // fall back to floating point - return Ok(Value::from(integer as f64 / divisor as f64)); - } - integer / divisor - }; - Ok(Value::from(integer)) + variant_decimal_to_json_value(decimal4) } Variant::Decimal8(decimal8) => { - let scale = decimal8.scale(); - let integer = decimal8.integer(); - - let integer = if scale == 0 { - integer - } else { - let divisor = 10_i64.pow(scale as u32); - if integer % divisor != 0 { - // fall back to floating point - return Ok(Value::from(integer as f64 / divisor as f64)); - } - integer / divisor - }; - Ok(Value::from(integer)) + variant_decimal_to_json_value(decimal8) } Variant::Decimal16(decimal16) => { - let scale = decimal16.scale(); - let integer = decimal16.integer(); - - let integer = if scale == 0 { - integer - } else { - let divisor = 10_i128.pow(scale as u32); - if integer % divisor != 0 { - // fall back to floating point - return Ok(Value::from(integer as f64 / divisor as f64)); - } - integer / divisor - }; - // i128 has higher precision than any 64-bit type. Try a lossless narrowing cast to - // i64 or u64 first, falling back to a lossy narrowing cast to f64 if necessary. - let value = i64::try_from(integer) - .map(Value::from) - .or_else(|_| u64::try_from(integer).map(Value::from)) - .unwrap_or_else(|_| Value::from(integer as f64)); - Ok(value) + variant_decimal_to_json_value(decimal16) } Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3dcb08053a6b..04de5c3730ca 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -16,7 +16,7 @@ use std::ops::Deref; // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; +pub use self::decimal::{VariantDecimal, VariantDecimal16, VariantDecimal4, VariantDecimal8}; pub use self::list::VariantList; pub use self::metadata::VariantMetadata; pub use self::object::VariantObject; diff --git a/parquet-variant/src/variant/decimal.rs b/parquet-variant/src/variant/decimal.rs index d80fd07d3b67..c5ccd327d1db 100644 --- a/parquet-variant/src/variant/decimal.rs +++ b/parquet-variant/src/variant/decimal.rs @@ -40,6 +40,12 @@ macro_rules! format_decimal { }}; } +// Common trait to classify VariantDecimalXXX types +pub trait VariantDecimal { + fn integer(&self) -> i128; + fn scale(&self) -> u32; +} + /// Represents a 4-byte decimal value in the Variant format. /// /// This struct stores a decimal number using a 32-bit signed integer for the coefficient @@ -101,6 +107,16 @@ impl VariantDecimal4 { } } +impl VariantDecimal for VariantDecimal4 { + fn integer(&self) -> i128 { + self.integer() as i128 + } + + fn scale(&self) -> u32 { + self.scale() as u32 + } +} + impl fmt::Display for VariantDecimal4 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i32) @@ -169,6 +185,16 @@ impl VariantDecimal8 { } } +impl VariantDecimal for VariantDecimal8 { + fn integer(&self) -> i128 { + self.integer() as i128 + } + + fn scale(&self) -> u32 { + self.scale() as u32 + } +} + impl fmt::Display for VariantDecimal8 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i64) @@ -237,6 +263,16 @@ impl VariantDecimal16 { } } +impl VariantDecimal for VariantDecimal16 { + fn integer(&self) -> i128 { + self.integer() as i128 + } + + fn scale(&self) -> u32 { + self.scale() as u32 + } +} + impl fmt::Display for VariantDecimal16 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i128) From 50f4b2564fc7c7948c8c72f81c60496a018eb99e Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 17:29:04 -0700 Subject: [PATCH 26/35] fmt and clippy --- parquet-variant/src/to_json.rs | 21 +++++++++------------ parquet-variant/src/variant/decimal.rs | 2 +- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 7338caabd4f9..616699664d91 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -22,7 +22,10 @@ use rust_decimal::prelude::*; use serde_json::{Number, Value}; use std::io::Write; -use crate::{variant::{Variant, VariantList, VariantObject}, VariantDecimal}; +use crate::{ + variant::{Variant, VariantList, VariantObject}, + VariantDecimal, +}; // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; @@ -247,8 +250,8 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { } fn variant_decimal_to_json_value(decimal: &impl VariantDecimal) -> Result { - let integer = decimal.integer() as i128; - let scale = decimal.scale() as u32; + let integer = decimal.integer(); + let scale = decimal.scale(); if scale == 0 { return Ok(Value::from(integer)); } @@ -308,15 +311,9 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::Double(f) => serde_json::Number::from_f64(*f) .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), - Variant::Decimal4(decimal4) => { - variant_decimal_to_json_value(decimal4) - } - Variant::Decimal8(decimal8) => { - variant_decimal_to_json_value(decimal8) - } - Variant::Decimal16(decimal16) => { - variant_decimal_to_json_value(decimal16) - } + Variant::Decimal4(decimal4) => variant_decimal_to_json_value(decimal4), + Variant::Decimal8(decimal8) => variant_decimal_to_json_value(decimal8), + Variant::Decimal16(decimal16) => variant_decimal_to_json_value(decimal16), Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), Variant::TimestampNtzMicros(ts) => Ok(Value::String(format_timestamp_ntz_string(ts))), diff --git a/parquet-variant/src/variant/decimal.rs b/parquet-variant/src/variant/decimal.rs index c5ccd327d1db..39c3311b03ec 100644 --- a/parquet-variant/src/variant/decimal.rs +++ b/parquet-variant/src/variant/decimal.rs @@ -265,7 +265,7 @@ impl VariantDecimal16 { impl VariantDecimal for VariantDecimal16 { fn integer(&self) -> i128 { - self.integer() as i128 + self.integer() } fn scale(&self) -> u32 { From 560e430ef0d96dd85f9a6802051d848f8ef15fe7 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 30 Jun 2025 18:33:55 -0700 Subject: [PATCH 27/35] fix --- parquet-variant/src/builder.rs | 6 +++--- parquet-variant/src/from_json.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 443b4d159894..a1b16d36f395 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -682,7 +682,7 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { /// Trait that abstracts functionality from Variant fconstruction implementations, namely /// `VariantBuilder`, `ListBuilder` and `ObjectFieldBuilder` to minimize code duplication. -pub(crate) trait AppendVariantHelper { +pub(crate) trait VariantBuilderExt { fn append_value<'m, 'd, T: Into>>(&mut self, value: T); fn new_list(&mut self) -> ListBuilder; @@ -690,7 +690,7 @@ pub(crate) trait AppendVariantHelper { fn new_object(&mut self) -> ObjectBuilder; } -impl AppendVariantHelper for ListBuilder<'_> { +impl VariantBuilderExt for ListBuilder<'_> { fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.append_value(value); } @@ -704,7 +704,7 @@ impl AppendVariantHelper for ListBuilder<'_> { } } -impl AppendVariantHelper for VariantBuilder { +impl VariantBuilderExt for VariantBuilder { fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.append_value(value); } diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 740ae7cd5db6..e07fd8b1c9af 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -18,7 +18,7 @@ //! Module for parsing JSON strings as Variant pub use crate::variant::{VariantDecimal4, VariantDecimal8}; -use crate::{AppendVariantHelper, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; +use crate::{VariantBuilderExt, ListBuilder, ObjectBuilder, Variant, VariantBuilder}; use arrow_schema::ArrowError; use rust_decimal::prelude::*; use serde_json::{Number, Value}; @@ -122,7 +122,7 @@ fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError } } -fn append_json(json: &Value, builder: &mut impl AppendVariantHelper) -> Result<(), ArrowError> { +fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { match json { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), @@ -157,7 +157,7 @@ struct ObjectFieldBuilder<'a, 'b, 'c> { builder: &'b mut ObjectBuilder<'c, 'a>, } -impl AppendVariantHelper for ObjectFieldBuilder<'_, '_, '_> { +impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> { fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.builder.insert(self.key, value); } From 07d5688c8aa7aeae93d5648846b5d94cbeaaaa20 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Jul 2025 13:36:50 -0400 Subject: [PATCH 28/35] clippy --- parquet-variant/tests/test_json_to_variant.rs | 71 +++++++------------ 1 file changed, 25 insertions(+), 46 deletions(-) diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index e570efada5c5..5c302c29218b 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -170,7 +170,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { // Double JsonToVariantTest { json: "0.79228162514264337593543950335", - expected: Variant::Double(0.79228162514264337593543950335f64), + expected: Variant::Double(0.792_281_625_142_643_3_f64), } .run()?; JsonToVariantTest { @@ -192,37 +192,19 @@ fn test_json_to_variant() -> Result<(), ArrowError> { .run()?; // longest short string JsonToVariantTest { - json: &format!( - "\"{}\"", - std::iter::repeat('a').take(63).collect::() - ), - expected: Variant::ShortString(ShortString::try_new(&format!( - "{}", - std::iter::repeat('a').take(63).collect::() - ))?), + json: &format!("\"{}\"", "a".repeat(63)), + expected: Variant::ShortString(ShortString::try_new(&"a".repeat(63))?), } .run()?; // long strings JsonToVariantTest { - json: &format!( - "\"{}\"", - std::iter::repeat('a').take(64).collect::() - ), - expected: Variant::String(&format!( - "{}", - std::iter::repeat('a').take(64).collect::() - )), + json: &format!("\"{}\"", "a".repeat(64)), + expected: Variant::String(&"a".repeat(64)), } .run()?; JsonToVariantTest { - json: &format!( - "\"{}\"", - std::iter::repeat('b').take(100000).collect::() - ), - expected: Variant::String(&format!( - "{}", - std::iter::repeat('b').take(100000).collect::() - )), + json: &format!("\"{}\"", "b".repeat(100000)), + expected: Variant::String(&"b".repeat(100000)), } .run()?; @@ -249,7 +231,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { let mut list_builder = variant_builder.new_list(); let mut object_builder_inner = list_builder.new_object(); object_builder_inner.insert("age", Variant::Int8(32)); - object_builder_inner.finish(); + object_builder_inner.finish().unwrap(); list_builder.append_value(Variant::Int16(128)); list_builder.append_value(Variant::BooleanFalse); list_builder.finish(); @@ -275,10 +257,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { let variant = Variant::try_new(&metadata, &value)?; JsonToVariantTest { - json: &format!( - "[{} true]", - std::iter::repeat("1, ").take(128).collect::() - ), + json: &format!("[{} true]", "1, ".repeat(128)), expected: variant, } .run()?; @@ -312,7 +291,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { let mut object_builder = variant_builder.new_object(); object_builder.insert("b", Variant::Int8(2)); object_builder.insert("a", Variant::Int8(3)); - object_builder.finish(); + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; JsonToVariantTest { @@ -334,7 +313,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { inner_list_builder.append_value(Variant::BooleanTrue); inner_list_builder.append_value(Variant::BooleanFalse); inner_list_builder.finish(); - object_builder.finish(); + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; JsonToVariantTest { @@ -356,7 +335,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { inner_list_builder.append_value(Variant::BooleanTrue); inner_list_builder.append_value(Variant::BooleanFalse); inner_list_builder.finish(); - object_builder.finish(); + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; JsonToVariantTest { @@ -368,30 +347,30 @@ fn test_json_to_variant() -> Result<(), ArrowError> { { // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each // element a list of numbers from 0-127 - let keys: Vec = (0..=255).map(|n| format!("{:03}", n)).collect(); + let keys: Vec = (0..=255).map(|n| format!("{n:03}")).collect(); let innermost_list: String = format!( "[{}]", (0..=127) - .map(|n| format!("{}", n)) + .map(|n| format!("{n}")) .collect::>() .join(",") ); - let inner_keys: Vec = (240..=495).map(|n| format!("{}", n)).collect(); + let inner_keys: Vec = (240..=495).map(|n| format!("{n}")).collect(); let inner_object = format!( "{{{}:{}}}", inner_keys .iter() - .map(|k| format!("\"{}\"", k)) + .map(|k| format!("\"{k}\"")) .collect::>() - .join(format!(":{},", innermost_list).as_str()), + .join(format!(":{innermost_list},").as_str()), innermost_list ); let json = format!( "{{{}:{}}}", keys.iter() - .map(|k| format!("\"{}\"", k)) + .map(|k| format!("\"{k}\"")) .collect::>() - .join(format!(":{},", inner_object).as_str()), + .join(format!(":{inner_object},").as_str()), inner_object ); // Manually verify raw JSON value size @@ -414,15 +393,15 @@ fn test_json_to_variant() -> Result<(), ArrowError> { keys.iter().for_each(|key| { let mut inner_object_builder = object_builder.new_object(key); inner_keys.iter().for_each(|inner_key| { - let mut list_builder = inner_object_builder.new_list(&inner_key); + let mut list_builder = inner_object_builder.new_list(inner_key); for i in 0..=127 { list_builder.append_value(Variant::Int8(i)); } list_builder.finish(); }); - inner_object_builder.finish(); + inner_object_builder.finish().unwrap(); }); - object_builder.finish(); + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; @@ -435,7 +414,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { { let json = "{\"爱\":\"अ\",\"a\":1}"; let mut variant_builder = VariantBuilder::new(); - json_to_variant(&json, &mut variant_builder)?; + json_to_variant(json, &mut variant_builder)?; let (metadata, value) = variant_builder.finish(); let v = parquet_variant::Variant::try_new(&metadata, &value)?; let output_string = variant_to_json_string(&v)?; @@ -444,7 +423,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { let mut object_builder = variant_builder.new_object(); object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); object_builder.insert("a", Variant::Int8(1)); - object_builder.finish(); + object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; @@ -457,7 +436,7 @@ fn test_json_to_variant() -> Result<(), ArrowError> { &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] ); JsonToVariantTest { - json: &json, + json, expected: variant, } .run()?; From af937acf82c8827aad16b53389ca41c41a54df5d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Jul 2025 13:47:04 -0400 Subject: [PATCH 29/35] Split tests into separate functions --- parquet-variant/tests/test_json_to_variant.rs | 637 ++++++++++-------- 1 file changed, 364 insertions(+), 273 deletions(-) diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 5c302c29218b..9204135d9dc9 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -23,141 +23,222 @@ use parquet_variant::{ VariantDecimal16, VariantDecimal4, VariantDecimal8, }; -#[test] -fn test_json_to_variant() -> Result<(), ArrowError> { - struct JsonToVariantTest<'a> { - json: &'a str, - expected: Variant<'a, 'a>, - } +struct JsonToVariantTest<'a> { + json: &'a str, + expected: Variant<'a, 'a>, +} - impl<'a> JsonToVariantTest<'a> { - fn run(self) -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - json_to_variant(self.json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - assert_eq!(variant, self.expected); - Ok(()) - } +impl<'a> JsonToVariantTest<'a> { + fn run(self) -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + json_to_variant(self.json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + assert_eq!(variant, self.expected); + Ok(()) } +} - // Null +#[test] +fn test_json_to_variant_null() -> Result<(), ArrowError> { JsonToVariantTest { json: "null", expected: Variant::Null, } - .run()?; - // Bool + .run() +} + +#[test] +fn test_json_to_variant_boolean_true() -> Result<(), ArrowError> { JsonToVariantTest { json: "true", expected: Variant::BooleanTrue, } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_boolean_false() -> Result<(), ArrowError> { JsonToVariantTest { json: "false", expected: Variant::BooleanFalse, } - .run()?; - // Integers + .run() +} + +#[test] +fn test_json_to_variant_int8_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: " 127 ", expected: Variant::Int8(127), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_int8_negative() -> Result<(), ArrowError> { JsonToVariantTest { json: " -128 ", expected: Variant::Int8(-128), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_int16() -> Result<(), ArrowError> { JsonToVariantTest { json: " 27134 ", expected: Variant::Int16(27134), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_int32() -> Result<(), ArrowError> { JsonToVariantTest { json: " -32767431 ", expected: Variant::Int32(-32767431), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_int64() -> Result<(), ArrowError> { JsonToVariantTest { json: "92842754201389", expected: Variant::Int64(92842754201389), } - .run()?; - // Decimals - // Decimal 4 + .run() +} + +#[test] +fn test_json_to_variant_decimal4_basic() -> Result<(), ArrowError> { JsonToVariantTest { json: "1.23", expected: Variant::from(VariantDecimal4::try_new(123, 2)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal4_large_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: "99999999.9", expected: Variant::from(VariantDecimal4::try_new(999999999, 1)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal4_large_negative() -> Result<(), ArrowError> { JsonToVariantTest { json: "-99999999.9", expected: Variant::from(VariantDecimal4::try_new(-999999999, 1)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal4_small_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.999999999", expected: Variant::from(VariantDecimal4::try_new(999999999, 9)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal4_tiny_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.000000001", expected: Variant::from(VariantDecimal4::try_new(1, 9)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal4_small_negative() -> Result<(), ArrowError> { JsonToVariantTest { json: "-0.999999999", expected: Variant::from(VariantDecimal4::try_new(-999999999, 9)?), } - .run()?; - // Decimal 8 + .run() +} + +#[test] +fn test_json_to_variant_decimal8_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: "999999999.0", expected: Variant::from(VariantDecimal8::try_new(9999999990, 1)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal8_negative() -> Result<(), ArrowError> { JsonToVariantTest { json: "-999999999.0", expected: Variant::from(VariantDecimal8::try_new(-9999999990, 1)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal8_high_precision() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.999999999999999999", expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 18)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal8_large_with_scale() -> Result<(), ArrowError> { JsonToVariantTest { json: "9999999999999999.99", expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 2)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal8_large_negative_with_scale() -> Result<(), ArrowError> { JsonToVariantTest { json: "-9999999999999999.99", expected: Variant::from(VariantDecimal8::try_new(-999999999999999999, 2)?), } - .run()?; - // Decimal 16 + .run() +} + +#[test] +fn test_json_to_variant_decimal16_large_integer() -> Result<(), ArrowError> { JsonToVariantTest { json: "9999999999999999999", // integer larger than i64 expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 0)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal16_high_precision() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.9999999999999999999", expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 19)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal16_max_value() -> Result<(), ArrowError> { JsonToVariantTest { json: "79228162514264337593543950335", // 2 ^ 96 - 1 expected: Variant::from(VariantDecimal16::try_new(79228162514264337593543950335, 0)?), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_decimal16_max_scale() -> Result<(), ArrowError> { JsonToVariantTest { json: "7.9228162514264337593543950335", // using scale higher than this falls into double // since the max scale is 28. @@ -166,281 +247,291 @@ fn test_json_to_variant() -> Result<(), ArrowError> { 28, )?), } - .run()?; - // Double + .run() +} + +#[test] +fn test_json_to_variant_double_precision() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.79228162514264337593543950335", expected: Variant::Double(0.792_281_625_142_643_3_f64), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_double_scientific_positive() -> Result<(), ArrowError> { JsonToVariantTest { json: "15e-1", expected: Variant::Double(15e-1f64), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_double_scientific_negative() -> Result<(), ArrowError> { JsonToVariantTest { json: "-15e-1", expected: Variant::Double(-15e-1f64), } - .run()?; - // short strings - // random short string + .run() +} + +#[test] +fn test_json_to_variant_short_string() -> Result<(), ArrowError> { JsonToVariantTest { json: "\"harsh\"", expected: Variant::ShortString(ShortString::try_new("harsh")?), } - .run()?; - // longest short string + .run() +} + +#[test] +fn test_json_to_variant_short_string_max_length() -> Result<(), ArrowError> { JsonToVariantTest { json: &format!("\"{}\"", "a".repeat(63)), expected: Variant::ShortString(ShortString::try_new(&"a".repeat(63))?), } - .run()?; - // long strings + .run() +} + +#[test] +fn test_json_to_variant_long_string() -> Result<(), ArrowError> { JsonToVariantTest { json: &format!("\"{}\"", "a".repeat(64)), expected: Variant::String(&"a".repeat(64)), } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_very_long_string() -> Result<(), ArrowError> { JsonToVariantTest { json: &format!("\"{}\"", "b".repeat(100000)), expected: Variant::String(&"b".repeat(100000)), } - .run()?; + .run() +} - // arrays - // u8 offset - { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - list_builder.append_value(Variant::Int8(127)); - list_builder.append_value(Variant::Int16(128)); - list_builder.append_value(Variant::Int32(-32767431)); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; +#[test] +fn test_json_to_variant_array_simple() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + list_builder.append_value(Variant::Int8(127)); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::Int32(-32767431)); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "[127, 128, -32767431]", - expected: variant, - } - .run()?; + JsonToVariantTest { + json: "[127, 128, -32767431]", + expected: variant, } - { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - let mut object_builder_inner = list_builder.new_object(); - object_builder_inner.insert("age", Variant::Int8(32)); - object_builder_inner.finish().unwrap(); - list_builder.append_value(Variant::Int16(128)); - list_builder.append_value(Variant::BooleanFalse); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; + .run() +} - JsonToVariantTest { - json: "[{\"age\": 32}, 128, false]", - expected: variant, - } - .run()?; +#[test] +fn test_json_to_variant_array_with_object() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + let mut object_builder_inner = list_builder.new_object(); + object_builder_inner.insert("age", Variant::Int8(32)); + object_builder_inner.finish().unwrap(); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::BooleanFalse); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: "[{\"age\": 32}, 128, false]", + expected: variant, } + .run() +} + +#[test] +fn test_json_to_variant_array_large_u16_offset() -> Result<(), ArrowError> { // u16 offset - 128 i8's + 1 "true" = 257 bytes - { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - for _ in 0..128 { - list_builder.append_value(Variant::Int8(1)); - } - list_builder.append_value(Variant::BooleanTrue); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..128 { + list_builder.append_value(Variant::Int8(1)); + } + list_builder.append_value(Variant::BooleanTrue); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: &format!("[{} true]", "1, ".repeat(128)), - expected: variant, - } - .run()?; + JsonToVariantTest { + json: &format!("[{} true]", "1, ".repeat(128)), + expected: variant, } + .run() +} + +#[test] +fn test_json_to_variant_array_nested_large() -> Result<(), ArrowError> { // verify u24, and large_size - { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - for _ in 0..256 { - let mut list_builder_inner = list_builder.new_list(); - for _ in 0..255 { - list_builder_inner.append_value(Variant::Null); - } - list_builder_inner.finish(); + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..256 { + let mut list_builder_inner = list_builder.new_list(); + for _ in 0..255 { + list_builder_inner.append_value(Variant::Null); } - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); - let json = format!("[{}]", vec![intermediate; 256].join(", ")); - JsonToVariantTest { - json: json.as_str(), - expected: variant, - } - .run()?; + list_builder_inner.finish(); } + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); + let json = format!("[{}]", vec![intermediate; 256].join(", ")); + JsonToVariantTest { + json: json.as_str(), + expected: variant, + } + .run() +} - // objects - { - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - object_builder.insert("b", Variant::Int8(2)); - object_builder.insert("a", Variant::Int8(3)); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "{\"b\": 2, \"a\": 1, \"a\": 3}", - expected: variant, - } - .run()?; +#[test] +fn test_json_to_variant_object_simple() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("b", Variant::Int8(2)); + object_builder.insert("a", Variant::Int8(3)); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"b\": 2, \"a\": 1, \"a\": 3}", + expected: variant, } - { - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - let mut inner_list_builder = object_builder.new_list("numbers"); - inner_list_builder.append_value(Variant::Int8(4)); - inner_list_builder.append_value(Variant::Double(-3e0)); - inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); - inner_list_builder.finish(); - object_builder.insert("null", Variant::Null); - let mut inner_list_builder = object_builder.new_list("booleans"); - inner_list_builder.append_value(Variant::BooleanTrue); - inner_list_builder.append_value(Variant::BooleanFalse); - inner_list_builder.finish(); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", - expected: variant, - } - .run()?; + .run() +} + +#[test] +fn test_json_to_variant_object_complex() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + let mut inner_list_builder = object_builder.new_list("numbers"); + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); + inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("booleans"); + inner_list_builder.append_value(Variant::BooleanTrue); + inner_list_builder.append_value(Variant::BooleanFalse); + inner_list_builder.finish(); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + expected: variant, } - { - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - let mut inner_list_builder = object_builder.new_list("numbers"); - inner_list_builder.append_value(Variant::Int8(4)); - inner_list_builder.append_value(Variant::Double(-3e0)); - inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); - inner_list_builder.finish(); - object_builder.insert("null", Variant::Null); - let mut inner_list_builder = object_builder.new_list("booleans"); - inner_list_builder.append_value(Variant::BooleanTrue); - inner_list_builder.append_value(Variant::BooleanFalse); - inner_list_builder.finish(); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", - expected: variant, - } - .run()?; - } - { - // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each - // element a list of numbers from 0-127 - let keys: Vec = (0..=255).map(|n| format!("{n:03}")).collect(); - let innermost_list: String = format!( - "[{}]", - (0..=127) - .map(|n| format!("{n}")) - .collect::>() - .join(",") - ); - let inner_keys: Vec = (240..=495).map(|n| format!("{n}")).collect(); - let inner_object = format!( - "{{{}:{}}}", - inner_keys - .iter() - .map(|k| format!("\"{k}\"")) - .collect::>() - .join(format!(":{innermost_list},").as_str()), - innermost_list - ); - let json = format!( - "{{{}:{}}}", - keys.iter() - .map(|k| format!("\"{k}\"")) - .collect::>() - .join(format!(":{inner_object},").as_str()), - inner_object - ); - // Manually verify raw JSON value size - let mut variant_builder = VariantBuilder::new(); - json_to_variant(&json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let v = parquet_variant::Variant::try_new(&metadata, &value)?; - let output_string = variant_to_json_string(&v)?; - assert_eq!(output_string, json); - // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 - assert_eq!(metadata.len(), 2485); - // Verify value size. - // Size of innermost_list: 1 + 1 + 258 + 256 = 516 - // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 - // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 - assert_eq!(value.len(), 34082313); + .run() +} - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - keys.iter().for_each(|key| { - let mut inner_object_builder = object_builder.new_object(key); - inner_keys.iter().for_each(|inner_key| { - let mut list_builder = inner_object_builder.new_list(inner_key); - for i in 0..=127 { - list_builder.append_value(Variant::Int8(i)); - } - list_builder.finish(); - }); - inner_object_builder.finish().unwrap(); +#[test] +fn test_json_to_variant_object_very_large() -> Result<(), ArrowError> { + // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each + // element a list of numbers from 0-127 + let keys: Vec = (0..=255).map(|n| format!("{n:03}")).collect(); + let innermost_list: String = format!( + "[{}]", + (0..=127) + .map(|n| format!("{n}")) + .collect::>() + .join(",") + ); + let inner_keys: Vec = (240..=495).map(|n| format!("{n}")).collect(); + let inner_object = format!( + "{{{}:{}}}", + inner_keys + .iter() + .map(|k| format!("\"{k}\"")) + .collect::>() + .join(format!(":{innermost_list},").as_str()), + innermost_list + ); + let json = format!( + "{{{}:{}}}", + keys.iter() + .map(|k| format!("\"{k}\"")) + .collect::>() + .join(format!(":{inner_object},").as_str()), + inner_object + ); + // Manually verify raw JSON value size + let mut variant_builder = VariantBuilder::new(); + json_to_variant(&json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = parquet_variant::Variant::try_new(&metadata, &value)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, json); + // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 + assert_eq!(metadata.len(), 2485); + // Verify value size. + // Size of innermost_list: 1 + 1 + 258 + 256 = 516 + // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 + // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 + assert_eq!(value.len(), 34082313); + + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + keys.iter().for_each(|key| { + let mut inner_object_builder = object_builder.new_object(key); + inner_keys.iter().for_each(|inner_key| { + let mut list_builder = inner_object_builder.new_list(inner_key); + for i in 0..=127 { + list_builder.append_value(Variant::Int8(i)); + } + list_builder.finish(); }); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; + inner_object_builder.finish().unwrap(); + }); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: &json, - expected: variant, - } - .run()?; + JsonToVariantTest { + json: &json, + expected: variant, } - { - let json = "{\"爱\":\"अ\",\"a\":1}"; - let mut variant_builder = VariantBuilder::new(); - json_to_variant(json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let v = parquet_variant::Variant::try_new(&metadata, &value)?; - let output_string = variant_to_json_string(&v)?; - assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); - object_builder.insert("a", Variant::Int8(1)); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; + .run() +} - assert_eq!( - value, - &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] - ); - assert_eq!( - metadata, - &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] - ); - JsonToVariantTest { - json, - expected: variant, - } - .run()?; - } +#[test] +fn test_json_to_variant_unicode() -> Result<(), ArrowError> { + let json = "{\"爱\":\"अ\",\"a\":1}"; + let mut variant_builder = VariantBuilder::new(); + json_to_variant(json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = parquet_variant::Variant::try_new(&metadata, &value)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); + object_builder.insert("a", Variant::Int8(1)); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; - Ok(()) + assert_eq!( + value, + &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] + ); + assert_eq!( + metadata, + &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] + ); + JsonToVariantTest { + json, + expected: variant, + } + .run() } From 388f188676e2960736044fc1fe88ca25e4b4fdff Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 2 Jul 2025 13:33:07 -0700 Subject: [PATCH 30/35] partially removed decimal dependency --- parquet-variant/Cargo.toml | 2 +- parquet-variant/src/from_json.rs | 39 +++--------- parquet-variant/src/to_json.rs | 86 +++++++++++++++++--------- parquet-variant/src/variant.rs | 2 +- parquet-variant/src/variant/decimal.rs | 36 ----------- 5 files changed, 67 insertions(+), 98 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 270e77367361..f1b609f1ffbf 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -37,7 +37,7 @@ arrow-schema = { workspace = true } chrono = { workspace = true } # "arbitrary_precision" allows us to manually parse numbers. "preserve_order" does not automatically # sort object keys -serde_json = { version = "1.0", features = ["arbitrary_precision", "preserve_order"] } +serde_json = { version = "1.0" } rust_decimal = "1.37.2" base64 = "0.22" diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index e07fd8b1c9af..5dde0f981f41 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -89,36 +89,15 @@ fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError Ok(i.into()) } } else { - // Try decimal - // TODO: Replace with custom decimal parsing as the rust_decimal library only supports - // a max unscaled value of 2^96. - match Decimal::from_str_exact(n.as_str()) { - Ok(dec) => { - let unscaled: i128 = dec.mantissa(); - let scale = dec.scale() as u8; - if unscaled.abs() <= VariantDecimal4::MAX_UNSCALED_VALUE as i128 - && scale <= VariantDecimal4::MAX_PRECISION as u8 - { - (unscaled as i32, scale).try_into() - } else if unscaled.abs() <= VariantDecimal8::MAX_UNSCALED_VALUE as i128 - && scale <= VariantDecimal8::MAX_PRECISION as u8 - { - (unscaled as i64, scale).try_into() - } else { - (unscaled, scale).try_into() - } - } - Err(_) => { - // Try double - match n.as_f64() { - Some(f) => return Ok(f.into()), - None => Err(ArrowError::InvalidArgumentError(format!( - "Failed to parse {} as number", - n.as_str() - ))), - }? - } - } + // Todo: Try decimal once we implement custom JSON parsing where we have access to strings + // Try double - currently json_to_variant does not produce decimal + match n.as_f64() { + Some(f) => return Ok(f.into()), + None => Err(ArrowError::InvalidArgumentError(format!( + "Failed to parse {} as number", + n.to_string() + ))), + }? } } diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 616699664d91..07ce7b83d1eb 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -18,14 +18,10 @@ //! Module for converting Variant data to JSON format use arrow_schema::ArrowError; use base64::{engine::general_purpose, Engine as _}; -use rust_decimal::prelude::*; -use serde_json::{Number, Value}; +use serde_json::Value; use std::io::Write; -use crate::{ - variant::{Variant, VariantList, VariantObject}, - VariantDecimal, -}; +use crate::variant::{Variant, VariantList, VariantObject}; // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; @@ -249,27 +245,6 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { .map_err(|e| ArrowError::InvalidArgumentError(format!("UTF-8 conversion error: {e}"))) } -fn variant_decimal_to_json_value(decimal: &impl VariantDecimal) -> Result { - let integer = decimal.integer(); - let scale = decimal.scale(); - if scale == 0 { - return Ok(Value::from(integer)); - } - let divisor = 10_i128.pow(scale); - if integer % divisor != 0 { - // try decimal - if let Ok(dec) = Decimal::try_from_i128_with_scale(integer, scale) { - if let Ok(value) = serde_json::from_str::(&dec.to_string()) { - return Ok(serde_json::Value::Number(value)); - } - } - // fall back to floating point - return Ok(Value::from(integer as f64 / divisor as f64)); - } - // integer perfect division - Ok(Value::from(integer / divisor)) -} - /// Convert [`Variant`] to [`serde_json::Value`] /// /// This function converts a Variant to a [`serde_json::Value`], which is useful @@ -311,9 +286,60 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::Double(f) => serde_json::Number::from_f64(*f) .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), - Variant::Decimal4(decimal4) => variant_decimal_to_json_value(decimal4), - Variant::Decimal8(decimal8) => variant_decimal_to_json_value(decimal8), - Variant::Decimal16(decimal16) => variant_decimal_to_json_value(decimal16), + Variant::Decimal4(decimal4) => { + let scale = decimal4.scale(); + let integer = decimal4.integer(); + + let integer = if scale == 0 { + integer + } else { + let divisor = 10_i32.pow(scale as u32); + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(integer as f64 / divisor as f64)); + } + integer / divisor + }; + Ok(Value::from(integer)) + } + Variant::Decimal8(decimal8) => { + let scale = decimal8.scale(); + let integer = decimal8.integer(); + + let integer = if scale == 0 { + integer + } else { + let divisor = 10_i64.pow(scale as u32); + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(integer as f64 / divisor as f64)); + } + integer / divisor + }; + Ok(Value::from(integer)) + } + Variant::Decimal16(decimal16) => { + let scale = decimal16.scale(); + let integer = decimal16.integer(); + + let integer = if scale == 0 { + integer + } else { + let divisor = 10_i128.pow(scale as u32); + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(integer as f64 / divisor as f64)); + } + integer / divisor + }; + // i128 has higher precision than any 64-bit type. Try a lossless narrowing cast to + // i64 or u64 first, falling back to a lossy narrowing cast to f64 if necessary. + let value = i64::try_from(integer) + .map(Value::from) + .or_else(|_| u64::try_from(integer).map(Value::from)) + .unwrap_or_else(|_| Value::from(integer as f64)); + Ok(value) + } Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), Variant::TimestampNtzMicros(ts) => Ok(Value::String(format_timestamp_ntz_string(ts))), diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 04de5c3730ca..3dcb08053a6b 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -16,7 +16,7 @@ use std::ops::Deref; // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -pub use self::decimal::{VariantDecimal, VariantDecimal16, VariantDecimal4, VariantDecimal8}; +pub use self::decimal::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; pub use self::list::VariantList; pub use self::metadata::VariantMetadata; pub use self::object::VariantObject; diff --git a/parquet-variant/src/variant/decimal.rs b/parquet-variant/src/variant/decimal.rs index 39c3311b03ec..d80fd07d3b67 100644 --- a/parquet-variant/src/variant/decimal.rs +++ b/parquet-variant/src/variant/decimal.rs @@ -40,12 +40,6 @@ macro_rules! format_decimal { }}; } -// Common trait to classify VariantDecimalXXX types -pub trait VariantDecimal { - fn integer(&self) -> i128; - fn scale(&self) -> u32; -} - /// Represents a 4-byte decimal value in the Variant format. /// /// This struct stores a decimal number using a 32-bit signed integer for the coefficient @@ -107,16 +101,6 @@ impl VariantDecimal4 { } } -impl VariantDecimal for VariantDecimal4 { - fn integer(&self) -> i128 { - self.integer() as i128 - } - - fn scale(&self) -> u32 { - self.scale() as u32 - } -} - impl fmt::Display for VariantDecimal4 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i32) @@ -185,16 +169,6 @@ impl VariantDecimal8 { } } -impl VariantDecimal for VariantDecimal8 { - fn integer(&self) -> i128 { - self.integer() as i128 - } - - fn scale(&self) -> u32 { - self.scale() as u32 - } -} - impl fmt::Display for VariantDecimal8 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i64) @@ -263,16 +237,6 @@ impl VariantDecimal16 { } } -impl VariantDecimal for VariantDecimal16 { - fn integer(&self) -> i128 { - self.integer() - } - - fn scale(&self) -> u32 { - self.scale() as u32 - } -} - impl fmt::Display for VariantDecimal16 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { format_decimal!(f, self.integer, self.scale, i128) From 3b42d911a1295d6206a30126251b1002af8d180f Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 2 Jul 2025 14:28:18 -0700 Subject: [PATCH 31/35] removed decimal type from variant json parsing --- parquet-variant/src/from_json.rs | 2 - parquet-variant/tests/test_json_to_variant.rs | 39 +++++++++++++------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 546cb07ebcb1..0ec5e5466526 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -17,10 +17,8 @@ //! Module for parsing JSON strings as Variant -pub use crate::variant::{VariantDecimal4, VariantDecimal8}; use crate::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt}; use arrow_schema::ArrowError; -use rust_decimal::prelude::*; use serde_json::{Number, Value}; /// Converts a JSON string to Variant using [`VariantBuilder`]. The resulting `value` and `metadata` diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index 9204135d9dc9..fd6056d02d9c 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -111,6 +111,7 @@ fn test_json_to_variant_int64() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_basic() -> Result<(), ArrowError> { JsonToVariantTest { @@ -120,6 +121,7 @@ fn test_json_to_variant_decimal4_basic() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_large_positive() -> Result<(), ArrowError> { JsonToVariantTest { @@ -129,6 +131,7 @@ fn test_json_to_variant_decimal4_large_positive() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_large_negative() -> Result<(), ArrowError> { JsonToVariantTest { @@ -138,6 +141,7 @@ fn test_json_to_variant_decimal4_large_negative() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_small_positive() -> Result<(), ArrowError> { JsonToVariantTest { @@ -147,6 +151,7 @@ fn test_json_to_variant_decimal4_small_positive() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_tiny_positive() -> Result<(), ArrowError> { JsonToVariantTest { @@ -156,6 +161,7 @@ fn test_json_to_variant_decimal4_tiny_positive() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal4_small_negative() -> Result<(), ArrowError> { JsonToVariantTest { @@ -165,6 +171,7 @@ fn test_json_to_variant_decimal4_small_negative() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal8_positive() -> Result<(), ArrowError> { JsonToVariantTest { @@ -174,6 +181,7 @@ fn test_json_to_variant_decimal8_positive() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal8_negative() -> Result<(), ArrowError> { JsonToVariantTest { @@ -183,6 +191,7 @@ fn test_json_to_variant_decimal8_negative() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal8_high_precision() -> Result<(), ArrowError> { JsonToVariantTest { @@ -192,6 +201,7 @@ fn test_json_to_variant_decimal8_high_precision() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal8_large_with_scale() -> Result<(), ArrowError> { JsonToVariantTest { @@ -201,6 +211,7 @@ fn test_json_to_variant_decimal8_large_with_scale() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal8_large_negative_with_scale() -> Result<(), ArrowError> { JsonToVariantTest { @@ -210,6 +221,7 @@ fn test_json_to_variant_decimal8_large_negative_with_scale() -> Result<(), Arrow .run() } +#[ignore] #[test] fn test_json_to_variant_decimal16_large_integer() -> Result<(), ArrowError> { JsonToVariantTest { @@ -219,6 +231,7 @@ fn test_json_to_variant_decimal16_large_integer() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal16_high_precision() -> Result<(), ArrowError> { JsonToVariantTest { @@ -228,6 +241,7 @@ fn test_json_to_variant_decimal16_high_precision() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal16_max_value() -> Result<(), ArrowError> { JsonToVariantTest { @@ -237,6 +251,7 @@ fn test_json_to_variant_decimal16_max_value() -> Result<(), ArrowError> { .run() } +#[ignore] #[test] fn test_json_to_variant_decimal16_max_scale() -> Result<(), ArrowError> { JsonToVariantTest { @@ -254,7 +269,7 @@ fn test_json_to_variant_decimal16_max_scale() -> Result<(), ArrowError> { fn test_json_to_variant_double_precision() -> Result<(), ArrowError> { JsonToVariantTest { json: "0.79228162514264337593543950335", - expected: Variant::Double(0.792_281_625_142_643_3_f64), + expected: Variant::Double(0.792_281_625_142_643_4_f64), } .run() } @@ -399,8 +414,8 @@ fn test_json_to_variant_array_nested_large() -> Result<(), ArrowError> { fn test_json_to_variant_object_simple() -> Result<(), ArrowError> { let mut variant_builder = VariantBuilder::new(); let mut object_builder = variant_builder.new_object(); - object_builder.insert("b", Variant::Int8(2)); object_builder.insert("a", Variant::Int8(3)); + object_builder.insert("b", Variant::Int8(2)); object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; @@ -415,21 +430,21 @@ fn test_json_to_variant_object_simple() -> Result<(), ArrowError> { fn test_json_to_variant_object_complex() -> Result<(), ArrowError> { let mut variant_builder = VariantBuilder::new(); let mut object_builder = variant_builder.new_object(); - let mut inner_list_builder = object_builder.new_list("numbers"); - inner_list_builder.append_value(Variant::Int8(4)); - inner_list_builder.append_value(Variant::Double(-3e0)); - inner_list_builder.append_value(Variant::Decimal4(VariantDecimal4::try_new(1001, 3)?)); - inner_list_builder.finish(); - object_builder.insert("null", Variant::Null); let mut inner_list_builder = object_builder.new_list("booleans"); inner_list_builder.append_value(Variant::BooleanTrue); inner_list_builder.append_value(Variant::BooleanFalse); inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("numbers"); + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Double(1001e-3)); + inner_list_builder.finish(); object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; JsonToVariantTest { - json: "{\"numbers\": [4, -3e0, 1.001], \"null\": null, \"booleans\": [true, false]}", + json: "{\"numbers\": [4, -3e0, 1001e-3], \"null\": null, \"booleans\": [true, false]}", expected: variant, } .run() @@ -515,19 +530,19 @@ fn test_json_to_variant_unicode() -> Result<(), ArrowError> { assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); let mut variant_builder = VariantBuilder::new(); let mut object_builder = variant_builder.new_object(); - object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); object_builder.insert("a", Variant::Int8(1)); + object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); object_builder.finish().unwrap(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; assert_eq!( value, - &[2u8, 2u8, 1u8, 0u8, 4u8, 0u8, 6u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8, 12u8, 1u8] + &[2u8, 2u8, 0u8, 1u8, 0u8, 2u8, 6u8, 12u8, 1u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8] ); assert_eq!( metadata, - &[1u8, 2u8, 0u8, 3u8, 4u8, 0xe7u8, 0x88u8, 0xb1u8, 97u8] + &[1u8, 2u8, 0u8, 1u8, 4u8, 97u8, 0xe7u8, 0x88u8, 0xb1u8] ); JsonToVariantTest { json, From 43d6ea53c2c79ce07c2b77f7fb28592473eea3e3 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 2 Jul 2025 14:29:28 -0700 Subject: [PATCH 32/35] comment fix --- parquet-variant/Cargo.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index e27c9c712d72..18ec0b4dc18a 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -35,9 +35,7 @@ rust-version = "1.83" [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } -# "arbitrary_precision" allows us to manually parse numbers. "preserve_order" does not automatically -# sort object keys -serde_json = { version = "1.0" } +serde_json = "1.0" rust_decimal = "1.37.2" base64 = "0.22" indexmap = "2.10.0" From e9deda95c0360cfb58f62011a70a2fad2260da91 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 2 Jul 2025 18:14:57 -0700 Subject: [PATCH 33/35] refined lifetimes --- parquet-variant/src/builder.rs | 12 ++++++------ parquet-variant/src/from_json.rs | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 021d38f9f327..fe3090f70b8f 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -758,16 +758,16 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { /// Trait that abstracts functionality from Variant fconstruction implementations, namely /// `VariantBuilder`, `ListBuilder` and `ObjectFieldBuilder` to minimize code duplication. -pub(crate) trait VariantBuilderExt { - fn append_value<'m, 'd, T: Into>>(&mut self, value: T); +pub(crate) trait VariantBuilderExt<'m, 'v> { + fn append_value(&mut self, value: impl Into>); fn new_list(&mut self) -> ListBuilder; fn new_object(&mut self) -> ObjectBuilder; } -impl VariantBuilderExt for ListBuilder<'_> { - fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { +impl<'m, 'v> VariantBuilderExt<'m, 'v> for ListBuilder<'_> { + fn append_value(&mut self, value: impl Into>) { self.append_value(value); } @@ -780,8 +780,8 @@ impl VariantBuilderExt for ListBuilder<'_> { } } -impl VariantBuilderExt for VariantBuilder { - fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { +impl<'m, 'v> VariantBuilderExt<'m, 'v> for VariantBuilder { + fn append_value(&mut self, value: impl Into>) { self.append_value(value); } diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 0ec5e5466526..3528c315c9f7 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -74,7 +74,7 @@ fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowErr Ok(()) } -fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError> { +fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError> { if let Some(i) = n.as_i64() { // Find minimum Integer width to fit if i as i8 as i64 == i { @@ -93,13 +93,16 @@ fn variant_from_number<'a, 'b>(n: &Number) -> Result, ArrowError Some(f) => return Ok(f.into()), None => Err(ArrowError::InvalidArgumentError(format!( "Failed to parse {} as number", - n.to_string() + n ))), }? } } -fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { +fn append_json<'m, 'v>( + json: &'v Value, + builder: &mut impl VariantBuilderExt<'m, 'v>, +) -> Result<(), ArrowError> { match json { Value::Null => builder.append_value(Variant::Null), Value::Bool(b) => builder.append_value(*b), @@ -129,13 +132,13 @@ fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), Ok(()) } -struct ObjectFieldBuilder<'a, 'b, 'c> { - key: &'a str, - builder: &'b mut ObjectBuilder<'c, 'a>, +struct ObjectFieldBuilder<'s, 'o, 'v> { + key: &'s str, + builder: &'o mut ObjectBuilder<'v, 's>, } -impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> { - fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { +impl<'m, 'v> VariantBuilderExt<'m, 'v> for ObjectFieldBuilder<'_, '_, '_> { + fn append_value(&mut self, value: impl Into>) { self.builder.insert(self.key, value); } From eb11890d0e7a77f3d30c05a5f3f785d4354d63f1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Jul 2025 14:04:37 -0400 Subject: [PATCH 34/35] Fix clippy, remove unused dependency --- parquet-variant/Cargo.toml | 1 - parquet-variant/src/from_json.rs | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 18ec0b4dc18a..708b614cf4b7 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -36,7 +36,6 @@ rust-version = "1.83" arrow-schema = { workspace = true } chrono = { workspace = true } serde_json = "1.0" -rust_decimal = "1.37.2" base64 = "0.22" indexmap = "2.10.0" diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 3528c315c9f7..c9e1ce75037b 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -92,8 +92,7 @@ fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError match n.as_f64() { Some(f) => return Ok(f.into()), None => Err(ArrowError::InvalidArgumentError(format!( - "Failed to parse {} as number", - n + "Failed to parse {n} as number", ))), }? } From ea5b5738786be03dde04deab7f5393e7e47e7bc4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Jul 2025 14:10:26 -0400 Subject: [PATCH 35/35] Update parquet-variant/src/from_json.rs Co-authored-by: Ryan Johnson --- parquet-variant/src/from_json.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index c9e1ce75037b..00d205f38584 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -125,7 +125,7 @@ fn append_json<'m, 'v>( }; append_json(value, &mut field_builder)?; } - obj_builder.finish().unwrap(); + obj_builder.finish()?; } }; Ok(())