From f278b9d84e8bdebe6c2fc5b17d28380e15d02f18 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 9 Jun 2025 20:29:58 -0400 Subject: [PATCH 01/32] [CHANGE][ISSUE#7426] Add support to convert variant to JSON --- parquet-variant/Cargo.toml | 1 + parquet-variant/examples/variant_to_json.rs | 108 ++++ parquet-variant/src/encoder.rs | 20 + parquet-variant/src/encoder/json.rs | 306 +++++++++++ parquet-variant/src/lib.rs | 2 + parquet-variant/src/variant.rs | 557 +++++++++++++++++++- parquet-variant/tests/integration_test.rs | 136 +++++ 7 files changed, 1129 insertions(+), 1 deletion(-) create mode 100644 parquet-variant/examples/variant_to_json.rs create mode 100644 parquet-variant/src/encoder.rs create mode 100644 parquet-variant/src/encoder/json.rs create mode 100644 parquet-variant/tests/integration_test.rs diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 0065121726ac..30545f8fa21a 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -35,5 +35,6 @@ rust-version = "1.83" [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } +serde_json = "1.0" [lib] diff --git a/parquet-variant/examples/variant_to_json.rs b/parquet-variant/examples/variant_to_json.rs new file mode 100644 index 000000000000..a3c339c85101 --- /dev/null +++ b/parquet-variant/examples/variant_to_json.rs @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Example showing how to convert Variant values to JSON + +use parquet_variant::{ + variant_to_json, variant_to_json_string, variant_to_json_value, Variant, +}; + +fn main() -> Result<(), Box> { + let variants = vec![ + ("Null", Variant::Null), + ("Boolean True", Variant::BooleanTrue), + ("Boolean False", Variant::BooleanFalse), + ("Integer 42", Variant::Int8(42)), + ("Negative Integer", Variant::Int8(-123)), + ("String", Variant::String("Hello, World!")), + ("Short String", Variant::ShortString("Hi!")), + ]; + + for (name, variant) in variants { + let json_string = variant_to_json_string(&variant)?; + println!(" {} -> {}", name, json_string); + } + + // Example 2: String escaping + println!("\nπŸ”€ 2. String Escaping:"); + + let special_string = Variant::String("Line 1\nLine 2\tTabbed\r\nWith \"quotes\" and \\backslashes"); + let escaped_json = variant_to_json_string(&special_string)?; + println!(" Original: Line 1\\nLine 2\\tTabbed\\r\\nWith \"quotes\" and \\\\backslashes"); + println!(" JSON: {}", escaped_json); + + // Example 3: Unicode support + println!("\n🌍 3. Unicode Support:"); + + let unicode_variants = vec![ + Variant::String("Hello δΈ–η•Œ 🌍"), + Variant::String("Emoji: πŸ’»"), + Variant::String("Math: Ξ± + Ξ² = Ξ³"), + ]; + + for variant in unicode_variants { + let json_string = variant_to_json_string(&variant)?; + println!(" {}", json_string); + } + + let test_variant = Variant::String("Buffer test"); + + // Write to Vec + let mut vec_buffer = Vec::new(); + variant_to_json(&mut vec_buffer, &test_variant)?; + println!(" Vec buffer: {}", String::from_utf8(vec_buffer)?); + + // Write to String (through write! macro) + let mut string_buffer = String::new(); + use std::fmt::Write; + write!(string_buffer, "Prefix: ")?; + + // Convert to bytes temporarily to use the Write trait + let mut temp_buffer = Vec::new(); + variant_to_json(&mut temp_buffer, &test_variant)?; + string_buffer.push_str(&String::from_utf8(temp_buffer)?); + println!(" String buffer: {}", string_buffer); + + let variants_for_value = vec![ + Variant::Null, + Variant::BooleanTrue, + Variant::Int8(100), + Variant::String("Value conversion"), + ]; + + for variant in variants_for_value { + let json_value = variant_to_json_value(&variant)?; + println!(" {:?} -> {:?}", variant, json_value); + } + + let start = std::time::Instant::now(); + for i in 0..1000 { + let variant = Variant::Int8((i % 128) as i8); + let _json = variant_to_json_string(&variant)?; + } + let duration = start.elapsed(); + println!(" Converted 1000 variants in {:?}", duration); + + // This would demonstrate error handling if we had invalid variants + // For now, all our examples work, so we'll just show the pattern + match variant_to_json_string(&Variant::String("Valid string")) { + Ok(json) => println!(" Success: {}", json), + Err(e) => println!(" Error: {}", e), + } + + Ok(()) +} \ No newline at end of file diff --git a/parquet-variant/src/encoder.rs b/parquet-variant/src/encoder.rs new file mode 100644 index 000000000000..d4ef77e9388b --- /dev/null +++ b/parquet-variant/src/encoder.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encoder module for converting Variant values to other formats + +pub mod json; \ No newline at end of file diff --git a/parquet-variant/src/encoder/json.rs b/parquet-variant/src/encoder/json.rs new file mode 100644 index 000000000000..c41f394e1413 --- /dev/null +++ b/parquet-variant/src/encoder/json.rs @@ -0,0 +1,306 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for converting Variant data to JSON format + +use arrow_schema::ArrowError; +use serde_json::Value; +use std::io::Write; + +use crate::variant::{Variant, VariantArray, VariantObject}; + +/// Converts a Variant to JSON and writes it to the provided buffer +/// +/// # Arguments +/// +/// * `json_buffer` - Writer to output JSON to +/// * `variant` - The Variant value to convert +/// +/// # Returns +/// +/// * `Ok(())` if successful +/// * `Err` with error details if conversion fails +/// +/// # Example +/// +/// ```rust +/// use parquet_variant::{Variant, variant_to_json}; +/// +/// let variant = Variant::Int8(42); +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// assert_eq!(String::from_utf8(buffer)?, "42"); +/// ``` +pub fn variant_to_json( + json_buffer: &mut W, + variant: &Variant, +) -> Result<(), ArrowError> { + match variant { + Variant::Null => { + write!(json_buffer, "null")?; + } + Variant::BooleanTrue => { + write!(json_buffer, "true")?; + } + Variant::BooleanFalse => { + write!(json_buffer, "false")?; + } + Variant::Int8(i) => { + write!(json_buffer, "{}", i)?; + } + Variant::String(s) | Variant::ShortString(s) => { + // Use serde_json to properly escape the string + let json_str = serde_json::to_string(s) + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)))?; + write!(json_buffer, "{}", json_str)?; + } + Variant::Object(obj) => { + convert_object_to_json(json_buffer, obj)?; + } + Variant::Array(arr) => { + convert_array_to_json(json_buffer, arr)?; + } + } + Ok(()) +} + +/// Convert object fields to JSON +fn convert_object_to_json( + buffer: &mut W, + obj: &VariantObject, +) -> Result<(), ArrowError> { + write!(buffer, "{{")?; + + // Get all fields from the object + let fields = obj.fields()?; + let mut first = true; + + for (key, value) in fields { + if !first { + write!(buffer, ",")?; + } + first = false; + + // Write the key (properly escaped) + let json_key = serde_json::to_string(key) + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON key encoding error: {}", e)))?; + write!(buffer, "{}:", json_key)?; + + // Recursively convert the value + variant_to_json(buffer, &value)?; + } + + write!(buffer, "}}")?; + Ok(()) +} + +/// Convert array elements to JSON +fn convert_array_to_json( + buffer: &mut W, + arr: &VariantArray, +) -> Result<(), ArrowError> { + write!(buffer, "[")?; + + let len = arr.len(); + for i in 0..len { + if i > 0 { + write!(buffer, ",")?; + } + + let element = arr.get(i)?; + variant_to_json(buffer, &element)?; + } + + write!(buffer, "]")?; + Ok(()) +} + +/// Convert Variant to JSON string +/// +/// # Arguments +/// +/// * `variant` - The Variant value to convert +/// +/// # Returns +/// +/// * `Ok(String)` containing the JSON representation +/// * `Err` with error details if conversion fails +/// +/// # Example +/// +/// ```rust +/// use parquet_variant::{Variant, variant_to_json_string}; +/// +/// let variant = Variant::String("hello"); +/// let json = variant_to_json_string(&variant)?; +/// assert_eq!(json, "\"hello\""); +/// ``` +pub fn variant_to_json_string(variant: &Variant) -> Result { + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, variant)?; + String::from_utf8(buffer) + .map_err(|e| ArrowError::InvalidArgumentError(format!("UTF-8 conversion error: {}", e))) +} + +/// Convert Variant to serde_json::Value +/// +/// # Arguments +/// +/// * `variant` - The Variant value to convert +/// +/// # Returns +/// +/// * `Ok(Value)` containing the JSON value +/// * `Err` with error details if conversion fails +/// +/// # Example +/// +/// ```rust +/// use parquet_variant::{Variant, variant_to_json_value}; +/// use serde_json::Value; +/// +/// let variant = Variant::Int8(42); +/// let json_value = variant_to_json_value(&variant)?; +/// assert_eq!(json_value, Value::Number(42.into())); +/// ``` +pub fn variant_to_json_value(variant: &Variant) -> Result { + match variant { + Variant::Null => Ok(Value::Null), + Variant::BooleanTrue => Ok(Value::Bool(true)), + Variant::BooleanFalse => Ok(Value::Bool(false)), + Variant::Int8(i) => Ok(Value::Number((*i).into())), + Variant::String(s) | Variant::ShortString(s) => Ok(Value::String(s.to_string())), + Variant::Object(obj) => { + let mut map = serde_json::Map::new(); + let fields = obj.fields()?; + + for (key, value) in fields { + let json_value = variant_to_json_value(&value)?; + map.insert(key.to_string(), json_value); + } + + Ok(Value::Object(map)) + } + Variant::Array(arr) => { + let mut vec = Vec::new(); + let len = arr.len(); + + for i in 0..len { + let element = arr.get(i)?; + let json_value = variant_to_json_value(&element)?; + vec.push(json_value); + } + + Ok(Value::Array(vec)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Variant; + + #[test] + fn test_null_to_json() -> Result<(), ArrowError> { + let variant = Variant::Null; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "null"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Null); + Ok(()) + } + + #[test] + fn test_boolean_to_json() -> Result<(), ArrowError> { + let variant_true = Variant::BooleanTrue; + let json_true = variant_to_json_string(&variant_true)?; + assert_eq!(json_true, "true"); + + let variant_false = Variant::BooleanFalse; + let json_false = variant_to_json_string(&variant_false)?; + assert_eq!(json_false, "false"); + + let json_value_true = variant_to_json_value(&variant_true)?; + assert_eq!(json_value_true, Value::Bool(true)); + + let json_value_false = variant_to_json_value(&variant_false)?; + assert_eq!(json_value_false, Value::Bool(false)); + Ok(()) + } + + #[test] + fn test_int8_to_json() -> Result<(), ArrowError> { + let variant = Variant::Int8(42); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "42"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Number(42.into())); + Ok(()) + } + + #[test] + fn test_string_to_json() -> Result<(), ArrowError> { + let variant = Variant::String("hello world"); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "\"hello world\""); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::String("hello world".to_string())); + Ok(()) + } + + #[test] + fn test_short_string_to_json() -> Result<(), ArrowError> { + let variant = Variant::ShortString("short"); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "\"short\""); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::String("short".to_string())); + Ok(()) + } + + #[test] + fn test_string_escaping() -> Result<(), ArrowError> { + let variant = Variant::String("hello\nworld\t\"quoted\""); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "\"hello\\nworld\\t\\\"quoted\\\"\""); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::String("hello\nworld\t\"quoted\"".to_string())); + Ok(()) + } + + // TODO: Add tests for objects and arrays once the implementation is complete + // These will be added in the next steps when we implement the missing methods + // in VariantObject and VariantArray + + #[test] + fn test_json_buffer_writing() -> Result<(), ArrowError> { + let variant = Variant::Int8(123); + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, &variant)?; + + let result = String::from_utf8(buffer) + .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; + assert_eq!(result, "123"); + Ok(()) + } +} \ No newline at end of file diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 00a8a69aff99..fd5947aeec71 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -30,6 +30,7 @@ // TODO: dead code removal #[allow(dead_code)] mod decoder; +mod encoder; mod variant; // TODO: dead code removal mod builder; @@ -38,3 +39,4 @@ mod utils; pub use builder::*; pub use variant::*; +pub use encoder::json::{variant_to_json, variant_to_json_string, variant_to_json_value}; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index da3fbd36fc2c..0b3e07598a5b 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -33,7 +33,562 @@ mod object; const MAX_SHORT_STRING_BYTES: usize = 0x3F; -/// A Variant [`ShortString`] + /// Return one unsigned little-endian value from `bytes`. + /// + /// * `bytes` – the Variant-metadata buffer. + /// * `byte_offset` – number of bytes to skip **before** reading the first + /// value (usually `1` to move past the header byte). + /// * `offset_index` – 0-based index **after** the skip + /// (`0` is the first value, `1` the next, …). + /// + /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). + /// Three-byte values are zero-extended to 32 bits before the final + /// fallible cast to `usize`. + fn unpack_usize( + &self, + bytes: &[u8], + byte_offset: usize, // how many bytes to skip + offset_index: usize, // which offset in an array of offsets + ) -> Result { + use OffsetSizeBytes::*; + let offset = byte_offset + (*self as usize) * offset_index; + let result = match self { + One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Three => { + // Let's grab the three byte le-chunk first + let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?; + // Let's pad it and construct a padded u32 from it. + let mut buf = [0u8; 4]; + buf[..3].copy_from_slice(&b3_chunks); + u32::from_le_bytes(buf) + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))? + } + Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, + }; + Ok(result) + } +} + +#[derive(Clone, Debug, Copy, PartialEq)] +pub(crate) struct VariantMetadataHeader { + version: u8, + is_sorted: bool, + /// Note: This is `offset_size_minus_one` + 1 + offset_size: OffsetSizeBytes, +} + +// According to the spec this is currently always = 1, and so we store this const for validation +// purposes and to make that visible. +const CORRECT_VERSION_VALUE: u8 = 1; + +impl VariantMetadataHeader { + /// Tries to construct the variant metadata header, which has the form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 + pub(crate) fn try_new(bytes: &[u8]) -> Result { + let header = first_byte_from_slice(bytes)?; + + let version = header & 0x0F; // First four bits + if version != CORRECT_VERSION_VALUE { + let err_msg = format!( + "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {:b}", + version + ); + return Err(ArrowError::InvalidArgumentError(err_msg)); + } + let is_sorted = (header & 0x10) != 0; // Fifth bit + let offset_size_minus_one = header >> 6; // Last two bits + Ok(Self { + version, + is_sorted, + offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?, + }) + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information +pub struct VariantMetadata<'m> { + bytes: &'m [u8], + header: VariantMetadataHeader, + dict_size: usize, + dictionary_key_start_byte: usize, +} + +impl<'m> VariantMetadata<'m> { + /// View the raw bytes (needed by very low-level decoders) + #[inline] + pub const fn as_bytes(&self) -> &'m [u8] { + self.bytes + } + + pub fn try_new(bytes: &'m [u8]) -> Result { + let header = VariantMetadataHeader::try_new(bytes)?; + // Offset 1, index 0 because first element after header is dictionary size + let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; + + // Check that we have the correct metadata length according to dictionary_size, or return + // error early. + // Minimum number of bytes the metadata buffer must contain: + // 1 byte header + // + offset_size-byte `dictionary_size` field + // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table size, essentially) + // 1 + offset_size + (dict_size + 1) * offset_size + // = (dict_size + 2) * offset_size + 1 + let offset_size = header.offset_size as usize; // Cheap to copy + + let dictionary_key_start_byte = dict_size + .checked_add(2) + .and_then(|n| n.checked_mul(offset_size)) + .and_then(|n| n.checked_add(1)) + .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?; + + if bytes.len() < dictionary_key_start_byte { + return Err(ArrowError::InvalidArgumentError( + "Metadata shorter than dictionary_size implies".to_string(), + )); + } + + // Check that all offsets are monotonically increasing + let mut offsets = (0..=dict_size).map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)); + let Some(Ok(mut end @ 0)) = offsets.next() else { + return Err(ArrowError::InvalidArgumentError( + "First offset is non-zero".to_string(), + )); + }; + + for offset in offsets { + let offset = offset?; + if end >= offset { + return Err(ArrowError::InvalidArgumentError( + "Offsets are not monotonically increasing".to_string(), + )); + } + end = offset; + } + + // Verify the buffer covers the whole dictionary-string section + if end > bytes.len() - dictionary_key_start_byte { + // `prev` holds the last offset seen still + return Err(ArrowError::InvalidArgumentError( + "Last offset does not equal dictionary length".to_string(), + )); + } + + Ok(Self { + bytes, + header, + dict_size, + dictionary_key_start_byte, + }) + } + + /// Whether the dictionary keys are sorted and unique + pub fn is_sorted(&self) -> bool { + self.header.is_sorted + } + + /// Get the dictionary size + pub fn dictionary_size(&self) -> usize { + self.dict_size + } + pub fn version(&self) -> u8 { + self.header.version + } + + /// Helper method to get the offset start and end range for a key by index. + fn get_offsets_for_key_by(&self, index: usize) -> Result, ArrowError> { + if index >= self.dict_size { + return Err(ArrowError::InvalidArgumentError(format!( + "Index {} out of bounds for dictionary of length {}", + index, self.dict_size + ))); + } + + // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) + let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); + Ok(unpack(index)?..unpack(index + 1)?) + } + + /// Get a single offset by index + pub fn get_offset_by(&self, index: usize) -> Result { + if index >= self.dict_size { + return Err(ArrowError::InvalidArgumentError(format!( + "Index {} out of bounds for dictionary of length {}", + index, self.dict_size + ))); + } + + // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) + let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); + unpack(index) + } + + /// Get the key-name by index + pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { + let offset_range = self.get_offsets_for_key_by(index)?; + self.get_field_by_offset(offset_range) + } + + /// Gets the field using an offset (Range) - helper method to keep consistent API. + pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { + let dictionary_keys_bytes = + slice_from_slice(self.bytes, self.dictionary_key_start_byte..self.bytes.len())?; + let result = string_from_slice(dictionary_keys_bytes, offset)?; + + Ok(result) + } + + #[allow(unused)] + pub(crate) fn header(&self) -> VariantMetadataHeader { + self.header + } + + /// Get the offsets as an iterator + pub fn offsets(&self) -> impl Iterator, ArrowError>> + 'm { + let offset_size = self.header.offset_size; // `Copy` + let bytes = self.bytes; + + (0..self.dict_size).map(move |i| { + // This wont be out of bounds as long as dict_size and offsets have been validated + // during construction via `try_new`, as it calls unpack_usize for the + // indices `1..dict_size+1` already. + let start = offset_size.unpack_usize(bytes, 1, i + 1); + let end = offset_size.unpack_usize(bytes, 1, i + 2); + + match (start, end) { + (Ok(s), Ok(e)) => Ok(s..e), + (Err(e), _) | (_, Err(e)) => Err(e), + } + }) + } + + /// Get all key-names as an Iterator of strings + pub fn fields( + &'m self, + ) -> Result>, ArrowError> { + let iterator = self + .offsets() + .map(move |offset_range| self.get_field_by_offset(offset_range?)); + Ok(iterator) + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct VariantObjectHeader { + field_offset_size: OffsetSizeBytes, + field_id_size: OffsetSizeBytes, + num_elements: usize, + field_ids_start_byte: usize, + field_offsets_start_byte: usize, + values_start_byte: usize, +} + +impl VariantObjectHeader { + pub(crate) fn try_new(value: &[u8]) -> Result { + // Parse the header byte to get object parameters + let header = first_byte_from_slice(value)?; + let value_header = header >> 2; + + let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits + let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits + let is_large = value_header & 0x10; // 5th bit + + let field_offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; + let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?; + + // Determine num_elements size based on is_large flag + let num_elements_size = if is_large != 0 { + OffsetSizeBytes::Four + } else { + OffsetSizeBytes::One + }; + + // Parse num_elements + let num_elements = num_elements_size.unpack_usize(value, 1, 0)?; + + // Calculate byte offsets for different sections + let field_ids_start_byte = 1 + num_elements_size as usize; + let field_offsets_start_byte = field_ids_start_byte + num_elements * field_id_size as usize; + let values_start_byte = + field_offsets_start_byte + (num_elements + 1) * field_offset_size as usize; + + // Verify that the last field offset array entry is inside the value slice + let last_field_offset_byte = + field_offsets_start_byte + (num_elements + 1) * field_offset_size as usize; + if last_field_offset_byte > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last field offset array entry at offset {} with length {} is outside the value slice of length {}", + last_field_offset_byte, + field_offset_size as usize, + value.len() + ))); + } + + // Verify that the value of the last field offset array entry fits inside the value slice + let last_field_offset = + field_offset_size.unpack_usize(value, field_offsets_start_byte, num_elements)?; + if values_start_byte + last_field_offset > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last field offset value {} at offset {} is outside the value slice of length {}", + last_field_offset, + values_start_byte, + value.len() + ))); + } + Ok(Self { + field_offset_size, + field_id_size, + num_elements, + field_ids_start_byte, + field_offsets_start_byte, + values_start_byte, + }) + } + + /// Returns the number of key-value pairs in this object + pub(crate) fn num_elements(&self) -> usize { + self.num_elements + } +} + +#[derive(Clone, Debug, PartialEq)] +pub struct VariantObject<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], + header: VariantObjectHeader, +} + +impl<'m, 'v> VariantObject<'m, 'v> { + pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { + Ok(Self { + metadata, + value, + header: VariantObjectHeader::try_new(value)?, + }) + } + + /// Returns the number of key-value pairs in this object + pub fn len(&self) -> usize { + self.header.num_elements() + } + + /// Returns true if the object contains no key-value pairs + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn fields(&self) -> Result)>, ArrowError> { + let field_list = self.parse_field_list()?; + Ok(field_list.into_iter()) + } + + pub fn values(&self) -> Result>, ArrowError> { + let len = self.len(); + let mut values = Vec::new(); + + for i in 0..len { + let element = self.get(i)?; + values.push(element); + } + + Ok(values.into_iter()) + } + + pub fn field(&self, name: &str) -> Result>, ArrowError> { + // Binary search through the field IDs of this object to find the requested field name. + // + // NOTE: This does not require a sorted metadata dictionary, because the variant spec + // requires object field ids to be lexically sorted by their corresponding string values, + // and probing the dictionary for a field id is always O(1) work. + let (field_ids, field_offsets) = self.parse_field_arrays()?; + let search_result = try_binary_search_by(&field_ids, &name, |&field_id| { + self.metadata.get_field_by(field_id) + })?; + + let Ok(index) = search_result else { + return Ok(None); + }; + let start_offset = field_offsets[index]; + let end_offset = field_offsets[index + 1]; + let value_bytes = slice_from_slice( + self.value, + self.header.values_start_byte + start_offset + ..self.header.values_start_byte + end_offset, + )?; + let variant = Variant::try_new_with_metadata(self.metadata, value_bytes)?; + Ok(Some(variant)) + } + + /// Parse field IDs and field offsets arrays using the cached header + fn parse_field_arrays(&self) -> Result<(Vec, Vec), ArrowError> { + // Parse field IDs + let field_ids = (0..self.header.num_elements) + .map(|i| { + self.header.field_id_size.unpack_usize( + self.value, + self.header.field_ids_start_byte, + i, + ) + }) + .collect::, _>>()?; + debug_assert_eq!(field_ids.len(), self.header.num_elements); + + // Parse field offsets (num_elements + 1 entries) + let field_offsets = (0..=self.header.num_elements) + .map(|i| { + self.header.field_offset_size.unpack_usize( + self.value, + self.header.field_offsets_start_byte, + i, + ) + }) + .collect::, _>>()?; + debug_assert_eq!(field_offsets.len(), self.header.num_elements + 1); + + Ok((field_ids, field_offsets)) + } + + /// Parse all fields into a vector for iteration + fn parse_field_list(&self) -> Result)>, ArrowError> { + let (field_ids, field_offsets) = self.parse_field_arrays()?; + + let mut fields = Vec::with_capacity(self.header.num_elements); + + for i in 0..self.header.num_elements { + let field_id = field_ids[i]; + let field_name = self.metadata.get_field_by(field_id)?; + + let start_offset = field_offsets[i]; + let value_bytes = + slice_from_slice(self.value, self.header.values_start_byte + start_offset..)?; + let variant = Variant::try_new_with_metadata(self.metadata, value_bytes)?; + + fields.push((field_name, variant)); + } + + Ok(fields) + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct VariantListHeader { + offset_size: OffsetSizeBytes, + is_large: bool, + num_elements: usize, + first_offset_byte: usize, + first_value_byte: usize, +} + +impl VariantListHeader { + pub(crate) fn try_new(value: &[u8]) -> Result { + // The 6 first bits to the left are the value_header and the 2 bits + // to the right are the basic type, so we shift to get only the value_header + let value_header = first_byte_from_slice(value)? >> 2; + let is_large = (value_header & 0x04) != 0; // 3rd bit from the right + let field_offset_size_minus_one = value_header & 0x03; // Last two bits + let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; + + // The size of the num_elements entry in the array value_data is 4 bytes if + // is_large is true, otherwise 1 byte. + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; + + // Skip the header byte to read the num_elements + let num_elements = num_elements_size.unpack_usize(value, 1, 0)?; + let first_offset_byte = 1 + num_elements_size as usize; + + let overflow = + || ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into()); + + // 1. num_elements + 1 + let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?; + + // 2. (num_elements + 1) * offset_size + let value_bytes = n_offsets + .checked_mul(offset_size as usize) + .ok_or_else(overflow)?; + + // 3. first_offset_byte + ... + let first_value_byte = first_offset_byte + .checked_add(value_bytes) + .ok_or_else(overflow)?; + + // Verify that the last offset array entry is inside the value slice + let last_offset_byte = first_offset_byte + n_offsets * offset_size as usize; + if last_offset_byte > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last offset array entry at offset {} with length {} is outside the value slice of length {}", + last_offset_byte, + offset_size as usize, + value.len() + ))); + } + + // Verify that the value of the last offset array entry fits inside the value slice + let last_offset = offset_size.unpack_usize(value, first_offset_byte, num_elements)?; + if first_value_byte + last_offset > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last offset value {} at offset {} is outside the value slice of length {}", + last_offset, + first_value_byte, + value.len() + ))); + } + + Ok(Self { + offset_size, + is_large, + num_elements, + first_offset_byte, + first_value_byte, + }) + } + + /// Returns the number of elements in this list + pub(crate) fn num_elements(&self) -> usize { + self.num_elements + } + + /// Returns the offset size in bytes + #[allow(unused)] + pub(crate) fn offset_size(&self) -> usize { + self.offset_size as _ + } + + /// Returns whether this is a large list + #[allow(unused)] + pub(crate) fn is_large(&self) -> bool { + self.is_large + } + + /// Returns the byte offset where the offset array starts + pub(crate) fn first_offset_byte(&self) -> usize { + self.first_offset_byte + } + + /// Returns the byte offset where the values start + pub(crate) fn first_value_byte(&self) -> usize { + self.first_value_byte + } +} + +/// Represents a variant array. /// /// This implementation is a zero cost wrapper over `&str` that ensures /// the length of the underlying string is a valid Variant short string (63 bytes or less) diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs new file mode 100644 index 000000000000..cfade8d5b9b0 --- /dev/null +++ b/parquet-variant/tests/integration_test.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration tests for JSON conversion functionality + +use parquet_variant::{variant_to_json_string, variant_to_json_value, Variant}; +use serde_json::Value; + +#[test] +fn test_primitive_values_to_json() { + // Test null + let null_variant = Variant::Null; + assert_eq!(variant_to_json_string(&null_variant).unwrap(), "null"); + assert_eq!(variant_to_json_value(&null_variant).unwrap(), Value::Null); + + // Test boolean + let bool_true = Variant::BooleanTrue; + let bool_false = Variant::BooleanFalse; + assert_eq!(variant_to_json_string(&bool_true).unwrap(), "true"); + assert_eq!(variant_to_json_string(&bool_false).unwrap(), "false"); + assert_eq!(variant_to_json_value(&bool_true).unwrap(), Value::Bool(true)); + assert_eq!(variant_to_json_value(&bool_false).unwrap(), Value::Bool(false)); + + // Test integers + let int_variant = Variant::Int8(42); + assert_eq!(variant_to_json_string(&int_variant).unwrap(), "42"); + assert_eq!(variant_to_json_value(&int_variant).unwrap(), Value::Number(42.into())); + + let negative_int = Variant::Int8(-123); + assert_eq!(variant_to_json_string(&negative_int).unwrap(), "-123"); + assert_eq!(variant_to_json_value(&negative_int).unwrap(), Value::Number((-123).into())); + + // Test strings + let string_variant = Variant::String("hello world"); + assert_eq!(variant_to_json_string(&string_variant).unwrap(), "\"hello world\""); + assert_eq!(variant_to_json_value(&string_variant).unwrap(), Value::String("hello world".to_string())); + + let short_string = Variant::ShortString("test"); + assert_eq!(variant_to_json_string(&short_string).unwrap(), "\"test\""); + assert_eq!(variant_to_json_value(&short_string).unwrap(), Value::String("test".to_string())); +} + +#[test] +fn test_string_escaping_edge_cases() { + // Test various escape sequences + let escaped_string = Variant::String("line1\nline2\ttab\"quote\"\\backslash"); + let expected_json = "\"line1\\nline2\\ttab\\\"quote\\\"\\\\backslash\""; + assert_eq!(variant_to_json_string(&escaped_string).unwrap(), expected_json); + + // Test Unicode characters + let unicode_string = Variant::String("Hello δΈ–η•Œ 🌍"); + let json_result = variant_to_json_string(&unicode_string).unwrap(); + assert!(json_result.contains("Hello δΈ–η•Œ 🌍")); + assert!(json_result.starts_with('"') && json_result.ends_with('"')); +} + +#[test] +fn test_json_roundtrip_compatibility() { + // Test that our JSON output can be parsed back by serde_json + let test_cases = vec![ + Variant::Null, + Variant::BooleanTrue, + Variant::BooleanFalse, + Variant::Int8(0), + Variant::Int8(127), + Variant::Int8(-128), + Variant::String(""), + Variant::String("simple string"), + Variant::String("string with\nnewlines\tand\ttabs"), + Variant::ShortString("short"), + ]; + + for variant in test_cases { + let json_string = variant_to_json_string(&variant).unwrap(); + + // Ensure the JSON can be parsed back + let parsed: Value = serde_json::from_str(&json_string).unwrap(); + + // Ensure our direct Value conversion matches + let direct_value = variant_to_json_value(&variant).unwrap(); + assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); + } +} + +#[test] +fn test_buffer_writing() { + use parquet_variant::variant_to_json; + use std::io::Write; + + let variant = Variant::String("test buffer writing"); + + // Test writing to a Vec + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, &variant).unwrap(); + let result = String::from_utf8(buffer).unwrap(); + assert_eq!(result, "\"test buffer writing\""); + + // Test writing to a custom writer + struct CustomWriter { + data: Vec, + } + + impl Write for CustomWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.data.extend_from_slice(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + let mut custom_writer = CustomWriter { data: Vec::new() }; + variant_to_json(&mut custom_writer, &variant).unwrap(); + let result = String::from_utf8(custom_writer.data).unwrap(); + assert_eq!(result, "\"test buffer writing\""); +} + +// Note: Tests for arrays and objects would require actual Variant data structures +// to be created, which would need the builder API to be implemented first. +// These tests demonstrate the primitive functionality that's currently working. \ No newline at end of file From 4f005f3d40ada0804886dd34ac87fcc575ac0f6d Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 12 Jun 2025 17:15:53 -0400 Subject: [PATCH 02/32] [FIX] fix doc tests and refactor code in variant_to_json --- parquet-variant/examples/variant_to_json.rs | 3 -- parquet-variant/src/encoder/json.rs | 35 +++++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/parquet-variant/examples/variant_to_json.rs b/parquet-variant/examples/variant_to_json.rs index a3c339c85101..fce42b5b4d47 100644 --- a/parquet-variant/examples/variant_to_json.rs +++ b/parquet-variant/examples/variant_to_json.rs @@ -44,9 +44,6 @@ fn main() -> Result<(), Box> { let escaped_json = variant_to_json_string(&special_string)?; println!(" Original: Line 1\\nLine 2\\tTabbed\\r\\nWith \"quotes\" and \\\\backslashes"); println!(" JSON: {}", escaped_json); - - // Example 3: Unicode support - println!("\n🌍 3. Unicode Support:"); let unicode_variants = vec![ Variant::String("Hello δΈ–η•Œ 🌍"), diff --git a/parquet-variant/src/encoder/json.rs b/parquet-variant/src/encoder/json.rs index c41f394e1413..05591f7c77c7 100644 --- a/parquet-variant/src/encoder/json.rs +++ b/parquet-variant/src/encoder/json.rs @@ -39,11 +39,16 @@ use crate::variant::{Variant, VariantArray, VariantObject}; /// /// ```rust /// use parquet_variant::{Variant, variant_to_json}; +/// use arrow_schema::ArrowError; /// -/// let variant = Variant::Int8(42); -/// let mut buffer = Vec::new(); -/// variant_to_json(&mut buffer, &variant)?; -/// assert_eq!(String::from_utf8(buffer)?, "42"); +/// fn example() -> Result<(), ArrowError> { +/// let variant = Variant::Int8(42); +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// assert_eq!(String::from_utf8(buffer).unwrap(), "42"); +/// Ok(()) +/// } +/// example().unwrap(); /// ``` pub fn variant_to_json( json_buffer: &mut W, @@ -144,10 +149,15 @@ fn convert_array_to_json( /// /// ```rust /// use parquet_variant::{Variant, variant_to_json_string}; +/// use arrow_schema::ArrowError; /// -/// let variant = Variant::String("hello"); -/// let json = variant_to_json_string(&variant)?; -/// assert_eq!(json, "\"hello\""); +/// fn example() -> Result<(), ArrowError> { +/// let variant = Variant::String("hello"); +/// let json = variant_to_json_string(&variant)?; +/// assert_eq!(json, "\"hello\""); +/// Ok(()) +/// } +/// example().unwrap(); /// ``` pub fn variant_to_json_string(variant: &Variant) -> Result { let mut buffer = Vec::new(); @@ -172,10 +182,15 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { /// ```rust /// use parquet_variant::{Variant, variant_to_json_value}; /// use serde_json::Value; +/// use arrow_schema::ArrowError; /// -/// let variant = Variant::Int8(42); -/// let json_value = variant_to_json_value(&variant)?; -/// assert_eq!(json_value, Value::Number(42.into())); +/// fn example() -> Result<(), ArrowError> { +/// let variant = Variant::Int8(42); +/// let json_value = variant_to_json_value(&variant)?; +/// assert_eq!(json_value, Value::Number(42.into())); +/// Ok(()) +/// } +/// example().unwrap(); /// ``` pub fn variant_to_json_value(variant: &Variant) -> Result { match variant { From 5c249f8836117b4b081823cde519409d50f95fc3 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 15 Jun 2025 13:44:52 -0400 Subject: [PATCH 03/32] [FIX] fix issues with clippy errors and rename files --- .../{variant_to_json.rs => variant_to_json_examples.rs} | 0 parquet-variant/src/encoder.rs | 2 +- .../src/encoder/{json.rs => variant_to_json.rs} | 0 parquet-variant/src/lib.rs | 2 +- parquet-variant/src/variant.rs | 8 ++++++++ 5 files changed, 10 insertions(+), 2 deletions(-) rename parquet-variant/examples/{variant_to_json.rs => variant_to_json_examples.rs} (100%) rename parquet-variant/src/encoder/{json.rs => variant_to_json.rs} (100%) diff --git a/parquet-variant/examples/variant_to_json.rs b/parquet-variant/examples/variant_to_json_examples.rs similarity index 100% rename from parquet-variant/examples/variant_to_json.rs rename to parquet-variant/examples/variant_to_json_examples.rs diff --git a/parquet-variant/src/encoder.rs b/parquet-variant/src/encoder.rs index d4ef77e9388b..f671e0c2ee9b 100644 --- a/parquet-variant/src/encoder.rs +++ b/parquet-variant/src/encoder.rs @@ -17,4 +17,4 @@ //! Encoder module for converting Variant values to other formats -pub mod json; \ No newline at end of file +pub mod variant_to_json; \ No newline at end of file diff --git a/parquet-variant/src/encoder/json.rs b/parquet-variant/src/encoder/variant_to_json.rs similarity index 100% rename from parquet-variant/src/encoder/json.rs rename to parquet-variant/src/encoder/variant_to_json.rs diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index fd5947aeec71..7bbbb96ef8ce 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -39,4 +39,4 @@ mod utils; pub use builder::*; pub use variant::*; -pub use encoder::json::{variant_to_json, variant_to_json_string, variant_to_json_value}; +pub use encoder::variant_to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0b3e07598a5b..4320e10fa78d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1564,6 +1564,14 @@ impl From for Variant<'_, '_> { Variant::Decimal4(value) } } +impl From for Variant<'_, '_> { + fn from(value: bool) -> Self { + match value { + true => Variant::BooleanTrue, + false => Variant::BooleanFalse, + } + } +} impl From for Variant<'_, '_> { fn from(value: VariantDecimal8) -> Self { From 97e93099a2d28d8331c239d51a087ae3b3d692da Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 15 Jun 2025 17:17:48 -0400 Subject: [PATCH 04/32] [FIX/ADD] fix and changes to accomodate new variant types --- parquet-variant/Cargo.toml | 1 + .../src/encoder/variant_to_json.rs | 88 +++++++++++++++++++ parquet-variant/src/variant.rs | 10 --- 3 files changed, 89 insertions(+), 10 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 30545f8fa21a..51cec81b2ab6 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -36,5 +36,6 @@ rust-version = "1.83" arrow-schema = { workspace = true } chrono = { workspace = true } serde_json = "1.0" +base64 = "0.21" [lib] diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index 05591f7c77c7..748124ba2df0 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -18,6 +18,7 @@ //! Module for converting Variant data to JSON format use arrow_schema::ArrowError; +use base64::{Engine as _, engine::general_purpose}; use serde_json::Value; use std::io::Write; @@ -67,6 +68,55 @@ pub fn variant_to_json( Variant::Int8(i) => { write!(json_buffer, "{}", i)?; } + Variant::Int16(i) => { + write!(json_buffer, "{}", i)?; + } + Variant::Int32(i) => { + write!(json_buffer, "{}", i)?; + } + Variant::Int64(i) => { + write!(json_buffer, "{}", i)?; + } + Variant::Float(f) => { + write!(json_buffer, "{}", f)?; + } + Variant::Double(f) => { + write!(json_buffer, "{}", f)?; + } + Variant::Decimal4 { integer, scale } => { + // Convert decimal to string representation + let divisor = 10_i32.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + write!(json_buffer, "{}", decimal_value)?; + } + Variant::Decimal8 { integer, scale } => { + // Convert decimal to string representation + let divisor = 10_i64.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + write!(json_buffer, "{}", decimal_value)?; + } + Variant::Decimal16 { integer, scale } => { + // Convert decimal to string representation + let divisor = 10_i128.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + write!(json_buffer, "{}", decimal_value)?; + } + Variant::Date(date) => { + write!(json_buffer, "\"{}\"", date.format("%Y-%m-%d"))?; + } + Variant::TimestampMicros(ts) => { + write!(json_buffer, "\"{}\"", ts.to_rfc3339())?; + } + Variant::TimestampNtzMicros(ts) => { + write!(json_buffer, "\"{}\"", ts.format("%Y-%m-%dT%H:%M:%S%.6f"))?; + } + Variant::Binary(bytes) => { + // Encode binary as base64 string + let base64_str = general_purpose::STANDARD.encode(bytes); + let json_str = serde_json::to_string(&base64_str) + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)))?; + write!(json_buffer, "{}", json_str)?; + } Variant::String(s) | Variant::ShortString(s) => { // Use serde_json to properly escape the string let json_str = serde_json::to_string(s) @@ -198,6 +248,44 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::BooleanTrue => Ok(Value::Bool(true)), Variant::BooleanFalse => Ok(Value::Bool(false)), Variant::Int8(i) => Ok(Value::Number((*i).into())), + Variant::Int16(i) => Ok(Value::Number((*i).into())), + Variant::Int32(i) => Ok(Value::Number((*i).into())), + Variant::Int64(i) => Ok(Value::Number((*i).into())), + Variant::Float(f) => { + serde_json::Number::from_f64(*f as f64) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid float value".to_string())) + } + Variant::Double(f) => { + serde_json::Number::from_f64(*f) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())) + } + Variant::Decimal4 { integer, scale } => { + let divisor = 10_i32.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + serde_json::Number::from_f64(decimal_value) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + } + Variant::Decimal8 { integer, scale } => { + let divisor = 10_i64.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + serde_json::Number::from_f64(decimal_value) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + } + Variant::Decimal16 { integer, scale } => { + let divisor = 10_i128.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; + serde_json::Number::from_f64(decimal_value) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + } + Variant::Date(date) => Ok(Value::String(date.format("%Y-%m-%d").to_string())), + Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), + Variant::TimestampNtzMicros(ts) => Ok(Value::String(ts.format("%Y-%m-%dT%H:%M:%S%.6f").to_string())), + Variant::Binary(bytes) => Ok(Value::String(general_purpose::STANDARD.encode(bytes))), Variant::String(s) | Variant::ShortString(s) => Ok(Value::String(s.to_string())), Variant::Object(obj) => { let mut map = serde_json::Map::new(); diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 4320e10fa78d..af1991201c69 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1597,16 +1597,6 @@ impl From for Variant<'_, '_> { } } -impl From for Variant<'_, '_> { - fn from(value: bool) -> Self { - if value { - Variant::BooleanTrue - } else { - Variant::BooleanFalse - } - } -} - impl From for Variant<'_, '_> { fn from(value: NaiveDate) -> Self { Variant::Date(value) From c4e48686a55b571c8186fc0a237c980a49dac6fa Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sun, 15 Jun 2025 17:27:53 -0400 Subject: [PATCH 05/32] [TEST] add tests for new variant types to json functionality --- .../src/encoder/variant_to_json.rs | 251 +++++++++++++++++- parquet-variant/tests/integration_test.rs | 179 ++++++++++++- 2 files changed, 420 insertions(+), 10 deletions(-) diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index 748124ba2df0..2e582236d825 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -317,6 +317,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { mod tests { use super::*; use crate::Variant; + use chrono::{DateTime, NaiveDate, Utc}; #[test] fn test_null_to_json() -> Result<(), ArrowError> { @@ -358,6 +359,219 @@ mod tests { Ok(()) } + #[test] + fn test_int16_to_json() -> Result<(), ArrowError> { + let variant = Variant::Int16(32767); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "32767"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Number(32767.into())); + + // Test negative value + let negative_variant = Variant::Int16(-32768); + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-32768"); + Ok(()) + } + + #[test] + fn test_int32_to_json() -> Result<(), ArrowError> { + let variant = Variant::Int32(2147483647); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "2147483647"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Number(2147483647.into())); + + // Test negative value + let negative_variant = Variant::Int32(-2147483648); + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-2147483648"); + Ok(()) + } + + #[test] + fn test_int64_to_json() -> Result<(), ArrowError> { + let variant = Variant::Int64(9223372036854775807); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "9223372036854775807"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Number(9223372036854775807i64.into())); + + // Test negative value + let negative_variant = Variant::Int64(-9223372036854775808); + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-9223372036854775808"); + Ok(()) + } + + #[test] + fn test_float_to_json() -> Result<(), ArrowError> { + let variant = Variant::Float(3.14159); + let json = variant_to_json_string(&variant)?; + assert!(json.starts_with("3.14159")); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Number(_))); + + // Test zero + let zero_variant = Variant::Float(0.0); + let zero_json = variant_to_json_string(&zero_variant)?; + assert_eq!(zero_json, "0"); + + // Test negative + let negative_variant = Variant::Float(-1.5); + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-1.5"); + Ok(()) + } + + #[test] + fn test_double_to_json() -> Result<(), ArrowError> { + let variant = Variant::Double(2.718281828459045); + let json = variant_to_json_string(&variant)?; + assert!(json.starts_with("2.718281828459045")); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Number(_))); + + // Test zero + let zero_variant = Variant::Double(0.0); + let zero_json = variant_to_json_string(&zero_variant)?; + assert_eq!(zero_json, "0"); + + // Test negative + let negative_variant = Variant::Double(-2.5); + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-2.5"); + Ok(()) + } + + #[test] + fn test_decimal4_to_json() -> Result<(), ArrowError> { + let variant = Variant::Decimal4 { integer: 12345, scale: 2 }; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "123.45"); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Number(_))); + + // Test zero scale + let no_scale_variant = Variant::Decimal4 { integer: 42, scale: 0 }; + let no_scale_json = variant_to_json_string(&no_scale_variant)?; + assert_eq!(no_scale_json, "42"); + + // Test negative + let negative_variant = Variant::Decimal4 { integer: -12345, scale: 3 }; + let negative_json = variant_to_json_string(&negative_variant)?; + assert_eq!(negative_json, "-12.345"); + Ok(()) + } + + #[test] + fn test_decimal8_to_json() -> Result<(), ArrowError> { + let variant = Variant::Decimal8 { integer: 1234567890, scale: 3 }; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "1234567.89"); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Number(_))); + + // Test large scale + let large_scale_variant = Variant::Decimal8 { integer: 123456789, scale: 6 }; + let large_scale_json = variant_to_json_string(&large_scale_variant)?; + assert_eq!(large_scale_json, "123.456789"); + Ok(()) + } + + #[test] + fn test_decimal16_to_json() -> Result<(), ArrowError> { + let variant = Variant::Decimal16 { integer: 123456789012345, scale: 4 }; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "12345678901.2345"); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Number(_))); + + // Test very large number + let large_variant = Variant::Decimal16 { integer: 999999999999999999, scale: 2 }; + let large_json = variant_to_json_string(&large_variant)?; + // Due to f64 precision limits, very large numbers may lose precision + assert!(large_json.starts_with("9999999999999999") || large_json.starts_with("10000000000000000")); + Ok(()) + } + + #[test] + fn test_date_to_json() -> Result<(), ArrowError> { + let date = NaiveDate::from_ymd_opt(2023, 12, 25).unwrap(); + let variant = Variant::Date(date); + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "\"2023-12-25\""); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::String("2023-12-25".to_string())); + + // Test leap year date + let leap_date = NaiveDate::from_ymd_opt(2024, 2, 29).unwrap(); + let leap_variant = Variant::Date(leap_date); + let leap_json = variant_to_json_string(&leap_variant)?; + assert_eq!(leap_json, "\"2024-02-29\""); + Ok(()) + } + + #[test] + fn test_timestamp_micros_to_json() -> Result<(), ArrowError> { + let timestamp = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z").unwrap().with_timezone(&Utc); + let variant = Variant::TimestampMicros(timestamp); + let json = variant_to_json_string(&variant)?; + assert!(json.contains("2023-12-25T10:30:45")); + assert!(json.starts_with('"') && json.ends_with('"')); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::String(_))); + Ok(()) + } + + #[test] + fn test_timestamp_ntz_micros_to_json() -> Result<(), ArrowError> { + let naive_timestamp = DateTime::from_timestamp(1703505045, 123456).unwrap().naive_utc(); + let variant = Variant::TimestampNtzMicros(naive_timestamp); + let json = variant_to_json_string(&variant)?; + assert!(json.contains("2023-12-25")); + assert!(json.starts_with('"') && json.ends_with('"')); + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::String(_))); + Ok(()) + } + + #[test] + fn test_binary_to_json() -> Result<(), ArrowError> { + let binary_data = b"Hello, World!"; + let variant = Variant::Binary(binary_data); + let json = variant_to_json_string(&variant)?; + + // Should be base64 encoded and quoted + assert!(json.starts_with('"') && json.ends_with('"')); + assert!(json.len() > 2); // Should have content + + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::String(_))); + + // Test empty binary + let empty_variant = Variant::Binary(b""); + let empty_json = variant_to_json_string(&empty_variant)?; + assert_eq!(empty_json, "\"\""); + + // Test binary with special bytes + let special_variant = Variant::Binary(&[0, 255, 128, 64]); + let special_json = variant_to_json_string(&special_variant)?; + assert!(special_json.starts_with('"') && special_json.ends_with('"')); + Ok(()) + } + #[test] fn test_string_to_json() -> Result<(), ArrowError> { let variant = Variant::String("hello world"); @@ -391,10 +605,6 @@ mod tests { Ok(()) } - // TODO: Add tests for objects and arrays once the implementation is complete - // These will be added in the next steps when we implement the missing methods - // in VariantObject and VariantArray - #[test] fn test_json_buffer_writing() -> Result<(), ArrowError> { let variant = Variant::Int8(123); @@ -406,4 +616,37 @@ mod tests { assert_eq!(result, "123"); Ok(()) } + + #[test] + fn test_comprehensive_type_coverage() -> Result<(), ArrowError> { + // Test all supported types to ensure no compilation errors + let test_variants = vec![ + Variant::Null, + Variant::BooleanTrue, + Variant::BooleanFalse, + Variant::Int8(1), + Variant::Int16(2), + Variant::Int32(3), + Variant::Int64(4), + Variant::Float(5.0), + Variant::Double(6.0), + Variant::Decimal4 { integer: 7, scale: 0 }, + Variant::Decimal8 { integer: 8, scale: 0 }, + Variant::Decimal16 { integer: 9, scale: 0 }, + Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), + Variant::TimestampMicros(DateTime::parse_from_rfc3339("2023-01-01T00:00:00Z").unwrap().with_timezone(&Utc)), + Variant::TimestampNtzMicros(DateTime::from_timestamp(0, 0).unwrap().naive_utc()), + Variant::Binary(b"test"), + Variant::String("test"), + Variant::ShortString("test"), + ]; + + for variant in test_variants { + // Ensure all types can be converted without panicking + let _json_string = variant_to_json_string(&variant)?; + let _json_value = variant_to_json_value(&variant)?; + } + + Ok(()) + } } \ No newline at end of file diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs index cfade8d5b9b0..d60be07d4586 100644 --- a/parquet-variant/tests/integration_test.rs +++ b/parquet-variant/tests/integration_test.rs @@ -19,6 +19,7 @@ use parquet_variant::{variant_to_json_string, variant_to_json_value, Variant}; use serde_json::Value; +use chrono::{DateTime, NaiveDate, Utc}; #[test] fn test_primitive_values_to_json() { @@ -36,13 +37,13 @@ fn test_primitive_values_to_json() { assert_eq!(variant_to_json_value(&bool_false).unwrap(), Value::Bool(false)); // Test integers - let int_variant = Variant::Int8(42); - assert_eq!(variant_to_json_string(&int_variant).unwrap(), "42"); - assert_eq!(variant_to_json_value(&int_variant).unwrap(), Value::Number(42.into())); + let int8_variant = Variant::Int8(42); + assert_eq!(variant_to_json_string(&int8_variant).unwrap(), "42"); + assert_eq!(variant_to_json_value(&int8_variant).unwrap(), Value::Number(42.into())); - let negative_int = Variant::Int8(-123); - assert_eq!(variant_to_json_string(&negative_int).unwrap(), "-123"); - assert_eq!(variant_to_json_value(&negative_int).unwrap(), Value::Number((-123).into())); + let negative_int8 = Variant::Int8(-123); + assert_eq!(variant_to_json_string(&negative_int8).unwrap(), "-123"); + assert_eq!(variant_to_json_value(&negative_int8).unwrap(), Value::Number((-123).into())); // Test strings let string_variant = Variant::String("hello world"); @@ -54,6 +55,172 @@ fn test_primitive_values_to_json() { assert_eq!(variant_to_json_value(&short_string).unwrap(), Value::String("test".to_string())); } +#[test] +fn test_integer_types_to_json() { + // Test Int16 + let int16_variant = Variant::Int16(32767); + assert_eq!(variant_to_json_string(&int16_variant).unwrap(), "32767"); + assert_eq!(variant_to_json_value(&int16_variant).unwrap(), Value::Number(32767.into())); + + let negative_int16 = Variant::Int16(-32768); + assert_eq!(variant_to_json_string(&negative_int16).unwrap(), "-32768"); + assert_eq!(variant_to_json_value(&negative_int16).unwrap(), Value::Number((-32768).into())); + + // Test Int32 + let int32_variant = Variant::Int32(2147483647); + assert_eq!(variant_to_json_string(&int32_variant).unwrap(), "2147483647"); + assert_eq!(variant_to_json_value(&int32_variant).unwrap(), Value::Number(2147483647.into())); + + let negative_int32 = Variant::Int32(-2147483648); + assert_eq!(variant_to_json_string(&negative_int32).unwrap(), "-2147483648"); + assert_eq!(variant_to_json_value(&negative_int32).unwrap(), Value::Number((-2147483648).into())); + + // Test Int64 + let int64_variant = Variant::Int64(9223372036854775807); + assert_eq!(variant_to_json_string(&int64_variant).unwrap(), "9223372036854775807"); + assert_eq!(variant_to_json_value(&int64_variant).unwrap(), Value::Number(9223372036854775807i64.into())); + + let negative_int64 = Variant::Int64(-9223372036854775808); + assert_eq!(variant_to_json_string(&negative_int64).unwrap(), "-9223372036854775808"); + assert_eq!(variant_to_json_value(&negative_int64).unwrap(), Value::Number((-9223372036854775808i64).into())); +} + +#[test] +fn test_floating_point_types_to_json() { + // Test Float (f32) + let float_variant = Variant::Float(3.14159); + let float_json = variant_to_json_string(&float_variant).unwrap(); + assert!(float_json.starts_with("3.14159")); + + let float_value = variant_to_json_value(&float_variant).unwrap(); + assert!(matches!(float_value, Value::Number(_))); + + // Test Double (f64) + let double_variant = Variant::Double(2.718281828459045); + let double_json = variant_to_json_string(&double_variant).unwrap(); + assert!(double_json.starts_with("2.718281828459045")); + + let double_value = variant_to_json_value(&double_variant).unwrap(); + assert!(matches!(double_value, Value::Number(_))); + + // Test special float values + let zero_float = Variant::Float(0.0); + assert_eq!(variant_to_json_string(&zero_float).unwrap(), "0"); + + let negative_float = Variant::Float(-1.5); + assert_eq!(variant_to_json_string(&negative_float).unwrap(), "-1.5"); +} + +#[test] +fn test_decimal_types_to_json() { + // Test Decimal4 (i32 with scale) + let decimal4_variant = Variant::Decimal4 { integer: 12345, scale: 2 }; + assert_eq!(variant_to_json_string(&decimal4_variant).unwrap(), "123.45"); + + let decimal4_value = variant_to_json_value(&decimal4_variant).unwrap(); + assert!(matches!(decimal4_value, Value::Number(_))); + + // Test Decimal8 (i64 with scale) + let decimal8_variant = Variant::Decimal8 { integer: 1234567890, scale: 3 }; + assert_eq!(variant_to_json_string(&decimal8_variant).unwrap(), "1234567.89"); + + let decimal8_value = variant_to_json_value(&decimal8_variant).unwrap(); + assert!(matches!(decimal8_value, Value::Number(_))); + + // Test Decimal16 (i128 with scale) + let decimal16_variant = Variant::Decimal16 { integer: 123456789012345, scale: 4 }; + assert_eq!(variant_to_json_string(&decimal16_variant).unwrap(), "12345678901.2345"); + + let decimal16_value = variant_to_json_value(&decimal16_variant).unwrap(); + assert!(matches!(decimal16_value, Value::Number(_))); + + // Test zero scale decimal + let no_scale_decimal = Variant::Decimal4 { integer: 42, scale: 0 }; + assert_eq!(variant_to_json_string(&no_scale_decimal).unwrap(), "42"); +} + +#[test] +fn test_date_and_timestamp_types_to_json() { + // Test Date + let date = NaiveDate::from_ymd_opt(2023, 12, 25).unwrap(); + let date_variant = Variant::Date(date); + assert_eq!(variant_to_json_string(&date_variant).unwrap(), "\"2023-12-25\""); + assert_eq!(variant_to_json_value(&date_variant).unwrap(), Value::String("2023-12-25".to_string())); + + // Test TimestampMicros (UTC) + let timestamp_utc = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z").unwrap().with_timezone(&Utc); + let timestamp_variant = Variant::TimestampMicros(timestamp_utc); + let timestamp_json = variant_to_json_string(×tamp_variant).unwrap(); + assert!(timestamp_json.contains("2023-12-25T10:30:45")); + assert!(timestamp_json.starts_with('"') && timestamp_json.ends_with('"')); + + // Test TimestampNtzMicros (naive datetime) + let naive_timestamp = DateTime::from_timestamp(1703505045, 123456).unwrap().naive_utc(); + let naive_timestamp_variant = Variant::TimestampNtzMicros(naive_timestamp); + let naive_json = variant_to_json_string(&naive_timestamp_variant).unwrap(); + assert!(naive_json.contains("2023-12-25")); + assert!(naive_json.starts_with('"') && naive_json.ends_with('"')); +} + +#[test] +fn test_binary_type_to_json() { + // Test Binary data (encoded as base64) + let binary_data = b"Hello, World!"; + let binary_variant = Variant::Binary(binary_data); + + let binary_json = variant_to_json_string(&binary_variant).unwrap(); + // Should be base64 encoded and quoted + assert!(binary_json.starts_with('"') && binary_json.ends_with('"')); + + let binary_value = variant_to_json_value(&binary_variant).unwrap(); + assert!(matches!(binary_value, Value::String(_))); + + // Test empty binary + let empty_binary = Variant::Binary(b""); + let empty_json = variant_to_json_string(&empty_binary).unwrap(); + assert_eq!(empty_json, "\"\""); // Empty base64 string + + // Test binary with special bytes + let special_binary = Variant::Binary(&[0, 255, 128, 64]); + let special_json = variant_to_json_string(&special_binary).unwrap(); + assert!(special_json.starts_with('"') && special_json.ends_with('"')); + assert!(special_json.len() > 2); // Should have content between quotes +} + +#[test] +fn test_comprehensive_roundtrip_compatibility() { + // Test that our JSON output can be parsed back by serde_json for all types + let test_cases = vec![ + Variant::Null, + Variant::BooleanTrue, + Variant::BooleanFalse, + Variant::Int8(42), + Variant::Int16(1000), + Variant::Int32(100000), + Variant::Int64(10000000000), + Variant::Float(3.5), // Use a value that can be represented exactly in f32 + Variant::Double(2.718281828), + Variant::Decimal4 { integer: 12345, scale: 2 }, + Variant::Decimal8 { integer: 1234567890, scale: 3 }, + Variant::Decimal16 { integer: 123456789012345, scale: 4 }, + Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), + Variant::String("test string"), + Variant::ShortString("short"), + Variant::Binary(b"binary data"), + ]; + + for variant in test_cases { + let json_string = variant_to_json_string(&variant).unwrap(); + + // Ensure the JSON can be parsed back + let parsed: Value = serde_json::from_str(&json_string).unwrap(); + + // Ensure our direct Value conversion matches + let direct_value = variant_to_json_value(&variant).unwrap(); + assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); + } +} + #[test] fn test_string_escaping_edge_cases() { // Test various escape sequences From f930f3b69f7f637d2405553889730bc9f29e7137 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 19 Jun 2025 01:42:39 -0400 Subject: [PATCH 06/32] [FIX] fix merge conflict issues --- parquet-variant/src/encoder/variant_to_json.rs | 8 ++++---- parquet-variant/src/variant.rs | 10 ++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index 2e582236d825..eb70619ccae6 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -22,7 +22,7 @@ use base64::{Engine as _, engine::general_purpose}; use serde_json::Value; use std::io::Write; -use crate::variant::{Variant, VariantArray, VariantObject}; +use crate::variant::{Variant, VariantList, VariantObject}; /// Converts a Variant to JSON and writes it to the provided buffer /// @@ -126,7 +126,7 @@ pub fn variant_to_json( Variant::Object(obj) => { convert_object_to_json(json_buffer, obj)?; } - Variant::Array(arr) => { + Variant::List(arr) => { convert_array_to_json(json_buffer, arr)?; } } @@ -166,7 +166,7 @@ fn convert_object_to_json( /// Convert array elements to JSON fn convert_array_to_json( buffer: &mut W, - arr: &VariantArray, + arr: &VariantList, ) -> Result<(), ArrowError> { write!(buffer, "[")?; @@ -298,7 +298,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Ok(Value::Object(map)) } - Variant::Array(arr) => { + Variant::List(arr) => { let mut vec = Vec::new(); let len = arr.len(); diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index af1991201c69..1b5a0011e562 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -397,14 +397,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { } pub fn values(&self) -> Result>, ArrowError> { - let len = self.len(); - let mut values = Vec::new(); - - for i in 0..len { - let element = self.get(i)?; - values.push(element); - } - + let fields = self.parse_field_list()?; + let values: Vec<_> = fields.into_iter().map(|(_, variant)| variant).collect(); Ok(values.into_iter()) } From 345d9c56b082bade5aa482ee24472a34defa70ea Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 19 Jun 2025 01:57:07 -0400 Subject: [PATCH 07/32] [TEST] Add tests for objects and lists to json --- .../src/encoder/variant_to_json.rs | 290 ++++++++++++++++++ 1 file changed, 290 insertions(+) diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index eb70619ccae6..64fd7f530404 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -649,4 +649,294 @@ mod tests { Ok(()) } + + #[test] + fn test_simple_object_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + // Create a simple object with various field types + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.new_object(); + obj.append_value("name", "Alice"); + obj.append_value("age", 30i32); + obj.append_value("active", true); + obj.append_value("score", 95.5f64); + obj.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + // Parse the JSON to verify structure - handle JSON parsing errors manually + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + if let Value::Object(obj) = parsed { + assert_eq!(obj.get("name"), Some(&Value::String("Alice".to_string()))); + assert_eq!(obj.get("age"), Some(&Value::Number(30.into()))); + assert_eq!(obj.get("active"), Some(&Value::Bool(true))); + assert!(matches!(obj.get("score"), Some(Value::Number(_)))); + assert_eq!(obj.len(), 4); + } else { + panic!("Expected JSON object"); + } + + // Test variant_to_json_value as well + let json_value = variant_to_json_value(&variant)?; + assert!(matches!(json_value, Value::Object(_))); + + Ok(()) + } + + #[test] + fn test_empty_object_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let obj = builder.new_object(); + obj.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "{}"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Object(serde_json::Map::new())); + + Ok(()) + } + + #[test] + fn test_object_with_special_characters_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.new_object(); + obj.append_value("message", "Hello \"World\"\nWith\tTabs"); + obj.append_value("path", "C:\\Users\\Alice\\Documents"); + obj.append_value("unicode", "πŸ˜€ Smiley"); + obj.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + // Verify that special characters are properly escaped + assert!(json.contains("Hello \\\"World\\\"\\nWith\\tTabs")); + assert!(json.contains("C:\\\\Users\\\\Alice\\\\Documents")); + assert!(json.contains("πŸ˜€ Smiley")); + + // Verify that the JSON can be parsed back + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + assert!(matches!(parsed, Value::Object(_))); + + Ok(()) + } + + #[test] + fn test_simple_list_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut list = builder.new_list(); + list.append_value(1i32); + list.append_value(2i32); + list.append_value(3i32); + list.append_value(4i32); + list.append_value(5i32); + list.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "[1,2,3,4,5]"); + + let json_value = variant_to_json_value(&variant)?; + if let Value::Array(arr) = json_value { + assert_eq!(arr.len(), 5); + assert_eq!(arr[0], Value::Number(1.into())); + assert_eq!(arr[4], Value::Number(5.into())); + } else { + panic!("Expected JSON array"); + } + + Ok(()) + } + + #[test] + fn test_empty_list_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let list = builder.new_list(); + list.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + assert_eq!(json, "[]"); + + let json_value = variant_to_json_value(&variant)?; + assert_eq!(json_value, Value::Array(vec![])); + + Ok(()) + } + + #[test] + fn test_mixed_type_list_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut list = builder.new_list(); + list.append_value("hello"); + list.append_value(42i32); + list.append_value(true); + list.append_value(()); // null + list.append_value(3.14f64); + list.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + if let Value::Array(arr) = parsed { + assert_eq!(arr.len(), 5); + assert_eq!(arr[0], Value::String("hello".to_string())); + assert_eq!(arr[1], Value::Number(42.into())); + assert_eq!(arr[2], Value::Bool(true)); + assert_eq!(arr[3], Value::Null); + assert!(matches!(arr[4], Value::Number(_))); + } else { + panic!("Expected JSON array"); + } + + Ok(()) + } + + #[test] + fn test_object_field_ordering_in_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.new_object(); + // Add fields in non-alphabetical order + obj.append_value("zebra", "last"); + obj.append_value("alpha", "first"); + obj.append_value("beta", "second"); + obj.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + // Parse and verify all fields are present + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + if let Value::Object(obj) = parsed { + assert_eq!(obj.len(), 3); + assert_eq!(obj.get("alpha"), Some(&Value::String("first".to_string()))); + assert_eq!(obj.get("beta"), Some(&Value::String("second".to_string()))); + assert_eq!(obj.get("zebra"), Some(&Value::String("last".to_string()))); + } else { + panic!("Expected JSON object"); + } + + Ok(()) + } + + #[test] + fn test_list_with_various_primitive_types_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut list = builder.new_list(); + list.append_value("string_value"); + list.append_value(42i32); + list.append_value(true); + list.append_value(3.14f64); + list.append_value(false); + list.append_value(()); // null + list.append_value(100i64); + list.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + if let Value::Array(arr) = parsed { + assert_eq!(arr.len(), 7); + assert_eq!(arr[0], Value::String("string_value".to_string())); + assert_eq!(arr[1], Value::Number(42.into())); + assert_eq!(arr[2], Value::Bool(true)); + assert!(matches!(arr[3], Value::Number(_))); // float + assert_eq!(arr[4], Value::Bool(false)); + assert_eq!(arr[5], Value::Null); + assert_eq!(arr[6], Value::Number(100.into())); + } else { + panic!("Expected JSON array"); + } + + Ok(()) + } + + #[test] + fn test_object_with_various_primitive_types_to_json() -> Result<(), ArrowError> { + use crate::builder::VariantBuilder; + + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.new_object(); + obj.append_value("string_field", "test_string"); + obj.append_value("int_field", 123i32); + obj.append_value("bool_field", true); + obj.append_value("float_field", 2.71f64); + obj.append_value("null_field", ()); + obj.append_value("long_field", 999i64); + obj.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let json = variant_to_json_string(&variant)?; + + let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + if let Value::Object(obj) = parsed { + assert_eq!(obj.len(), 6); + assert_eq!(obj.get("string_field"), Some(&Value::String("test_string".to_string()))); + assert_eq!(obj.get("int_field"), Some(&Value::Number(123.into()))); + assert_eq!(obj.get("bool_field"), Some(&Value::Bool(true))); + assert!(matches!(obj.get("float_field"), Some(Value::Number(_)))); + assert_eq!(obj.get("null_field"), Some(&Value::Null)); + assert_eq!(obj.get("long_field"), Some(&Value::Number(999.into()))); + } else { + panic!("Expected JSON object"); + } + + Ok(()) + } } \ No newline at end of file From 47d809b3d4c3326741baf9a1f1f9a574335ccb7f Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 19 Jun 2025 01:58:13 -0400 Subject: [PATCH 08/32] [FIX] fix cargo fmt tests --- .../examples/variant_to_json_examples.rs | 27 +- parquet-variant/src/encoder.rs | 2 +- .../src/encoder/variant_to_json.rs | 337 ++++++++++-------- parquet-variant/src/lib.rs | 4 +- parquet-variant/src/variant.rs | 2 +- parquet-variant/tests/integration_test.rs | 191 +++++++--- 6 files changed, 353 insertions(+), 210 deletions(-) diff --git a/parquet-variant/examples/variant_to_json_examples.rs b/parquet-variant/examples/variant_to_json_examples.rs index fce42b5b4d47..951eb7b47585 100644 --- a/parquet-variant/examples/variant_to_json_examples.rs +++ b/parquet-variant/examples/variant_to_json_examples.rs @@ -17,9 +17,7 @@ //! Example showing how to convert Variant values to JSON -use parquet_variant::{ - variant_to_json, variant_to_json_string, variant_to_json_value, Variant, -}; +use parquet_variant::{variant_to_json, variant_to_json_string, variant_to_json_value, Variant}; fn main() -> Result<(), Box> { let variants = vec![ @@ -39,12 +37,13 @@ fn main() -> Result<(), Box> { // Example 2: String escaping println!("\nπŸ”€ 2. String Escaping:"); - - let special_string = Variant::String("Line 1\nLine 2\tTabbed\r\nWith \"quotes\" and \\backslashes"); + + let special_string = + Variant::String("Line 1\nLine 2\tTabbed\r\nWith \"quotes\" and \\backslashes"); let escaped_json = variant_to_json_string(&special_string)?; println!(" Original: Line 1\\nLine 2\\tTabbed\\r\\nWith \"quotes\" and \\\\backslashes"); println!(" JSON: {}", escaped_json); - + let unicode_variants = vec![ Variant::String("Hello δΈ–η•Œ 🌍"), Variant::String("Emoji: πŸ’»"), @@ -55,25 +54,25 @@ fn main() -> Result<(), Box> { let json_string = variant_to_json_string(&variant)?; println!(" {}", json_string); } - + let test_variant = Variant::String("Buffer test"); - + // Write to Vec let mut vec_buffer = Vec::new(); variant_to_json(&mut vec_buffer, &test_variant)?; println!(" Vec buffer: {}", String::from_utf8(vec_buffer)?); - + // Write to String (through write! macro) let mut string_buffer = String::new(); use std::fmt::Write; write!(string_buffer, "Prefix: ")?; - + // Convert to bytes temporarily to use the Write trait let mut temp_buffer = Vec::new(); variant_to_json(&mut temp_buffer, &test_variant)?; string_buffer.push_str(&String::from_utf8(temp_buffer)?); println!(" String buffer: {}", string_buffer); - + let variants_for_value = vec![ Variant::Null, Variant::BooleanTrue, @@ -85,7 +84,7 @@ fn main() -> Result<(), Box> { let json_value = variant_to_json_value(&variant)?; println!(" {:?} -> {:?}", variant, json_value); } - + let start = std::time::Instant::now(); for i in 0..1000 { let variant = Variant::Int8((i % 128) as i8); @@ -93,7 +92,7 @@ fn main() -> Result<(), Box> { } let duration = start.elapsed(); println!(" Converted 1000 variants in {:?}", duration); - + // This would demonstrate error handling if we had invalid variants // For now, all our examples work, so we'll just show the pattern match variant_to_json_string(&Variant::String("Valid string")) { @@ -102,4 +101,4 @@ fn main() -> Result<(), Box> { } Ok(()) -} \ No newline at end of file +} diff --git a/parquet-variant/src/encoder.rs b/parquet-variant/src/encoder.rs index f671e0c2ee9b..646c8ba889e1 100644 --- a/parquet-variant/src/encoder.rs +++ b/parquet-variant/src/encoder.rs @@ -17,4 +17,4 @@ //! Encoder module for converting Variant values to other formats -pub mod variant_to_json; \ No newline at end of file +pub mod variant_to_json; diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index 64fd7f530404..117d15655496 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -18,7 +18,7 @@ //! Module for converting Variant data to JSON format use arrow_schema::ArrowError; -use base64::{Engine as _, engine::general_purpose}; +use base64::{engine::general_purpose, Engine as _}; use serde_json::Value; use std::io::Write; @@ -41,7 +41,7 @@ use crate::variant::{Variant, VariantList, VariantObject}; /// ```rust /// use parquet_variant::{Variant, variant_to_json}; /// use arrow_schema::ArrowError; -/// +/// /// fn example() -> Result<(), ArrowError> { /// let variant = Variant::Int8(42); /// let mut buffer = Vec::new(); @@ -51,10 +51,7 @@ use crate::variant::{Variant, VariantList, VariantObject}; /// } /// example().unwrap(); /// ``` -pub fn variant_to_json( - json_buffer: &mut W, - variant: &Variant, -) -> Result<(), ArrowError> { +pub fn variant_to_json(json_buffer: &mut W, variant: &Variant) -> Result<(), ArrowError> { match variant { Variant::Null => { write!(json_buffer, "null")?; @@ -113,14 +110,16 @@ pub fn variant_to_json( Variant::Binary(bytes) => { // Encode binary as base64 string let base64_str = general_purpose::STANDARD.encode(bytes); - let json_str = serde_json::to_string(&base64_str) - .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)))?; + let json_str = serde_json::to_string(&base64_str).map_err(|e| { + ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) + })?; write!(json_buffer, "{}", json_str)?; } Variant::String(s) | Variant::ShortString(s) => { // Use serde_json to properly escape the string - let json_str = serde_json::to_string(s) - .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)))?; + let json_str = serde_json::to_string(s).map_err(|e| { + ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) + })?; write!(json_buffer, "{}", json_str)?; } Variant::Object(obj) => { @@ -134,52 +133,47 @@ pub fn variant_to_json( } /// Convert object fields to JSON -fn convert_object_to_json( - buffer: &mut W, - obj: &VariantObject, -) -> Result<(), ArrowError> { +fn convert_object_to_json(buffer: &mut W, obj: &VariantObject) -> Result<(), ArrowError> { write!(buffer, "{{")?; - + // Get all fields from the object let fields = obj.fields()?; let mut first = true; - + for (key, value) in fields { if !first { write!(buffer, ",")?; } first = false; - + // Write the key (properly escaped) - let json_key = serde_json::to_string(key) - .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON key encoding error: {}", e)))?; + let json_key = serde_json::to_string(key).map_err(|e| { + ArrowError::InvalidArgumentError(format!("JSON key encoding error: {}", e)) + })?; write!(buffer, "{}:", json_key)?; - + // Recursively convert the value variant_to_json(buffer, &value)?; } - + write!(buffer, "}}")?; Ok(()) } /// Convert array elements to JSON -fn convert_array_to_json( - buffer: &mut W, - arr: &VariantList, -) -> Result<(), ArrowError> { +fn convert_array_to_json(buffer: &mut W, arr: &VariantList) -> Result<(), ArrowError> { write!(buffer, "[")?; - + let len = arr.len(); for i in 0..len { if i > 0 { write!(buffer, ",")?; } - + let element = arr.get(i)?; variant_to_json(buffer, &element)?; } - + write!(buffer, "]")?; Ok(()) } @@ -200,7 +194,7 @@ fn convert_array_to_json( /// ```rust /// use parquet_variant::{Variant, variant_to_json_string}; /// use arrow_schema::ArrowError; -/// +/// /// fn example() -> Result<(), ArrowError> { /// let variant = Variant::String("hello"); /// let json = variant_to_json_string(&variant)?; @@ -233,7 +227,7 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { /// use parquet_variant::{Variant, variant_to_json_value}; /// use serde_json::Value; /// use arrow_schema::ArrowError; -/// +/// /// fn example() -> Result<(), ArrowError> { /// let variant = Variant::Int8(42); /// let json_value = variant_to_json_value(&variant)?; @@ -251,63 +245,67 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::Int16(i) => Ok(Value::Number((*i).into())), Variant::Int32(i) => Ok(Value::Number((*i).into())), Variant::Int64(i) => Ok(Value::Number((*i).into())), - Variant::Float(f) => { - serde_json::Number::from_f64(*f as f64) - .map(Value::Number) - .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid float value".to_string())) - } - Variant::Double(f) => { - serde_json::Number::from_f64(*f) - .map(Value::Number) - .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())) - } + Variant::Float(f) => serde_json::Number::from_f64(*f as f64) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid float value".to_string())), + Variant::Double(f) => serde_json::Number::from_f64(*f) + .map(Value::Number) + .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), Variant::Decimal4 { integer, scale } => { let divisor = 10_i32.pow(*scale as u32); let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) - .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + .ok_or_else(|| { + ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) + }) } Variant::Decimal8 { integer, scale } => { let divisor = 10_i64.pow(*scale as u32); let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) - .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + .ok_or_else(|| { + ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) + }) } Variant::Decimal16 { integer, scale } => { let divisor = 10_i128.pow(*scale as u32); let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) - .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid decimal value".to_string())) + .ok_or_else(|| { + ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) + }) } Variant::Date(date) => Ok(Value::String(date.format("%Y-%m-%d").to_string())), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), - Variant::TimestampNtzMicros(ts) => Ok(Value::String(ts.format("%Y-%m-%dT%H:%M:%S%.6f").to_string())), + Variant::TimestampNtzMicros(ts) => Ok(Value::String( + ts.format("%Y-%m-%dT%H:%M:%S%.6f").to_string(), + )), Variant::Binary(bytes) => Ok(Value::String(general_purpose::STANDARD.encode(bytes))), Variant::String(s) | Variant::ShortString(s) => Ok(Value::String(s.to_string())), Variant::Object(obj) => { let mut map = serde_json::Map::new(); let fields = obj.fields()?; - + for (key, value) in fields { let json_value = variant_to_json_value(&value)?; map.insert(key.to_string(), json_value); } - + Ok(Value::Object(map)) } Variant::List(arr) => { let mut vec = Vec::new(); let len = arr.len(); - + for i in 0..len { let element = arr.get(i)?; let json_value = variant_to_json_value(&element)?; vec.push(json_value); } - + Ok(Value::Array(vec)) } } @@ -324,7 +322,7 @@ mod tests { let variant = Variant::Null; let json = variant_to_json_string(&variant)?; assert_eq!(json, "null"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Null); Ok(()) @@ -335,14 +333,14 @@ mod tests { let variant_true = Variant::BooleanTrue; let json_true = variant_to_json_string(&variant_true)?; assert_eq!(json_true, "true"); - + let variant_false = Variant::BooleanFalse; let json_false = variant_to_json_string(&variant_false)?; assert_eq!(json_false, "false"); - + let json_value_true = variant_to_json_value(&variant_true)?; assert_eq!(json_value_true, Value::Bool(true)); - + let json_value_false = variant_to_json_value(&variant_false)?; assert_eq!(json_value_false, Value::Bool(false)); Ok(()) @@ -353,7 +351,7 @@ mod tests { let variant = Variant::Int8(42); let json = variant_to_json_string(&variant)?; assert_eq!(json, "42"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Number(42.into())); Ok(()) @@ -364,7 +362,7 @@ mod tests { let variant = Variant::Int16(32767); let json = variant_to_json_string(&variant)?; assert_eq!(json, "32767"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Number(32767.into())); @@ -380,7 +378,7 @@ mod tests { let variant = Variant::Int32(2147483647); let json = variant_to_json_string(&variant)?; assert_eq!(json, "2147483647"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Number(2147483647.into())); @@ -396,7 +394,7 @@ mod tests { let variant = Variant::Int64(9223372036854775807); let json = variant_to_json_string(&variant)?; assert_eq!(json, "9223372036854775807"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Number(9223372036854775807i64.into())); @@ -412,7 +410,7 @@ mod tests { let variant = Variant::Float(3.14159); let json = variant_to_json_string(&variant)?; assert!(json.starts_with("3.14159")); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Number(_))); @@ -433,7 +431,7 @@ mod tests { let variant = Variant::Double(2.718281828459045); let json = variant_to_json_string(&variant)?; assert!(json.starts_with("2.718281828459045")); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Number(_))); @@ -451,20 +449,29 @@ mod tests { #[test] fn test_decimal4_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal4 { integer: 12345, scale: 2 }; + let variant = Variant::Decimal4 { + integer: 12345, + scale: 2, + }; let json = variant_to_json_string(&variant)?; assert_eq!(json, "123.45"); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Number(_))); // Test zero scale - let no_scale_variant = Variant::Decimal4 { integer: 42, scale: 0 }; + let no_scale_variant = Variant::Decimal4 { + integer: 42, + scale: 0, + }; let no_scale_json = variant_to_json_string(&no_scale_variant)?; assert_eq!(no_scale_json, "42"); // Test negative - let negative_variant = Variant::Decimal4 { integer: -12345, scale: 3 }; + let negative_variant = Variant::Decimal4 { + integer: -12345, + scale: 3, + }; let negative_json = variant_to_json_string(&negative_variant)?; assert_eq!(negative_json, "-12.345"); Ok(()) @@ -472,15 +479,21 @@ mod tests { #[test] fn test_decimal8_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal8 { integer: 1234567890, scale: 3 }; + let variant = Variant::Decimal8 { + integer: 1234567890, + scale: 3, + }; let json = variant_to_json_string(&variant)?; assert_eq!(json, "1234567.89"); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Number(_))); // Test large scale - let large_scale_variant = Variant::Decimal8 { integer: 123456789, scale: 6 }; + let large_scale_variant = Variant::Decimal8 { + integer: 123456789, + scale: 6, + }; let large_scale_json = variant_to_json_string(&large_scale_variant)?; assert_eq!(large_scale_json, "123.456789"); Ok(()) @@ -488,18 +501,27 @@ mod tests { #[test] fn test_decimal16_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal16 { integer: 123456789012345, scale: 4 }; + let variant = Variant::Decimal16 { + integer: 123456789012345, + scale: 4, + }; let json = variant_to_json_string(&variant)?; assert_eq!(json, "12345678901.2345"); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Number(_))); // Test very large number - let large_variant = Variant::Decimal16 { integer: 999999999999999999, scale: 2 }; + let large_variant = Variant::Decimal16 { + integer: 999999999999999999, + scale: 2, + }; let large_json = variant_to_json_string(&large_variant)?; // Due to f64 precision limits, very large numbers may lose precision - assert!(large_json.starts_with("9999999999999999") || large_json.starts_with("10000000000000000")); + assert!( + large_json.starts_with("9999999999999999") + || large_json.starts_with("10000000000000000") + ); Ok(()) } @@ -509,7 +531,7 @@ mod tests { let variant = Variant::Date(date); let json = variant_to_json_string(&variant)?; assert_eq!(json, "\"2023-12-25\""); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::String("2023-12-25".to_string())); @@ -523,12 +545,14 @@ mod tests { #[test] fn test_timestamp_micros_to_json() -> Result<(), ArrowError> { - let timestamp = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z").unwrap().with_timezone(&Utc); + let timestamp = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z") + .unwrap() + .with_timezone(&Utc); let variant = Variant::TimestampMicros(timestamp); let json = variant_to_json_string(&variant)?; assert!(json.contains("2023-12-25T10:30:45")); assert!(json.starts_with('"') && json.ends_with('"')); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::String(_))); Ok(()) @@ -536,12 +560,14 @@ mod tests { #[test] fn test_timestamp_ntz_micros_to_json() -> Result<(), ArrowError> { - let naive_timestamp = DateTime::from_timestamp(1703505045, 123456).unwrap().naive_utc(); + let naive_timestamp = DateTime::from_timestamp(1703505045, 123456) + .unwrap() + .naive_utc(); let variant = Variant::TimestampNtzMicros(naive_timestamp); let json = variant_to_json_string(&variant)?; assert!(json.contains("2023-12-25")); assert!(json.starts_with('"') && json.ends_with('"')); - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::String(_))); Ok(()) @@ -552,11 +578,11 @@ mod tests { let binary_data = b"Hello, World!"; let variant = Variant::Binary(binary_data); let json = variant_to_json_string(&variant)?; - + // Should be base64 encoded and quoted assert!(json.starts_with('"') && json.ends_with('"')); assert!(json.len() > 2); // Should have content - + let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::String(_))); @@ -577,7 +603,7 @@ mod tests { let variant = Variant::String("hello world"); let json = variant_to_json_string(&variant)?; assert_eq!(json, "\"hello world\""); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::String("hello world".to_string())); Ok(()) @@ -588,7 +614,7 @@ mod tests { let variant = Variant::ShortString("short"); let json = variant_to_json_string(&variant)?; assert_eq!(json, "\"short\""); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::String("short".to_string())); Ok(()) @@ -599,9 +625,12 @@ mod tests { let variant = Variant::String("hello\nworld\t\"quoted\""); let json = variant_to_json_string(&variant)?; assert_eq!(json, "\"hello\\nworld\\t\\\"quoted\\\"\""); - + let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::String("hello\nworld\t\"quoted\"".to_string())); + assert_eq!( + json_value, + Value::String("hello\nworld\t\"quoted\"".to_string()) + ); Ok(()) } @@ -610,7 +639,7 @@ mod tests { let variant = Variant::Int8(123); let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant)?; - + let result = String::from_utf8(buffer) .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; assert_eq!(result, "123"); @@ -630,11 +659,24 @@ mod tests { Variant::Int64(4), Variant::Float(5.0), Variant::Double(6.0), - Variant::Decimal4 { integer: 7, scale: 0 }, - Variant::Decimal8 { integer: 8, scale: 0 }, - Variant::Decimal16 { integer: 9, scale: 0 }, + Variant::Decimal4 { + integer: 7, + scale: 0, + }, + Variant::Decimal8 { + integer: 8, + scale: 0, + }, + Variant::Decimal16 { + integer: 9, + scale: 0, + }, Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), - Variant::TimestampMicros(DateTime::parse_from_rfc3339("2023-01-01T00:00:00Z").unwrap().with_timezone(&Utc)), + Variant::TimestampMicros( + DateTime::parse_from_rfc3339("2023-01-01T00:00:00Z") + .unwrap() + .with_timezone(&Utc), + ), Variant::TimestampNtzMicros(DateTime::from_timestamp(0, 0).unwrap().naive_utc()), Variant::Binary(b"test"), Variant::String("test"), @@ -646,17 +688,17 @@ mod tests { let _json_string = variant_to_json_string(&variant)?; let _json_value = variant_to_json_value(&variant)?; } - + Ok(()) } #[test] fn test_simple_object_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + // Create a simple object with various field types let mut builder = VariantBuilder::new(); - + { let mut obj = builder.new_object(); obj.append_value("name", "Alice"); @@ -665,13 +707,14 @@ mod tests { obj.append_value("score", 95.5f64); obj.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - + // Parse the JSON to verify structure - handle JSON parsing errors manually - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; if let Value::Object(obj) = parsed { assert_eq!(obj.get("name"), Some(&Value::String("Alice".to_string()))); assert_eq!(obj.get("age"), Some(&Value::Number(30.into()))); @@ -681,42 +724,42 @@ mod tests { } else { panic!("Expected JSON object"); } - + // Test variant_to_json_value as well let json_value = variant_to_json_value(&variant)?; assert!(matches!(json_value, Value::Object(_))); - + Ok(()) } #[test] fn test_empty_object_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let obj = builder.new_object(); obj.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; assert_eq!(json, "{}"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Object(serde_json::Map::new())); - + Ok(()) } #[test] fn test_object_with_special_characters_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut obj = builder.new_object(); obj.append_value("message", "Hello \"World\"\nWith\tTabs"); @@ -724,29 +767,30 @@ mod tests { obj.append_value("unicode", "πŸ˜€ Smiley"); obj.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - + // Verify that special characters are properly escaped assert!(json.contains("Hello \\\"World\\\"\\nWith\\tTabs")); assert!(json.contains("C:\\\\Users\\\\Alice\\\\Documents")); assert!(json.contains("πŸ˜€ Smiley")); - + // Verify that the JSON can be parsed back - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; assert!(matches!(parsed, Value::Object(_))); - + Ok(()) } #[test] fn test_simple_list_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut list = builder.new_list(); list.append_value(1i32); @@ -756,12 +800,12 @@ mod tests { list.append_value(5i32); list.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; assert_eq!(json, "[1,2,3,4,5]"); - + let json_value = variant_to_json_value(&variant)?; if let Value::Array(arr) = json_value { assert_eq!(arr.len(), 5); @@ -770,53 +814,54 @@ mod tests { } else { panic!("Expected JSON array"); } - + Ok(()) } #[test] fn test_empty_list_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let list = builder.new_list(); list.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; assert_eq!(json, "[]"); - + let json_value = variant_to_json_value(&variant)?; assert_eq!(json_value, Value::Array(vec![])); - + Ok(()) } #[test] fn test_mixed_type_list_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut list = builder.new_list(); list.append_value("hello"); list.append_value(42i32); list.append_value(true); - list.append_value(()); // null + list.append_value(()); // null list.append_value(3.14f64); list.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; if let Value::Array(arr) = parsed { assert_eq!(arr.len(), 5); assert_eq!(arr[0], Value::String("hello".to_string())); @@ -827,16 +872,16 @@ mod tests { } else { panic!("Expected JSON array"); } - + Ok(()) } #[test] fn test_object_field_ordering_in_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut obj = builder.new_object(); // Add fields in non-alphabetical order @@ -845,13 +890,14 @@ mod tests { obj.append_value("beta", "second"); obj.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - + // Parse and verify all fields are present - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; if let Value::Object(obj) = parsed { assert_eq!(obj.len(), 3); assert_eq!(obj.get("alpha"), Some(&Value::String("first".to_string()))); @@ -860,16 +906,16 @@ mod tests { } else { panic!("Expected JSON object"); } - + Ok(()) } #[test] fn test_list_with_various_primitive_types_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut list = builder.new_list(); list.append_value("string_value"); @@ -877,16 +923,17 @@ mod tests { list.append_value(true); list.append_value(3.14f64); list.append_value(false); - list.append_value(()); // null + list.append_value(()); // null list.append_value(100i64); list.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; if let Value::Array(arr) = parsed { assert_eq!(arr.len(), 7); assert_eq!(arr[0], Value::String("string_value".to_string())); @@ -899,16 +946,16 @@ mod tests { } else { panic!("Expected JSON array"); } - + Ok(()) } #[test] fn test_object_with_various_primitive_types_to_json() -> Result<(), ArrowError> { use crate::builder::VariantBuilder; - + let mut builder = VariantBuilder::new(); - + { let mut obj = builder.new_object(); obj.append_value("string_field", "test_string"); @@ -919,15 +966,19 @@ mod tests { obj.append_value("long_field", 999i64); obj.finish(); } - + let (metadata, value) = builder.finish(); let variant = Variant::try_new(&metadata, &value)?; let json = variant_to_json_string(&variant)?; - - let parsed: Value = serde_json::from_str(&json).map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + + let parsed: Value = serde_json::from_str(&json) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; if let Value::Object(obj) = parsed { assert_eq!(obj.len(), 6); - assert_eq!(obj.get("string_field"), Some(&Value::String("test_string".to_string()))); + assert_eq!( + obj.get("string_field"), + Some(&Value::String("test_string".to_string())) + ); assert_eq!(obj.get("int_field"), Some(&Value::Number(123.into()))); assert_eq!(obj.get("bool_field"), Some(&Value::Bool(true))); assert!(matches!(obj.get("float_field"), Some(Value::Number(_)))); @@ -936,7 +987,7 @@ mod tests { } else { panic!("Expected JSON object"); } - + Ok(()) } -} \ No newline at end of file +} diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 7bbbb96ef8ce..da9afa89c1ce 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -38,5 +38,7 @@ mod builder; mod utils; pub use builder::*; +pub use encoder::variant_to_json::{ + variant_to_json, variant_to_json_string, variant_to_json_value, +}; pub use variant::*; -pub use encoder::variant_to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 1b5a0011e562..f7e021238aa4 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -395,7 +395,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { let field_list = self.parse_field_list()?; Ok(field_list.into_iter()) } - + pub fn values(&self) -> Result>, ArrowError> { let fields = self.parse_field_list()?; let values: Vec<_> = fields.into_iter().map(|(_, variant)| variant).collect(); diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs index d60be07d4586..0da1981a8984 100644 --- a/parquet-variant/tests/integration_test.rs +++ b/parquet-variant/tests/integration_test.rs @@ -17,9 +17,9 @@ //! Integration tests for JSON conversion functionality +use chrono::{DateTime, NaiveDate, Utc}; use parquet_variant::{variant_to_json_string, variant_to_json_value, Variant}; use serde_json::Value; -use chrono::{DateTime, NaiveDate, Utc}; #[test] fn test_primitive_values_to_json() { @@ -33,26 +33,47 @@ fn test_primitive_values_to_json() { let bool_false = Variant::BooleanFalse; assert_eq!(variant_to_json_string(&bool_true).unwrap(), "true"); assert_eq!(variant_to_json_string(&bool_false).unwrap(), "false"); - assert_eq!(variant_to_json_value(&bool_true).unwrap(), Value::Bool(true)); - assert_eq!(variant_to_json_value(&bool_false).unwrap(), Value::Bool(false)); + assert_eq!( + variant_to_json_value(&bool_true).unwrap(), + Value::Bool(true) + ); + assert_eq!( + variant_to_json_value(&bool_false).unwrap(), + Value::Bool(false) + ); // Test integers let int8_variant = Variant::Int8(42); assert_eq!(variant_to_json_string(&int8_variant).unwrap(), "42"); - assert_eq!(variant_to_json_value(&int8_variant).unwrap(), Value::Number(42.into())); + assert_eq!( + variant_to_json_value(&int8_variant).unwrap(), + Value::Number(42.into()) + ); let negative_int8 = Variant::Int8(-123); assert_eq!(variant_to_json_string(&negative_int8).unwrap(), "-123"); - assert_eq!(variant_to_json_value(&negative_int8).unwrap(), Value::Number((-123).into())); + assert_eq!( + variant_to_json_value(&negative_int8).unwrap(), + Value::Number((-123).into()) + ); // Test strings let string_variant = Variant::String("hello world"); - assert_eq!(variant_to_json_string(&string_variant).unwrap(), "\"hello world\""); - assert_eq!(variant_to_json_value(&string_variant).unwrap(), Value::String("hello world".to_string())); + assert_eq!( + variant_to_json_string(&string_variant).unwrap(), + "\"hello world\"" + ); + assert_eq!( + variant_to_json_value(&string_variant).unwrap(), + Value::String("hello world".to_string()) + ); let short_string = Variant::ShortString("test"); assert_eq!(variant_to_json_string(&short_string).unwrap(), "\"test\""); - assert_eq!(variant_to_json_value(&short_string).unwrap(), Value::String("test".to_string())); + assert_eq!( + variant_to_json_value(&short_string).unwrap(), + Value::String("test".to_string()) + ); } #[test] @@ -60,29 +81,59 @@ fn test_integer_types_to_json() { // Test Int16 let int16_variant = Variant::Int16(32767); assert_eq!(variant_to_json_string(&int16_variant).unwrap(), "32767"); - assert_eq!(variant_to_json_value(&int16_variant).unwrap(), Value::Number(32767.into())); + assert_eq!( + variant_to_json_value(&int16_variant).unwrap(), + Value::Number(32767.into()) + ); let negative_int16 = Variant::Int16(-32768); assert_eq!(variant_to_json_string(&negative_int16).unwrap(), "-32768"); - assert_eq!(variant_to_json_value(&negative_int16).unwrap(), Value::Number((-32768).into())); + assert_eq!( + variant_to_json_value(&negative_int16).unwrap(), + Value::Number((-32768).into()) + ); // Test Int32 let int32_variant = Variant::Int32(2147483647); - assert_eq!(variant_to_json_string(&int32_variant).unwrap(), "2147483647"); - assert_eq!(variant_to_json_value(&int32_variant).unwrap(), Value::Number(2147483647.into())); + assert_eq!( + variant_to_json_string(&int32_variant).unwrap(), + "2147483647" + ); + assert_eq!( + variant_to_json_value(&int32_variant).unwrap(), + Value::Number(2147483647.into()) + ); let negative_int32 = Variant::Int32(-2147483648); - assert_eq!(variant_to_json_string(&negative_int32).unwrap(), "-2147483648"); - assert_eq!(variant_to_json_value(&negative_int32).unwrap(), Value::Number((-2147483648).into())); + assert_eq!( + variant_to_json_string(&negative_int32).unwrap(), + "-2147483648" + ); + assert_eq!( + variant_to_json_value(&negative_int32).unwrap(), + Value::Number((-2147483648).into()) + ); // Test Int64 let int64_variant = Variant::Int64(9223372036854775807); - assert_eq!(variant_to_json_string(&int64_variant).unwrap(), "9223372036854775807"); - assert_eq!(variant_to_json_value(&int64_variant).unwrap(), Value::Number(9223372036854775807i64.into())); + assert_eq!( + variant_to_json_string(&int64_variant).unwrap(), + "9223372036854775807" + ); + assert_eq!( + variant_to_json_value(&int64_variant).unwrap(), + Value::Number(9223372036854775807i64.into()) + ); let negative_int64 = Variant::Int64(-9223372036854775808); - assert_eq!(variant_to_json_string(&negative_int64).unwrap(), "-9223372036854775808"); - assert_eq!(variant_to_json_value(&negative_int64).unwrap(), Value::Number((-9223372036854775808i64).into())); + assert_eq!( + variant_to_json_string(&negative_int64).unwrap(), + "-9223372036854775808" + ); + assert_eq!( + variant_to_json_value(&negative_int64).unwrap(), + Value::Number((-9223372036854775808i64).into()) + ); } #[test] @@ -91,7 +142,7 @@ fn test_floating_point_types_to_json() { let float_variant = Variant::Float(3.14159); let float_json = variant_to_json_string(&float_variant).unwrap(); assert!(float_json.starts_with("3.14159")); - + let float_value = variant_to_json_value(&float_variant).unwrap(); assert!(matches!(float_value, Value::Number(_))); @@ -99,7 +150,7 @@ fn test_floating_point_types_to_json() { let double_variant = Variant::Double(2.718281828459045); let double_json = variant_to_json_string(&double_variant).unwrap(); assert!(double_json.starts_with("2.718281828459045")); - + let double_value = variant_to_json_value(&double_variant).unwrap(); assert!(matches!(double_value, Value::Number(_))); @@ -114,28 +165,46 @@ fn test_floating_point_types_to_json() { #[test] fn test_decimal_types_to_json() { // Test Decimal4 (i32 with scale) - let decimal4_variant = Variant::Decimal4 { integer: 12345, scale: 2 }; + let decimal4_variant = Variant::Decimal4 { + integer: 12345, + scale: 2, + }; assert_eq!(variant_to_json_string(&decimal4_variant).unwrap(), "123.45"); - + let decimal4_value = variant_to_json_value(&decimal4_variant).unwrap(); assert!(matches!(decimal4_value, Value::Number(_))); // Test Decimal8 (i64 with scale) - let decimal8_variant = Variant::Decimal8 { integer: 1234567890, scale: 3 }; - assert_eq!(variant_to_json_string(&decimal8_variant).unwrap(), "1234567.89"); - + let decimal8_variant = Variant::Decimal8 { + integer: 1234567890, + scale: 3, + }; + assert_eq!( + variant_to_json_string(&decimal8_variant).unwrap(), + "1234567.89" + ); + let decimal8_value = variant_to_json_value(&decimal8_variant).unwrap(); assert!(matches!(decimal8_value, Value::Number(_))); // Test Decimal16 (i128 with scale) - let decimal16_variant = Variant::Decimal16 { integer: 123456789012345, scale: 4 }; - assert_eq!(variant_to_json_string(&decimal16_variant).unwrap(), "12345678901.2345"); - + let decimal16_variant = Variant::Decimal16 { + integer: 123456789012345, + scale: 4, + }; + assert_eq!( + variant_to_json_string(&decimal16_variant).unwrap(), + "12345678901.2345" + ); + let decimal16_value = variant_to_json_value(&decimal16_variant).unwrap(); assert!(matches!(decimal16_value, Value::Number(_))); // Test zero scale decimal - let no_scale_decimal = Variant::Decimal4 { integer: 42, scale: 0 }; + let no_scale_decimal = Variant::Decimal4 { + integer: 42, + scale: 0, + }; assert_eq!(variant_to_json_string(&no_scale_decimal).unwrap(), "42"); } @@ -144,18 +213,28 @@ fn test_date_and_timestamp_types_to_json() { // Test Date let date = NaiveDate::from_ymd_opt(2023, 12, 25).unwrap(); let date_variant = Variant::Date(date); - assert_eq!(variant_to_json_string(&date_variant).unwrap(), "\"2023-12-25\""); - assert_eq!(variant_to_json_value(&date_variant).unwrap(), Value::String("2023-12-25".to_string())); + assert_eq!( + variant_to_json_string(&date_variant).unwrap(), + "\"2023-12-25\"" + ); + assert_eq!( + variant_to_json_value(&date_variant).unwrap(), + Value::String("2023-12-25".to_string()) + ); // Test TimestampMicros (UTC) - let timestamp_utc = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z").unwrap().with_timezone(&Utc); + let timestamp_utc = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z") + .unwrap() + .with_timezone(&Utc); let timestamp_variant = Variant::TimestampMicros(timestamp_utc); let timestamp_json = variant_to_json_string(×tamp_variant).unwrap(); assert!(timestamp_json.contains("2023-12-25T10:30:45")); assert!(timestamp_json.starts_with('"') && timestamp_json.ends_with('"')); // Test TimestampNtzMicros (naive datetime) - let naive_timestamp = DateTime::from_timestamp(1703505045, 123456).unwrap().naive_utc(); + let naive_timestamp = DateTime::from_timestamp(1703505045, 123456) + .unwrap() + .naive_utc(); let naive_timestamp_variant = Variant::TimestampNtzMicros(naive_timestamp); let naive_json = variant_to_json_string(&naive_timestamp_variant).unwrap(); assert!(naive_json.contains("2023-12-25")); @@ -167,14 +246,14 @@ fn test_binary_type_to_json() { // Test Binary data (encoded as base64) let binary_data = b"Hello, World!"; let binary_variant = Variant::Binary(binary_data); - + let binary_json = variant_to_json_string(&binary_variant).unwrap(); // Should be base64 encoded and quoted assert!(binary_json.starts_with('"') && binary_json.ends_with('"')); - + let binary_value = variant_to_json_value(&binary_variant).unwrap(); assert!(matches!(binary_value, Value::String(_))); - + // Test empty binary let empty_binary = Variant::Binary(b""); let empty_json = variant_to_json_string(&empty_binary).unwrap(); @@ -200,9 +279,18 @@ fn test_comprehensive_roundtrip_compatibility() { Variant::Int64(10000000000), Variant::Float(3.5), // Use a value that can be represented exactly in f32 Variant::Double(2.718281828), - Variant::Decimal4 { integer: 12345, scale: 2 }, - Variant::Decimal8 { integer: 1234567890, scale: 3 }, - Variant::Decimal16 { integer: 123456789012345, scale: 4 }, + Variant::Decimal4 { + integer: 12345, + scale: 2, + }, + Variant::Decimal8 { + integer: 1234567890, + scale: 3, + }, + Variant::Decimal16 { + integer: 123456789012345, + scale: 4, + }, Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), Variant::String("test string"), Variant::ShortString("short"), @@ -211,10 +299,10 @@ fn test_comprehensive_roundtrip_compatibility() { for variant in test_cases { let json_string = variant_to_json_string(&variant).unwrap(); - + // Ensure the JSON can be parsed back let parsed: Value = serde_json::from_str(&json_string).unwrap(); - + // Ensure our direct Value conversion matches let direct_value = variant_to_json_value(&variant).unwrap(); assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); @@ -226,7 +314,10 @@ fn test_string_escaping_edge_cases() { // Test various escape sequences let escaped_string = Variant::String("line1\nline2\ttab\"quote\"\\backslash"); let expected_json = "\"line1\\nline2\\ttab\\\"quote\\\"\\\\backslash\""; - assert_eq!(variant_to_json_string(&escaped_string).unwrap(), expected_json); + assert_eq!( + variant_to_json_string(&escaped_string).unwrap(), + expected_json + ); // Test Unicode characters let unicode_string = Variant::String("Hello δΈ–η•Œ 🌍"); @@ -253,10 +344,10 @@ fn test_json_roundtrip_compatibility() { for variant in test_cases { let json_string = variant_to_json_string(&variant).unwrap(); - + // Ensure the JSON can be parsed back let parsed: Value = serde_json::from_str(&json_string).unwrap(); - + // Ensure our direct Value conversion matches let direct_value = variant_to_json_value(&variant).unwrap(); assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); @@ -269,29 +360,29 @@ fn test_buffer_writing() { use std::io::Write; let variant = Variant::String("test buffer writing"); - + // Test writing to a Vec let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant).unwrap(); let result = String::from_utf8(buffer).unwrap(); assert_eq!(result, "\"test buffer writing\""); - + // Test writing to a custom writer struct CustomWriter { data: Vec, } - + impl Write for CustomWriter { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.data.extend_from_slice(buf); Ok(buf.len()) } - + fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } - + let mut custom_writer = CustomWriter { data: Vec::new() }; variant_to_json(&mut custom_writer, &variant).unwrap(); let result = String::from_utf8(custom_writer.data).unwrap(); @@ -300,4 +391,4 @@ fn test_buffer_writing() { // Note: Tests for arrays and objects would require actual Variant data structures // to be created, which would need the builder API to be implemented first. -// These tests demonstrate the primitive functionality that's currently working. \ No newline at end of file +// These tests demonstrate the primitive functionality that's currently working. From b8f708adb43f5497ae3f662492345909933b9dd0 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Thu, 19 Jun 2025 02:02:01 -0400 Subject: [PATCH 09/32] [FIX] fix clippy warnings --- parquet-variant/src/encoder/variant_to_json.rs | 8 ++++---- parquet-variant/tests/integration_test.rs | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/encoder/variant_to_json.rs index 117d15655496..7b9c2cdf579b 100644 --- a/parquet-variant/src/encoder/variant_to_json.rs +++ b/parquet-variant/src/encoder/variant_to_json.rs @@ -407,7 +407,7 @@ mod tests { #[test] fn test_float_to_json() -> Result<(), ArrowError> { - let variant = Variant::Float(3.14159); + let variant = Variant::Float(std::f32::consts::PI); let json = variant_to_json_string(&variant)?; assert!(json.starts_with("3.14159")); @@ -428,7 +428,7 @@ mod tests { #[test] fn test_double_to_json() -> Result<(), ArrowError> { - let variant = Variant::Double(2.718281828459045); + let variant = Variant::Double(std::f64::consts::E); let json = variant_to_json_string(&variant)?; assert!(json.starts_with("2.718281828459045")); @@ -852,7 +852,7 @@ mod tests { list.append_value(42i32); list.append_value(true); list.append_value(()); // null - list.append_value(3.14f64); + list.append_value(std::f64::consts::PI); list.finish(); } @@ -921,7 +921,7 @@ mod tests { list.append_value("string_value"); list.append_value(42i32); list.append_value(true); - list.append_value(3.14f64); + list.append_value(std::f64::consts::PI); list.append_value(false); list.append_value(()); // null list.append_value(100i64); diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs index 0da1981a8984..714f8a57f18f 100644 --- a/parquet-variant/tests/integration_test.rs +++ b/parquet-variant/tests/integration_test.rs @@ -139,7 +139,7 @@ fn test_integer_types_to_json() { #[test] fn test_floating_point_types_to_json() { // Test Float (f32) - let float_variant = Variant::Float(3.14159); + let float_variant = Variant::Float(std::f32::consts::PI); let float_json = variant_to_json_string(&float_variant).unwrap(); assert!(float_json.starts_with("3.14159")); @@ -147,7 +147,7 @@ fn test_floating_point_types_to_json() { assert!(matches!(float_value, Value::Number(_))); // Test Double (f64) - let double_variant = Variant::Double(2.718281828459045); + let double_variant = Variant::Double(std::f64::consts::E); let double_json = variant_to_json_string(&double_variant).unwrap(); assert!(double_json.starts_with("2.718281828459045")); @@ -278,7 +278,7 @@ fn test_comprehensive_roundtrip_compatibility() { Variant::Int32(100000), Variant::Int64(10000000000), Variant::Float(3.5), // Use a value that can be represented exactly in f32 - Variant::Double(2.718281828), + Variant::Double(std::f64::consts::E), Variant::Decimal4 { integer: 12345, scale: 2, From 1ac766dc228df796114ed665403c5efcd4034ec9 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sat, 21 Jun 2025 18:58:58 -0400 Subject: [PATCH 10/32] [RENAME] rename file to to_json --- parquet-variant/src/encoder.rs | 20 ------------------- parquet-variant/src/lib.rs | 4 ++-- .../variant_to_json.rs => to_json.rs} | 0 3 files changed, 2 insertions(+), 22 deletions(-) delete mode 100644 parquet-variant/src/encoder.rs rename parquet-variant/src/{encoder/variant_to_json.rs => to_json.rs} (100%) diff --git a/parquet-variant/src/encoder.rs b/parquet-variant/src/encoder.rs deleted file mode 100644 index 646c8ba889e1..000000000000 --- a/parquet-variant/src/encoder.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Encoder module for converting Variant values to other formats - -pub mod variant_to_json; diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index da9afa89c1ce..cdb59689a838 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -30,15 +30,15 @@ // TODO: dead code removal #[allow(dead_code)] mod decoder; -mod encoder; mod variant; // TODO: dead code removal mod builder; #[allow(dead_code)] mod utils; +mod to_json; pub use builder::*; -pub use encoder::variant_to_json::{ +pub use to_json::{ variant_to_json, variant_to_json_string, variant_to_json_value, }; pub use variant::*; diff --git a/parquet-variant/src/encoder/variant_to_json.rs b/parquet-variant/src/to_json.rs similarity index 100% rename from parquet-variant/src/encoder/variant_to_json.rs rename to parquet-variant/src/to_json.rs From 5f7cca73b1f8c10521b9ba977d85d47bbde3947c Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sat, 21 Jun 2025 19:02:24 -0400 Subject: [PATCH 11/32] [FIX] use impl in convert_variant_to_json function --- parquet-variant/src/to_json.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 7b9c2cdf579b..a4e8f0bd2d0a 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -51,7 +51,7 @@ use crate::variant::{Variant, VariantList, VariantObject}; /// } /// example().unwrap(); /// ``` -pub fn variant_to_json(json_buffer: &mut W, variant: &Variant) -> Result<(), ArrowError> { +pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Result<(), ArrowError> { match variant { Variant::Null => { write!(json_buffer, "null")?; @@ -133,7 +133,7 @@ pub fn variant_to_json(json_buffer: &mut W, variant: &Variant) -> Resu } /// Convert object fields to JSON -fn convert_object_to_json(buffer: &mut W, obj: &VariantObject) -> Result<(), ArrowError> { +fn convert_object_to_json(buffer: &mut impl Write, obj: &VariantObject) -> Result<(), ArrowError> { write!(buffer, "{{")?; // Get all fields from the object @@ -161,7 +161,7 @@ fn convert_object_to_json(buffer: &mut W, obj: &VariantObject) -> Resu } /// Convert array elements to JSON -fn convert_array_to_json(buffer: &mut W, arr: &VariantList) -> Result<(), ArrowError> { +fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<(), ArrowError> { write!(buffer, "[")?; let len = arr.len(); From e2502a6107bf25f99b539db367c368054527a087 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sat, 21 Jun 2025 19:14:04 -0400 Subject: [PATCH 12/32] [FIX] try removing f64 to convert decimal to string --- parquet-variant/src/to_json.rs | 75 ++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index a4e8f0bd2d0a..cb8deef95d06 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -81,22 +81,55 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul write!(json_buffer, "{}", f)?; } Variant::Decimal4 { integer, scale } => { - // Convert decimal to string representation - let divisor = 10_i32.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - write!(json_buffer, "{}", decimal_value)?; + // Convert decimal to string representation using integer arithmetic + if *scale == 0 { + write!(json_buffer, "{}", integer)?; + } else { + let divisor = 10_i32.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + if trimmed_remainder.is_empty() { + write!(json_buffer, "{}", quotient)?; + } else { + write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; + } + } } Variant::Decimal8 { integer, scale } => { - // Convert decimal to string representation - let divisor = 10_i64.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - write!(json_buffer, "{}", decimal_value)?; + // Convert decimal to string representation using integer arithmetic + if *scale == 0 { + write!(json_buffer, "{}", integer)?; + } else { + let divisor = 10_i64.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + if trimmed_remainder.is_empty() { + write!(json_buffer, "{}", quotient)?; + } else { + write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; + } + } } Variant::Decimal16 { integer, scale } => { - // Convert decimal to string representation - let divisor = 10_i128.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - write!(json_buffer, "{}", decimal_value)?; + // Convert decimal to string representation using integer arithmetic + if *scale == 0 { + write!(json_buffer, "{}", integer)?; + } else { + let divisor = 10_i128.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + if trimmed_remainder.is_empty() { + write!(json_buffer, "{}", quotient)?; + } else { + write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; + } + } } Variant::Date(date) => { write!(json_buffer, "\"{}\"", date.format("%Y-%m-%d"))?; @@ -252,30 +285,30 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), Variant::Decimal4 { integer, scale } => { - let divisor = 10_i32.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; + let divisor = 10_i32.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) .ok_or_else(|| { - ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) +ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) }) } Variant::Decimal8 { integer, scale } => { - let divisor = 10_i64.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; + let divisor = 10_i64.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) .ok_or_else(|| { - ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) +ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) }) } Variant::Decimal16 { integer, scale } => { - let divisor = 10_i128.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; + let divisor = 10_i128.pow(*scale as u32); + let decimal_value = *integer as f64 / divisor as f64; serde_json::Number::from_f64(decimal_value) .map(Value::Number) .ok_or_else(|| { - ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) +ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) }) } Variant::Date(date) => Ok(Value::String(date.format("%Y-%m-%d").to_string())), From d947dce767347e2db86b25ff5686e06beebf44c4 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sat, 21 Jun 2025 19:18:28 -0400 Subject: [PATCH 13/32] [FIX] make date time formatting more modular --- parquet-variant/src/to_json.rs | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index cb8deef95d06..c2aebf83495a 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -24,6 +24,23 @@ use std::io::Write; use crate::variant::{Variant, VariantList, VariantObject}; +// Format string constants to avoid duplication and reduce errors +const DATE_FORMAT: &str = "%Y-%m-%d"; +const TIMESTAMP_NTZ_FORMAT: &str = "%Y-%m-%dT%H:%M:%S%.6f"; + +// Helper functions for consistent formatting +fn format_date_string(date: &chrono::NaiveDate) -> String { + date.format(DATE_FORMAT).to_string() +} + +fn format_timestamp_ntz_string(ts: &chrono::NaiveDateTime) -> String { + ts.format(TIMESTAMP_NTZ_FORMAT).to_string() +} + +fn format_binary_base64(bytes: &[u8]) -> String { + general_purpose::STANDARD.encode(bytes) +} + /// Converts a Variant to JSON and writes it to the provided buffer /// /// # Arguments @@ -132,17 +149,17 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul } } Variant::Date(date) => { - write!(json_buffer, "\"{}\"", date.format("%Y-%m-%d"))?; + write!(json_buffer, "\"{}\"", format_date_string(date))?; } Variant::TimestampMicros(ts) => { write!(json_buffer, "\"{}\"", ts.to_rfc3339())?; } Variant::TimestampNtzMicros(ts) => { - write!(json_buffer, "\"{}\"", ts.format("%Y-%m-%dT%H:%M:%S%.6f"))?; + write!(json_buffer, "\"{}\"", format_timestamp_ntz_string(ts))?; } Variant::Binary(bytes) => { // Encode binary as base64 string - let base64_str = general_purpose::STANDARD.encode(bytes); + let base64_str = format_binary_base64(bytes); let json_str = serde_json::to_string(&base64_str).map_err(|e| { ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) })?; @@ -311,12 +328,10 @@ ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) }) } - Variant::Date(date) => Ok(Value::String(date.format("%Y-%m-%d").to_string())), + Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), - Variant::TimestampNtzMicros(ts) => Ok(Value::String( - ts.format("%Y-%m-%dT%H:%M:%S%.6f").to_string(), - )), - Variant::Binary(bytes) => Ok(Value::String(general_purpose::STANDARD.encode(bytes))), + Variant::TimestampNtzMicros(ts) => Ok(Value::String(format_timestamp_ntz_string(ts))), + Variant::Binary(bytes) => Ok(Value::String(format_binary_base64(bytes))), Variant::String(s) | Variant::ShortString(s) => Ok(Value::String(s.to_string())), Variant::Object(obj) => { let mut map = serde_json::Map::new(); From c96c2d43e99efeadef298c512bec3c527ad62e10 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Sat, 21 Jun 2025 19:43:09 -0400 Subject: [PATCH 14/32] [FIX] fix merge conflicts --- .../examples/variant_to_json_examples.rs | 2 +- parquet-variant/src/to_json.rs | 25 +- parquet-variant/src/variant.rs | 548 ------------------ parquet-variant/tests/integration_test.rs | 6 +- 4 files changed, 21 insertions(+), 560 deletions(-) diff --git a/parquet-variant/examples/variant_to_json_examples.rs b/parquet-variant/examples/variant_to_json_examples.rs index 951eb7b47585..fceda22175f3 100644 --- a/parquet-variant/examples/variant_to_json_examples.rs +++ b/parquet-variant/examples/variant_to_json_examples.rs @@ -27,7 +27,7 @@ fn main() -> Result<(), Box> { ("Integer 42", Variant::Int8(42)), ("Negative Integer", Variant::Int8(-123)), ("String", Variant::String("Hello, World!")), - ("Short String", Variant::ShortString("Hi!")), + ("Short String", Variant::ShortString(parquet_variant::ShortString::try_new("Hi!").unwrap())), ]; for (name, variant) in variants { diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index c2aebf83495a..95d9fab689a8 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -165,13 +165,20 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul })?; write!(json_buffer, "{}", json_str)?; } - Variant::String(s) | Variant::ShortString(s) => { + Variant::String(s) => { // Use serde_json to properly escape the string let json_str = serde_json::to_string(s).map_err(|e| { ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) })?; write!(json_buffer, "{}", json_str)?; } + Variant::ShortString(s) => { + // Use serde_json to properly escape the string + let json_str = serde_json::to_string(s.as_str()).map_err(|e| { + ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) + })?; + write!(json_buffer, "{}", json_str)?; + } Variant::Object(obj) => { convert_object_to_json(json_buffer, obj)?; } @@ -187,10 +194,9 @@ fn convert_object_to_json(buffer: &mut impl Write, obj: &VariantObject) -> Resul write!(buffer, "{{")?; // Get all fields from the object - let fields = obj.fields()?; let mut first = true; - for (key, value) in fields { + for (key, value) in obj.iter() { if !first { write!(buffer, ",")?; } @@ -332,12 +338,12 @@ ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), Variant::TimestampNtzMicros(ts) => Ok(Value::String(format_timestamp_ntz_string(ts))), Variant::Binary(bytes) => Ok(Value::String(format_binary_base64(bytes))), - Variant::String(s) | Variant::ShortString(s) => Ok(Value::String(s.to_string())), + Variant::String(s) => Ok(Value::String(s.to_string())), + Variant::ShortString(s) => Ok(Value::String(s.to_string())), Variant::Object(obj) => { let mut map = serde_json::Map::new(); - let fields = obj.fields()?; - for (key, value) in fields { + for (key, value) in obj.iter() { let json_value = variant_to_json_value(&value)?; map.insert(key.to_string(), json_value); } @@ -659,7 +665,9 @@ mod tests { #[test] fn test_short_string_to_json() -> Result<(), ArrowError> { - let variant = Variant::ShortString("short"); + use crate::variant::ShortString; + let short_string = ShortString::try_new("short")?; + let variant = Variant::ShortString(short_string); let json = variant_to_json_string(&variant)?; assert_eq!(json, "\"short\""); @@ -696,6 +704,7 @@ mod tests { #[test] fn test_comprehensive_type_coverage() -> Result<(), ArrowError> { + use crate::variant::ShortString; // Test all supported types to ensure no compilation errors let test_variants = vec![ Variant::Null, @@ -728,7 +737,7 @@ mod tests { Variant::TimestampNtzMicros(DateTime::from_timestamp(0, 0).unwrap().naive_utc()), Variant::Binary(b"test"), Variant::String("test"), - Variant::ShortString("test"), + Variant::ShortString(ShortString::try_new("test")?), ]; for variant in test_variants { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index f7e021238aa4..27d867d71a05 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -33,554 +33,6 @@ mod object; const MAX_SHORT_STRING_BYTES: usize = 0x3F; - /// Return one unsigned little-endian value from `bytes`. - /// - /// * `bytes` – the Variant-metadata buffer. - /// * `byte_offset` – number of bytes to skip **before** reading the first - /// value (usually `1` to move past the header byte). - /// * `offset_index` – 0-based index **after** the skip - /// (`0` is the first value, `1` the next, …). - /// - /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). - /// Three-byte values are zero-extended to 32 bits before the final - /// fallible cast to `usize`. - fn unpack_usize( - &self, - bytes: &[u8], - byte_offset: usize, // how many bytes to skip - offset_index: usize, // which offset in an array of offsets - ) -> Result { - use OffsetSizeBytes::*; - let offset = byte_offset + (*self as usize) * offset_index; - let result = match self { - One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), - Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), - Three => { - // Let's grab the three byte le-chunk first - let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?; - // Let's pad it and construct a padded u32 from it. - let mut buf = [0u8; 4]; - buf[..3].copy_from_slice(&b3_chunks); - u32::from_le_bytes(buf) - .try_into() - .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))? - } - Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) - .try_into() - .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, - }; - Ok(result) - } -} - -#[derive(Clone, Debug, Copy, PartialEq)] -pub(crate) struct VariantMetadataHeader { - version: u8, - is_sorted: bool, - /// Note: This is `offset_size_minus_one` + 1 - offset_size: OffsetSizeBytes, -} - -// According to the spec this is currently always = 1, and so we store this const for validation -// purposes and to make that visible. -const CORRECT_VERSION_VALUE: u8 = 1; - -impl VariantMetadataHeader { - /// Tries to construct the variant metadata header, which has the form - /// 7 6 5 4 3 0 - /// +-------+---+---+---------------+ - /// header | | | | version | - /// +-------+---+---+---------------+ - /// ^ ^ - /// | +-- sorted_strings - /// +-- offset_size_minus_one - /// The version is a 4-bit value that must always contain the value 1. - /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. - /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. - /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 - pub(crate) fn try_new(bytes: &[u8]) -> Result { - let header = first_byte_from_slice(bytes)?; - - let version = header & 0x0F; // First four bits - if version != CORRECT_VERSION_VALUE { - let err_msg = format!( - "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {:b}", - version - ); - return Err(ArrowError::InvalidArgumentError(err_msg)); - } - let is_sorted = (header & 0x10) != 0; // Fifth bit - let offset_size_minus_one = header >> 6; // Last two bits - Ok(Self { - version, - is_sorted, - offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?, - }) - } -} - -#[derive(Clone, Copy, Debug, PartialEq)] -/// Encodes the Variant Metadata, see the Variant spec file for more information -pub struct VariantMetadata<'m> { - bytes: &'m [u8], - header: VariantMetadataHeader, - dict_size: usize, - dictionary_key_start_byte: usize, -} - -impl<'m> VariantMetadata<'m> { - /// View the raw bytes (needed by very low-level decoders) - #[inline] - pub const fn as_bytes(&self) -> &'m [u8] { - self.bytes - } - - pub fn try_new(bytes: &'m [u8]) -> Result { - let header = VariantMetadataHeader::try_new(bytes)?; - // Offset 1, index 0 because first element after header is dictionary size - let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; - - // Check that we have the correct metadata length according to dictionary_size, or return - // error early. - // Minimum number of bytes the metadata buffer must contain: - // 1 byte header - // + offset_size-byte `dictionary_size` field - // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table size, essentially) - // 1 + offset_size + (dict_size + 1) * offset_size - // = (dict_size + 2) * offset_size + 1 - let offset_size = header.offset_size as usize; // Cheap to copy - - let dictionary_key_start_byte = dict_size - .checked_add(2) - .and_then(|n| n.checked_mul(offset_size)) - .and_then(|n| n.checked_add(1)) - .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?; - - if bytes.len() < dictionary_key_start_byte { - return Err(ArrowError::InvalidArgumentError( - "Metadata shorter than dictionary_size implies".to_string(), - )); - } - - // Check that all offsets are monotonically increasing - let mut offsets = (0..=dict_size).map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)); - let Some(Ok(mut end @ 0)) = offsets.next() else { - return Err(ArrowError::InvalidArgumentError( - "First offset is non-zero".to_string(), - )); - }; - - for offset in offsets { - let offset = offset?; - if end >= offset { - return Err(ArrowError::InvalidArgumentError( - "Offsets are not monotonically increasing".to_string(), - )); - } - end = offset; - } - - // Verify the buffer covers the whole dictionary-string section - if end > bytes.len() - dictionary_key_start_byte { - // `prev` holds the last offset seen still - return Err(ArrowError::InvalidArgumentError( - "Last offset does not equal dictionary length".to_string(), - )); - } - - Ok(Self { - bytes, - header, - dict_size, - dictionary_key_start_byte, - }) - } - - /// Whether the dictionary keys are sorted and unique - pub fn is_sorted(&self) -> bool { - self.header.is_sorted - } - - /// Get the dictionary size - pub fn dictionary_size(&self) -> usize { - self.dict_size - } - pub fn version(&self) -> u8 { - self.header.version - } - - /// Helper method to get the offset start and end range for a key by index. - fn get_offsets_for_key_by(&self, index: usize) -> Result, ArrowError> { - if index >= self.dict_size { - return Err(ArrowError::InvalidArgumentError(format!( - "Index {} out of bounds for dictionary of length {}", - index, self.dict_size - ))); - } - - // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) - let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); - Ok(unpack(index)?..unpack(index + 1)?) - } - - /// Get a single offset by index - pub fn get_offset_by(&self, index: usize) -> Result { - if index >= self.dict_size { - return Err(ArrowError::InvalidArgumentError(format!( - "Index {} out of bounds for dictionary of length {}", - index, self.dict_size - ))); - } - - // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) - let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); - unpack(index) - } - - /// Get the key-name by index - pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { - let offset_range = self.get_offsets_for_key_by(index)?; - self.get_field_by_offset(offset_range) - } - - /// Gets the field using an offset (Range) - helper method to keep consistent API. - pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { - let dictionary_keys_bytes = - slice_from_slice(self.bytes, self.dictionary_key_start_byte..self.bytes.len())?; - let result = string_from_slice(dictionary_keys_bytes, offset)?; - - Ok(result) - } - - #[allow(unused)] - pub(crate) fn header(&self) -> VariantMetadataHeader { - self.header - } - - /// Get the offsets as an iterator - pub fn offsets(&self) -> impl Iterator, ArrowError>> + 'm { - let offset_size = self.header.offset_size; // `Copy` - let bytes = self.bytes; - - (0..self.dict_size).map(move |i| { - // This wont be out of bounds as long as dict_size and offsets have been validated - // during construction via `try_new`, as it calls unpack_usize for the - // indices `1..dict_size+1` already. - let start = offset_size.unpack_usize(bytes, 1, i + 1); - let end = offset_size.unpack_usize(bytes, 1, i + 2); - - match (start, end) { - (Ok(s), Ok(e)) => Ok(s..e), - (Err(e), _) | (_, Err(e)) => Err(e), - } - }) - } - - /// Get all key-names as an Iterator of strings - pub fn fields( - &'m self, - ) -> Result>, ArrowError> { - let iterator = self - .offsets() - .map(move |offset_range| self.get_field_by_offset(offset_range?)); - Ok(iterator) - } -} - -#[derive(Clone, Debug, PartialEq)] -pub(crate) struct VariantObjectHeader { - field_offset_size: OffsetSizeBytes, - field_id_size: OffsetSizeBytes, - num_elements: usize, - field_ids_start_byte: usize, - field_offsets_start_byte: usize, - values_start_byte: usize, -} - -impl VariantObjectHeader { - pub(crate) fn try_new(value: &[u8]) -> Result { - // Parse the header byte to get object parameters - let header = first_byte_from_slice(value)?; - let value_header = header >> 2; - - let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits - let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits - let is_large = value_header & 0x10; // 5th bit - - let field_offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; - let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?; - - // Determine num_elements size based on is_large flag - let num_elements_size = if is_large != 0 { - OffsetSizeBytes::Four - } else { - OffsetSizeBytes::One - }; - - // Parse num_elements - let num_elements = num_elements_size.unpack_usize(value, 1, 0)?; - - // Calculate byte offsets for different sections - let field_ids_start_byte = 1 + num_elements_size as usize; - let field_offsets_start_byte = field_ids_start_byte + num_elements * field_id_size as usize; - let values_start_byte = - field_offsets_start_byte + (num_elements + 1) * field_offset_size as usize; - - // Verify that the last field offset array entry is inside the value slice - let last_field_offset_byte = - field_offsets_start_byte + (num_elements + 1) * field_offset_size as usize; - if last_field_offset_byte > value.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Last field offset array entry at offset {} with length {} is outside the value slice of length {}", - last_field_offset_byte, - field_offset_size as usize, - value.len() - ))); - } - - // Verify that the value of the last field offset array entry fits inside the value slice - let last_field_offset = - field_offset_size.unpack_usize(value, field_offsets_start_byte, num_elements)?; - if values_start_byte + last_field_offset > value.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Last field offset value {} at offset {} is outside the value slice of length {}", - last_field_offset, - values_start_byte, - value.len() - ))); - } - Ok(Self { - field_offset_size, - field_id_size, - num_elements, - field_ids_start_byte, - field_offsets_start_byte, - values_start_byte, - }) - } - - /// Returns the number of key-value pairs in this object - pub(crate) fn num_elements(&self) -> usize { - self.num_elements - } -} - -#[derive(Clone, Debug, PartialEq)] -pub struct VariantObject<'m, 'v> { - pub metadata: VariantMetadata<'m>, - pub value: &'v [u8], - header: VariantObjectHeader, -} - -impl<'m, 'v> VariantObject<'m, 'v> { - pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { - Ok(Self { - metadata, - value, - header: VariantObjectHeader::try_new(value)?, - }) - } - - /// Returns the number of key-value pairs in this object - pub fn len(&self) -> usize { - self.header.num_elements() - } - - /// Returns true if the object contains no key-value pairs - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn fields(&self) -> Result)>, ArrowError> { - let field_list = self.parse_field_list()?; - Ok(field_list.into_iter()) - } - - pub fn values(&self) -> Result>, ArrowError> { - let fields = self.parse_field_list()?; - let values: Vec<_> = fields.into_iter().map(|(_, variant)| variant).collect(); - Ok(values.into_iter()) - } - - pub fn field(&self, name: &str) -> Result>, ArrowError> { - // Binary search through the field IDs of this object to find the requested field name. - // - // NOTE: This does not require a sorted metadata dictionary, because the variant spec - // requires object field ids to be lexically sorted by their corresponding string values, - // and probing the dictionary for a field id is always O(1) work. - let (field_ids, field_offsets) = self.parse_field_arrays()?; - let search_result = try_binary_search_by(&field_ids, &name, |&field_id| { - self.metadata.get_field_by(field_id) - })?; - - let Ok(index) = search_result else { - return Ok(None); - }; - let start_offset = field_offsets[index]; - let end_offset = field_offsets[index + 1]; - let value_bytes = slice_from_slice( - self.value, - self.header.values_start_byte + start_offset - ..self.header.values_start_byte + end_offset, - )?; - let variant = Variant::try_new_with_metadata(self.metadata, value_bytes)?; - Ok(Some(variant)) - } - - /// Parse field IDs and field offsets arrays using the cached header - fn parse_field_arrays(&self) -> Result<(Vec, Vec), ArrowError> { - // Parse field IDs - let field_ids = (0..self.header.num_elements) - .map(|i| { - self.header.field_id_size.unpack_usize( - self.value, - self.header.field_ids_start_byte, - i, - ) - }) - .collect::, _>>()?; - debug_assert_eq!(field_ids.len(), self.header.num_elements); - - // Parse field offsets (num_elements + 1 entries) - let field_offsets = (0..=self.header.num_elements) - .map(|i| { - self.header.field_offset_size.unpack_usize( - self.value, - self.header.field_offsets_start_byte, - i, - ) - }) - .collect::, _>>()?; - debug_assert_eq!(field_offsets.len(), self.header.num_elements + 1); - - Ok((field_ids, field_offsets)) - } - - /// Parse all fields into a vector for iteration - fn parse_field_list(&self) -> Result)>, ArrowError> { - let (field_ids, field_offsets) = self.parse_field_arrays()?; - - let mut fields = Vec::with_capacity(self.header.num_elements); - - for i in 0..self.header.num_elements { - let field_id = field_ids[i]; - let field_name = self.metadata.get_field_by(field_id)?; - - let start_offset = field_offsets[i]; - let value_bytes = - slice_from_slice(self.value, self.header.values_start_byte + start_offset..)?; - let variant = Variant::try_new_with_metadata(self.metadata, value_bytes)?; - - fields.push((field_name, variant)); - } - - Ok(fields) - } -} - -#[derive(Clone, Debug, PartialEq)] -pub(crate) struct VariantListHeader { - offset_size: OffsetSizeBytes, - is_large: bool, - num_elements: usize, - first_offset_byte: usize, - first_value_byte: usize, -} - -impl VariantListHeader { - pub(crate) fn try_new(value: &[u8]) -> Result { - // The 6 first bits to the left are the value_header and the 2 bits - // to the right are the basic type, so we shift to get only the value_header - let value_header = first_byte_from_slice(value)? >> 2; - let is_large = (value_header & 0x04) != 0; // 3rd bit from the right - let field_offset_size_minus_one = value_header & 0x03; // Last two bits - let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; - - // The size of the num_elements entry in the array value_data is 4 bytes if - // is_large is true, otherwise 1 byte. - let num_elements_size = match is_large { - true => OffsetSizeBytes::Four, - false => OffsetSizeBytes::One, - }; - - // Skip the header byte to read the num_elements - let num_elements = num_elements_size.unpack_usize(value, 1, 0)?; - let first_offset_byte = 1 + num_elements_size as usize; - - let overflow = - || ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into()); - - // 1. num_elements + 1 - let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?; - - // 2. (num_elements + 1) * offset_size - let value_bytes = n_offsets - .checked_mul(offset_size as usize) - .ok_or_else(overflow)?; - - // 3. first_offset_byte + ... - let first_value_byte = first_offset_byte - .checked_add(value_bytes) - .ok_or_else(overflow)?; - - // Verify that the last offset array entry is inside the value slice - let last_offset_byte = first_offset_byte + n_offsets * offset_size as usize; - if last_offset_byte > value.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Last offset array entry at offset {} with length {} is outside the value slice of length {}", - last_offset_byte, - offset_size as usize, - value.len() - ))); - } - - // Verify that the value of the last offset array entry fits inside the value slice - let last_offset = offset_size.unpack_usize(value, first_offset_byte, num_elements)?; - if first_value_byte + last_offset > value.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Last offset value {} at offset {} is outside the value slice of length {}", - last_offset, - first_value_byte, - value.len() - ))); - } - - Ok(Self { - offset_size, - is_large, - num_elements, - first_offset_byte, - first_value_byte, - }) - } - - /// Returns the number of elements in this list - pub(crate) fn num_elements(&self) -> usize { - self.num_elements - } - - /// Returns the offset size in bytes - #[allow(unused)] - pub(crate) fn offset_size(&self) -> usize { - self.offset_size as _ - } - - /// Returns whether this is a large list - #[allow(unused)] - pub(crate) fn is_large(&self) -> bool { - self.is_large - } - - /// Returns the byte offset where the offset array starts - pub(crate) fn first_offset_byte(&self) -> usize { - self.first_offset_byte - } - - /// Returns the byte offset where the values start - pub(crate) fn first_value_byte(&self) -> usize { - self.first_value_byte - } -} /// Represents a variant array. /// diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs index 714f8a57f18f..4741c130f630 100644 --- a/parquet-variant/tests/integration_test.rs +++ b/parquet-variant/tests/integration_test.rs @@ -68,7 +68,7 @@ fn test_primitive_values_to_json() { Value::String("hello world".to_string()) ); - let short_string = Variant::ShortString("test"); + let short_string = Variant::ShortString(parquet_variant::ShortString::try_new("test").unwrap()); assert_eq!(variant_to_json_string(&short_string).unwrap(), "\"test\""); assert_eq!( variant_to_json_value(&short_string).unwrap(), @@ -293,7 +293,7 @@ fn test_comprehensive_roundtrip_compatibility() { }, Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), Variant::String("test string"), - Variant::ShortString("short"), + Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), Variant::Binary(b"binary data"), ]; @@ -339,7 +339,7 @@ fn test_json_roundtrip_compatibility() { Variant::String(""), Variant::String("simple string"), Variant::String("string with\nnewlines\tand\ttabs"), - Variant::ShortString("short"), + Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), ]; for variant in test_cases { From 743d0e92d66bdc9cc16d0b5d6a66ca8aca218672 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 22:30:30 -0400 Subject: [PATCH 15/32] [FIX] make intergration_tests.rs more modular --- parquet-variant/tests/integration_test.rs | 219 +++++++++++++--------- 1 file changed, 128 insertions(+), 91 deletions(-) diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs index 4741c130f630..581a33b1d3f1 100644 --- a/parquet-variant/tests/integration_test.rs +++ b/parquet-variant/tests/integration_test.rs @@ -268,45 +268,47 @@ fn test_binary_type_to_json() { #[test] fn test_comprehensive_roundtrip_compatibility() { - // Test that our JSON output can be parsed back by serde_json for all types - let test_cases = vec![ - Variant::Null, - Variant::BooleanTrue, - Variant::BooleanFalse, - Variant::Int8(42), - Variant::Int16(1000), - Variant::Int32(100000), - Variant::Int64(10000000000), - Variant::Float(3.5), // Use a value that can be represented exactly in f32 - Variant::Double(std::f64::consts::E), - Variant::Decimal4 { - integer: 12345, - scale: 2, - }, - Variant::Decimal8 { - integer: 1234567890, - scale: 3, - }, - Variant::Decimal16 { - integer: 123456789012345, - scale: 4, - }, - Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), - Variant::String("test string"), - Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), - Variant::Binary(b"binary data"), - ]; - - for variant in test_cases { - let json_string = variant_to_json_string(&variant).unwrap(); - - // Ensure the JSON can be parsed back - let parsed: Value = serde_json::from_str(&json_string).unwrap(); - - // Ensure our direct Value conversion matches - let direct_value = variant_to_json_value(&variant).unwrap(); - assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); - } + Test { + variant: Variant::Float(3.5), + json: "3.5", + value: serde_json::Number::from_f64(3.5).map(Value::Number).unwrap(), + }.run(); + + Test { + variant: Variant::Double(2.718281828459045), + json: "2.718281828459045", + value: serde_json::Number::from_f64(2.718281828459045).map(Value::Number).unwrap(), + }.run(); + + Test { + variant: Variant::Decimal4 { integer: 12345, scale: 2 }, + json: "123.45", + value: serde_json::Number::from_f64(123.45).map(Value::Number).unwrap(), + }.run(); + + Test { + variant: Variant::Decimal8 { integer: 1234567890, scale: 3 }, + json: "1234567.89", + value: serde_json::Number::from_f64(1234567.89).map(Value::Number).unwrap(), + }.run(); + + Test { + variant: Variant::Decimal16 { integer: 123456789012345, scale: 4 }, + json: "12345678901.2345", + value: serde_json::Number::from_f64(12345678901.2345).map(Value::Number).unwrap(), + }.run(); + + Test { + variant: Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), + json: "\"2023-01-01\"", + value: Value::String("2023-01-01".to_string()), + }.run(); + + Test { + variant: Variant::Binary(b"binary data"), + json: "\"YmluYXJ5IGRhdGE=\"", // base64 encoded "binary data" + value: Value::String("YmluYXJ5IGRhdGE=".to_string()), + }.run(); } #[test] @@ -326,69 +328,104 @@ fn test_string_escaping_edge_cases() { assert!(json_result.starts_with('"') && json_result.ends_with('"')); } -#[test] -fn test_json_roundtrip_compatibility() { - // Test that our JSON output can be parsed back by serde_json - let test_cases = vec![ - Variant::Null, - Variant::BooleanTrue, - Variant::BooleanFalse, - Variant::Int8(0), - Variant::Int8(127), - Variant::Int8(-128), - Variant::String(""), - Variant::String("simple string"), - Variant::String("string with\nnewlines\tand\ttabs"), - Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), - ]; - - for variant in test_cases { - let json_string = variant_to_json_string(&variant).unwrap(); - - // Ensure the JSON can be parsed back - let parsed: Value = serde_json::from_str(&json_string).unwrap(); - - // Ensure our direct Value conversion matches - let direct_value = variant_to_json_value(&variant).unwrap(); - assert_eq!(parsed, direct_value, "Mismatch for variant: {:?}", variant); - } -} - #[test] fn test_buffer_writing() { use parquet_variant::variant_to_json; - use std::io::Write; - let variant = Variant::String("test buffer writing"); - // Test writing to a Vec let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant).unwrap(); let result = String::from_utf8(buffer).unwrap(); assert_eq!(result, "\"test buffer writing\""); + let mut buffer = vec![]; + variant_to_json(&mut buffer, &variant).unwrap(); + let result = String::from_utf8(buffer).unwrap(); + assert_eq!(result, "\"test buffer writing\""); +} - // Test writing to a custom writer - struct CustomWriter { - data: Vec, - } - - impl Write for CustomWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.data.extend_from_slice(buf); - Ok(buf.len()) - } +struct Test { + variant: Variant<'static, 'static>, + json: &'static str, + value: Value, +} - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } +impl Test { + fn run(self) { + let json_string = variant_to_json_string(&self.variant).unwrap(); + assert_eq!(json_string, self.json, "JSON string mismatch for variant: {:?}", self.variant); + + let json_value = variant_to_json_value(&self.variant).unwrap(); + assert_eq!(json_value, self.value, "JSON value mismatch for variant: {:?}", self.variant); + let parsed: Value = serde_json::from_str(&json_string).unwrap(); + assert_eq!(parsed, self.value, "Parsed JSON mismatch for variant: {:?}", self.variant); } - - let mut custom_writer = CustomWriter { data: Vec::new() }; - variant_to_json(&mut custom_writer, &variant).unwrap(); - let result = String::from_utf8(custom_writer.data).unwrap(); - assert_eq!(result, "\"test buffer writing\""); } -// Note: Tests for arrays and objects would require actual Variant data structures -// to be created, which would need the builder API to be implemented first. -// These tests demonstrate the primitive functionality that's currently working. +#[test] +fn test_json_roundtrip_compatibility() { + Test { + variant: Variant::Null, + json: "null", + value: Value::Null, + }.run(); + + Test { + variant: Variant::BooleanTrue, + json: "true", + value: Value::Bool(true), + }.run(); + + Test { + variant: Variant::BooleanFalse, + json: "false", + value: Value::Bool(false), + }.run(); + + Test { + variant: Variant::Int8(42), + json: "42", + value: Value::Number(42.into()), + }.run(); + + Test { + variant: Variant::Int8(-128), + json: "-128", + value: Value::Number((-128).into()), + }.run(); + + Test { + variant: Variant::Int16(1000), + json: "1000", + value: Value::Number(1000.into()), + }.run(); + + Test { + variant: Variant::Int32(100000), + json: "100000", + value: Value::Number(100000.into()), + }.run(); + + Test { + variant: Variant::Int64(10000000000), + json: "10000000000", + value: Value::Number(10000000000i64.into()), + }.run(); + + Test { + variant: Variant::String("simple string"), + json: "\"simple string\"", + value: Value::String("simple string".to_string()), + }.run(); + + Test { + variant: Variant::String(""), + json: "\"\"", + value: Value::String("".to_string()), + }.run(); + + Test { + variant: Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), + json: "\"short\"", + value: Value::String("short".to_string()), + }.run(); +} From b4559970b3f38d05c01b43c2820d5661d3a05590 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 22:40:31 -0400 Subject: [PATCH 16/32] [FIX] modify to_json to hide lines when rendering as documentation & modify the examples to a single function --- .../examples/variant_to_json_examples.rs | 134 +++++++----------- parquet-variant/src/to_json.rs | 93 +++++++----- 2 files changed, 114 insertions(+), 113 deletions(-) diff --git a/parquet-variant/examples/variant_to_json_examples.rs b/parquet-variant/examples/variant_to_json_examples.rs index fceda22175f3..4249485b8eb6 100644 --- a/parquet-variant/examples/variant_to_json_examples.rs +++ b/parquet-variant/examples/variant_to_json_examples.rs @@ -16,89 +16,63 @@ // under the License. //! Example showing how to convert Variant values to JSON +//! +//! This example demonstrates building a complex Variant +//! and converting it to JSON using the three main conversion functions. -use parquet_variant::{variant_to_json, variant_to_json_string, variant_to_json_value, Variant}; +use parquet_variant::{variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder}; fn main() -> Result<(), Box> { - let variants = vec![ - ("Null", Variant::Null), - ("Boolean True", Variant::BooleanTrue), - ("Boolean False", Variant::BooleanFalse), - ("Integer 42", Variant::Int8(42)), - ("Negative Integer", Variant::Int8(-123)), - ("String", Variant::String("Hello, World!")), - ("Short String", Variant::ShortString(parquet_variant::ShortString::try_new("Hi!").unwrap())), - ]; - - for (name, variant) in variants { - let json_string = variant_to_json_string(&variant)?; - println!(" {} -> {}", name, json_string); - } - - // Example 2: String escaping - println!("\nπŸ”€ 2. String Escaping:"); - - let special_string = - Variant::String("Line 1\nLine 2\tTabbed\r\nWith \"quotes\" and \\backslashes"); - let escaped_json = variant_to_json_string(&special_string)?; - println!(" Original: Line 1\\nLine 2\\tTabbed\\r\\nWith \"quotes\" and \\\\backslashes"); - println!(" JSON: {}", escaped_json); - - let unicode_variants = vec![ - Variant::String("Hello δΈ–η•Œ 🌍"), - Variant::String("Emoji: πŸ’»"), - Variant::String("Math: Ξ± + Ξ² = Ξ³"), - ]; - - for variant in unicode_variants { - let json_string = variant_to_json_string(&variant)?; - println!(" {}", json_string); - } - - let test_variant = Variant::String("Buffer test"); - - // Write to Vec - let mut vec_buffer = Vec::new(); - variant_to_json(&mut vec_buffer, &test_variant)?; - println!(" Vec buffer: {}", String::from_utf8(vec_buffer)?); - - // Write to String (through write! macro) - let mut string_buffer = String::new(); - use std::fmt::Write; - write!(string_buffer, "Prefix: ")?; - - // Convert to bytes temporarily to use the Write trait - let mut temp_buffer = Vec::new(); - variant_to_json(&mut temp_buffer, &test_variant)?; - string_buffer.push_str(&String::from_utf8(temp_buffer)?); - println!(" String buffer: {}", string_buffer); - - let variants_for_value = vec![ - Variant::Null, - Variant::BooleanTrue, - Variant::Int8(100), - Variant::String("Value conversion"), - ]; - - for variant in variants_for_value { - let json_value = variant_to_json_value(&variant)?; - println!(" {:?} -> {:?}", variant, json_value); + let mut builder = VariantBuilder::new(); + + { + let mut person = builder.new_object(); + person.append_value("name", "Alice"); + person.append_value("age", 30i32); + person.append_value("email", "alice@example.com"); + person.append_value("is_active", true); + person.append_value("score", 95.7f64); + person.append_value("department", "Engineering"); + person.finish(); } - - let start = std::time::Instant::now(); - for i in 0..1000 { - let variant = Variant::Int8((i % 128) as i8); - let _json = variant_to_json_string(&variant)?; - } - let duration = start.elapsed(); - println!(" Converted 1000 variants in {:?}", duration); - - // This would demonstrate error handling if we had invalid variants - // For now, all our examples work, so we'll just show the pattern - match variant_to_json_string(&Variant::String("Valid string")) { - Ok(json) => println!(" Success: {}", json), - Err(e) => println!(" Error: {}", e), - } - + + let (metadata, value) = builder.finish(); + let variant = parquet_variant::Variant::try_new(&metadata, &value)?; + + // Method 1: Convert to JSON string (most common) + let json_string = variant_to_json_string(&variant)?; + // Method 2: Convert to serde_json::Value (for programmatic use) + println!("πŸ”§ Using variant_to_json_value() -> serde_json::Value:"); + let json_value = variant_to_json_value(&variant)?; + let pretty_json = serde_json::to_string_pretty(&json_value)?; + println!("{}\n", pretty_json); + + // Method 3: Write directly to buffer (for performance) + println!("⚑ Using variant_to_json() -> Write to buffer:"); + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, &variant)?; + let buffer_result = String::from_utf8(buffer)?; + println!("{}\n", buffer_result); + + // Verify all methods produce the same result + assert_eq!(json_string, buffer_result); + assert_eq!(json_string, serde_json::to_string(&json_value)?); + + // Simple string + let simple_string = parquet_variant::Variant::String("Hello, JSON!"); + println!("String: {}", variant_to_json_string(&simple_string)?); + + // Simple number + let simple_number = parquet_variant::Variant::Int32(42); + println!("Number: {}", variant_to_json_string(&simple_number)?); + + // Simple boolean + let simple_bool = parquet_variant::Variant::BooleanTrue; + println!("Boolean: {}", variant_to_json_string(&simple_bool)?); + + // Simple null + let simple_null = parquet_variant::Variant::Null; + println!("Null: {}", variant_to_json_string(&simple_null)?); + Ok(()) } diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 95d9fab689a8..97f468c45ff6 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -41,7 +41,10 @@ fn format_binary_base64(bytes: &[u8]) -> String { general_purpose::STANDARD.encode(bytes) } -/// Converts a Variant to JSON and writes it to the provided buffer +/// Converts a Variant to JSON and writes it to the provided `Write` +/// +/// This function writes JSON directly to any type that implements [`Write`](std::io::Write), +/// making it efficient for streaming or when you want to control the output destination. /// /// # Arguments /// @@ -53,20 +56,26 @@ fn format_binary_base64(bytes: &[u8]) -> String { /// * `Ok(())` if successful /// * `Err` with error details if conversion fails /// -/// # Example +/// # Examples /// /// ```rust -/// use parquet_variant::{Variant, variant_to_json}; -/// use arrow_schema::ArrowError; +/// # use parquet_variant::{Variant, variant_to_json}; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::Int32(42); +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// assert_eq!(String::from_utf8(buffer).unwrap(), "42"); +/// # Ok::<(), ArrowError>(()) +/// ``` /// -/// fn example() -> Result<(), ArrowError> { -/// let variant = Variant::Int8(42); -/// let mut buffer = Vec::new(); -/// variant_to_json(&mut buffer, &variant)?; -/// assert_eq!(String::from_utf8(buffer).unwrap(), "42"); -/// Ok(()) -/// } -/// example().unwrap(); +/// ```rust +/// # use parquet_variant::{Variant, variant_to_json}; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::String("Hello, World!"); +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// assert_eq!(String::from_utf8(buffer).unwrap(), "\"Hello, World!\""); +/// # Ok::<(), ArrowError>(()) /// ``` pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Result<(), ArrowError> { match variant { @@ -236,6 +245,9 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// Convert Variant to JSON string /// +/// This is a convenience function that converts a Variant to a JSON string. +/// It's the simplest way to get a JSON representation when you just need a String result. +/// /// # Arguments /// /// * `variant` - The Variant value to convert @@ -245,19 +257,24 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// * `Ok(String)` containing the JSON representation /// * `Err` with error details if conversion fails /// -/// # Example +/// # Examples /// /// ```rust -/// use parquet_variant::{Variant, variant_to_json_string}; -/// use arrow_schema::ArrowError; +/// # use parquet_variant::{Variant, variant_to_json_string}; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::Int32(42); +/// let json = variant_to_json_string(&variant)?; +/// assert_eq!(json, "42"); +/// # Ok::<(), ArrowError>(()) +/// ``` /// -/// fn example() -> Result<(), ArrowError> { -/// let variant = Variant::String("hello"); -/// let json = variant_to_json_string(&variant)?; -/// assert_eq!(json, "\"hello\""); -/// Ok(()) -/// } -/// example().unwrap(); +/// ```rust +/// # use parquet_variant::{Variant, variant_to_json_string}; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::String("Hello, World!"); +/// let json = variant_to_json_string(&variant)?; +/// assert_eq!(json, "\"Hello, World!\""); +/// # Ok::<(), ArrowError>(()) /// ``` pub fn variant_to_json_string(variant: &Variant) -> Result { let mut buffer = Vec::new(); @@ -268,6 +285,10 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { /// Convert Variant to serde_json::Value /// +/// This function converts a Variant to a [`serde_json::Value`], which is useful +/// when you need to work with the JSON data programmatically or integrate with +/// other serde-based JSON processing. +/// /// # Arguments /// /// * `variant` - The Variant value to convert @@ -277,20 +298,26 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { /// * `Ok(Value)` containing the JSON value /// * `Err` with error details if conversion fails /// -/// # Example +/// # Examples /// /// ```rust -/// use parquet_variant::{Variant, variant_to_json_value}; -/// use serde_json::Value; -/// use arrow_schema::ArrowError; +/// # use parquet_variant::{Variant, variant_to_json_value}; +/// # use serde_json::Value; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::Int32(42); +/// let json_value = variant_to_json_value(&variant)?; +/// assert_eq!(json_value, Value::Number(42.into())); +/// # Ok::<(), ArrowError>(()) +/// ``` /// -/// fn example() -> Result<(), ArrowError> { -/// let variant = Variant::Int8(42); -/// let json_value = variant_to_json_value(&variant)?; -/// assert_eq!(json_value, Value::Number(42.into())); -/// Ok(()) -/// } -/// example().unwrap(); +/// ```rust +/// # use parquet_variant::{Variant, variant_to_json_value}; +/// # use serde_json::Value; +/// # use arrow_schema::ArrowError; +/// let variant = Variant::String("hello"); +/// let json_value = variant_to_json_value(&variant)?; +/// assert_eq!(json_value, Value::String("hello".to_string())); +/// # Ok::<(), ArrowError>(()) /// ``` pub fn variant_to_json_value(variant: &Variant) -> Result { match variant { From 1853fe5b7308e6a0fc7bff53fcd3161257f98a58 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 22:40:49 -0400 Subject: [PATCH 17/32] [FIX] modify the examples to a single function --- .../examples/variant_to_json_examples.rs | 27 +------------------ 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/parquet-variant/examples/variant_to_json_examples.rs b/parquet-variant/examples/variant_to_json_examples.rs index 4249485b8eb6..936b1475d601 100644 --- a/parquet-variant/examples/variant_to_json_examples.rs +++ b/parquet-variant/examples/variant_to_json_examples.rs @@ -16,9 +16,6 @@ // under the License. //! Example showing how to convert Variant values to JSON -//! -//! This example demonstrates building a complex Variant -//! and converting it to JSON using the three main conversion functions. use parquet_variant::{variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder}; @@ -39,40 +36,18 @@ fn main() -> Result<(), Box> { let (metadata, value) = builder.finish(); let variant = parquet_variant::Variant::try_new(&metadata, &value)?; - // Method 1: Convert to JSON string (most common) let json_string = variant_to_json_string(&variant)?; - // Method 2: Convert to serde_json::Value (for programmatic use) - println!("πŸ”§ Using variant_to_json_value() -> serde_json::Value:"); let json_value = variant_to_json_value(&variant)?; let pretty_json = serde_json::to_string_pretty(&json_value)?; - println!("{}\n", pretty_json); + println!("{}", pretty_json); - // Method 3: Write directly to buffer (for performance) - println!("⚑ Using variant_to_json() -> Write to buffer:"); let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant)?; let buffer_result = String::from_utf8(buffer)?; - println!("{}\n", buffer_result); // Verify all methods produce the same result assert_eq!(json_string, buffer_result); assert_eq!(json_string, serde_json::to_string(&json_value)?); - // Simple string - let simple_string = parquet_variant::Variant::String("Hello, JSON!"); - println!("String: {}", variant_to_json_string(&simple_string)?); - - // Simple number - let simple_number = parquet_variant::Variant::Int32(42); - println!("Number: {}", variant_to_json_string(&simple_number)?); - - // Simple boolean - let simple_bool = parquet_variant::Variant::BooleanTrue; - println!("Boolean: {}", variant_to_json_string(&simple_bool)?); - - // Simple null - let simple_null = parquet_variant::Variant::Null; - println!("Null: {}", variant_to_json_string(&simple_null)?); - Ok(()) } From b78b1103028488a0a800ef11434884fb1470744c Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 22:46:30 -0400 Subject: [PATCH 18/32] [FIX] add an example for the new variant builder --- parquet-variant/src/to_json.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 97f468c45ff6..518475e13003 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -246,6 +246,7 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// Convert Variant to JSON string /// /// This is a convenience function that converts a Variant to a JSON string. +/// This is the same as calling variant_to_json with a Vec /// It's the simplest way to get a JSON representation when you just need a String result. /// /// # Arguments @@ -276,6 +277,35 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// assert_eq!(json, "\"Hello, World!\""); /// # Ok::<(), ArrowError>(()) /// ``` +/// +/// # Example: Create a [`Variant::Object`] and convert to JSON +/// +/// This example shows how to create an object with two fields and convert it to JSON: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ```rust +/// # use parquet_variant::{Variant, VariantBuilder, variant_to_json_string}; +/// # use arrow_schema::ArrowError; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // Create the Variant and convert to JSON +/// let variant = Variant::try_new(&metadata, &value)?; +/// let json = variant_to_json_string(&variant)?; +/// assert!(json.contains("\"first_name\":\"Jiaying\"")); +/// assert!(json.contains("\"last_name\":\"Li\"")); +/// # Ok::<(), ArrowError>(()) +/// ``` pub fn variant_to_json_string(variant: &Variant) -> Result { let mut buffer = Vec::new(); variant_to_json(&mut buffer, variant)?; From e7a103d6b372f713b3d4a13aa64d2bb21659433b Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 22:59:25 -0400 Subject: [PATCH 19/32] [FIX] remove integration_tests and consolidate tests --- parquet-variant/src/to_json.rs | 466 ++++++++++++---------- parquet-variant/tests/integration_test.rs | 431 -------------------- 2 files changed, 259 insertions(+), 638 deletions(-) delete mode 100644 parquet-variant/tests/integration_test.rs diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 518475e13003..68dcb1e4b3dd 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -429,184 +429,23 @@ mod tests { use chrono::{DateTime, NaiveDate, Utc}; #[test] - fn test_null_to_json() -> Result<(), ArrowError> { - let variant = Variant::Null; - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "null"); - - let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::Null); - Ok(()) - } - - #[test] - fn test_boolean_to_json() -> Result<(), ArrowError> { - let variant_true = Variant::BooleanTrue; - let json_true = variant_to_json_string(&variant_true)?; - assert_eq!(json_true, "true"); - - let variant_false = Variant::BooleanFalse; - let json_false = variant_to_json_string(&variant_false)?; - assert_eq!(json_false, "false"); - - let json_value_true = variant_to_json_value(&variant_true)?; - assert_eq!(json_value_true, Value::Bool(true)); - - let json_value_false = variant_to_json_value(&variant_false)?; - assert_eq!(json_value_false, Value::Bool(false)); - Ok(()) - } - - #[test] - fn test_int8_to_json() -> Result<(), ArrowError> { - let variant = Variant::Int8(42); - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "42"); - - let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::Number(42.into())); - Ok(()) - } - - #[test] - fn test_int16_to_json() -> Result<(), ArrowError> { - let variant = Variant::Int16(32767); - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "32767"); - - let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::Number(32767.into())); - - // Test negative value - let negative_variant = Variant::Int16(-32768); - let negative_json = variant_to_json_string(&negative_variant)?; - assert_eq!(negative_json, "-32768"); - Ok(()) - } - - #[test] - fn test_int32_to_json() -> Result<(), ArrowError> { - let variant = Variant::Int32(2147483647); - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "2147483647"); - - let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::Number(2147483647.into())); - - // Test negative value - let negative_variant = Variant::Int32(-2147483648); - let negative_json = variant_to_json_string(&negative_variant)?; - assert_eq!(negative_json, "-2147483648"); - Ok(()) - } - - #[test] - fn test_int64_to_json() -> Result<(), ArrowError> { - let variant = Variant::Int64(9223372036854775807); - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "9223372036854775807"); - - let json_value = variant_to_json_value(&variant)?; - assert_eq!(json_value, Value::Number(9223372036854775807i64.into())); - - // Test negative value - let negative_variant = Variant::Int64(-9223372036854775808); - let negative_json = variant_to_json_string(&negative_variant)?; - assert_eq!(negative_json, "-9223372036854775808"); - Ok(()) - } - - #[test] - fn test_float_to_json() -> Result<(), ArrowError> { - let variant = Variant::Float(std::f32::consts::PI); - let json = variant_to_json_string(&variant)?; - assert!(json.starts_with("3.14159")); - - let json_value = variant_to_json_value(&variant)?; - assert!(matches!(json_value, Value::Number(_))); - - // Test zero - let zero_variant = Variant::Float(0.0); - let zero_json = variant_to_json_string(&zero_variant)?; - assert_eq!(zero_json, "0"); - - // Test negative - let negative_variant = Variant::Float(-1.5); - let negative_json = variant_to_json_string(&negative_variant)?; - assert_eq!(negative_json, "-1.5"); - Ok(()) - } - - #[test] - fn test_double_to_json() -> Result<(), ArrowError> { - let variant = Variant::Double(std::f64::consts::E); - let json = variant_to_json_string(&variant)?; - assert!(json.starts_with("2.718281828459045")); - - let json_value = variant_to_json_value(&variant)?; - assert!(matches!(json_value, Value::Number(_))); - - // Test zero - let zero_variant = Variant::Double(0.0); - let zero_json = variant_to_json_string(&zero_variant)?; - assert_eq!(zero_json, "0"); - - // Test negative - let negative_variant = Variant::Double(-2.5); - let negative_json = variant_to_json_string(&negative_variant)?; - assert_eq!(negative_json, "-2.5"); - Ok(()) - } - - #[test] - fn test_decimal4_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal4 { - integer: 12345, - scale: 2, - }; - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "123.45"); - - let json_value = variant_to_json_value(&variant)?; - assert!(matches!(json_value, Value::Number(_))); - - // Test zero scale - let no_scale_variant = Variant::Decimal4 { - integer: 42, - scale: 0, - }; - let no_scale_json = variant_to_json_string(&no_scale_variant)?; - assert_eq!(no_scale_json, "42"); - - // Test negative + fn test_decimal_edge_cases() -> Result<(), ArrowError> { + // Test negative decimal let negative_variant = Variant::Decimal4 { integer: -12345, scale: 3, }; let negative_json = variant_to_json_string(&negative_variant)?; assert_eq!(negative_json, "-12.345"); - Ok(()) - } - - #[test] - fn test_decimal8_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal8 { - integer: 1234567890, - scale: 3, - }; - let json = variant_to_json_string(&variant)?; - assert_eq!(json, "1234567.89"); - - let json_value = variant_to_json_value(&variant)?; - assert!(matches!(json_value, Value::Number(_))); - - // Test large scale + + // Test large scale decimal let large_scale_variant = Variant::Decimal8 { integer: 123456789, scale: 6, }; let large_scale_json = variant_to_json_string(&large_scale_variant)?; assert_eq!(large_scale_json, "123.456789"); + Ok(()) } @@ -759,50 +598,263 @@ mod tests { Ok(()) } + /// Reusable test structure for JSON conversion testing + struct JsonTest { + variant: Variant<'static, 'static>, + expected_json: &'static str, + expected_value: Value, + } + + impl JsonTest { + fn run(self) { + let json_string = variant_to_json_string(&self.variant) + .expect("variant_to_json_string should succeed"); + assert_eq!(json_string, self.expected_json, + "JSON string mismatch for variant: {:?}", self.variant); + + let json_value = variant_to_json_value(&self.variant) + .expect("variant_to_json_value should succeed"); + + // For floating point numbers, we need special comparison due to JSON number representation + match (&json_value, &self.expected_value) { + (Value::Number(actual), Value::Number(expected)) => { + let actual_f64 = actual.as_f64().unwrap_or(0.0); + let expected_f64 = expected.as_f64().unwrap_or(0.0); + assert!((actual_f64 - expected_f64).abs() < f64::EPSILON, + "JSON value mismatch for variant: {:?}, got {}, expected {}", + self.variant, actual_f64, expected_f64); + } + _ => { + assert_eq!(json_value, self.expected_value, + "JSON value mismatch for variant: {:?}", self.variant); + } + } + + // Verify roundtrip: JSON string should parse to same value + let parsed: Value = serde_json::from_str(&json_string) + .expect("Generated JSON should be valid"); + // Same floating point handling for roundtrip + match (&parsed, &self.expected_value) { + (Value::Number(actual), Value::Number(expected)) => { + let actual_f64 = actual.as_f64().unwrap_or(0.0); + let expected_f64 = expected.as_f64().unwrap_or(0.0); + assert!((actual_f64 - expected_f64).abs() < f64::EPSILON, + "Parsed JSON mismatch for variant: {:?}, got {}, expected {}", + self.variant, actual_f64, expected_f64); + } + _ => { + assert_eq!(parsed, self.expected_value, + "Parsed JSON mismatch for variant: {:?}", self.variant); + } + } + } + } + #[test] - fn test_comprehensive_type_coverage() -> Result<(), ArrowError> { + fn test_primitive_json_conversion() { use crate::variant::ShortString; - // Test all supported types to ensure no compilation errors - let test_variants = vec![ - Variant::Null, - Variant::BooleanTrue, - Variant::BooleanFalse, - Variant::Int8(1), - Variant::Int16(2), - Variant::Int32(3), - Variant::Int64(4), - Variant::Float(5.0), - Variant::Double(6.0), - Variant::Decimal4 { - integer: 7, - scale: 0, - }, - Variant::Decimal8 { - integer: 8, - scale: 0, - }, - Variant::Decimal16 { - integer: 9, - scale: 0, - }, - Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), - Variant::TimestampMicros( - DateTime::parse_from_rfc3339("2023-01-01T00:00:00Z") - .unwrap() - .with_timezone(&Utc), - ), - Variant::TimestampNtzMicros(DateTime::from_timestamp(0, 0).unwrap().naive_utc()), - Variant::Binary(b"test"), - Variant::String("test"), - Variant::ShortString(ShortString::try_new("test")?), - ]; - - for variant in test_variants { - // Ensure all types can be converted without panicking - let _json_string = variant_to_json_string(&variant)?; - let _json_value = variant_to_json_value(&variant)?; - } + + // Null + JsonTest { + variant: Variant::Null, + expected_json: "null", + expected_value: Value::Null, + }.run(); + + // Booleans + JsonTest { + variant: Variant::BooleanTrue, + expected_json: "true", + expected_value: Value::Bool(true), + }.run(); + + JsonTest { + variant: Variant::BooleanFalse, + expected_json: "false", + expected_value: Value::Bool(false), + }.run(); + + // Integers - positive and negative edge cases + JsonTest { + variant: Variant::Int8(42), + expected_json: "42", + expected_value: Value::Number(42.into()), + }.run(); + + JsonTest { + variant: Variant::Int8(-128), + expected_json: "-128", + expected_value: Value::Number((-128).into()), + }.run(); + + JsonTest { + variant: Variant::Int16(32767), + expected_json: "32767", + expected_value: Value::Number(32767.into()), + }.run(); + + JsonTest { + variant: Variant::Int16(-32768), + expected_json: "-32768", + expected_value: Value::Number((-32768).into()), + }.run(); + + JsonTest { + variant: Variant::Int32(2147483647), + expected_json: "2147483647", + expected_value: Value::Number(2147483647.into()), + }.run(); + + JsonTest { + variant: Variant::Int32(-2147483648), + expected_json: "-2147483648", + expected_value: Value::Number((-2147483648).into()), + }.run(); + + JsonTest { + variant: Variant::Int64(9223372036854775807), + expected_json: "9223372036854775807", + expected_value: Value::Number(9223372036854775807i64.into()), + }.run(); + + JsonTest { + variant: Variant::Int64(-9223372036854775808), + expected_json: "-9223372036854775808", + expected_value: Value::Number((-9223372036854775808i64).into()), + }.run(); + + // Floats + JsonTest { + variant: Variant::Float(3.5), + expected_json: "3.5", + expected_value: serde_json::Number::from_f64(3.5).map(Value::Number).unwrap(), + }.run(); + + JsonTest { + variant: Variant::Float(0.0), + expected_json: "0", + expected_value: Value::Number(0.into()), // Use integer 0 to match JSON parsing + }.run(); + + JsonTest { + variant: Variant::Float(-1.5), + expected_json: "-1.5", + expected_value: serde_json::Number::from_f64(-1.5).map(Value::Number).unwrap(), + }.run(); + + JsonTest { + variant: Variant::Double(2.718281828459045), + expected_json: "2.718281828459045", + expected_value: serde_json::Number::from_f64(2.718281828459045).map(Value::Number).unwrap(), + }.run(); + + // Decimals + JsonTest { + variant: Variant::Decimal4 { integer: 12345, scale: 2 }, + expected_json: "123.45", + expected_value: serde_json::Number::from_f64(123.45).map(Value::Number).unwrap(), + }.run(); + + JsonTest { + variant: Variant::Decimal4 { integer: 42, scale: 0 }, + expected_json: "42", + expected_value: serde_json::Number::from_f64(42.0).map(Value::Number).unwrap(), + }.run(); + + JsonTest { + variant: Variant::Decimal8 { integer: 1234567890, scale: 3 }, + expected_json: "1234567.89", + expected_value: serde_json::Number::from_f64(1234567.89).map(Value::Number).unwrap(), + }.run(); + + JsonTest { + variant: Variant::Decimal16 { integer: 123456789012345, scale: 4 }, + expected_json: "12345678901.2345", + expected_value: serde_json::Number::from_f64(12345678901.2345).map(Value::Number).unwrap(), + }.run(); + + // Strings + JsonTest { + variant: Variant::String("hello world"), + expected_json: "\"hello world\"", + expected_value: Value::String("hello world".to_string()), + }.run(); + + JsonTest { + variant: Variant::String(""), + expected_json: "\"\"", + expected_value: Value::String("".to_string()), + }.run(); + + JsonTest { + variant: Variant::ShortString(ShortString::try_new("test").unwrap()), + expected_json: "\"test\"", + expected_value: Value::String("test".to_string()), + }.run(); + + // Date and timestamps + JsonTest { + variant: Variant::Date(NaiveDate::from_ymd_opt(2023, 12, 25).unwrap()), + expected_json: "\"2023-12-25\"", + expected_value: Value::String("2023-12-25".to_string()), + }.run(); + + // Binary data (base64 encoded) + JsonTest { + variant: Variant::Binary(b"test"), + expected_json: "\"dGVzdA==\"", // base64 encoded "test" + expected_value: Value::String("dGVzdA==".to_string()), + }.run(); + + JsonTest { + variant: Variant::Binary(b""), + expected_json: "\"\"", // empty base64 + expected_value: Value::String("".to_string()), + }.run(); + + JsonTest { + variant: Variant::Binary(b"binary data"), + expected_json: "\"YmluYXJ5IGRhdGE=\"", // base64 encoded "binary data" + expected_value: Value::String("YmluYXJ5IGRhdGE=".to_string()), + }.run(); + } + #[test] + fn test_string_escaping_comprehensive() { + // Test comprehensive string escaping scenarios + JsonTest { + variant: Variant::String("line1\nline2\ttab\"quote\"\\backslash"), + expected_json: "\"line1\\nline2\\ttab\\\"quote\\\"\\\\backslash\"", + expected_value: Value::String("line1\nline2\ttab\"quote\"\\backslash".to_string()), + }.run(); + + JsonTest { + variant: Variant::String("Hello δΈ–η•Œ 🌍"), + expected_json: "\"Hello δΈ–η•Œ 🌍\"", + expected_value: Value::String("Hello δΈ–η•Œ 🌍".to_string()), + }.run(); + } + + #[test] + fn test_buffer_writing_variants() -> Result<(), ArrowError> { + use crate::variant_to_json; + + let variant = Variant::String("test buffer writing"); + + // Test writing to a Vec + let mut buffer = Vec::new(); + variant_to_json(&mut buffer, &variant)?; + let result = String::from_utf8(buffer) + .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; + assert_eq!(result, "\"test buffer writing\""); + + // Test writing to vec![] + let mut buffer = vec![]; + variant_to_json(&mut buffer, &variant)?; + let result = String::from_utf8(buffer) + .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; + assert_eq!(result, "\"test buffer writing\""); + Ok(()) } diff --git a/parquet-variant/tests/integration_test.rs b/parquet-variant/tests/integration_test.rs deleted file mode 100644 index 581a33b1d3f1..000000000000 --- a/parquet-variant/tests/integration_test.rs +++ /dev/null @@ -1,431 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Integration tests for JSON conversion functionality - -use chrono::{DateTime, NaiveDate, Utc}; -use parquet_variant::{variant_to_json_string, variant_to_json_value, Variant}; -use serde_json::Value; - -#[test] -fn test_primitive_values_to_json() { - // Test null - let null_variant = Variant::Null; - assert_eq!(variant_to_json_string(&null_variant).unwrap(), "null"); - assert_eq!(variant_to_json_value(&null_variant).unwrap(), Value::Null); - - // Test boolean - let bool_true = Variant::BooleanTrue; - let bool_false = Variant::BooleanFalse; - assert_eq!(variant_to_json_string(&bool_true).unwrap(), "true"); - assert_eq!(variant_to_json_string(&bool_false).unwrap(), "false"); - assert_eq!( - variant_to_json_value(&bool_true).unwrap(), - Value::Bool(true) - ); - assert_eq!( - variant_to_json_value(&bool_false).unwrap(), - Value::Bool(false) - ); - - // Test integers - let int8_variant = Variant::Int8(42); - assert_eq!(variant_to_json_string(&int8_variant).unwrap(), "42"); - assert_eq!( - variant_to_json_value(&int8_variant).unwrap(), - Value::Number(42.into()) - ); - - let negative_int8 = Variant::Int8(-123); - assert_eq!(variant_to_json_string(&negative_int8).unwrap(), "-123"); - assert_eq!( - variant_to_json_value(&negative_int8).unwrap(), - Value::Number((-123).into()) - ); - - // Test strings - let string_variant = Variant::String("hello world"); - assert_eq!( - variant_to_json_string(&string_variant).unwrap(), - "\"hello world\"" - ); - assert_eq!( - variant_to_json_value(&string_variant).unwrap(), - Value::String("hello world".to_string()) - ); - - let short_string = Variant::ShortString(parquet_variant::ShortString::try_new("test").unwrap()); - assert_eq!(variant_to_json_string(&short_string).unwrap(), "\"test\""); - assert_eq!( - variant_to_json_value(&short_string).unwrap(), - Value::String("test".to_string()) - ); -} - -#[test] -fn test_integer_types_to_json() { - // Test Int16 - let int16_variant = Variant::Int16(32767); - assert_eq!(variant_to_json_string(&int16_variant).unwrap(), "32767"); - assert_eq!( - variant_to_json_value(&int16_variant).unwrap(), - Value::Number(32767.into()) - ); - - let negative_int16 = Variant::Int16(-32768); - assert_eq!(variant_to_json_string(&negative_int16).unwrap(), "-32768"); - assert_eq!( - variant_to_json_value(&negative_int16).unwrap(), - Value::Number((-32768).into()) - ); - - // Test Int32 - let int32_variant = Variant::Int32(2147483647); - assert_eq!( - variant_to_json_string(&int32_variant).unwrap(), - "2147483647" - ); - assert_eq!( - variant_to_json_value(&int32_variant).unwrap(), - Value::Number(2147483647.into()) - ); - - let negative_int32 = Variant::Int32(-2147483648); - assert_eq!( - variant_to_json_string(&negative_int32).unwrap(), - "-2147483648" - ); - assert_eq!( - variant_to_json_value(&negative_int32).unwrap(), - Value::Number((-2147483648).into()) - ); - - // Test Int64 - let int64_variant = Variant::Int64(9223372036854775807); - assert_eq!( - variant_to_json_string(&int64_variant).unwrap(), - "9223372036854775807" - ); - assert_eq!( - variant_to_json_value(&int64_variant).unwrap(), - Value::Number(9223372036854775807i64.into()) - ); - - let negative_int64 = Variant::Int64(-9223372036854775808); - assert_eq!( - variant_to_json_string(&negative_int64).unwrap(), - "-9223372036854775808" - ); - assert_eq!( - variant_to_json_value(&negative_int64).unwrap(), - Value::Number((-9223372036854775808i64).into()) - ); -} - -#[test] -fn test_floating_point_types_to_json() { - // Test Float (f32) - let float_variant = Variant::Float(std::f32::consts::PI); - let float_json = variant_to_json_string(&float_variant).unwrap(); - assert!(float_json.starts_with("3.14159")); - - let float_value = variant_to_json_value(&float_variant).unwrap(); - assert!(matches!(float_value, Value::Number(_))); - - // Test Double (f64) - let double_variant = Variant::Double(std::f64::consts::E); - let double_json = variant_to_json_string(&double_variant).unwrap(); - assert!(double_json.starts_with("2.718281828459045")); - - let double_value = variant_to_json_value(&double_variant).unwrap(); - assert!(matches!(double_value, Value::Number(_))); - - // Test special float values - let zero_float = Variant::Float(0.0); - assert_eq!(variant_to_json_string(&zero_float).unwrap(), "0"); - - let negative_float = Variant::Float(-1.5); - assert_eq!(variant_to_json_string(&negative_float).unwrap(), "-1.5"); -} - -#[test] -fn test_decimal_types_to_json() { - // Test Decimal4 (i32 with scale) - let decimal4_variant = Variant::Decimal4 { - integer: 12345, - scale: 2, - }; - assert_eq!(variant_to_json_string(&decimal4_variant).unwrap(), "123.45"); - - let decimal4_value = variant_to_json_value(&decimal4_variant).unwrap(); - assert!(matches!(decimal4_value, Value::Number(_))); - - // Test Decimal8 (i64 with scale) - let decimal8_variant = Variant::Decimal8 { - integer: 1234567890, - scale: 3, - }; - assert_eq!( - variant_to_json_string(&decimal8_variant).unwrap(), - "1234567.89" - ); - - let decimal8_value = variant_to_json_value(&decimal8_variant).unwrap(); - assert!(matches!(decimal8_value, Value::Number(_))); - - // Test Decimal16 (i128 with scale) - let decimal16_variant = Variant::Decimal16 { - integer: 123456789012345, - scale: 4, - }; - assert_eq!( - variant_to_json_string(&decimal16_variant).unwrap(), - "12345678901.2345" - ); - - let decimal16_value = variant_to_json_value(&decimal16_variant).unwrap(); - assert!(matches!(decimal16_value, Value::Number(_))); - - // Test zero scale decimal - let no_scale_decimal = Variant::Decimal4 { - integer: 42, - scale: 0, - }; - assert_eq!(variant_to_json_string(&no_scale_decimal).unwrap(), "42"); -} - -#[test] -fn test_date_and_timestamp_types_to_json() { - // Test Date - let date = NaiveDate::from_ymd_opt(2023, 12, 25).unwrap(); - let date_variant = Variant::Date(date); - assert_eq!( - variant_to_json_string(&date_variant).unwrap(), - "\"2023-12-25\"" - ); - assert_eq!( - variant_to_json_value(&date_variant).unwrap(), - Value::String("2023-12-25".to_string()) - ); - - // Test TimestampMicros (UTC) - let timestamp_utc = DateTime::parse_from_rfc3339("2023-12-25T10:30:45Z") - .unwrap() - .with_timezone(&Utc); - let timestamp_variant = Variant::TimestampMicros(timestamp_utc); - let timestamp_json = variant_to_json_string(×tamp_variant).unwrap(); - assert!(timestamp_json.contains("2023-12-25T10:30:45")); - assert!(timestamp_json.starts_with('"') && timestamp_json.ends_with('"')); - - // Test TimestampNtzMicros (naive datetime) - let naive_timestamp = DateTime::from_timestamp(1703505045, 123456) - .unwrap() - .naive_utc(); - let naive_timestamp_variant = Variant::TimestampNtzMicros(naive_timestamp); - let naive_json = variant_to_json_string(&naive_timestamp_variant).unwrap(); - assert!(naive_json.contains("2023-12-25")); - assert!(naive_json.starts_with('"') && naive_json.ends_with('"')); -} - -#[test] -fn test_binary_type_to_json() { - // Test Binary data (encoded as base64) - let binary_data = b"Hello, World!"; - let binary_variant = Variant::Binary(binary_data); - - let binary_json = variant_to_json_string(&binary_variant).unwrap(); - // Should be base64 encoded and quoted - assert!(binary_json.starts_with('"') && binary_json.ends_with('"')); - - let binary_value = variant_to_json_value(&binary_variant).unwrap(); - assert!(matches!(binary_value, Value::String(_))); - - // Test empty binary - let empty_binary = Variant::Binary(b""); - let empty_json = variant_to_json_string(&empty_binary).unwrap(); - assert_eq!(empty_json, "\"\""); // Empty base64 string - - // Test binary with special bytes - let special_binary = Variant::Binary(&[0, 255, 128, 64]); - let special_json = variant_to_json_string(&special_binary).unwrap(); - assert!(special_json.starts_with('"') && special_json.ends_with('"')); - assert!(special_json.len() > 2); // Should have content between quotes -} - -#[test] -fn test_comprehensive_roundtrip_compatibility() { - Test { - variant: Variant::Float(3.5), - json: "3.5", - value: serde_json::Number::from_f64(3.5).map(Value::Number).unwrap(), - }.run(); - - Test { - variant: Variant::Double(2.718281828459045), - json: "2.718281828459045", - value: serde_json::Number::from_f64(2.718281828459045).map(Value::Number).unwrap(), - }.run(); - - Test { - variant: Variant::Decimal4 { integer: 12345, scale: 2 }, - json: "123.45", - value: serde_json::Number::from_f64(123.45).map(Value::Number).unwrap(), - }.run(); - - Test { - variant: Variant::Decimal8 { integer: 1234567890, scale: 3 }, - json: "1234567.89", - value: serde_json::Number::from_f64(1234567.89).map(Value::Number).unwrap(), - }.run(); - - Test { - variant: Variant::Decimal16 { integer: 123456789012345, scale: 4 }, - json: "12345678901.2345", - value: serde_json::Number::from_f64(12345678901.2345).map(Value::Number).unwrap(), - }.run(); - - Test { - variant: Variant::Date(NaiveDate::from_ymd_opt(2023, 1, 1).unwrap()), - json: "\"2023-01-01\"", - value: Value::String("2023-01-01".to_string()), - }.run(); - - Test { - variant: Variant::Binary(b"binary data"), - json: "\"YmluYXJ5IGRhdGE=\"", // base64 encoded "binary data" - value: Value::String("YmluYXJ5IGRhdGE=".to_string()), - }.run(); -} - -#[test] -fn test_string_escaping_edge_cases() { - // Test various escape sequences - let escaped_string = Variant::String("line1\nline2\ttab\"quote\"\\backslash"); - let expected_json = "\"line1\\nline2\\ttab\\\"quote\\\"\\\\backslash\""; - assert_eq!( - variant_to_json_string(&escaped_string).unwrap(), - expected_json - ); - - // Test Unicode characters - let unicode_string = Variant::String("Hello δΈ–η•Œ 🌍"); - let json_result = variant_to_json_string(&unicode_string).unwrap(); - assert!(json_result.contains("Hello δΈ–η•Œ 🌍")); - assert!(json_result.starts_with('"') && json_result.ends_with('"')); -} - -#[test] -fn test_buffer_writing() { - use parquet_variant::variant_to_json; - let variant = Variant::String("test buffer writing"); - // Test writing to a Vec - let mut buffer = Vec::new(); - variant_to_json(&mut buffer, &variant).unwrap(); - let result = String::from_utf8(buffer).unwrap(); - assert_eq!(result, "\"test buffer writing\""); - let mut buffer = vec![]; - variant_to_json(&mut buffer, &variant).unwrap(); - let result = String::from_utf8(buffer).unwrap(); - assert_eq!(result, "\"test buffer writing\""); -} - -struct Test { - variant: Variant<'static, 'static>, - json: &'static str, - value: Value, -} - -impl Test { - fn run(self) { - let json_string = variant_to_json_string(&self.variant).unwrap(); - assert_eq!(json_string, self.json, "JSON string mismatch for variant: {:?}", self.variant); - - let json_value = variant_to_json_value(&self.variant).unwrap(); - assert_eq!(json_value, self.value, "JSON value mismatch for variant: {:?}", self.variant); - let parsed: Value = serde_json::from_str(&json_string).unwrap(); - assert_eq!(parsed, self.value, "Parsed JSON mismatch for variant: {:?}", self.variant); - } -} - -#[test] -fn test_json_roundtrip_compatibility() { - Test { - variant: Variant::Null, - json: "null", - value: Value::Null, - }.run(); - - Test { - variant: Variant::BooleanTrue, - json: "true", - value: Value::Bool(true), - }.run(); - - Test { - variant: Variant::BooleanFalse, - json: "false", - value: Value::Bool(false), - }.run(); - - Test { - variant: Variant::Int8(42), - json: "42", - value: Value::Number(42.into()), - }.run(); - - Test { - variant: Variant::Int8(-128), - json: "-128", - value: Value::Number((-128).into()), - }.run(); - - Test { - variant: Variant::Int16(1000), - json: "1000", - value: Value::Number(1000.into()), - }.run(); - - Test { - variant: Variant::Int32(100000), - json: "100000", - value: Value::Number(100000.into()), - }.run(); - - Test { - variant: Variant::Int64(10000000000), - json: "10000000000", - value: Value::Number(10000000000i64.into()), - }.run(); - - Test { - variant: Variant::String("simple string"), - json: "\"simple string\"", - value: Value::String("simple string".to_string()), - }.run(); - - Test { - variant: Variant::String(""), - json: "\"\"", - value: Value::String("".to_string()), - }.run(); - - Test { - variant: Variant::ShortString(parquet_variant::ShortString::try_new("short").unwrap()), - json: "\"short\"", - value: Value::String("short".to_string()), - }.run(); -} From a7e19d82ea9cd92f13b54f4aaa49f89d5b2a9193 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 23:02:30 -0400 Subject: [PATCH 20/32] [FIX] remove dependency on f64 variant_to_json_value conversion --- parquet-variant/src/to_json.rs | 116 +++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 21 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 68dcb1e4b3dd..58f39390e65a 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -365,31 +365,73 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), Variant::Decimal4 { integer, scale } => { - let divisor = 10_i32.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - serde_json::Number::from_f64(decimal_value) - .map(Value::Number) - .ok_or_else(|| { -ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) - }) + // Use integer arithmetic to avoid f64 precision loss + if *scale == 0 { + Ok(Value::Number((*integer).into())) + } else { + let divisor = 10_i32.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + + let decimal_str = if trimmed_remainder.is_empty() { + quotient.to_string() + } else { + format!("{}.{}", quotient, trimmed_remainder) + }; + + // Parse as serde_json::Number to preserve precision + decimal_str.parse::() + .map(Value::Number) + .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + } } Variant::Decimal8 { integer, scale } => { - let divisor = 10_i64.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - serde_json::Number::from_f64(decimal_value) - .map(Value::Number) - .ok_or_else(|| { -ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) - }) + // Use integer arithmetic to avoid f64 precision loss + if *scale == 0 { + Ok(Value::Number((*integer).into())) + } else { + let divisor = 10_i64.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + + let decimal_str = if trimmed_remainder.is_empty() { + quotient.to_string() + } else { + format!("{}.{}", quotient, trimmed_remainder) + }; + + // Parse as serde_json::Number to preserve precision + decimal_str.parse::() + .map(Value::Number) + .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + } } Variant::Decimal16 { integer, scale } => { - let divisor = 10_i128.pow(*scale as u32); - let decimal_value = *integer as f64 / divisor as f64; - serde_json::Number::from_f64(decimal_value) - .map(Value::Number) - .ok_or_else(|| { -ArrowError::InvalidArgumentError("Invalid decimal value".to_string()) - }) + // Use integer arithmetic to avoid f64 precision loss + if *scale == 0 { + Ok(Value::Number((*integer as i64).into())) // Convert to i64 for JSON compatibility + } else { + let divisor = 10_i128.pow(*scale as u32); + let quotient = integer / divisor; + let remainder = (integer % divisor).abs(); + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); + + let decimal_str = if trimmed_remainder.is_empty() { + quotient.to_string() + } else { + format!("{}.{}", quotient, trimmed_remainder) + }; + + // Parse as serde_json::Number to preserve precision + decimal_str.parse::() + .map(Value::Number) + .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + } } Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), @@ -1156,4 +1198,36 @@ mod tests { Ok(()) } + + #[test] + fn test_high_precision_decimal_no_loss() -> Result<(), ArrowError> { + // Test case that would lose precision with f64 conversion + // This is a 63-bit precision decimal8 value that f64 cannot represent exactly + let high_precision_decimal8 = Variant::Decimal8 { + integer: 9007199254740993, // 2^53 + 1, exceeds f64 precision + scale: 6, + }; + + let json_string = variant_to_json_string(&high_precision_decimal8)?; + let json_value = variant_to_json_value(&high_precision_decimal8)?; + + // Expected result: 9007199254.740993 (exact representation) + assert_eq!(json_string, "9007199254.740993"); + + // Verify that both functions produce consistent results + let parsed: Value = serde_json::from_str(&json_string) + .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; + assert_eq!(parsed, json_value); + + // Test another case with trailing zeros that should be trimmed + let decimal_with_zeros = Variant::Decimal8 { + integer: 1234567890000, // Should result in 1234567.89 (trailing zeros trimmed) + scale: 6, + }; + + let json_string_zeros = variant_to_json_string(&decimal_with_zeros)?; + assert_eq!(json_string_zeros, "1234567.89"); + + Ok(()) + } } From a4314e2bf6176c462d3524bebe2e8f5577745bfb Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 23:05:50 -0400 Subject: [PATCH 21/32] [FIX] use the let-else pattern to reduce indentation --- parquet-variant/src/to_json.rs | 102 ++++++++++++++++----------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 58f39390e65a..fabf6e402ba1 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -923,15 +923,14 @@ mod tests { // Parse the JSON to verify structure - handle JSON parsing errors manually let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - if let Value::Object(obj) = parsed { - assert_eq!(obj.get("name"), Some(&Value::String("Alice".to_string()))); - assert_eq!(obj.get("age"), Some(&Value::Number(30.into()))); - assert_eq!(obj.get("active"), Some(&Value::Bool(true))); - assert!(matches!(obj.get("score"), Some(Value::Number(_)))); - assert_eq!(obj.len(), 4); - } else { + let Value::Object(obj) = parsed else { panic!("Expected JSON object"); - } + }; + assert_eq!(obj.get("name"), Some(&Value::String("Alice".to_string()))); + assert_eq!(obj.get("age"), Some(&Value::Number(30.into()))); + assert_eq!(obj.get("active"), Some(&Value::Bool(true))); + assert!(matches!(obj.get("score"), Some(Value::Number(_)))); + assert_eq!(obj.len(), 4); // Test variant_to_json_value as well let json_value = variant_to_json_value(&variant)?; @@ -1015,13 +1014,12 @@ mod tests { assert_eq!(json, "[1,2,3,4,5]"); let json_value = variant_to_json_value(&variant)?; - if let Value::Array(arr) = json_value { - assert_eq!(arr.len(), 5); - assert_eq!(arr[0], Value::Number(1.into())); - assert_eq!(arr[4], Value::Number(5.into())); - } else { + let Value::Array(arr) = json_value else { panic!("Expected JSON array"); - } + }; + assert_eq!(arr.len(), 5); + assert_eq!(arr[0], Value::Number(1.into())); + assert_eq!(arr[4], Value::Number(5.into())); Ok(()) } @@ -1070,16 +1068,15 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - if let Value::Array(arr) = parsed { - assert_eq!(arr.len(), 5); - assert_eq!(arr[0], Value::String("hello".to_string())); - assert_eq!(arr[1], Value::Number(42.into())); - assert_eq!(arr[2], Value::Bool(true)); - assert_eq!(arr[3], Value::Null); - assert!(matches!(arr[4], Value::Number(_))); - } else { + let Value::Array(arr) = parsed else { panic!("Expected JSON array"); - } + }; + assert_eq!(arr.len(), 5); + assert_eq!(arr[0], Value::String("hello".to_string())); + assert_eq!(arr[1], Value::Number(42.into())); + assert_eq!(arr[2], Value::Bool(true)); + assert_eq!(arr[3], Value::Null); + assert!(matches!(arr[4], Value::Number(_))); Ok(()) } @@ -1106,14 +1103,13 @@ mod tests { // Parse and verify all fields are present let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - if let Value::Object(obj) = parsed { - assert_eq!(obj.len(), 3); - assert_eq!(obj.get("alpha"), Some(&Value::String("first".to_string()))); - assert_eq!(obj.get("beta"), Some(&Value::String("second".to_string()))); - assert_eq!(obj.get("zebra"), Some(&Value::String("last".to_string()))); - } else { + let Value::Object(obj) = parsed else { panic!("Expected JSON object"); - } + }; + assert_eq!(obj.len(), 3); + assert_eq!(obj.get("alpha"), Some(&Value::String("first".to_string()))); + assert_eq!(obj.get("beta"), Some(&Value::String("second".to_string()))); + assert_eq!(obj.get("zebra"), Some(&Value::String("last".to_string()))); Ok(()) } @@ -1142,18 +1138,17 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - if let Value::Array(arr) = parsed { - assert_eq!(arr.len(), 7); - assert_eq!(arr[0], Value::String("string_value".to_string())); - assert_eq!(arr[1], Value::Number(42.into())); - assert_eq!(arr[2], Value::Bool(true)); - assert!(matches!(arr[3], Value::Number(_))); // float - assert_eq!(arr[4], Value::Bool(false)); - assert_eq!(arr[5], Value::Null); - assert_eq!(arr[6], Value::Number(100.into())); - } else { + let Value::Array(arr) = parsed else { panic!("Expected JSON array"); - } + }; + assert_eq!(arr.len(), 7); + assert_eq!(arr[0], Value::String("string_value".to_string())); + assert_eq!(arr[1], Value::Number(42.into())); + assert_eq!(arr[2], Value::Bool(true)); + assert!(matches!(arr[3], Value::Number(_))); // float + assert_eq!(arr[4], Value::Bool(false)); + assert_eq!(arr[5], Value::Null); + assert_eq!(arr[6], Value::Number(100.into())); Ok(()) } @@ -1181,20 +1176,19 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - if let Value::Object(obj) = parsed { - assert_eq!(obj.len(), 6); - assert_eq!( - obj.get("string_field"), - Some(&Value::String("test_string".to_string())) - ); - assert_eq!(obj.get("int_field"), Some(&Value::Number(123.into()))); - assert_eq!(obj.get("bool_field"), Some(&Value::Bool(true))); - assert!(matches!(obj.get("float_field"), Some(Value::Number(_)))); - assert_eq!(obj.get("null_field"), Some(&Value::Null)); - assert_eq!(obj.get("long_field"), Some(&Value::Number(999.into()))); - } else { + let Value::Object(obj) = parsed else { panic!("Expected JSON object"); - } + }; + assert_eq!(obj.len(), 6); + assert_eq!( + obj.get("string_field"), + Some(&Value::String("test_string".to_string())) + ); + assert_eq!(obj.get("int_field"), Some(&Value::Number(123.into()))); + assert_eq!(obj.get("bool_field"), Some(&Value::Bool(true))); + assert!(matches!(obj.get("float_field"), Some(Value::Number(_)))); + assert_eq!(obj.get("null_field"), Some(&Value::Null)); + assert_eq!(obj.get("long_field"), Some(&Value::Number(999.into()))); Ok(()) } From c8c810a67ff036c30598f419cd4a0c9fb9549afd Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Mon, 23 Jun 2025 23:07:09 -0400 Subject: [PATCH 22/32] [FIX] fix the formatting issues --- .../examples/variant_to_json_examples.rs | 16 +- parquet-variant/src/lib.rs | 6 +- parquet-variant/src/to_json.rs | 306 +++++++++++------- parquet-variant/src/variant.rs | 1 - 4 files changed, 204 insertions(+), 125 deletions(-) diff --git a/parquet-variant/examples/variant_to_json_examples.rs b/parquet-variant/examples/variant_to_json_examples.rs index 936b1475d601..787a19cb2bef 100644 --- a/parquet-variant/examples/variant_to_json_examples.rs +++ b/parquet-variant/examples/variant_to_json_examples.rs @@ -17,11 +17,13 @@ //! Example showing how to convert Variant values to JSON -use parquet_variant::{variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder}; +use parquet_variant::{ + variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder, +}; fn main() -> Result<(), Box> { let mut builder = VariantBuilder::new(); - + { let mut person = builder.new_object(); person.append_value("name", "Alice"); @@ -32,22 +34,22 @@ fn main() -> Result<(), Box> { person.append_value("department", "Engineering"); person.finish(); } - + let (metadata, value) = builder.finish(); let variant = parquet_variant::Variant::try_new(&metadata, &value)?; - + let json_string = variant_to_json_string(&variant)?; let json_value = variant_to_json_value(&variant)?; let pretty_json = serde_json::to_string_pretty(&json_value)?; println!("{}", pretty_json); - + let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant)?; let buffer_result = String::from_utf8(buffer)?; - + // Verify all methods produce the same result assert_eq!(json_string, buffer_result); assert_eq!(json_string, serde_json::to_string(&json_value)?); - + Ok(()) } diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index cdb59689a838..8ce3008655d4 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -33,12 +33,10 @@ mod decoder; mod variant; // TODO: dead code removal mod builder; +mod to_json; #[allow(dead_code)] mod utils; -mod to_json; pub use builder::*; -pub use to_json::{ - variant_to_json, variant_to_json_string, variant_to_json_value, -}; +pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index fabf6e402ba1..bc964dec0fe3 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -279,7 +279,7 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// ``` /// /// # Example: Create a [`Variant::Object`] and convert to JSON -/// +/// /// This example shows how to create an object with two fields and convert it to JSON: /// ```json /// { @@ -287,7 +287,7 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// "last_name": "Li" /// } /// ``` -/// +/// /// ```rust /// # use parquet_variant::{Variant, VariantBuilder, variant_to_json_string}; /// # use arrow_schema::ArrowError; @@ -374,17 +374,20 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let remainder = (integer % divisor).abs(); let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - + let decimal_str = if trimmed_remainder.is_empty() { quotient.to_string() } else { format!("{}.{}", quotient, trimmed_remainder) }; - + // Parse as serde_json::Number to preserve precision - decimal_str.parse::() + decimal_str + .parse::() .map(Value::Number) - .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + .map_err(|e| { + ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) + }) } } Variant::Decimal8 { integer, scale } => { @@ -397,17 +400,20 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let remainder = (integer % divisor).abs(); let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - + let decimal_str = if trimmed_remainder.is_empty() { quotient.to_string() } else { format!("{}.{}", quotient, trimmed_remainder) }; - + // Parse as serde_json::Number to preserve precision - decimal_str.parse::() + decimal_str + .parse::() .map(Value::Number) - .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + .map_err(|e| { + ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) + }) } } Variant::Decimal16 { integer, scale } => { @@ -420,17 +426,20 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let remainder = (integer % divisor).abs(); let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - + let decimal_str = if trimmed_remainder.is_empty() { quotient.to_string() } else { format!("{}.{}", quotient, trimmed_remainder) }; - + // Parse as serde_json::Number to preserve precision - decimal_str.parse::() + decimal_str + .parse::() .map(Value::Number) - .map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e))) + .map_err(|e| { + ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) + }) } } Variant::Date(date) => Ok(Value::String(format_date_string(date))), @@ -479,15 +488,15 @@ mod tests { }; let negative_json = variant_to_json_string(&negative_variant)?; assert_eq!(negative_json, "-12.345"); - - // Test large scale decimal + + // Test large scale decimal let large_scale_variant = Variant::Decimal8 { integer: 123456789, scale: 6, }; let large_scale_json = variant_to_json_string(&large_scale_variant)?; assert_eq!(large_scale_json, "123.456789"); - + Ok(()) } @@ -651,42 +660,59 @@ mod tests { fn run(self) { let json_string = variant_to_json_string(&self.variant) .expect("variant_to_json_string should succeed"); - assert_eq!(json_string, self.expected_json, - "JSON string mismatch for variant: {:?}", self.variant); - - let json_value = variant_to_json_value(&self.variant) - .expect("variant_to_json_value should succeed"); - + assert_eq!( + json_string, self.expected_json, + "JSON string mismatch for variant: {:?}", + self.variant + ); + + let json_value = + variant_to_json_value(&self.variant).expect("variant_to_json_value should succeed"); + // For floating point numbers, we need special comparison due to JSON number representation match (&json_value, &self.expected_value) { (Value::Number(actual), Value::Number(expected)) => { let actual_f64 = actual.as_f64().unwrap_or(0.0); let expected_f64 = expected.as_f64().unwrap_or(0.0); - assert!((actual_f64 - expected_f64).abs() < f64::EPSILON, - "JSON value mismatch for variant: {:?}, got {}, expected {}", - self.variant, actual_f64, expected_f64); + assert!( + (actual_f64 - expected_f64).abs() < f64::EPSILON, + "JSON value mismatch for variant: {:?}, got {}, expected {}", + self.variant, + actual_f64, + expected_f64 + ); } _ => { - assert_eq!(json_value, self.expected_value, - "JSON value mismatch for variant: {:?}", self.variant); + assert_eq!( + json_value, self.expected_value, + "JSON value mismatch for variant: {:?}", + self.variant + ); } } - + // Verify roundtrip: JSON string should parse to same value - let parsed: Value = serde_json::from_str(&json_string) - .expect("Generated JSON should be valid"); + let parsed: Value = + serde_json::from_str(&json_string).expect("Generated JSON should be valid"); // Same floating point handling for roundtrip match (&parsed, &self.expected_value) { (Value::Number(actual), Value::Number(expected)) => { let actual_f64 = actual.as_f64().unwrap_or(0.0); let expected_f64 = expected.as_f64().unwrap_or(0.0); - assert!((actual_f64 - expected_f64).abs() < f64::EPSILON, - "Parsed JSON mismatch for variant: {:?}, got {}, expected {}", - self.variant, actual_f64, expected_f64); + assert!( + (actual_f64 - expected_f64).abs() < f64::EPSILON, + "Parsed JSON mismatch for variant: {:?}, got {}, expected {}", + self.variant, + actual_f64, + expected_f64 + ); } _ => { - assert_eq!(parsed, self.expected_value, - "Parsed JSON mismatch for variant: {:?}", self.variant); + assert_eq!( + parsed, self.expected_value, + "Parsed JSON mismatch for variant: {:?}", + self.variant + ); } } } @@ -695,170 +721,222 @@ mod tests { #[test] fn test_primitive_json_conversion() { use crate::variant::ShortString; - + // Null JsonTest { variant: Variant::Null, expected_json: "null", expected_value: Value::Null, - }.run(); - + } + .run(); + // Booleans JsonTest { variant: Variant::BooleanTrue, - expected_json: "true", + expected_json: "true", expected_value: Value::Bool(true), - }.run(); - + } + .run(); + JsonTest { variant: Variant::BooleanFalse, expected_json: "false", expected_value: Value::Bool(false), - }.run(); - + } + .run(); + // Integers - positive and negative edge cases JsonTest { variant: Variant::Int8(42), expected_json: "42", expected_value: Value::Number(42.into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int8(-128), expected_json: "-128", expected_value: Value::Number((-128).into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int16(32767), expected_json: "32767", expected_value: Value::Number(32767.into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int16(-32768), expected_json: "-32768", expected_value: Value::Number((-32768).into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int32(2147483647), expected_json: "2147483647", expected_value: Value::Number(2147483647.into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int32(-2147483648), expected_json: "-2147483648", expected_value: Value::Number((-2147483648).into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int64(9223372036854775807), expected_json: "9223372036854775807", expected_value: Value::Number(9223372036854775807i64.into()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Int64(-9223372036854775808), expected_json: "-9223372036854775808", expected_value: Value::Number((-9223372036854775808i64).into()), - }.run(); - + } + .run(); + // Floats JsonTest { variant: Variant::Float(3.5), expected_json: "3.5", - expected_value: serde_json::Number::from_f64(3.5).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(3.5) + .map(Value::Number) + .unwrap(), + } + .run(); + JsonTest { variant: Variant::Float(0.0), expected_json: "0", expected_value: Value::Number(0.into()), // Use integer 0 to match JSON parsing - }.run(); - + } + .run(); + JsonTest { variant: Variant::Float(-1.5), expected_json: "-1.5", - expected_value: serde_json::Number::from_f64(-1.5).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(-1.5) + .map(Value::Number) + .unwrap(), + } + .run(); + JsonTest { variant: Variant::Double(2.718281828459045), expected_json: "2.718281828459045", - expected_value: serde_json::Number::from_f64(2.718281828459045).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(2.718281828459045) + .map(Value::Number) + .unwrap(), + } + .run(); + // Decimals JsonTest { - variant: Variant::Decimal4 { integer: 12345, scale: 2 }, + variant: Variant::Decimal4 { + integer: 12345, + scale: 2, + }, expected_json: "123.45", - expected_value: serde_json::Number::from_f64(123.45).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(123.45) + .map(Value::Number) + .unwrap(), + } + .run(); + JsonTest { - variant: Variant::Decimal4 { integer: 42, scale: 0 }, + variant: Variant::Decimal4 { + integer: 42, + scale: 0, + }, expected_json: "42", - expected_value: serde_json::Number::from_f64(42.0).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(42.0) + .map(Value::Number) + .unwrap(), + } + .run(); + JsonTest { - variant: Variant::Decimal8 { integer: 1234567890, scale: 3 }, + variant: Variant::Decimal8 { + integer: 1234567890, + scale: 3, + }, expected_json: "1234567.89", - expected_value: serde_json::Number::from_f64(1234567.89).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(1234567.89) + .map(Value::Number) + .unwrap(), + } + .run(); + JsonTest { - variant: Variant::Decimal16 { integer: 123456789012345, scale: 4 }, + variant: Variant::Decimal16 { + integer: 123456789012345, + scale: 4, + }, expected_json: "12345678901.2345", - expected_value: serde_json::Number::from_f64(12345678901.2345).map(Value::Number).unwrap(), - }.run(); - + expected_value: serde_json::Number::from_f64(12345678901.2345) + .map(Value::Number) + .unwrap(), + } + .run(); + // Strings JsonTest { variant: Variant::String("hello world"), expected_json: "\"hello world\"", expected_value: Value::String("hello world".to_string()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::String(""), expected_json: "\"\"", expected_value: Value::String("".to_string()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::ShortString(ShortString::try_new("test").unwrap()), expected_json: "\"test\"", expected_value: Value::String("test".to_string()), - }.run(); - + } + .run(); + // Date and timestamps JsonTest { variant: Variant::Date(NaiveDate::from_ymd_opt(2023, 12, 25).unwrap()), expected_json: "\"2023-12-25\"", expected_value: Value::String("2023-12-25".to_string()), - }.run(); - + } + .run(); + // Binary data (base64 encoded) JsonTest { variant: Variant::Binary(b"test"), expected_json: "\"dGVzdA==\"", // base64 encoded "test" expected_value: Value::String("dGVzdA==".to_string()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Binary(b""), expected_json: "\"\"", // empty base64 expected_value: Value::String("".to_string()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::Binary(b"binary data"), expected_json: "\"YmluYXJ5IGRhdGE=\"", // base64 encoded "binary data" expected_value: Value::String("YmluYXJ5IGRhdGE=".to_string()), - }.run(); + } + .run(); } #[test] @@ -868,35 +946,37 @@ mod tests { variant: Variant::String("line1\nline2\ttab\"quote\"\\backslash"), expected_json: "\"line1\\nline2\\ttab\\\"quote\\\"\\\\backslash\"", expected_value: Value::String("line1\nline2\ttab\"quote\"\\backslash".to_string()), - }.run(); - + } + .run(); + JsonTest { variant: Variant::String("Hello δΈ–η•Œ 🌍"), expected_json: "\"Hello δΈ–η•Œ 🌍\"", expected_value: Value::String("Hello δΈ–η•Œ 🌍".to_string()), - }.run(); + } + .run(); } #[test] fn test_buffer_writing_variants() -> Result<(), ArrowError> { use crate::variant_to_json; - + let variant = Variant::String("test buffer writing"); - + // Test writing to a Vec let mut buffer = Vec::new(); variant_to_json(&mut buffer, &variant)?; let result = String::from_utf8(buffer) .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; assert_eq!(result, "\"test buffer writing\""); - + // Test writing to vec![] let mut buffer = vec![]; variant_to_json(&mut buffer, &variant)?; let result = String::from_utf8(buffer) .map_err(|e| ArrowError::InvalidArgumentError(e.to_string()))?; assert_eq!(result, "\"test buffer writing\""); - + Ok(()) } @@ -1201,27 +1281,27 @@ mod tests { integer: 9007199254740993, // 2^53 + 1, exceeds f64 precision scale: 6, }; - + let json_string = variant_to_json_string(&high_precision_decimal8)?; let json_value = variant_to_json_value(&high_precision_decimal8)?; - + // Expected result: 9007199254.740993 (exact representation) assert_eq!(json_string, "9007199254.740993"); - + // Verify that both functions produce consistent results let parsed: Value = serde_json::from_str(&json_string) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; assert_eq!(parsed, json_value); - + // Test another case with trailing zeros that should be trimmed let decimal_with_zeros = Variant::Decimal8 { integer: 1234567890000, // Should result in 1234567.89 (trailing zeros trimmed) scale: 6, }; - + let json_string_zeros = variant_to_json_string(&decimal_with_zeros)?; assert_eq!(json_string_zeros, "1234567.89"); - + Ok(()) } } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 27d867d71a05..923faf8431c5 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -33,7 +33,6 @@ mod object; const MAX_SHORT_STRING_BYTES: usize = 0x3F; - /// Represents a variant array. /// /// This implementation is a zero cost wrapper over `&str` that ensures From 3ac00ca1a42e25f971265f806ef2f519140f568b Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 08:09:51 -0400 Subject: [PATCH 23/32] [FIX] fix all clippy and doc tests errors --- parquet-variant/src/to_json.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index bc964dec0fe3..0cdcb8b49e63 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -43,7 +43,7 @@ fn format_binary_base64(bytes: &[u8]) -> String { /// Converts a Variant to JSON and writes it to the provided `Write` /// -/// This function writes JSON directly to any type that implements [`Write`](std::io::Write), +/// This function writes JSON directly to any type that implements [`Write`], /// making it efficient for streaming or when you want to control the output destination. /// /// # Arguments @@ -829,9 +829,9 @@ mod tests { .run(); JsonTest { - variant: Variant::Double(2.718281828459045), + variant: Variant::Double(std::f64::consts::E), expected_json: "2.718281828459045", - expected_value: serde_json::Number::from_f64(2.718281828459045) + expected_value: serde_json::Number::from_f64(std::f64::consts::E) .map(Value::Number) .unwrap(), } From b49c26facb62bdc0a59ef67a18706b928dd071bf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 24 Jun 2025 14:41:43 -0400 Subject: [PATCH 24/32] Fix logical conflict with #7738 --- parquet-variant/src/to_json.rs | 69 ++++++++++++---------------------- 1 file changed, 23 insertions(+), 46 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 0cdcb8b49e63..82a677206a13 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -23,6 +23,7 @@ use serde_json::Value; use std::io::Write; use crate::variant::{Variant, VariantList, VariantObject}; +use crate::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; @@ -106,7 +107,7 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul Variant::Double(f) => { write!(json_buffer, "{}", f)?; } - Variant::Decimal4 { integer, scale } => { + Variant::Decimal4(VariantDecimal4 { integer, scale }) => { // Convert decimal to string representation using integer arithmetic if *scale == 0 { write!(json_buffer, "{}", integer)?; @@ -123,7 +124,7 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul } } } - Variant::Decimal8 { integer, scale } => { + Variant::Decimal8(VariantDecimal8 { integer, scale }) => { // Convert decimal to string representation using integer arithmetic if *scale == 0 { write!(json_buffer, "{}", integer)?; @@ -140,7 +141,7 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul } } } - Variant::Decimal16 { integer, scale } => { + Variant::Decimal16(VariantDecimal16 { integer, scale }) => { // Convert decimal to string representation using integer arithmetic if *scale == 0 { write!(json_buffer, "{}", integer)?; @@ -364,7 +365,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::Double(f) => serde_json::Number::from_f64(*f) .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), - Variant::Decimal4 { integer, scale } => { + Variant::Decimal4(VariantDecimal4 { integer, scale }) => { // Use integer arithmetic to avoid f64 precision loss if *scale == 0 { Ok(Value::Number((*integer).into())) @@ -390,7 +391,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { }) } } - Variant::Decimal8 { integer, scale } => { + Variant::Decimal8(VariantDecimal8 { integer, scale }) => { // Use integer arithmetic to avoid f64 precision loss if *scale == 0 { Ok(Value::Number((*integer).into())) @@ -416,7 +417,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { }) } } - Variant::Decimal16 { integer, scale } => { + Variant::Decimal16(VariantDecimal16 { integer, scale }) => { // Use integer arithmetic to avoid f64 precision loss if *scale == 0 { Ok(Value::Number((*integer as i64).into())) // Convert to i64 for JSON compatibility @@ -482,18 +483,12 @@ mod tests { #[test] fn test_decimal_edge_cases() -> Result<(), ArrowError> { // Test negative decimal - let negative_variant = Variant::Decimal4 { - integer: -12345, - scale: 3, - }; + let negative_variant = Variant::from(VariantDecimal4::try_new(-12345, 3)?); let negative_json = variant_to_json_string(&negative_variant)?; assert_eq!(negative_json, "-12.345"); // Test large scale decimal - let large_scale_variant = Variant::Decimal8 { - integer: 123456789, - scale: 6, - }; + let large_scale_variant = Variant::from(VariantDecimal8::try_new(123456789, 6)?); let large_scale_json = variant_to_json_string(&large_scale_variant)?; assert_eq!(large_scale_json, "123.456789"); @@ -502,10 +497,7 @@ mod tests { #[test] fn test_decimal16_to_json() -> Result<(), ArrowError> { - let variant = Variant::Decimal16 { - integer: 123456789012345, - scale: 4, - }; + let variant = Variant::from(VariantDecimal16::try_new(123456789012345, 4)?); let json = variant_to_json_string(&variant)?; assert_eq!(json, "12345678901.2345"); @@ -513,10 +505,7 @@ mod tests { assert!(matches!(json_value, Value::Number(_))); // Test very large number - let large_variant = Variant::Decimal16 { - integer: 999999999999999999, - scale: 2, - }; + let large_variant = Variant::from(VariantDecimal16::try_new(999999999999999999, 2)?); let large_json = variant_to_json_string(&large_variant)?; // Due to f64 precision limits, very large numbers may lose precision assert!( @@ -839,10 +828,7 @@ mod tests { // Decimals JsonTest { - variant: Variant::Decimal4 { - integer: 12345, - scale: 2, - }, + variant: Variant::from(VariantDecimal4::try_new(12345, 2).unwrap()), expected_json: "123.45", expected_value: serde_json::Number::from_f64(123.45) .map(Value::Number) @@ -851,10 +837,7 @@ mod tests { .run(); JsonTest { - variant: Variant::Decimal4 { - integer: 42, - scale: 0, - }, + variant: Variant::from(VariantDecimal4::try_new(42, 0).unwrap()), expected_json: "42", expected_value: serde_json::Number::from_f64(42.0) .map(Value::Number) @@ -863,10 +846,7 @@ mod tests { .run(); JsonTest { - variant: Variant::Decimal8 { - integer: 1234567890, - scale: 3, - }, + variant: Variant::from(VariantDecimal8::try_new(1234567890, 3).unwrap()), expected_json: "1234567.89", expected_value: serde_json::Number::from_f64(1234567.89) .map(Value::Number) @@ -875,10 +855,7 @@ mod tests { .run(); JsonTest { - variant: Variant::Decimal16 { - integer: 123456789012345, - scale: 4, - }, + variant: Variant::from(VariantDecimal16::try_new(123456789012345, 4).unwrap()), expected_json: "12345678901.2345", expected_value: serde_json::Number::from_f64(12345678901.2345) .map(Value::Number) @@ -1277,10 +1254,10 @@ mod tests { fn test_high_precision_decimal_no_loss() -> Result<(), ArrowError> { // Test case that would lose precision with f64 conversion // This is a 63-bit precision decimal8 value that f64 cannot represent exactly - let high_precision_decimal8 = Variant::Decimal8 { - integer: 9007199254740993, // 2^53 + 1, exceeds f64 precision - scale: 6, - }; + let high_precision_decimal8 = Variant::from(VariantDecimal8::try_new( + 9007199254740993, // 2^53 + 1, exceeds f64 precision + 6, + )?); let json_string = variant_to_json_string(&high_precision_decimal8)?; let json_value = variant_to_json_value(&high_precision_decimal8)?; @@ -1294,10 +1271,10 @@ mod tests { assert_eq!(parsed, json_value); // Test another case with trailing zeros that should be trimmed - let decimal_with_zeros = Variant::Decimal8 { - integer: 1234567890000, // Should result in 1234567.89 (trailing zeros trimmed) - scale: 6, - }; + let decimal_with_zeros = Variant::from(VariantDecimal8::try_new( + 1234567890000, // Should result in 1234567.89 (trailing zeros trimmed) + 6, + )?); let json_string_zeros = variant_to_json_string(&decimal_with_zeros)?; assert_eq!(json_string_zeros, "1234567.89"); From a47ccfc55cbb9ebd9672b091b9b0bb0701c49f72 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 24 Jun 2025 14:45:28 -0400 Subject: [PATCH 25/32] Less explicit panics --- parquet-variant/src/to_json.rs | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 82a677206a13..80759b80a5c8 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -980,9 +980,7 @@ mod tests { // Parse the JSON to verify structure - handle JSON parsing errors manually let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - let Value::Object(obj) = parsed else { - panic!("Expected JSON object"); - }; + let obj = parsed.as_object().expect("expected JSON object"); assert_eq!(obj.get("name"), Some(&Value::String("Alice".to_string()))); assert_eq!(obj.get("age"), Some(&Value::Number(30.into()))); assert_eq!(obj.get("active"), Some(&Value::Bool(true))); @@ -1071,9 +1069,7 @@ mod tests { assert_eq!(json, "[1,2,3,4,5]"); let json_value = variant_to_json_value(&variant)?; - let Value::Array(arr) = json_value else { - panic!("Expected JSON array"); - }; + let arr = json_value.as_array().expect("expected JSON array"); assert_eq!(arr.len(), 5); assert_eq!(arr[0], Value::Number(1.into())); assert_eq!(arr[4], Value::Number(5.into())); @@ -1125,9 +1121,7 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - let Value::Array(arr) = parsed else { - panic!("Expected JSON array"); - }; + let arr = parsed.as_array().expect("expected JSON array"); assert_eq!(arr.len(), 5); assert_eq!(arr[0], Value::String("hello".to_string())); assert_eq!(arr[1], Value::Number(42.into())); @@ -1160,9 +1154,7 @@ mod tests { // Parse and verify all fields are present let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - let Value::Object(obj) = parsed else { - panic!("Expected JSON object"); - }; + let obj = parsed.as_object().expect("expected JSON object"); assert_eq!(obj.len(), 3); assert_eq!(obj.get("alpha"), Some(&Value::String("first".to_string()))); assert_eq!(obj.get("beta"), Some(&Value::String("second".to_string()))); @@ -1195,9 +1187,7 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - let Value::Array(arr) = parsed else { - panic!("Expected JSON array"); - }; + let arr = parsed.as_array().expect("expected JSON array"); assert_eq!(arr.len(), 7); assert_eq!(arr[0], Value::String("string_value".to_string())); assert_eq!(arr[1], Value::Number(42.into())); @@ -1233,9 +1223,7 @@ mod tests { let parsed: Value = serde_json::from_str(&json) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; - let Value::Object(obj) = parsed else { - panic!("Expected JSON object"); - }; + let obj = parsed.as_object().expect("expected JSON object"); assert_eq!(obj.len(), 6); assert_eq!( obj.get("string_field"), From 0e10d82d89af913494987dbff01215a8c5993f5a Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 20:17:12 -0400 Subject: [PATCH 26/32] [FIX] retain the position of from bool --- parquet-variant/src/variant.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 923faf8431c5..d1a34018a158 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -980,6 +980,15 @@ impl From<()> for Variant<'_, '_> { } } +impl From for Variant<'_, '_> { + fn from(value: bool) -> Self { + match value { + true => Variant::BooleanTrue, + false => Variant::BooleanFalse, + } + } +} + impl From for Variant<'_, '_> { fn from(value: i8) -> Self { Variant::Int8(value) @@ -1009,14 +1018,6 @@ impl From for Variant<'_, '_> { Variant::Decimal4(value) } } -impl From for Variant<'_, '_> { - fn from(value: bool) -> Self { - match value { - true => Variant::BooleanTrue, - false => Variant::BooleanFalse, - } - } -} impl From for Variant<'_, '_> { fn from(value: VariantDecimal8) -> Self { From 35a2f50ed710e5bda64903a42d2b25885464bc9a Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 20:21:28 -0400 Subject: [PATCH 27/32] [FIX] make the returns from writes more modular --- parquet-variant/src/to_json.rs | 52 ++++++++++------------------------ 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 80759b80a5c8..638065db435a 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -80,33 +80,15 @@ fn format_binary_base64(bytes: &[u8]) -> String { /// ``` pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Result<(), ArrowError> { match variant { - Variant::Null => { - write!(json_buffer, "null")?; - } - Variant::BooleanTrue => { - write!(json_buffer, "true")?; - } - Variant::BooleanFalse => { - write!(json_buffer, "false")?; - } - Variant::Int8(i) => { - write!(json_buffer, "{}", i)?; - } - Variant::Int16(i) => { - write!(json_buffer, "{}", i)?; - } - Variant::Int32(i) => { - write!(json_buffer, "{}", i)?; - } - Variant::Int64(i) => { - write!(json_buffer, "{}", i)?; - } - Variant::Float(f) => { - write!(json_buffer, "{}", f)?; - } - Variant::Double(f) => { - write!(json_buffer, "{}", f)?; - } + Variant::Null => write!(json_buffer, "null")?, + Variant::BooleanTrue => write!(json_buffer, "true")?, + Variant::BooleanFalse => write!(json_buffer, "false")?, + Variant::Int8(i) => write!(json_buffer, "{}", i)?, + Variant::Int16(i) => write!(json_buffer, "{}", i)?, + Variant::Int32(i) => write!(json_buffer, "{}", i)?, + Variant::Int64(i) => write!(json_buffer, "{}", i)?, + Variant::Float(f) => write!(json_buffer, "{}", f)?, + Variant::Double(f) => write!(json_buffer, "{}", f)?, Variant::Decimal4(VariantDecimal4 { integer, scale }) => { // Convert decimal to string representation using integer arithmetic if *scale == 0 { @@ -158,14 +140,10 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul } } } - Variant::Date(date) => { - write!(json_buffer, "\"{}\"", format_date_string(date))?; - } - Variant::TimestampMicros(ts) => { - write!(json_buffer, "\"{}\"", ts.to_rfc3339())?; - } + Variant::Date(date) => write!(json_buffer, "\"{}\"", format_date_string(date))?, + Variant::TimestampMicros(ts) => write!(json_buffer, "\"{}\"", ts.to_rfc3339())?, Variant::TimestampNtzMicros(ts) => { - write!(json_buffer, "\"{}\"", format_timestamp_ntz_string(ts))?; + write!(json_buffer, "\"{}\"", format_timestamp_ntz_string(ts))? } Variant::Binary(bytes) => { // Encode binary as base64 string @@ -173,21 +151,21 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul let json_str = serde_json::to_string(&base64_str).map_err(|e| { ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) })?; - write!(json_buffer, "{}", json_str)?; + write!(json_buffer, "{}", json_str)? } Variant::String(s) => { // Use serde_json to properly escape the string let json_str = serde_json::to_string(s).map_err(|e| { ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) })?; - write!(json_buffer, "{}", json_str)?; + write!(json_buffer, "{}", json_str)? } Variant::ShortString(s) => { // Use serde_json to properly escape the string let json_str = serde_json::to_string(s.as_str()).map_err(|e| { ArrowError::InvalidArgumentError(format!("JSON encoding error: {}", e)) })?; - write!(json_buffer, "{}", json_str)?; + write!(json_buffer, "{}", json_str)? } Variant::Object(obj) => { convert_object_to_json(json_buffer, obj)?; From b16d1e5a3dda8c39fe35bd0e40238724d26c20d8 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 20:30:54 -0400 Subject: [PATCH 28/32] [FIX] make the write decimal functions modular --- parquet-variant/src/to_json.rs | 89 +++++++++++++++++----------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 638065db435a..588782b96264 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -42,6 +42,47 @@ fn format_binary_base64(bytes: &[u8]) -> String { general_purpose::STANDARD.encode(bytes) } +/// Generic function to write decimal values to JSON buffer +fn write_decimal( + json_buffer: &mut impl Write, + integer: impl std::fmt::Display + std::fmt::Debug, + scale: u8, +) -> Result<(), ArrowError> { + if scale == 0 { + write!(json_buffer, "{}", integer)?; + } else { + // Convert to string and manually handle the decimal point placement + let integer_str = integer.to_string(); + let is_negative = integer_str.starts_with('-'); + let abs_str = if is_negative { &integer_str[1..] } else { &integer_str }; + + let scale_usize = scale as usize; + if abs_str.len() <= scale_usize { + // Need to pad with leading zeros + let zeros_needed = scale_usize - abs_str.len(); + let padded = format!("{}{}", "0".repeat(zeros_needed), abs_str); + let decimal_part = padded.trim_end_matches('0'); + if decimal_part.is_empty() { + write!(json_buffer, "0")?; + } else { + write!(json_buffer, "{}0.{}", if is_negative { "-" } else { "" }, decimal_part)?; + } + } else { + // Split the string at the decimal point + let split_point = abs_str.len() - scale_usize; + let integer_part = &abs_str[..split_point]; + let decimal_part = abs_str[split_point..].trim_end_matches('0'); + + if decimal_part.is_empty() { + write!(json_buffer, "{}{}", if is_negative { "-" } else { "" }, integer_part)?; + } else { + write!(json_buffer, "{}{}.{}", if is_negative { "-" } else { "" }, integer_part, decimal_part)?; + } + } + } + Ok(()) +} + /// Converts a Variant to JSON and writes it to the provided `Write` /// /// This function writes JSON directly to any type that implements [`Write`], @@ -90,55 +131,13 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul Variant::Float(f) => write!(json_buffer, "{}", f)?, Variant::Double(f) => write!(json_buffer, "{}", f)?, Variant::Decimal4(VariantDecimal4 { integer, scale }) => { - // Convert decimal to string representation using integer arithmetic - if *scale == 0 { - write!(json_buffer, "{}", integer)?; - } else { - let divisor = 10_i32.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - if trimmed_remainder.is_empty() { - write!(json_buffer, "{}", quotient)?; - } else { - write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; - } - } + write_decimal(json_buffer, *integer, *scale)?; } Variant::Decimal8(VariantDecimal8 { integer, scale }) => { - // Convert decimal to string representation using integer arithmetic - if *scale == 0 { - write!(json_buffer, "{}", integer)?; - } else { - let divisor = 10_i64.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - if trimmed_remainder.is_empty() { - write!(json_buffer, "{}", quotient)?; - } else { - write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; - } - } + write_decimal(json_buffer, *integer, *scale)?; } Variant::Decimal16(VariantDecimal16 { integer, scale }) => { - // Convert decimal to string representation using integer arithmetic - if *scale == 0 { - write!(json_buffer, "{}", integer)?; - } else { - let divisor = 10_i128.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - if trimmed_remainder.is_empty() { - write!(json_buffer, "{}", quotient)?; - } else { - write!(json_buffer, "{}.{}", quotient, trimmed_remainder)?; - } - } + write_decimal(json_buffer, *integer, *scale)?; } Variant::Date(date) => write!(json_buffer, "\"{}\"", format_date_string(date))?, Variant::TimestampMicros(ts) => write!(json_buffer, "\"{}\"", ts.to_rfc3339())?, From 4010b735a285406addadf9e96a9ef107ef3e33a3 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 20:54:57 -0400 Subject: [PATCH 29/32] [FIX] format code with improved iters and value::null handling --- parquet-variant/src/to_json.rs | 82 ++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 14 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 588782b96264..09a5d55fb47b 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -207,13 +207,13 @@ fn convert_object_to_json(buffer: &mut impl Write, obj: &VariantObject) -> Resul fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<(), ArrowError> { write!(buffer, "[")?; - let len = arr.len(); - for i in 0..len { - if i > 0 { + let mut first = true; + for element in arr.iter() { + if !first { write!(buffer, ",")?; } + first = false; - let element = arr.get(i)?; variant_to_json(buffer, &element)?; } @@ -336,7 +336,7 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::Int16(i) => Ok(Value::Number((*i).into())), Variant::Int32(i) => Ok(Value::Number((*i).into())), Variant::Int64(i) => Ok(Value::Number((*i).into())), - Variant::Float(f) => serde_json::Number::from_f64(*f as f64) + Variant::Float(f) => serde_json::Number::from_f64((*f).into()) .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid float value".to_string())), Variant::Double(f) => serde_json::Number::from_f64(*f) @@ -350,12 +350,14 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let divisor = 10_i32.pow(*scale as u32); let quotient = integer / divisor; let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - let decimal_str = if trimmed_remainder.is_empty() { + let decimal_str = if remainder == 0 { quotient.to_string() } else { + // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + // Then strip away any trailing zeros (e.g., .00001) + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); format!("{}.{}", quotient, trimmed_remainder) }; @@ -376,12 +378,14 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let divisor = 10_i64.pow(*scale as u32); let quotient = integer / divisor; let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - let decimal_str = if trimmed_remainder.is_empty() { + let decimal_str = if remainder == 0 { quotient.to_string() } else { + // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + // Then strip away any trailing zeros (e.g., .00001) + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); format!("{}.{}", quotient, trimmed_remainder) }; @@ -402,12 +406,14 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { let divisor = 10_i128.pow(*scale as u32); let quotient = integer / divisor; let remainder = (integer % divisor).abs(); - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - let decimal_str = if trimmed_remainder.is_empty() { + let decimal_str = if remainder == 0 { quotient.to_string() } else { + // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) + let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); + // Then strip away any trailing zeros (e.g., .00001) + let trimmed_remainder = formatted_remainder.trim_end_matches('0'); format!("{}.{}", quotient, trimmed_remainder) }; @@ -1246,4 +1252,52 @@ mod tests { Ok(()) } + + #[test] + fn test_float_nan_inf_handling() -> Result<(), ArrowError> { + // Test NaN handling - should return an error since JSON doesn't support NaN + let nan_variant = Variant::Float(f32::NAN); + let nan_result = variant_to_json_value(&nan_variant); + assert!(nan_result.is_err()); + assert!(nan_result.unwrap_err().to_string().contains("Invalid float value")); + + // Test positive infinity - should return an error since JSON doesn't support Infinity + let pos_inf_variant = Variant::Float(f32::INFINITY); + let pos_inf_result = variant_to_json_value(&pos_inf_variant); + assert!(pos_inf_result.is_err()); + assert!(pos_inf_result.unwrap_err().to_string().contains("Invalid float value")); + + // Test negative infinity - should return an error since JSON doesn't support -Infinity + let neg_inf_variant = Variant::Float(f32::NEG_INFINITY); + let neg_inf_result = variant_to_json_value(&neg_inf_variant); + assert!(neg_inf_result.is_err()); + assert!(neg_inf_result.unwrap_err().to_string().contains("Invalid float value")); + + // Test the same for Double variants + let nan_double_variant = Variant::Double(f64::NAN); + let nan_double_result = variant_to_json_value(&nan_double_variant); + assert!(nan_double_result.is_err()); + assert!(nan_double_result.unwrap_err().to_string().contains("Invalid double value")); + + let pos_inf_double_variant = Variant::Double(f64::INFINITY); + let pos_inf_double_result = variant_to_json_value(&pos_inf_double_variant); + assert!(pos_inf_double_result.is_err()); + assert!(pos_inf_double_result.unwrap_err().to_string().contains("Invalid double value")); + + let neg_inf_double_variant = Variant::Double(f64::NEG_INFINITY); + let neg_inf_double_result = variant_to_json_value(&neg_inf_double_variant); + assert!(neg_inf_double_result.is_err()); + assert!(neg_inf_double_result.unwrap_err().to_string().contains("Invalid double value")); + + // Test normal float values still work + let normal_float = Variant::Float(3.14f32); + let normal_result = variant_to_json_value(&normal_float)?; + assert!(matches!(normal_result, Value::Number(_))); + + let normal_double = Variant::Double(2.71828f64); + let normal_double_result = variant_to_json_value(&normal_double)?; + assert!(matches!(normal_double_result, Value::Number(_))); + + Ok(()) + } } From de7c979e8b4e1f095811457823869dbbad4a4d02 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 21:09:31 -0400 Subject: [PATCH 30/32] [FIX] fix further precision issues from the decimal --- parquet-variant/src/to_json.rs | 225 ++++++++++++++++----------------- 1 file changed, 107 insertions(+), 118 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 09a5d55fb47b..3e2886e98cc2 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -42,44 +42,47 @@ fn format_binary_base64(bytes: &[u8]) -> String { general_purpose::STANDARD.encode(bytes) } -/// Generic function to write decimal values to JSON buffer -fn write_decimal( +/// Write decimal using scovich's hybrid approach for i32 +fn write_decimal_i32( json_buffer: &mut impl Write, - integer: impl std::fmt::Display + std::fmt::Debug, + integer: i32, scale: u8, ) -> Result<(), ArrowError> { - if scale == 0 { - write!(json_buffer, "{}", integer)?; + let integer = if scale == 0 { + integer } else { - // Convert to string and manually handle the decimal point placement - let integer_str = integer.to_string(); - let is_negative = integer_str.starts_with('-'); - let abs_str = if is_negative { &integer_str[1..] } else { &integer_str }; - - let scale_usize = scale as usize; - if abs_str.len() <= scale_usize { - // Need to pad with leading zeros - let zeros_needed = scale_usize - abs_str.len(); - let padded = format!("{}{}", "0".repeat(zeros_needed), abs_str); - let decimal_part = padded.trim_end_matches('0'); - if decimal_part.is_empty() { - write!(json_buffer, "0")?; - } else { - write!(json_buffer, "{}0.{}", if is_negative { "-" } else { "" }, decimal_part)?; - } - } else { - // Split the string at the decimal point - let split_point = abs_str.len() - scale_usize; - let integer_part = &abs_str[..split_point]; - let decimal_part = abs_str[split_point..].trim_end_matches('0'); - - if decimal_part.is_empty() { - write!(json_buffer, "{}{}", if is_negative { "-" } else { "" }, integer_part)?; - } else { - write!(json_buffer, "{}{}.{}", if is_negative { "-" } else { "" }, integer_part, decimal_part)?; - } + let divisor = 10_i32.pow(scale as u32); + if integer % divisor != 0 { + // fall back to floating point + let result = integer as f64 / divisor as f64; + write!(json_buffer, "{}", result)?; + return Ok(()); } - } + integer / divisor + }; + write!(json_buffer, "{}", integer)?; + Ok(()) +} + +/// Write decimal using scovich's hybrid approach for i64 +fn write_decimal_i64( + json_buffer: &mut impl Write, + integer: i64, + scale: u8, +) -> Result<(), ArrowError> { + let integer = if scale == 0 { + integer + } else { + let divisor = 10_i64.pow(scale as u32); + if integer % divisor != 0 { + // fall back to floating point + let result = integer as f64 / divisor as f64; + write!(json_buffer, "{}", result)?; + return Ok(()); + } + integer / divisor + }; + write!(json_buffer, "{}", integer)?; Ok(()) } @@ -131,13 +134,32 @@ pub fn variant_to_json(json_buffer: &mut impl Write, variant: &Variant) -> Resul Variant::Float(f) => write!(json_buffer, "{}", f)?, Variant::Double(f) => write!(json_buffer, "{}", f)?, Variant::Decimal4(VariantDecimal4 { integer, scale }) => { - write_decimal(json_buffer, *integer, *scale)?; + write_decimal_i32(json_buffer, *integer, *scale)?; } Variant::Decimal8(VariantDecimal8 { integer, scale }) => { - write_decimal(json_buffer, *integer, *scale)?; + write_decimal_i64(json_buffer, *integer, *scale)?; } Variant::Decimal16(VariantDecimal16 { integer, scale }) => { - write_decimal(json_buffer, *integer, *scale)?; + let integer = if *scale == 0 { + *integer + } else { + let divisor = 10_i128.pow(*scale as u32); + if integer % divisor != 0 { + // fall back to floating point + let result = *integer as f64 / divisor as f64; + write!(json_buffer, "{}", result)?; + return Ok(()); + } + integer / divisor + }; + // Prefer to emit as i64, but fall back to u64 or even f64 (lossy) if necessary + if let Ok(i64_val) = i64::try_from(integer) { + write!(json_buffer, "{}", i64_val)?; + } else if let Ok(u64_val) = u64::try_from(integer) { + write!(json_buffer, "{}", u64_val)?; + } else { + write!(json_buffer, "{}", integer as f64)?; + } } Variant::Date(date) => write!(json_buffer, "\"{}\"", format_date_string(date))?, Variant::TimestampMicros(ts) => write!(json_buffer, "\"{}\"", ts.to_rfc3339())?, @@ -343,88 +365,48 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { .map(Value::Number) .ok_or_else(|| ArrowError::InvalidArgumentError("Invalid double value".to_string())), Variant::Decimal4(VariantDecimal4 { integer, scale }) => { - // Use integer arithmetic to avoid f64 precision loss - if *scale == 0 { - Ok(Value::Number((*integer).into())) + let integer = if *scale == 0 { + *integer } else { let divisor = 10_i32.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - - let decimal_str = if remainder == 0 { - quotient.to_string() - } else { - // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - // Then strip away any trailing zeros (e.g., .00001) - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - format!("{}.{}", quotient, trimmed_remainder) - }; - - // Parse as serde_json::Number to preserve precision - decimal_str - .parse::() - .map(Value::Number) - .map_err(|e| { - ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) - }) - } + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(*integer as f64 / divisor as f64)); + } + integer / divisor + }; + Ok(Value::from(integer)) } Variant::Decimal8(VariantDecimal8 { integer, scale }) => { - // Use integer arithmetic to avoid f64 precision loss - if *scale == 0 { - Ok(Value::Number((*integer).into())) + let integer = if *scale == 0 { + *integer } else { let divisor = 10_i64.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - - let decimal_str = if remainder == 0 { - quotient.to_string() - } else { - // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - // Then strip away any trailing zeros (e.g., .00001) - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - format!("{}.{}", quotient, trimmed_remainder) - }; - - // Parse as serde_json::Number to preserve precision - decimal_str - .parse::() - .map(Value::Number) - .map_err(|e| { - ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) - }) - } + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(*integer as f64 / divisor as f64)); + } + integer / divisor + }; + Ok(Value::from(integer)) } Variant::Decimal16(VariantDecimal16 { integer, scale }) => { - // Use integer arithmetic to avoid f64 precision loss - if *scale == 0 { - Ok(Value::Number((*integer as i64).into())) // Convert to i64 for JSON compatibility + let integer = if *scale == 0 { + *integer } else { let divisor = 10_i128.pow(*scale as u32); - let quotient = integer / divisor; - let remainder = (integer % divisor).abs(); - - let decimal_str = if remainder == 0 { - quotient.to_string() - } else { - // The {:0width$} format ensures it correctly renders with leading zeros (e.g., .0000100) - let formatted_remainder = format!("{:0width$}", remainder, width = *scale as usize); - // Then strip away any trailing zeros (e.g., .00001) - let trimmed_remainder = formatted_remainder.trim_end_matches('0'); - format!("{}.{}", quotient, trimmed_remainder) - }; - - // Parse as serde_json::Number to preserve precision - decimal_str - .parse::() - .map(Value::Number) - .map_err(|e| { - ArrowError::InvalidArgumentError(format!("Invalid decimal string: {}", e)) - }) - } + if integer % divisor != 0 { + // fall back to floating point + return Ok(Value::from(*integer as f64 / divisor as f64)); + } + integer / divisor + }; + // Prefer to emit as i64, but fall back to u64 or even f64 (lossy) if necessary + let value = i64::try_from(integer) + .map(Value::from) + .or_else(|_| u64::try_from(integer).map(Value::from)) + .unwrap_or_else(|_| Value::from(integer as f64)); + Ok(value) } Variant::Date(date) => Ok(Value::String(format_date_string(date))), Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), @@ -1222,8 +1204,8 @@ mod tests { } #[test] - fn test_high_precision_decimal_no_loss() -> Result<(), ArrowError> { - // Test case that would lose precision with f64 conversion + fn test_decimal_precision_behavior() -> Result<(), ArrowError> { + // Test case that demonstrates f64 precision limits // This is a 63-bit precision decimal8 value that f64 cannot represent exactly let high_precision_decimal8 = Variant::from(VariantDecimal8::try_new( 9007199254740993, // 2^53 + 1, exceeds f64 precision @@ -1233,22 +1215,29 @@ mod tests { let json_string = variant_to_json_string(&high_precision_decimal8)?; let json_value = variant_to_json_value(&high_precision_decimal8)?; - // Expected result: 9007199254.740993 (exact representation) - assert_eq!(json_string, "9007199254.740993"); - - // Verify that both functions produce consistent results + // Due to f64 precision limits, we expect precision loss for values > 2^53 + // Both functions should produce consistent results (even if not exact) let parsed: Value = serde_json::from_str(&json_string) .map_err(|e| ArrowError::ParseError(format!("JSON parse error: {}", e)))?; assert_eq!(parsed, json_value); - // Test another case with trailing zeros that should be trimmed - let decimal_with_zeros = Variant::from(VariantDecimal8::try_new( + // Test a case that can be exactly represented (integer result) + let exact_decimal = Variant::from(VariantDecimal8::try_new( 1234567890000, // Should result in 1234567.89 (trailing zeros trimmed) 6, )?); - let json_string_zeros = variant_to_json_string(&decimal_with_zeros)?; - assert_eq!(json_string_zeros, "1234567.89"); + let json_string_exact = variant_to_json_string(&exact_decimal)?; + assert_eq!(json_string_exact, "1234567.89"); + + // Test integer case (should be exact) + let integer_decimal = Variant::from(VariantDecimal8::try_new( + 42000000, // Should result in 42 (integer) + 6, + )?); + + let json_string_integer = variant_to_json_string(&integer_decimal)?; + assert_eq!(json_string_integer, "42"); Ok(()) } From d8f43f2ab81f3eeab0b89a3939e4654ac8999799 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 21:17:31 -0400 Subject: [PATCH 31/32] [FIX] implement fromiterator --- parquet-variant/src/to_json.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index 3e2886e98cc2..a74ec724018b 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -415,25 +415,17 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { Variant::String(s) => Ok(Value::String(s.to_string())), Variant::ShortString(s) => Ok(Value::String(s.to_string())), Variant::Object(obj) => { - let mut map = serde_json::Map::new(); - - for (key, value) in obj.iter() { - let json_value = variant_to_json_value(&value)?; - map.insert(key.to_string(), json_value); - } - + let map = obj + .iter() + .map(|(k, v)| variant_to_json_value(&v).map(|json_val| (k.to_string(), json_val))) + .collect::>()?; Ok(Value::Object(map)) } Variant::List(arr) => { - let mut vec = Vec::new(); - let len = arr.len(); - - for i in 0..len { - let element = arr.get(i)?; - let json_value = variant_to_json_value(&element)?; - vec.push(json_value); - } - + let vec = arr + .iter() + .map(|element| variant_to_json_value(&element)) + .collect::>()?; Ok(Value::Array(vec)) } } From 2f525696fb0ee39a4afa4d74d6a776c54a70ced5 Mon Sep 17 00:00:00 2001 From: carpecodeum Date: Tue, 24 Jun 2025 21:21:20 -0400 Subject: [PATCH 32/32] [FIX] fix clippy and fmt errors --- parquet-variant/src/to_json.rs | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/to_json.rs b/parquet-variant/src/to_json.rs index a74ec724018b..ac201148388e 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant/src/to_json.rs @@ -1240,42 +1240,60 @@ mod tests { let nan_variant = Variant::Float(f32::NAN); let nan_result = variant_to_json_value(&nan_variant); assert!(nan_result.is_err()); - assert!(nan_result.unwrap_err().to_string().contains("Invalid float value")); + assert!(nan_result + .unwrap_err() + .to_string() + .contains("Invalid float value")); // Test positive infinity - should return an error since JSON doesn't support Infinity let pos_inf_variant = Variant::Float(f32::INFINITY); let pos_inf_result = variant_to_json_value(&pos_inf_variant); assert!(pos_inf_result.is_err()); - assert!(pos_inf_result.unwrap_err().to_string().contains("Invalid float value")); + assert!(pos_inf_result + .unwrap_err() + .to_string() + .contains("Invalid float value")); // Test negative infinity - should return an error since JSON doesn't support -Infinity let neg_inf_variant = Variant::Float(f32::NEG_INFINITY); let neg_inf_result = variant_to_json_value(&neg_inf_variant); assert!(neg_inf_result.is_err()); - assert!(neg_inf_result.unwrap_err().to_string().contains("Invalid float value")); + assert!(neg_inf_result + .unwrap_err() + .to_string() + .contains("Invalid float value")); // Test the same for Double variants let nan_double_variant = Variant::Double(f64::NAN); let nan_double_result = variant_to_json_value(&nan_double_variant); assert!(nan_double_result.is_err()); - assert!(nan_double_result.unwrap_err().to_string().contains("Invalid double value")); + assert!(nan_double_result + .unwrap_err() + .to_string() + .contains("Invalid double value")); let pos_inf_double_variant = Variant::Double(f64::INFINITY); let pos_inf_double_result = variant_to_json_value(&pos_inf_double_variant); assert!(pos_inf_double_result.is_err()); - assert!(pos_inf_double_result.unwrap_err().to_string().contains("Invalid double value")); + assert!(pos_inf_double_result + .unwrap_err() + .to_string() + .contains("Invalid double value")); let neg_inf_double_variant = Variant::Double(f64::NEG_INFINITY); let neg_inf_double_result = variant_to_json_value(&neg_inf_double_variant); assert!(neg_inf_double_result.is_err()); - assert!(neg_inf_double_result.unwrap_err().to_string().contains("Invalid double value")); + assert!(neg_inf_double_result + .unwrap_err() + .to_string() + .contains("Invalid double value")); // Test normal float values still work - let normal_float = Variant::Float(3.14f32); + let normal_float = Variant::Float(std::f32::consts::PI); let normal_result = variant_to_json_value(&normal_float)?; assert!(matches!(normal_result, Value::Number(_))); - let normal_double = Variant::Double(2.71828f64); + let normal_double = Variant::Double(std::f64::consts::E); let normal_double_result = variant_to_json_value(&normal_double)?; assert!(matches!(normal_double_result, Value::Number(_)));