From 2a8bdbc20a01f869ab84784e482addd79138cf1f Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Thu, 12 Jun 2025 08:45:50 -0700 Subject: [PATCH 01/15] init --- parquet-variant/src/builder.rs | 409 +++++++++++++++++++++++ parquet-variant/src/decoder.rs | 4 +- parquet-variant/src/lib.rs | 2 + parquet-variant/tests/variant_interop.rs | 101 +++++- 4 files changed, 508 insertions(+), 8 deletions(-) create mode 100644 parquet-variant/src/builder.rs diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs new file mode 100644 index 000000000000..eaafa3f8d4b0 --- /dev/null +++ b/parquet-variant/src/builder.rs @@ -0,0 +1,409 @@ +use std::collections::HashMap; +use crate::decoder::{VariantBasicType, VariantPrimitiveType}; + +const BASIC_TYPE_BITS: u8 = 2; +const MAX_SHORT_STRING_SIZE: usize = 0x3F; + +pub fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { + (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 +} + +pub fn short_string_header(len: usize) -> u8 { + (len as u8) << 2 | VariantBasicType::ShortString as u8 +} + +pub fn array_header(large: bool, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 2)) | + ((offset_size - 1) << BASIC_TYPE_BITS) | + VariantBasicType::Array as u8 +} + +pub fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { + let large_bit = if large { 1 } else { 0 }; + (large_bit << (BASIC_TYPE_BITS + 4)) | + ((id_size - 1) << (BASIC_TYPE_BITS + 2)) | + ((offset_size - 1) << BASIC_TYPE_BITS) | + VariantBasicType::Object as u8 +} + +fn int_size(v: usize) -> u8 { + match v { + 0..=0xFF => 1, + 0x100..=0xFFFF => 2, + 0x10000..=0xFFFFFF => 3, + _ => 4, + } +} + +fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { + for i in 0..nbytes { + buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + } +} + +pub struct VariantBuilder { + buffer: Vec, + dict: HashMap, + dict_keys: Vec, +} + +impl VariantBuilder { + pub fn new() -> Self { + Self { + buffer: Vec::new(), + dict: HashMap::new(), + dict_keys: Vec::new(), + } + } + + pub fn append_null(&mut self) { + self.buffer.push(primitive_header(VariantPrimitiveType::Null)); + } + + pub fn append_bool(&mut self, value: bool) { + let primitive_type = if value { + VariantPrimitiveType::BooleanTrue + } else { + VariantPrimitiveType::BooleanFalse + }; + self.buffer.push(primitive_header(primitive_type)); + } + + pub fn append_int8(&mut self, value: i8) { + self.buffer.push(primitive_header(VariantPrimitiveType::Int8)); + self.buffer.push(value as u8); + } + + pub fn append_string(&mut self, value: &str) { + if value.len() <= MAX_SHORT_STRING_SIZE { + self.buffer.push(short_string_header(value.len())); + self.buffer.extend_from_slice(value.as_bytes()); + } else { + self.buffer.push(primitive_header(VariantPrimitiveType::String)); + self.buffer.extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer.extend_from_slice(value.as_bytes()); + } + } + + pub fn add_key(&mut self, key: &str) -> u32 { + if let Some(&id) = self.dict.get(key) { + return id; + } + let id = self.dict_keys.len() as u32; + self.dict.insert(key.to_string(), id); + self.dict_keys.push(key.to_string()); + id + } + + pub fn offset(&self) -> usize { + self.buffer.len() + } + + pub fn begin_array(&mut self) -> ArrayBuilder { + ArrayBuilder::new(self) + } + + pub fn begin_object(&mut self) -> ObjectBuilder { + ObjectBuilder::new(self) + } + + pub fn build(self) -> Vec { + self.buffer + } + + pub fn finish(self) -> (Vec, Vec) { + // Create metadata buffer with proper header + let mut metadata = Vec::new(); + + // Write metadata header: version=1, not sorted, offset_size=1 (offset_size_minus_one=0) + // Header format: bits 7-6: offset_size_minus_one (00), bit 5: sorted (0), bits 4-0: version (1) + let header = 0x01; // version = 1, sorted = false, offset_size_minus_one = 0 + metadata.push(header); + + // Write dictionary size (4 bytes little endian) + metadata.extend_from_slice(&(self.dict_keys.len() as u32).to_le_bytes()); + + // Write dictionary offsets (for empty dict, just write [0]) + if self.dict_keys.is_empty() { + metadata.push(0); // offset 0 for empty dictionary + } else { + // Write offsets for each dictionary entry plus end offset + let mut current_offset = 0u32; + metadata.push(current_offset as u8); // start offset + + for key in &self.dict_keys { + current_offset += key.len() as u32; + metadata.push(current_offset as u8); + } + + // Write the dictionary string data + for key in &self.dict_keys { + metadata.extend_from_slice(key.as_bytes()); + } + } + + (metadata, self.buffer) + } +} + +impl Default for VariantBuilder { + fn default() -> Self { + Self::new() + } +} + +pub struct ArrayBuilder<'a> { + parent: &'a mut VariantBuilder, + start_pos: usize, + offsets: Vec, +} + +impl<'a> ArrayBuilder<'a> { + fn new(parent: &'a mut VariantBuilder) -> Self { + let start_pos = parent.offset(); + Self { + parent, + start_pos, + offsets: vec![0], + } + } + + pub fn append_element(&mut self, f: F) + where F: FnOnce(&mut VariantBuilder) + { + f(self.parent); + let element_end = self.parent.offset() - self.start_pos; + self.offsets.push(element_end); + } + + pub fn finish(self) { + let data_size = self.parent.offset() - self.start_pos; + let num_elements = self.offsets.len() - 1; + let is_large = num_elements > u8::MAX as usize; + let size_bytes = if is_large { 4 } else { 1 }; + let offset_size = int_size(data_size); + let header_size = 1 + size_bytes + (num_elements + 1) * offset_size as usize; + + let current_len = self.parent.buffer.len(); + self.parent.buffer.resize(current_len + header_size, 0); + + let src_start = self.start_pos; + let src_end = current_len; + let dst_start = self.start_pos + header_size; + + self.parent.buffer.copy_within(src_start..src_end, dst_start); + + let mut pos = self.start_pos; + self.parent.buffer[pos] = array_header(is_large, offset_size); + pos += 1; + + if is_large { + self.parent.buffer[pos..pos + 4].copy_from_slice(&(num_elements as u32).to_le_bytes()); + pos += 4; + } else { + self.parent.buffer[pos] = num_elements as u8; + pos += 1; + } + + for offset in &self.offsets { + write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], *offset, offset_size); + pos += offset_size as usize; + } + } +} + +pub struct ObjectBuilder<'a> { + parent: &'a mut VariantBuilder, + start_pos: usize, + fields: Vec<(u32, usize)>, +} + +impl<'a> ObjectBuilder<'a> { + fn new(parent: &'a mut VariantBuilder) -> Self { + let start_pos = parent.offset(); + Self { + parent, + start_pos, + fields: Vec::new(), + } + } + + pub fn append_field(&mut self, key: &str, f: F) + where F: FnOnce(&mut VariantBuilder) + { + let id = self.parent.add_key(key); + let field_start = self.parent.offset() - self.start_pos; + f(self.parent); + self.fields.push((id, field_start)); + } + + pub fn finish(mut self) { + self.fields.sort_by_key(|&(id, _)| id); + + let data_size = self.parent.offset() - self.start_pos; + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + let size_bytes = if is_large { 4 } else { 1 }; + + let max_id = self.fields.iter().map(|&(id, _)| id).max().unwrap_or(0); + let id_size = int_size(max_id as usize); + let offset_size = int_size(data_size); + + let header_size = 1 + size_bytes + num_fields * id_size as usize + (num_fields + 1) * offset_size as usize; + + let current_len = self.parent.buffer.len(); + self.parent.buffer.resize(current_len + header_size, 0); + + let src_start = self.start_pos; + let src_end = current_len; + let dst_start = self.start_pos + header_size; + + self.parent.buffer.copy_within(src_start..src_end, dst_start); + + let mut pos = self.start_pos; + self.parent.buffer[pos] = object_header(is_large, id_size, offset_size); + pos += 1; + + if is_large { + self.parent.buffer[pos..pos + 4].copy_from_slice(&(num_fields as u32).to_le_bytes()); + pos += 4; + } else { + self.parent.buffer[pos] = num_fields as u8; + pos += 1; + } + + for &(id, _) in &self.fields { + write_offset(&mut self.parent.buffer[pos..pos + id_size as usize], id as usize, id_size); + pos += id_size as usize; + } + + for &(_, offset) in &self.fields { + write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], offset, offset_size); + pos += offset_size as usize; + } + write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], data_size, offset_size); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_usage() { + let mut builder = VariantBuilder::new(); + + builder.append_null(); + builder.append_bool(true); + builder.append_int8(42); + builder.append_string("hello"); + + let result = builder.build(); + assert!(!result.is_empty()); + } + + #[test] + fn test_array() { + let mut builder = VariantBuilder::new(); + + { + let mut array = builder.begin_array(); + array.append_element(|b| b.append_int8(1)); + array.append_element(|b| b.append_int8(2)); + array.append_element(|b| b.append_string("test")); + array.finish(); + } + + let result = builder.build(); + assert!(!result.is_empty()); + } + + #[test] + fn test_object() { + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.begin_object(); + obj.append_field("name", |b| b.append_string("John")); + obj.append_field("age", |b| b.append_int8(30)); + obj.finish(); + } + + let result = builder.build(); + assert!(!result.is_empty()); + } + + #[test] + fn test_compatibility() { + use crate::decoder::{decode_int8, decode_short_string, get_basic_type, get_primitive_type}; + + let mut builder = VariantBuilder::new(); + builder.append_int8(42); + let result = builder.build(); + + let header = result[0]; + assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Primitive); + assert_eq!(get_primitive_type(header).unwrap(), VariantPrimitiveType::Int8); + assert_eq!(decode_int8(&result).unwrap(), 42); + + let mut builder = VariantBuilder::new(); + builder.append_string("Hello"); + let result = builder.build(); + + let header = result[0]; + assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::ShortString); + assert_eq!(decode_short_string(&result).unwrap(), "Hello"); + } + + #[test] + fn test_object_structure() { + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.begin_object(); + obj.append_field("a", |b| b.append_int8(1)); + obj.append_field("b", |b| b.append_int8(2)); + obj.finish(); + } + + let result = builder.build(); + + // Print the byte structure for debugging + println!("Object bytes: {:?}", result); + + // Basic sanity check - should have more than just the header + assert!(result.len() > 10, "Object should have substantial size"); + + // Verify it can be parsed by the decoder + use crate::decoder::{get_basic_type, VariantBasicType}; + let header = result[0]; + assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Object); + } + + #[test] + fn test_object_offset_correctness() { + // Test with known field sizes to verify offset calculation + let mut builder = VariantBuilder::new(); + + { + let mut obj = builder.begin_object(); + // Field "x": int8 = 2 bytes (1 header + 1 value) + obj.append_field("x", |b| b.append_int8(42)); + // Field "y": string "hi" = 3 bytes (1 header + 2 chars) + obj.append_field("y", |b| b.append_string("hi")); + obj.finish(); + } + + let result = builder.build(); + println!("Object with known sizes: {:?}", result); + + // Verify the structure makes sense + assert!(result.len() > 5, "Should have reasonable size"); + + // Test that it doesn't crash when parsed + use crate::decoder::{get_basic_type, VariantBasicType}; + let header = result[0]; + assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Object); + } +} diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a3d2f87062ea..43515bf0a2f8 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -19,7 +19,7 @@ use std::array::TryFromSliceError; use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice}; -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum VariantBasicType { Primitive = 0, ShortString = 1, @@ -27,7 +27,7 @@ pub enum VariantBasicType { Array = 3, } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum VariantPrimitiveType { Null = 0, BooleanTrue = 1, diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 557271823bc5..7e220650d48d 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -34,5 +34,7 @@ mod variant; // TODO: dead code removal #[allow(dead_code)] mod utils; +mod builder; pub use variant::*; +pub use builder::*; diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 617a10d63d12..6e6f88794271 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,7 +24,7 @@ use std::fs; use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; -use parquet_variant::{Variant, VariantMetadata}; +use parquet_variant::{Variant, VariantMetadata, VariantBuilder}; fn cases_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) @@ -74,10 +74,47 @@ fn get_non_primitive_cases() -> Vec<&'static str> { fn variant_primitive() -> Result<(), ArrowError> { let cases = get_primitive_cases(); for (case, want) in cases { + // Test decoding reference data let (metadata_bytes, value) = load_case(case)?; let metadata = VariantMetadata::try_new(&metadata_bytes)?; let got = Variant::try_new(&metadata, &value)?; - assert_eq!(got, want); + assert_eq!(got, want, "Failed to decode case: {}", case); + + // Test that our builder can create equivalent data + let mut builder = VariantBuilder::new(); + + match want { + Variant::Null => { + builder.append_null(); + } + Variant::BooleanFalse => { + builder.append_bool(false); + } + Variant::BooleanTrue => { + builder.append_bool(true); + } + Variant::Int8(val) => { + builder.append_int8(val); + } + Variant::String(s) => { + builder.append_string(s); + } + Variant::ShortString(s) => { + builder.append_string(s); + } + _ => { + // Skip unsupported types for now + continue; + } + } + + let (built_metadata, built_value) = builder.finish(); + + // Decode what we built and verify it matches + let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; + let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; + + assert_eq!(built_variant, want, "Builder output doesn't match expected for case: {}", case); } Ok(()) } @@ -86,12 +123,13 @@ fn variant_primitive() -> Result<(), ArrowError> { fn variant_non_primitive() -> Result<(), ArrowError> { let cases = get_non_primitive_cases(); for case in cases { + // Test decoding reference data let (metadata, value) = load_case(case)?; let metadata = VariantMetadata::try_new(&metadata)?; let variant = Variant::try_new(&metadata, &value)?; match case { "object_primitive" => { - assert!(matches!(variant, Variant::Object(_))); + assert!(matches!(variant, Variant::Object(_)), "Expected object variant for case: {}", case); assert_eq!(metadata.dictionary_size(), 7); let dict_val = metadata.get_field_by(0)?; assert_eq!(dict_val, "int_field"); @@ -99,14 +137,65 @@ fn variant_non_primitive() -> Result<(), ArrowError> { "array_primitive" => match variant { Variant::Array(arr) => { let v = arr.get(0)?; - assert!(matches!(v, Variant::Int8(2))); + assert!(matches!(v, Variant::Int8(2)), "Expected first element to be Int8(2) for case: {}", case); let v = arr.get(1)?; - assert!(matches!(v, Variant::Int8(1))); + assert!(matches!(v, Variant::Int8(1)), "Expected second element to be Int8(1) for case: {}", case); } - _ => panic!("expected an array"), + _ => panic!("Expected an array variant for case: {}", case), }, _ => unreachable!(), } + + // Test that our builder can create equivalent data structures + let mut builder = VariantBuilder::new(); + + match case { + "object_primitive" => { + // Build an object similar to what we expect from the test data + let mut obj = builder.begin_object(); + obj.append_field("int_field", |b| b.append_int8(42)); + obj.finish(); + } + "array_primitive" => { + // Build an array similar to what we expect from the test data + let mut arr = builder.begin_array(); + arr.append_element(|b| b.append_int8(2)); + arr.append_element(|b| b.append_int8(1)); + arr.finish(); + } + _ => unreachable!(), + } + + let (built_metadata, built_value) = builder.finish(); + + // Decode what we built + let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; + let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; + + // Verify basic structure matches + match case { + "object_primitive" => { + assert!(matches!(built_variant, Variant::Object(_)), "Expected object variant for case: {}", case); + // Verify the structure is similar to the reference + if let Variant::Object(_obj) = built_variant { + // We can't compare exact metadata because field ordering might differ + // but we can verify the dictionary contains our field + assert!(built_variant_metadata.dictionary_size() > 0, "Built object should have dictionary entries"); + } + } + "array_primitive" => { + if let Variant::Array(arr) = built_variant { + // Test individual elements since len() is not implemented + let v0 = arr.get(0)?; + let v1 = arr.get(1)?; + assert!(matches!(v0, Variant::Int8(2)), "Expected first element to be Int8(2) for case: {}", case); + assert!(matches!(v1, Variant::Int8(1)), "Expected second element to be Int8(1) for case: {}", case); + } else { + panic!("Expected an array variant for case: {}", case); + } + } + _ => unreachable!(), + } } Ok(()) } From a03baa0b81a57c8ea945eed87f41a9810de6fe27 Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Thu, 12 Jun 2025 20:02:37 -0700 Subject: [PATCH 02/15] correct non-primitive --- parquet-variant/src/builder.rs | 51 ++++++++++++------------ parquet-variant/tests/variant_interop.rs | 3 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index eaafa3f8d4b0..e5b860633df1 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -113,35 +113,36 @@ impl VariantBuilder { } pub fn finish(self) -> (Vec, Vec) { - // Create metadata buffer with proper header - let mut metadata = Vec::new(); + let nkeys = self.dict_keys.len(); - // Write metadata header: version=1, not sorted, offset_size=1 (offset_size_minus_one=0) - // Header format: bits 7-6: offset_size_minus_one (00), bit 5: sorted (0), bits 4-0: version (1) - let header = 0x01; // version = 1, sorted = false, offset_size_minus_one = 0 - metadata.push(header); + // Calculate total size needed for metadata + let total_dict_size: usize = self.dict_keys.iter().map(|k| k.len()).sum(); + let offset_size = 1; // We use 1 byte offsets for now + let offset_start = 1 + offset_size; // Skip header and dict size + let string_start = offset_start + (nkeys + 1) * offset_size; + let metadata_size = string_start + total_dict_size; - // Write dictionary size (4 bytes little endian) - metadata.extend_from_slice(&(self.dict_keys.len() as u32).to_le_bytes()); + // Allocate the entire buffer + let mut metadata = vec![0u8; metadata_size]; - // Write dictionary offsets (for empty dict, just write [0]) - if self.dict_keys.is_empty() { - metadata.push(0); // offset 0 for empty dictionary - } else { - // Write offsets for each dictionary entry plus end offset - let mut current_offset = 0u32; - metadata.push(current_offset as u8); // start offset - - for key in &self.dict_keys { - current_offset += key.len() as u32; - metadata.push(current_offset as u8); - } - - // Write the dictionary string data - for key in &self.dict_keys { - metadata.extend_from_slice(key.as_bytes()); - } + // Write header: version=1, not sorted, offset_size=1 (offset_size_minus_one=0) + metadata[0] = 0x01; + + // Write dictionary size + metadata[1] = nkeys as u8; + + // Write offsets and string data + let mut cur_offset = 0; + for (i, key) in self.dict_keys.iter().enumerate() { + // Write offset + metadata[offset_start + i] = cur_offset as u8; + // Write string data + let start = string_start + cur_offset; + metadata[start..start + key.len()].copy_from_slice(key.as_bytes()); + cur_offset += key.len(); } + // Write final offset + metadata[offset_start + nkeys] = cur_offset as u8; (metadata, self.buffer) } diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 6e6f88794271..ec036260be2a 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -78,9 +78,8 @@ fn variant_primitive() -> Result<(), ArrowError> { let (metadata_bytes, value) = load_case(case)?; let metadata = VariantMetadata::try_new(&metadata_bytes)?; let got = Variant::try_new(&metadata, &value)?; - assert_eq!(got, want, "Failed to decode case: {}", case); + assert_eq!(got, want); - // Test that our builder can create equivalent data let mut builder = VariantBuilder::new(); match want { From c4ffce2ae679714ed43c5f87939b83b021d9a791 Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Thu, 12 Jun 2025 20:24:16 -0700 Subject: [PATCH 03/15] unify append --- parquet-variant/src/builder.rs | 30 ++++++ parquet-variant/tests/variant_interop.rs | 116 +++++++++++------------ 2 files changed, 83 insertions(+), 63 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index e5b860633df1..486f0cc098f2 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use crate::decoder::{VariantBasicType, VariantPrimitiveType}; +use crate::{Variant, VariantArray, VariantObject}; const BASIC_TYPE_BITS: u8 = 2; const MAX_SHORT_STRING_SIZE: usize = 0x3F; @@ -146,6 +147,35 @@ impl VariantBuilder { (metadata, self.buffer) } + + pub fn append<'m, 'v>(&mut self, value: &Variant<'m, 'v>) { + match value { + Variant::Null => self.append_null(), + Variant::BooleanFalse => self.append_bool(false), + Variant::BooleanTrue => self.append_bool(true), + Variant::Int8(val) => self.append_int8(*val), + Variant::String(s) => self.append_string(s), + Variant::ShortString(s) => self.append_string(s), + Variant::Array(arr) => { + let mut array_builder = self.begin_array(); + for i in 0..arr.len() { + if let Ok(v) = arr.get(i) { + array_builder.append_element(|b| b.append(&v)); + } + } + array_builder.finish(); + } + Variant::Object(obj) => { + let mut object_builder = self.begin_object(); + if let Ok(fields) = obj.fields() { + for (key, value) in fields { + object_builder.append_field(key, |b| b.append(&value)); + } + } + object_builder.finish(); + } + } + } } impl Default for VariantBuilder { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index ec036260be2a..44e05dc3cfd1 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -74,45 +74,36 @@ fn get_non_primitive_cases() -> Vec<&'static str> { fn variant_primitive() -> Result<(), ArrowError> { let cases = get_primitive_cases(); for (case, want) in cases { - // Test decoding reference data let (metadata_bytes, value) = load_case(case)?; let metadata = VariantMetadata::try_new(&metadata_bytes)?; let got = Variant::try_new(&metadata, &value)?; assert_eq!(got, want); + } + Ok(()) +} - let mut builder = VariantBuilder::new(); - - match want { - Variant::Null => { - builder.append_null(); - } - Variant::BooleanFalse => { - builder.append_bool(false); - } - Variant::BooleanTrue => { - builder.append_bool(true); - } - Variant::Int8(val) => { - builder.append_int8(val); - } - Variant::String(s) => { - builder.append_string(s); - } - Variant::ShortString(s) => { - builder.append_string(s); - } - _ => { - // Skip unsupported types for now - continue; - } - } +#[test] +fn variant_primitive_builder() -> Result<(), ArrowError> { + let builder_cases: [(&str, fn(&mut VariantBuilder)); 6] = [ + ("primitive_boolean_false", |b: &mut VariantBuilder| b.append(&Variant::BooleanFalse)), + ("primitive_boolean_true", |b: &mut VariantBuilder| b.append(&Variant::BooleanTrue)), + ("primitive_int8", |b: &mut VariantBuilder| b.append(&Variant::Int8(42))), + ("primitive_null", |b: &mut VariantBuilder| b.append(&Variant::Null)), + ("short_string", |b: &mut VariantBuilder| b.append(&Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)"))), + ("primitive_string", |b: &mut VariantBuilder| b.append(&Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!"))), + ]; + for (case, build_fn) in builder_cases { + let mut builder = VariantBuilder::new(); + build_fn(&mut builder); let (built_metadata, built_value) = builder.finish(); - - // Decode what we built and verify it matches let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; - + + // Load the reference data to compare against + let (metadata_bytes, value) = load_case(case)?; + let metadata = VariantMetadata::try_new(&metadata_bytes)?; + let want = Variant::try_new(&metadata, &value)?; assert_eq!(built_variant, want, "Builder output doesn't match expected for case: {}", case); } Ok(()) @@ -122,13 +113,12 @@ fn variant_primitive() -> Result<(), ArrowError> { fn variant_non_primitive() -> Result<(), ArrowError> { let cases = get_non_primitive_cases(); for case in cases { - // Test decoding reference data let (metadata, value) = load_case(case)?; let metadata = VariantMetadata::try_new(&metadata)?; let variant = Variant::try_new(&metadata, &value)?; match case { "object_primitive" => { - assert!(matches!(variant, Variant::Object(_)), "Expected object variant for case: {}", case); + assert!(matches!(variant, Variant::Object(_))); assert_eq!(metadata.dictionary_size(), 7); let dict_val = metadata.get_field_by(0)?; assert_eq!(dict_val, "int_field"); @@ -136,55 +126,55 @@ fn variant_non_primitive() -> Result<(), ArrowError> { "array_primitive" => match variant { Variant::Array(arr) => { let v = arr.get(0)?; - assert!(matches!(v, Variant::Int8(2)), "Expected first element to be Int8(2) for case: {}", case); + assert!(matches!(v, Variant::Int8(2))); let v = arr.get(1)?; - assert!(matches!(v, Variant::Int8(1)), "Expected second element to be Int8(1) for case: {}", case); + assert!(matches!(v, Variant::Int8(1))); } - _ => panic!("Expected an array variant for case: {}", case), + _ => panic!("expected an array"), }, _ => unreachable!(), } + } + Ok(()) +} - // Test that our builder can create equivalent data structures - let mut builder = VariantBuilder::new(); - - match case { - "object_primitive" => { - // Build an object similar to what we expect from the test data - let mut obj = builder.begin_object(); - obj.append_field("int_field", |b| b.append_int8(42)); - obj.finish(); - } - "array_primitive" => { - // Build an array similar to what we expect from the test data - let mut arr = builder.begin_array(); - arr.append_element(|b| b.append_int8(2)); - arr.append_element(|b| b.append_int8(1)); - arr.finish(); - } - _ => unreachable!(), - } +#[test] +fn variant_non_primitive_builder() -> Result<(), ArrowError> { + let builder_cases: [(&str, fn(&mut VariantBuilder)); 2] = [ + ("object_primitive", |b: &mut VariantBuilder| { + let mut obj = b.begin_object(); + obj.append_field("int_field", |b| b.append(&Variant::Int8(42))); + obj.finish(); + }), + ("array_primitive", |b: &mut VariantBuilder| { + let mut arr = b.begin_array(); + arr.append_element(|b| b.append(&Variant::Int8(2))); + arr.append_element(|b| b.append(&Variant::Int8(1))); + arr.finish(); + }), + ]; + for (case, build_fn) in builder_cases { + let mut builder = VariantBuilder::new(); + build_fn(&mut builder); let (built_metadata, built_value) = builder.finish(); - - // Decode what we built let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; - - // Verify basic structure matches + + // Load the reference data to compare against + let (metadata, value) = load_case(case)?; + let metadata = VariantMetadata::try_new(&metadata)?; + let want = Variant::try_new(&metadata, &value)?; + match case { "object_primitive" => { assert!(matches!(built_variant, Variant::Object(_)), "Expected object variant for case: {}", case); - // Verify the structure is similar to the reference if let Variant::Object(_obj) = built_variant { - // We can't compare exact metadata because field ordering might differ - // but we can verify the dictionary contains our field assert!(built_variant_metadata.dictionary_size() > 0, "Built object should have dictionary entries"); } } "array_primitive" => { if let Variant::Array(arr) = built_variant { - // Test individual elements since len() is not implemented let v0 = arr.get(0)?; let v1 = arr.get(1)?; assert!(matches!(v0, Variant::Int8(2)), "Expected first element to be Int8(2) for case: {}", case); @@ -197,4 +187,4 @@ fn variant_non_primitive() -> Result<(), ArrowError> { } } Ok(()) -} +} \ No newline at end of file From fecca4edead0b222ce3a1403d42dcf05400f5f72 Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Thu, 12 Jun 2025 23:04:28 -0700 Subject: [PATCH 04/15] refine --- parquet-variant/src/builder.rs | 372 +++++++++++------------ parquet-variant/src/lib.rs | 4 +- parquet-variant/src/variant.rs | 8 +- parquet-variant/tests/variant_interop.rs | 95 +++--- 4 files changed, 223 insertions(+), 256 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 486f0cc098f2..eafa38d32010 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1,6 +1,6 @@ -use std::collections::HashMap; use crate::decoder::{VariantBasicType, VariantPrimitiveType}; -use crate::{Variant, VariantArray, VariantObject}; +use crate::Variant; +use std::collections::HashMap; const BASIC_TYPE_BITS: u8 = 2; const MAX_SHORT_STRING_SIZE: usize = 0x3F; @@ -15,17 +15,17 @@ pub fn short_string_header(len: usize) -> u8 { pub fn array_header(large: bool, offset_size: u8) -> u8 { let large_bit = if large { 1 } else { 0 }; - (large_bit << (BASIC_TYPE_BITS + 2)) | - ((offset_size - 1) << BASIC_TYPE_BITS) | - VariantBasicType::Array as u8 + (large_bit << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Array as u8 } pub fn object_header(large: bool, id_size: u8, offset_size: u8) -> u8 { let large_bit = if large { 1 } else { 0 }; - (large_bit << (BASIC_TYPE_BITS + 4)) | - ((id_size - 1) << (BASIC_TYPE_BITS + 2)) | - ((offset_size - 1) << BASIC_TYPE_BITS) | - VariantBasicType::Object as u8 + (large_bit << (BASIC_TYPE_BITS + 4)) + | ((id_size - 1) << (BASIC_TYPE_BITS + 2)) + | ((offset_size - 1) << BASIC_TYPE_BITS) + | VariantBasicType::Object as u8 } fn int_size(v: usize) -> u8 { @@ -37,12 +37,25 @@ fn int_size(v: usize) -> u8 { } } +/// Write little-endian integer to buffer fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { for i in 0..nbytes { buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; } } +/// Helper to make room for header by moving data +fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usize) { + let current_len = buffer.len(); + buffer.resize(current_len + header_size, 0); + + let src_start = start_pos; + let src_end = current_len; + let dst_start = start_pos + header_size; + + buffer.copy_within(src_start..src_end, dst_start); +} + pub struct VariantBuilder { buffer: Vec, dict: HashMap, @@ -58,11 +71,12 @@ impl VariantBuilder { } } - pub fn append_null(&mut self) { - self.buffer.push(primitive_header(VariantPrimitiveType::Null)); + fn append_null(&mut self) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Null)); } - pub fn append_bool(&mut self, value: bool) { + fn append_bool(&mut self, value: bool) { let primitive_type = if value { VariantPrimitiveType::BooleanTrue } else { @@ -71,108 +85,108 @@ impl VariantBuilder { self.buffer.push(primitive_header(primitive_type)); } - pub fn append_int8(&mut self, value: i8) { - self.buffer.push(primitive_header(VariantPrimitiveType::Int8)); + fn append_int8(&mut self, value: i8) { + self.buffer + .push(primitive_header(VariantPrimitiveType::Int8)); self.buffer.push(value as u8); } - pub fn append_string(&mut self, value: &str) { + fn append_string(&mut self, value: &str) { if value.len() <= MAX_SHORT_STRING_SIZE { self.buffer.push(short_string_header(value.len())); self.buffer.extend_from_slice(value.as_bytes()); } else { - self.buffer.push(primitive_header(VariantPrimitiveType::String)); - self.buffer.extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer + .push(primitive_header(VariantPrimitiveType::String)); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); self.buffer.extend_from_slice(value.as_bytes()); } } - pub fn add_key(&mut self, key: &str) -> u32 { - if let Some(&id) = self.dict.get(key) { - return id; + /// Add key to dictionary, return its ID + fn add_key(&mut self, key: &str) -> u32 { + use std::collections::hash_map::Entry; + match self.dict.entry(key.to_string()) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let id = self.dict_keys.len() as u32; + entry.insert(id); + self.dict_keys.push(key.to_string()); + id + } } - let id = self.dict_keys.len() as u32; - self.dict.insert(key.to_string(), id); - self.dict_keys.push(key.to_string()); - id } - pub fn offset(&self) -> usize { + fn offset(&self) -> usize { self.buffer.len() } - pub fn begin_array(&mut self) -> ArrayBuilder { + pub fn new_array(&mut self) -> ArrayBuilder { ArrayBuilder::new(self) } - pub fn begin_object(&mut self) -> ObjectBuilder { + pub fn new_object(&mut self) -> ObjectBuilder { ObjectBuilder::new(self) } - pub fn build(self) -> Vec { - self.buffer - } - pub fn finish(self) -> (Vec, Vec) { let nkeys = self.dict_keys.len(); - - // Calculate total size needed for metadata + + // Calculate metadata size let total_dict_size: usize = self.dict_keys.iter().map(|k| k.len()).sum(); - let offset_size = 1; // We use 1 byte offsets for now - let offset_start = 1 + offset_size; // Skip header and dict size - let string_start = offset_start + (nkeys + 1) * offset_size; + + // Determine appropriate offset size based on the larger of dict size or total string size + let max_offset = std::cmp::max(total_dict_size, nkeys); + let offset_size = int_size(max_offset); + + let offset_start = 1 + offset_size as usize; + let string_start = offset_start + (nkeys + 1) * offset_size as usize; let metadata_size = string_start + total_dict_size; - - // Allocate the entire buffer - let mut metadata = vec![0u8; metadata_size]; - - // Write header: version=1, not sorted, offset_size=1 (offset_size_minus_one=0) - metadata[0] = 0x01; - + + // Pre-allocate exact size to avoid reallocations + let mut metadata = Vec::with_capacity(metadata_size); + metadata.resize(metadata_size, 0); + + // Write header: version=1, not sorted, with calculated offset_size + metadata[0] = 0x01 | ((offset_size - 1) << 6); + // Write dictionary size - metadata[1] = nkeys as u8; - + write_offset(&mut metadata[1..], nkeys, offset_size); + // Write offsets and string data let mut cur_offset = 0; for (i, key) in self.dict_keys.iter().enumerate() { - // Write offset - metadata[offset_start + i] = cur_offset as u8; - // Write string data + write_offset( + &mut metadata[offset_start + i * offset_size as usize..], + cur_offset, + offset_size, + ); let start = string_start + cur_offset; metadata[start..start + key.len()].copy_from_slice(key.as_bytes()); cur_offset += key.len(); } // Write final offset - metadata[offset_start + nkeys] = cur_offset as u8; - + write_offset( + &mut metadata[offset_start + nkeys * offset_size as usize..], + cur_offset, + offset_size, + ); + (metadata, self.buffer) } - pub fn append<'m, 'v>(&mut self, value: &Variant<'m, 'v>) { - match value { + pub fn append_value>>(&mut self, value: T) { + let variant = value.into(); + match variant { Variant::Null => self.append_null(), - Variant::BooleanFalse => self.append_bool(false), Variant::BooleanTrue => self.append_bool(true), - Variant::Int8(val) => self.append_int8(*val), - Variant::String(s) => self.append_string(s), - Variant::ShortString(s) => self.append_string(s), - Variant::Array(arr) => { - let mut array_builder = self.begin_array(); - for i in 0..arr.len() { - if let Ok(v) = arr.get(i) { - array_builder.append_element(|b| b.append(&v)); - } - } - array_builder.finish(); - } - Variant::Object(obj) => { - let mut object_builder = self.begin_object(); - if let Ok(fields) = obj.fields() { - for (key, value) in fields { - object_builder.append_field(key, |b| b.append(&value)); - } - } - object_builder.finish(); + Variant::BooleanFalse => self.append_bool(false), + Variant::Int8(v) => self.append_int8(v), + Variant::String(s) | Variant::ShortString(s) => self.append_string(s), + // TODO: Add types for the rest of primitives + Variant::Object(_) | Variant::Array(_) => { + unreachable!("Object and Array variants cannot be created through Into") } } } @@ -200,10 +214,8 @@ impl<'a> ArrayBuilder<'a> { } } - pub fn append_element(&mut self, f: F) - where F: FnOnce(&mut VariantBuilder) - { - f(self.parent); + pub fn append_value>>(&mut self, value: T) { + self.parent.append_value(value); let element_end = self.parent.offset() - self.start_pos; self.offsets.push(element_end); } @@ -216,15 +228,9 @@ impl<'a> ArrayBuilder<'a> { let offset_size = int_size(data_size); let header_size = 1 + size_bytes + (num_elements + 1) * offset_size as usize; - let current_len = self.parent.buffer.len(); - self.parent.buffer.resize(current_len + header_size, 0); - - let src_start = self.start_pos; - let src_end = current_len; - let dst_start = self.start_pos + header_size; - - self.parent.buffer.copy_within(src_start..src_end, dst_start); + make_room_for_header(&mut self.parent.buffer, self.start_pos, header_size); + // Write header let mut pos = self.start_pos; self.parent.buffer[pos] = array_header(is_large, offset_size); pos += 1; @@ -237,8 +243,13 @@ impl<'a> ArrayBuilder<'a> { pos += 1; } + // Write offsets for offset in &self.offsets { - write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], *offset, offset_size); + write_offset( + &mut self.parent.buffer[pos..pos + offset_size as usize], + *offset, + offset_size, + ); pos += offset_size as usize; } } @@ -247,7 +258,7 @@ impl<'a> ArrayBuilder<'a> { pub struct ObjectBuilder<'a> { parent: &'a mut VariantBuilder, start_pos: usize, - fields: Vec<(u32, usize)>, + fields: Vec<(u32, usize)>, // (field_id, offset) } impl<'a> ObjectBuilder<'a> { @@ -260,38 +271,40 @@ impl<'a> ObjectBuilder<'a> { } } - pub fn append_field(&mut self, key: &str, f: F) - where F: FnOnce(&mut VariantBuilder) - { + /// Add a field with key and value to the object + pub fn append_value>>(&mut self, key: &str, value: T) { let id = self.parent.add_key(key); let field_start = self.parent.offset() - self.start_pos; - f(self.parent); + self.parent.append_value(value); self.fields.push((id, field_start)); } + /// Finalize object with sorted fields pub fn finish(mut self) { - self.fields.sort_by_key(|&(id, _)| id); + // Sort fields by key name + self.fields.sort_by(|a, b| { + let key_a = &self.parent.dict_keys[a.0 as usize]; + let key_b = &self.parent.dict_keys[b.0 as usize]; + key_a.cmp(key_b) + }); let data_size = self.parent.offset() - self.start_pos; let num_fields = self.fields.len(); let is_large = num_fields > u8::MAX as usize; let size_bytes = if is_large { 4 } else { 1 }; - + let max_id = self.fields.iter().map(|&(id, _)| id).max().unwrap_or(0); let id_size = int_size(max_id as usize); let offset_size = int_size(data_size); - - let header_size = 1 + size_bytes + num_fields * id_size as usize + (num_fields + 1) * offset_size as usize; - - let current_len = self.parent.buffer.len(); - self.parent.buffer.resize(current_len + header_size, 0); - - let src_start = self.start_pos; - let src_end = current_len; - let dst_start = self.start_pos + header_size; - - self.parent.buffer.copy_within(src_start..src_end, dst_start); + let header_size = 1 + + size_bytes + + num_fields * id_size as usize + + (num_fields + 1) * offset_size as usize; + + make_room_for_header(&mut self.parent.buffer, self.start_pos, header_size); + + // Write header let mut pos = self.start_pos; self.parent.buffer[pos] = object_header(is_large, id_size, offset_size); pos += 1; @@ -304,16 +317,30 @@ impl<'a> ObjectBuilder<'a> { pos += 1; } + // Write field IDs (sorted order) for &(id, _) in &self.fields { - write_offset(&mut self.parent.buffer[pos..pos + id_size as usize], id as usize, id_size); + write_offset( + &mut self.parent.buffer[pos..pos + id_size as usize], + id as usize, + id_size, + ); pos += id_size as usize; } + // Write field offsets for &(_, offset) in &self.fields { - write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], offset, offset_size); + write_offset( + &mut self.parent.buffer[pos..pos + offset_size as usize], + offset, + offset_size, + ); pos += offset_size as usize; } - write_offset(&mut self.parent.buffer[pos..pos + offset_size as usize], data_size, offset_size); + write_offset( + &mut self.parent.buffer[pos..pos + offset_size as usize], + data_size, + offset_size, + ); } } @@ -324,117 +351,74 @@ mod tests { #[test] fn test_simple_usage() { let mut builder = VariantBuilder::new(); - - builder.append_null(); - builder.append_bool(true); - builder.append_int8(42); - builder.append_string("hello"); - - let result = builder.build(); - assert!(!result.is_empty()); + + builder.append_value(()); + builder.append_value(true); + builder.append_value(42i8); + builder.append_value("hello"); + + let (metadata, value) = builder.finish(); + assert!(!metadata.is_empty()); + assert!(!value.is_empty()); } #[test] fn test_array() { let mut builder = VariantBuilder::new(); - + { - let mut array = builder.begin_array(); - array.append_element(|b| b.append_int8(1)); - array.append_element(|b| b.append_int8(2)); - array.append_element(|b| b.append_string("test")); + let mut array = builder.new_array(); + array.append_value(1i8); + array.append_value(2i8); + array.append_value("test"); array.finish(); } - - let result = builder.build(); - assert!(!result.is_empty()); + + let (metadata, value) = builder.finish(); + assert!(!metadata.is_empty()); + assert!(!value.is_empty()); } #[test] fn test_object() { let mut builder = VariantBuilder::new(); - + { - let mut obj = builder.begin_object(); - obj.append_field("name", |b| b.append_string("John")); - obj.append_field("age", |b| b.append_int8(30)); + let mut obj = builder.new_object(); + obj.append_value("name", "John"); + obj.append_value("age", 42i8); obj.finish(); } - - let result = builder.build(); - assert!(!result.is_empty()); - } - #[test] - fn test_compatibility() { - use crate::decoder::{decode_int8, decode_short_string, get_basic_type, get_primitive_type}; - - let mut builder = VariantBuilder::new(); - builder.append_int8(42); - let result = builder.build(); - - let header = result[0]; - assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Primitive); - assert_eq!(get_primitive_type(header).unwrap(), VariantPrimitiveType::Int8); - assert_eq!(decode_int8(&result).unwrap(), 42); - - let mut builder = VariantBuilder::new(); - builder.append_string("Hello"); - let result = builder.build(); - - let header = result[0]; - assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::ShortString); - assert_eq!(decode_short_string(&result).unwrap(), "Hello"); + let (metadata, value) = builder.finish(); + assert!(!metadata.is_empty()); + assert!(!value.is_empty()); } #[test] - fn test_object_structure() { + fn test_object_field_ordering() { let mut builder = VariantBuilder::new(); - - { - let mut obj = builder.begin_object(); - obj.append_field("a", |b| b.append_int8(1)); - obj.append_field("b", |b| b.append_int8(2)); - obj.finish(); - } - - let result = builder.build(); - - // Print the byte structure for debugging - println!("Object bytes: {:?}", result); - - // Basic sanity check - should have more than just the header - assert!(result.len() > 10, "Object should have substantial size"); - - // Verify it can be parsed by the decoder - use crate::decoder::{get_basic_type, VariantBasicType}; - let header = result[0]; - assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Object); - } - #[test] - fn test_object_offset_correctness() { - // Test with known field sizes to verify offset calculation - let mut builder = VariantBuilder::new(); - { - let mut obj = builder.begin_object(); - // Field "x": int8 = 2 bytes (1 header + 1 value) - obj.append_field("x", |b| b.append_int8(42)); - // Field "y": string "hi" = 3 bytes (1 header + 2 chars) - obj.append_field("y", |b| b.append_string("hi")); + let mut obj = builder.new_object(); + obj.append_value("zebra", "stripes"); // ID = 0 + obj.append_value("apple", "red"); // ID = 1 + obj.append_value("banana", "yellow"); // ID = 2 obj.finish(); } - - let result = builder.build(); - println!("Object with known sizes: {:?}", result); - - // Verify the structure makes sense - assert!(result.len() > 5, "Should have reasonable size"); - - // Test that it doesn't crash when parsed - use crate::decoder::{get_basic_type, VariantBasicType}; - let header = result[0]; - assert_eq!(get_basic_type(header).unwrap(), VariantBasicType::Object); + + let (_, value) = builder.finish(); + + let header = value[0]; + assert_eq!(header & 0x03, VariantBasicType::Object as u8); + + let field_count = value[1] as usize; + assert_eq!(field_count, 3); + + // Get field IDs from the object header + let field_ids: Vec = value[2..5].to_vec(); + + // apple(1), banana(2), zebra(0) + assert_eq!(field_ids, vec![1, 2, 0]); } } diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 7e220650d48d..00a8a69aff99 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -32,9 +32,9 @@ mod decoder; mod variant; // TODO: dead code removal +mod builder; #[allow(dead_code)] mod utils; -mod builder; -pub use variant::*; pub use builder::*; +pub use variant::*; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b7b1932580b1..712e8e4ca597 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -502,6 +502,12 @@ impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { } } +impl<'m, 'v> From<()> for Variant<'m, 'v> { + fn from(_: ()) -> Self { + Variant::Null + } +} + #[cfg(test)] mod tests { use super::*; @@ -649,7 +655,7 @@ mod tests { } /// Too short buffer test (missing one required offset). - /// Should error with “metadata shorter than dictionary_size implies”. + /// Should error with "metadata shorter than dictionary_size implies". #[test] fn try_new_missing_last_value() { let bytes = &[ diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 44e05dc3cfd1..615e86983b04 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,7 +24,7 @@ use std::fs; use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; -use parquet_variant::{Variant, VariantMetadata, VariantBuilder}; +use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; fn cases_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) @@ -85,26 +85,34 @@ fn variant_primitive() -> Result<(), ArrowError> { #[test] fn variant_primitive_builder() -> Result<(), ArrowError> { let builder_cases: [(&str, fn(&mut VariantBuilder)); 6] = [ - ("primitive_boolean_false", |b: &mut VariantBuilder| b.append(&Variant::BooleanFalse)), - ("primitive_boolean_true", |b: &mut VariantBuilder| b.append(&Variant::BooleanTrue)), - ("primitive_int8", |b: &mut VariantBuilder| b.append(&Variant::Int8(42))), - ("primitive_null", |b: &mut VariantBuilder| b.append(&Variant::Null)), - ("short_string", |b: &mut VariantBuilder| b.append(&Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)"))), - ("primitive_string", |b: &mut VariantBuilder| b.append(&Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!"))), + ("primitive_boolean_false", |b: &mut VariantBuilder| { + b.append_value(false) + }), + ("primitive_boolean_true", |b: &mut VariantBuilder| { + b.append_value(true) + }), + ("primitive_int8", |b: &mut VariantBuilder| { + b.append_value(42i8) + }), + ("primitive_null", |b: &mut VariantBuilder| { + b.append_value(()) + }), + ("short_string", |b: &mut VariantBuilder| { + b.append_value("Less than 64 bytes (❤\u{fe0f} with utf8)") + }), + ("primitive_string", |b: &mut VariantBuilder| { + b.append_value("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!") + }), ]; for (case, build_fn) in builder_cases { let mut builder = VariantBuilder::new(); build_fn(&mut builder); let (built_metadata, built_value) = builder.finish(); - let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; - let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; + let (expected_metadata, expected_value) = load_case(case)?; - // Load the reference data to compare against - let (metadata_bytes, value) = load_case(case)?; - let metadata = VariantMetadata::try_new(&metadata_bytes)?; - let want = Variant::try_new(&metadata, &value)?; - assert_eq!(built_variant, want, "Builder output doesn't match expected for case: {}", case); + assert_eq!(built_metadata, expected_metadata); + assert_eq!(built_value, expected_value); } Ok(()) } @@ -139,52 +147,21 @@ fn variant_non_primitive() -> Result<(), ArrowError> { } #[test] -fn variant_non_primitive_builder() -> Result<(), ArrowError> { - let builder_cases: [(&str, fn(&mut VariantBuilder)); 2] = [ - ("object_primitive", |b: &mut VariantBuilder| { - let mut obj = b.begin_object(); - obj.append_field("int_field", |b| b.append(&Variant::Int8(42))); - obj.finish(); - }), - ("array_primitive", |b: &mut VariantBuilder| { - let mut arr = b.begin_array(); - arr.append_element(|b| b.append(&Variant::Int8(2))); - arr.append_element(|b| b.append(&Variant::Int8(1))); - arr.finish(); - }), - ]; +fn variant_array_builder() -> Result<(), ArrowError> { + let mut builder = VariantBuilder::new(); - for (case, build_fn) in builder_cases { - let mut builder = VariantBuilder::new(); - build_fn(&mut builder); - let (built_metadata, built_value) = builder.finish(); - let built_variant_metadata = VariantMetadata::try_new(&built_metadata)?; - let built_variant = Variant::try_new(&built_variant_metadata, &built_value)?; + let mut arr = builder.new_array(); + arr.append_value(2); + arr.append_value(1); + arr.append_value(5); + arr.append_value(9); + arr.finish(); - // Load the reference data to compare against - let (metadata, value) = load_case(case)?; - let metadata = VariantMetadata::try_new(&metadata)?; - let want = Variant::try_new(&metadata, &value)?; + let (built_metadata, built_value) = builder.finish(); + let (expected_metadata, expected_value) = load_case("array_primitive")?; + + assert_eq!(built_metadata, expected_metadata); + assert_eq!(built_value, expected_value); - match case { - "object_primitive" => { - assert!(matches!(built_variant, Variant::Object(_)), "Expected object variant for case: {}", case); - if let Variant::Object(_obj) = built_variant { - assert!(built_variant_metadata.dictionary_size() > 0, "Built object should have dictionary entries"); - } - } - "array_primitive" => { - if let Variant::Array(arr) = built_variant { - let v0 = arr.get(0)?; - let v1 = arr.get(1)?; - assert!(matches!(v0, Variant::Int8(2)), "Expected first element to be Int8(2) for case: {}", case); - assert!(matches!(v1, Variant::Int8(1)), "Expected second element to be Int8(1) for case: {}", case); - } else { - panic!("Expected an array variant for case: {}", case); - } - } - _ => unreachable!(), - } - } Ok(()) -} \ No newline at end of file +} From ff2f23fb4c8d39de7fffd7071b4dc0209a854b08 Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Fri, 13 Jun 2025 08:16:11 -0700 Subject: [PATCH 05/15] minor fix --- parquet-variant/src/builder.rs | 19 +++++++++++++++++-- parquet-variant/tests/variant_interop.rs | 5 ++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index eafa38d32010..233738da32a5 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. use crate::decoder::{VariantBasicType, VariantPrimitiveType}; use crate::Variant; use std::collections::HashMap; @@ -145,8 +161,7 @@ impl VariantBuilder { let metadata_size = string_start + total_dict_size; // Pre-allocate exact size to avoid reallocations - let mut metadata = Vec::with_capacity(metadata_size); - metadata.resize(metadata_size, 0); + let mut metadata = vec![0u8; metadata_size]; // Write header: version=1, not sorted, with calculated offset_size metadata[0] = 0x01 | ((offset_size - 1) << 6); diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 615e86983b04..11626d3214b5 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -26,6 +26,9 @@ use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// Type alias for builder test cases to improve readability +type BuilderTestFn = fn(&mut VariantBuilder); + fn cases_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) .join("..") @@ -84,7 +87,7 @@ fn variant_primitive() -> Result<(), ArrowError> { #[test] fn variant_primitive_builder() -> Result<(), ArrowError> { - let builder_cases: [(&str, fn(&mut VariantBuilder)); 6] = [ + let builder_cases: [(&'static str, BuilderTestFn); 6] = [ ("primitive_boolean_false", |b: &mut VariantBuilder| { b.append_value(false) }), From 37cecc5ef1b4bf37989872e253e8f1a17745ef5e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 13 Jun 2025 12:48:13 -0400 Subject: [PATCH 06/15] Add documentation and examples to VariantBuilder --- parquet-variant/src/builder.rs | 104 +++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index eafa38d32010..760d6eced744 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -56,6 +56,98 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi buffer.copy_within(src_start..src_end, dst_start); } +/// Builder for [`Variant`] values +/// +/// # Example: create a Primitive Int8 +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(Variant::Int8(42)); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// assert_eq!(variant, Variant::Int8(42)); +/// ``` +/// +/// # Example: Create an Object +/// This example shows how to create an object with two fields: +/// ```json +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// } +/// ``` +/// +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an object builder that will write fields to the object +/// let mut object_builder = builder.new_object(); +/// object_builder.append_value("first_name", "Jiaying"); +/// object_builder.append_value("last_name", "Li"); +/// object_builder.finish(); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// /* TODO: uncomment this, but now VariantObject:field is not implemented +/// assert_eq!( +/// variant_object.field("first_name").unwrap(), +/// Variant::String("Jiaying") +/// ); +/// assert_eq!( +/// variant_object.field("last_name").unwrap(), +/// Variant::String("Li") +/// ); +/// */ +/// ``` +/// +/// # Example: Create an Array +/// +/// This example shows how to create an array of integers: `[1, 2, 3]`. +/// (this test actually fails at the moment) +/// ``` +/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// let mut builder = VariantBuilder::new(); +/// // Create an array builder that will write elements to the array +/// let mut array_builder = builder.new_array(); +/// array_builder.append_value(1i8); +/// array_builder.append_value(2i8); +/// array_builder.append_value(3i8); +/// // Finish the builder to get the metadata and value +/// let (metadata, value) = builder.finish(); +/// // use the Variant API to verify the result +/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); +/// let variant = Variant::try_new(&metadata, &value).unwrap(); +/// let Variant::Object(variant_object) = variant else { +/// panic!("unexpected variant type") +/// }; +/// // TODO: VERIFY THE RESULT this, but now VariantObject:field is not implemented +/// ``` +/// +/// # Example: Array of objects +/// +/// THis example shows how to create an array of objects: +/// ```json +/// [ +/// { +/// "first_name": "Jiaying", +/// "last_name": "Li" +/// }, +/// { +/// "first_name": "Malthe", +/// "last_name": "Karbo" +/// } +/// ``` +/// +/// TODO +/// pub struct VariantBuilder { buffer: Vec, dict: HashMap, @@ -122,10 +214,16 @@ impl VariantBuilder { self.buffer.len() } + /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. pub fn new_array(&mut self) -> ArrayBuilder { ArrayBuilder::new(self) } + /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. + /// + /// See the examples on [`VariantBuilder`] for usage. pub fn new_object(&mut self) -> ObjectBuilder { ObjectBuilder::new(self) } @@ -198,6 +296,9 @@ impl Default for VariantBuilder { } } +/// A builder for creating [`Variant::Array`] values. +/// +/// See the examples on [`VariantBuilder`] for usage. pub struct ArrayBuilder<'a> { parent: &'a mut VariantBuilder, start_pos: usize, @@ -255,6 +356,9 @@ impl<'a> ArrayBuilder<'a> { } } +/// A builder for creating [`Variant::Object`] values. +/// +/// See the examples on [`VariantBuilder`] for usage. pub struct ObjectBuilder<'a> { parent: &'a mut VariantBuilder, start_pos: usize, From c58df812f7e02c68b0b91ba299c5ee5b1ba16fef Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Fri, 13 Jun 2025 17:42:55 -0700 Subject: [PATCH 07/15] fix --- parquet-variant/src/builder.rs | 27 +++++++++++++++--------- parquet-variant/tests/variant_interop.rs | 14 ++++++------ 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index af40c6dec6a7..04d0f32c37e0 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -127,7 +127,6 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// # Example: Create an Array /// /// This example shows how to create an array of integers: `[1, 2, 3]`. -/// (this test actually fails at the moment) /// ``` /// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; /// let mut builder = VariantBuilder::new(); @@ -136,15 +135,19 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// array_builder.append_value(1i8); /// array_builder.append_value(2i8); /// array_builder.append_value(3i8); +/// array_builder.finish(); /// // Finish the builder to get the metadata and value /// let (metadata, value) = builder.finish(); /// // use the Variant API to verify the result /// let metadata = VariantMetadata::try_new(&metadata).unwrap(); /// let variant = Variant::try_new(&metadata, &value).unwrap(); -/// let Variant::Object(variant_object) = variant else { +/// let Variant::Array(variant_array) = variant else { /// panic!("unexpected variant type") /// }; -/// // TODO: VERIFY THE RESULT this, but now VariantObject:field is not implemented +/// // Verify the array contents +/// assert_eq!(variant_array.get(0).unwrap(), Variant::Int8(1)); +/// assert_eq!(variant_array.get(1).unwrap(), Variant::Int8(2)); +/// assert_eq!(variant_array.get(2).unwrap(), Variant::Int8(3)); /// ``` /// /// # Example: Array of objects @@ -232,8 +235,11 @@ impl VariantBuilder { fn append_date(&mut self, value: chrono::NaiveDate) { self.buffer .push(primitive_header(VariantPrimitiveType::Date)); - let days_since_epoch = value.signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()).num_days() as i32; - self.buffer.extend_from_slice(&days_since_epoch.to_le_bytes()); + let days_since_epoch = value + .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) + .num_days() as i32; + self.buffer + .extend_from_slice(&days_since_epoch.to_le_bytes()); } fn append_timestamp_micros(&mut self, value: chrono::DateTime) { @@ -274,7 +280,8 @@ impl VariantBuilder { fn append_binary(&mut self, value: &[u8]) { self.buffer .push(primitive_header(VariantPrimitiveType::Binary)); - self.buffer.extend_from_slice(&(value.len() as u32).to_le_bytes()); + self.buffer + .extend_from_slice(&(value.len() as u32).to_le_bytes()); self.buffer.extend_from_slice(value); } @@ -686,18 +693,18 @@ mod tests { let (metadata, value) = builder.finish(); assert!(!metadata.is_empty()); assert!(!value.is_empty()); - + let metadata = VariantMetadata::try_new(&metadata).unwrap(); let variant = Variant::try_new(&metadata, &value).unwrap(); - + match variant { Variant::Array(array) => { let val0 = array.get(0).unwrap(); assert_eq!(val0, Variant::Int8(1)); - + let val1 = array.get(1).unwrap(); assert_eq!(val1, Variant::Int8(2)); - + let val2 = array.get(2).unwrap(); assert_eq!(val2, Variant::ShortString("test")); } diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 8e82eebe7ef5..e759090948eb 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,8 +24,8 @@ use std::fs; use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; -use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; use chrono::NaiveDate; +use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; type BuilderTestFn = fn(&mut VariantBuilder); fn cases_dir() -> PathBuf { @@ -172,10 +172,10 @@ fn variant_array_builder() -> Result<(), ArrowError> { #[test] fn variant_object_builder() -> Result<(), ArrowError> { let mut builder = VariantBuilder::new(); - + let mut obj = builder.new_object(); obj.append_value("int_field", 1i8); - + // The double field is actually encoded as decimal4 with scale 8 // Value: 123456789, Scale: 8 -> 1.23456789 obj.append_value("double_field", (123456789i32, 8u8)); @@ -184,14 +184,14 @@ fn variant_object_builder() -> Result<(), ArrowError> { obj.append_value("string_field", "Apache Parquet"); obj.append_value("null_field", ()); obj.append_value("timestamp_field", "2025-04-16T12:34:56.78"); - + obj.finish(); - + let (built_metadata, built_value) = builder.finish(); let (expected_metadata, expected_value) = load_case("object_primitive")?; - + assert_eq!(built_metadata, expected_metadata); assert_eq!(built_value, expected_value); - + Ok(()) } From 9625962d617070ea098da238cd432eb9187e84c9 Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Fri, 13 Jun 2025 17:48:12 -0700 Subject: [PATCH 08/15] minor --- parquet-variant/src/builder.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 04d0f32c37e0..3f49b134a315 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -632,20 +632,20 @@ mod tests { { let mut builder = VariantBuilder::new(); - builder.append_value(3.14f32); + builder.append_value(1.5f32); let (metadata, value) = builder.finish(); let metadata = VariantMetadata::try_new(&metadata).unwrap(); let variant = Variant::try_new(&metadata, &value).unwrap(); - assert_eq!(variant, Variant::Float(3.14)); + assert_eq!(variant, Variant::Float(1.5)); } { let mut builder = VariantBuilder::new(); - builder.append_value(2.718281828f64); + builder.append_value(2.5f64); let (metadata, value) = builder.finish(); let metadata = VariantMetadata::try_new(&metadata).unwrap(); let variant = Variant::try_new(&metadata, &value).unwrap(); - assert_eq!(variant, Variant::Double(2.718281828)); + assert_eq!(variant, Variant::Double(2.5)); } { From 968486e17cf06cd7c1cf2e3ea688738348ee41cd Mon Sep 17 00:00:00 2001 From: PinkCrow007 <1053603622@qq.com> Date: Mon, 16 Jun 2025 00:52:00 -0700 Subject: [PATCH 09/15] minor fix --- parquet-variant/src/builder.rs | 7 +++---- parquet-variant/src/variant.rs | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 3f49b134a315..c743f8587c1b 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -20,6 +20,7 @@ use std::collections::HashMap; const BASIC_TYPE_BITS: u8 = 2; const MAX_SHORT_STRING_SIZE: usize = 0x3F; +const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); fn primitive_header(primitive_type: VariantPrimitiveType) -> u8 { (primitive_type as u8) << 2 | VariantBasicType::Primitive as u8 @@ -56,7 +57,7 @@ fn int_size(v: usize) -> u8 { /// Write little-endian integer to buffer fn write_offset(buf: &mut [u8], value: usize, nbytes: u8) { for i in 0..nbytes { - buf[i as usize] = ((value >> (i * 8)) & 0xFF) as u8; + buf[i as usize] = (value >> (i * 8)) as u8; } } @@ -235,9 +236,7 @@ impl VariantBuilder { fn append_date(&mut self, value: chrono::NaiveDate) { self.buffer .push(primitive_header(VariantPrimitiveType::Date)); - let days_since_epoch = value - .signed_duration_since(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()) - .num_days() as i32; + let days_since_epoch = value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32; self.buffer .extend_from_slice(&days_since_epoch.to_le_bytes()); } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index e9d525be4ab9..8a33eb2a9964 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1260,7 +1260,7 @@ mod tests { } /// Too short buffer test (missing one required offset). - /// Should error with "metadata shorter than dictionary_size implies". + /// Should error with “metadata shorter than dictionary_size implies”. #[test] fn try_new_missing_last_value() { let bytes = &[ From 39aa887b758f883dc408e038baffc3fcf61d7d95 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 16:10:03 -0400 Subject: [PATCH 10/15] update examples --- parquet-variant/src/builder.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 6cbec95e26a8..5a7e2c18713d 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -77,13 +77,12 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// /// # Example: create a Primitive Int8 /// ``` -/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// # use parquet_variant::{Variant, VariantBuilder}; /// let mut builder = VariantBuilder::new(); /// builder.append_value(Variant::Int8(42)); /// // Finish the builder to get the metadata and value /// let (metadata, value) = builder.finish(); /// // use the Variant API to verify the result -/// let metadata = VariantMetadata::try_new(&metadata).unwrap(); /// let variant = Variant::try_new(&metadata, &value).unwrap(); /// assert_eq!(variant, Variant::Int8(42)); /// ``` @@ -98,7 +97,7 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// ``` /// /// ``` -/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// # use parquet_variant::{Variant, VariantBuilder}; /// let mut builder = VariantBuilder::new(); /// // Create an object builder that will write fields to the object /// let mut object_builder = builder.new_object(); @@ -112,23 +111,21 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// let Variant::Object(variant_object) = variant else { /// panic!("unexpected variant type") /// }; -/// /* TODO: uncomment this, but now VariantObject:field is not implemented /// assert_eq!( /// variant_object.field("first_name").unwrap(), -/// Variant::String("Jiaying") +/// Some(Variant::ShortString("Jiaying")) /// ); /// assert_eq!( /// variant_object.field("last_name").unwrap(), -/// Variant::String("Li") +/// Some(Variant::ShortString("Li")) /// ); -/// */ /// ``` /// /// # Example: Create an Array /// /// This example shows how to create an array of integers: `[1, 2, 3]`. /// ``` -/// # use parquet_variant::{Variant, VariantBuilder, VariantMetadata}; +/// # use parquet_variant::{Variant, VariantBuilder}; /// let mut builder = VariantBuilder::new(); /// // Create an array builder that will write elements to the array /// let mut array_builder = builder.new_array(); From 18ba160e09fc40e099e6fbf83c621bc2ec17795a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 16:13:42 -0400 Subject: [PATCH 11/15] Fix test --- parquet-variant/tests/variant_interop.rs | 47 +++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 7c165967c9e8..ff581f843a3a 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,7 +24,7 @@ use std::fs; use std::path::{Path, PathBuf}; use chrono::NaiveDate; -use parquet_variant::Variant; +use parquet_variant::{Variant, VariantBuilder}; fn cases_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) @@ -173,4 +173,49 @@ fn variant_array_primitive() { } } +#[test] +fn variant_array_builder() { + let mut builder = VariantBuilder::new(); + + let mut arr = builder.new_array(); + arr.append_value(2i8); + arr.append_value(1i8); + arr.append_value(5i8); + arr.append_value(9i8); + arr.finish(); + + let (built_metadata, built_value) = builder.finish(); + let actual = Variant::try_new(&built_metadata, &built_value).unwrap(); + let case = Case::load("array_primitive"); + let expected = case.variant(); + + assert_eq!(actual, expected); +} + +#[test] +fn variant_object_builder() { + let mut builder = VariantBuilder::new(); + + let mut obj = builder.new_object(); + obj.append_value("int_field", 1i8); + + // The double field is actually encoded as decimal4 with scale 8 + // Value: 123456789, Scale: 8 -> 1.23456789 + obj.append_value("double_field", (123456789i32, 8u8)); + obj.append_value("boolean_true_field", true); + obj.append_value("boolean_false_field", false); + obj.append_value("string_field", "Apache Parquet"); + obj.append_value("null_field", ()); + obj.append_value("timestamp_field", "2025-04-16T12:34:56.78"); + + obj.finish(); + + let (built_metadata, built_value) = builder.finish(); + let actual = Variant::try_new(&built_metadata, &built_value).unwrap(); + let case = Case::load("object_primitive"); + let expected = case.variant(); + + assert_eq!(actual, expected); +} + // TODO: Add tests for object_nested and array_nested From d7d144916ede7396c7d0f87288fdbd0e265ca71e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 16:41:06 -0400 Subject: [PATCH 12/15] Fix lifetimes --- parquet-variant/src/builder.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 5a7e2c18713d..604306870e29 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -369,7 +369,7 @@ impl VariantBuilder { (metadata, self.buffer) } - pub fn append_value>>(&mut self, value: T) { + pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { let variant = value.into(); match variant { Variant::Null => self.append_null(), @@ -421,7 +421,7 @@ impl<'a> ArrayBuilder<'a> { } } - pub fn append_value>>(&mut self, value: T) { + pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.parent.append_value(value); let element_end = self.parent.offset() - self.start_pos; self.offsets.push(element_end); @@ -482,7 +482,7 @@ impl<'a> ObjectBuilder<'a> { } /// Add a field with key and value to the object - pub fn append_value>>(&mut self, key: &str, value: T) { + pub fn append_value<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { let id = self.parent.add_key(key); let field_start = self.parent.offset() - self.start_pos; self.parent.append_value(value); From 2235cb88dd0e282c4ab74fd789a3ee712455c25d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 16:54:53 -0400 Subject: [PATCH 13/15] Fix docs and update naming --- parquet-variant/src/builder.rs | 57 ++++++++++++------------ parquet-variant/tests/variant_interop.rs | 2 +- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 604306870e29..3480f93cef45 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -87,7 +87,8 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// assert_eq!(variant, Variant::Int8(42)); /// ``` /// -/// # Example: Create an Object +/// # Example: Create a [`Variant::Object`] +/// /// This example shows how to create an object with two fields: /// ```json /// { @@ -121,18 +122,18 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// ); /// ``` /// -/// # Example: Create an Array +/// # Example: Create a [`Variant::List`] (an Array) /// /// This example shows how to create an array of integers: `[1, 2, 3]`. /// ``` /// # use parquet_variant::{Variant, VariantBuilder}; /// let mut builder = VariantBuilder::new(); -/// // Create an array builder that will write elements to the array -/// let mut array_builder = builder.new_array(); -/// array_builder.append_value(1i8); -/// array_builder.append_value(2i8); -/// array_builder.append_value(3i8); -/// array_builder.finish(); +/// // Create a builder that will write elements to the list +/// let mut list_builder = builder.new_list(); +/// list_builder.append_value(1i8); +/// list_builder.append_value(2i8); +/// list_builder.append_value(3i8); +/// list_builder.finish(); /// // Finish the builder to get the metadata and value /// let (metadata, value) = builder.finish(); /// // use the Variant API to verify the result @@ -140,15 +141,15 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// let Variant::List(variant_list) = variant else { /// panic!("unexpected variant type") /// }; -/// // Verify the array contents +/// // Verify the list contents /// assert_eq!(variant_list.get(0).unwrap(), Variant::Int8(1)); /// assert_eq!(variant_list.get(1).unwrap(), Variant::Int8(2)); /// assert_eq!(variant_list.get(2).unwrap(), Variant::Int8(3)); /// ``` /// -/// # Example: Array of objects +/// # Example: [`Variant::List`] of [`Variant::Object`]s /// -/// THis example shows how to create an array of objects: +/// THis example shows how to create an list of objects: /// ```json /// [ /// { @@ -310,11 +311,11 @@ impl VariantBuilder { self.buffer.len() } - /// Create an [`ArrayBuilder`] for creating [`Variant::Array`] values. + /// Create an [`ListBuilder`] for creating [`Variant::List`] values. /// /// See the examples on [`VariantBuilder`] for usage. - pub fn new_array(&mut self) -> ArrayBuilder { - ArrayBuilder::new(self) + pub fn new_list(&mut self) -> ListBuilder { + ListBuilder::new(self) } /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. @@ -390,7 +391,7 @@ impl VariantBuilder { Variant::Binary(v) => self.append_binary(v), Variant::String(s) | Variant::ShortString(s) => self.append_string(s), Variant::Object(_) | Variant::List(_) => { - unreachable!("Object and Array variants cannot be created through Into") + unreachable!("Object and List variants cannot be created through Into") } } } @@ -402,16 +403,16 @@ impl Default for VariantBuilder { } } -/// A builder for creating [`Variant::Array`] values. +/// A builder for creating [`Variant::List`] values. /// /// See the examples on [`VariantBuilder`] for usage. -pub struct ArrayBuilder<'a> { +pub struct ListBuilder<'a> { parent: &'a mut VariantBuilder, start_pos: usize, offsets: Vec, } -impl<'a> ArrayBuilder<'a> { +impl<'a> ListBuilder<'a> { fn new(parent: &'a mut VariantBuilder) -> Self { let start_pos = parent.offset(); Self { @@ -660,15 +661,15 @@ mod tests { } #[test] - fn test_array() { + fn test_list() { let mut builder = VariantBuilder::new(); { - let mut array = builder.new_array(); - array.append_value(1i8); - array.append_value(2i8); - array.append_value("test"); - array.finish(); + let mut list = builder.new_list(); + list.append_value(1i8); + list.append_value(2i8); + list.append_value("test"); + list.finish(); } let (metadata, value) = builder.finish(); @@ -678,14 +679,14 @@ mod tests { let variant = Variant::try_new(&metadata, &value).unwrap(); match variant { - Variant::List(array) => { - let val0 = array.get(0).unwrap(); + Variant::List(list) => { + let val0 = list.get(0).unwrap(); assert_eq!(val0, Variant::Int8(1)); - let val1 = array.get(1).unwrap(); + let val1 = list.get(1).unwrap(); assert_eq!(val1, Variant::Int8(2)); - let val2 = array.get(2).unwrap(); + let val2 = list.get(2).unwrap(); assert_eq!(val2, Variant::ShortString("test")); } _ => panic!("Expected an array variant, got: {:?}", variant), diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index ff581f843a3a..14a9669c2de3 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -177,7 +177,7 @@ fn variant_array_primitive() { fn variant_array_builder() { let mut builder = VariantBuilder::new(); - let mut arr = builder.new_array(); + let mut arr = builder.new_list(); arr.append_value(2i8); arr.append_value(1i8); arr.append_value(5i8); From eba876f627e6b6b93561a20bd916588454fb7c93 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 17:59:15 -0400 Subject: [PATCH 14/15] fix example --- parquet-variant/src/builder.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 3480f93cef45..8d0dda4a4e50 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -160,6 +160,7 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// "first_name": "Malthe", /// "last_name": "Karbo" /// } +/// ] /// ``` /// /// TODO From b11dc537f7afc4b16e5843c288ae8b0e4015916e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Jun 2025 18:02:10 -0400 Subject: [PATCH 15/15] Fix msrv --- parquet-variant/Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 47522f469995..051c4d573a27 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -28,7 +28,9 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "variant"] readme = "README.md" edition = { workspace = true } -rust-version = { workspace = true } +# needs a newer version than workspace due to +# rror: `Option::::unwrap` is not yet stable as a const fn +rust-version = "1.83" [dependencies] arrow-schema = "55.1.0"