From 87b1ee7dbaa7873c66198a0cd2baaaae6c44c4b9 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sat, 21 Jun 2025 09:38:39 -0400 Subject: [PATCH 1/5] Use BTreeMap to maintain sort invariants throughout writes --- parquet-variant/src/builder.rs | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 6cde4ce91125..affec3bb602b 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -16,7 +16,7 @@ // under the License. use crate::decoder::{VariantBasicType, VariantPrimitiveType}; use crate::{ShortString, Variant}; -use std::collections::HashMap; +use std::collections::BTreeMap; const BASIC_TYPE_BITS: u8 = 2; const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); @@ -166,7 +166,7 @@ fn make_room_for_header(buffer: &mut Vec, start_pos: usize, header_size: usi /// pub struct VariantBuilder { buffer: Vec, - dict: HashMap, + dict: BTreeMap, dict_keys: Vec, } @@ -174,7 +174,7 @@ impl VariantBuilder { pub fn new() -> Self { Self { buffer: Vec::new(), - dict: HashMap::new(), + dict: BTreeMap::new(), dict_keys: Vec::new(), } } @@ -296,7 +296,7 @@ impl VariantBuilder { /// Add key to dictionary, return its ID fn add_key(&mut self, key: &str) -> u32 { - use std::collections::hash_map::Entry; + use std::collections::btree_map::Entry; match self.dict.entry(key.to_string()) { Entry::Occupied(entry) => *entry.get(), Entry::Vacant(entry) => { @@ -471,7 +471,7 @@ impl<'a> ListBuilder<'a> { pub struct ObjectBuilder<'a> { parent: &'a mut VariantBuilder, start_pos: usize, - fields: Vec<(u32, usize)>, // (field_id, offset) + fields: BTreeMap, // (field_id, offset) } impl<'a> ObjectBuilder<'a> { @@ -480,7 +480,7 @@ impl<'a> ObjectBuilder<'a> { Self { parent, start_pos, - fields: Vec::new(), + fields: BTreeMap::new(), } } @@ -489,25 +489,27 @@ impl<'a> ObjectBuilder<'a> { let id = self.parent.add_key(key); let field_start = self.parent.offset() - self.start_pos; self.parent.append_value(value); - self.fields.push((id, field_start)); + let res = self.fields.insert(id, field_start); + debug_assert!(res.is_none()); } /// Finalize object with sorted fields - pub fn finish(mut self) { - // Sort fields by key name - self.fields.sort_by(|a, b| { - let key_a = &self.parent.dict_keys[a.0 as usize]; - let key_b = &self.parent.dict_keys[b.0 as usize]; - key_a.cmp(key_b) - }); - + pub fn finish(self) { let data_size = self.parent.offset() - self.start_pos; let num_fields = self.fields.len(); let is_large = num_fields > u8::MAX as usize; let size_bytes = if is_large { 4 } else { 1 }; - let max_id = self.fields.iter().map(|&(id, _)| id).max().unwrap_or(0); - let id_size = int_size(max_id as usize); + let field_ids_by_sorted_field_name = self + .parent + .dict + .iter() + .filter_map(|(_, id)| self.fields.contains_key(id).then_some(*id)) + .collect::>(); + + let max_id = self.fields.keys().last().copied().unwrap_or(0) as usize; + + let id_size = int_size(max_id); let offset_size = int_size(data_size); let header_size = 1 @@ -531,17 +533,18 @@ impl<'a> ObjectBuilder<'a> { } // Write field IDs (sorted order) - for &(id, _) in &self.fields { + for id in &field_ids_by_sorted_field_name { write_offset( &mut self.parent.buffer[pos..pos + id_size as usize], - id as usize, + *id as usize, id_size, ); pos += id_size as usize; } // Write field offsets - for &(_, offset) in &self.fields { + for id in &field_ids_by_sorted_field_name { + let &offset = self.fields.get(id).unwrap(); write_offset( &mut self.parent.buffer[pos..pos + offset_size as usize], offset, From 82712f9023f64fc00dbe005590d47f86f21d6337 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sat, 21 Jun 2025 10:14:58 -0400 Subject: [PATCH 2/5] Add test case that checks if Maps maintain invariant upon new field write --- parquet-variant/src/builder.rs | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index affec3bb602b..1c789924e9a2 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -740,4 +740,75 @@ mod tests { // apple(1), banana(2), zebra(0) assert_eq!(field_ids, vec![1, 2, 0]); } + + #[test] + fn test_object_and_metadata_ordering() { + let mut builder = VariantBuilder::new(); + + let mut obj = builder.new_object(); + + obj.append_value("zebra", "stripes"); // ID = 0 + obj.append_value("apple", "red"); // ID = 1 + + { + // fields_map is ordered by insertion order (field id) + let fields_map = obj.fields.keys().copied().collect::>(); + assert_eq!(fields_map, vec![0, 1]); + + // dict is ordered by field names + // NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids + let dict_metadata = obj + .parent + .dict + .iter() + .map(|(f, i)| (f.as_str(), *i)) + .collect::>(); + + assert_eq!(dict_metadata, vec![("apple", 1), ("zebra", 0)]); + + // dict_keys is ordered by insertion order (field id) + let dict_keys = obj + .parent + .dict_keys + .iter() + .map(|k| k.as_str()) + .collect::>(); + assert_eq!(dict_keys, vec!["zebra", "apple"]); + } + + obj.append_value("banana", "yellow"); // ID = 2 + + { + // fields_map is ordered by insertion order (field id) + let fields_map = obj.fields.keys().copied().collect::>(); + assert_eq!(fields_map, vec![0, 1, 2]); + + // dict is ordered by field names + // NOTE: when we support nested objects, we'll want to perform a filter by fields_map field ids + let dict_metadata = obj + .parent + .dict + .iter() + .map(|(f, i)| (f.as_str(), *i)) + .collect::>(); + + assert_eq!( + dict_metadata, + vec![("apple", 1), ("banana", 2), ("zebra", 0)] + ); + + // dict_keys is ordered by insertion order (field id) + let dict_keys = obj + .parent + .dict_keys + .iter() + .map(|k| k.as_str()) + .collect::>(); + assert_eq!(dict_keys, vec!["zebra", "apple", "banana"]); + } + + obj.finish(); + + builder.finish(); + } } From e8af7ff81605679091aeae570f118a2c39b61b5d Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sat, 21 Jun 2025 19:46:09 -0400 Subject: [PATCH 3/5] Raise error when attempting to append duplicate field name in object --- parquet-variant/src/builder.rs | 60 +++++++++++++++++++----- parquet-variant/tests/variant_interop.rs | 16 ++++--- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 1c789924e9a2..b297a2f7bea5 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1,3 +1,5 @@ +use arrow_schema::ArrowError; + // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -484,13 +486,33 @@ impl<'a> ObjectBuilder<'a> { } } + fn check_duplicate_field_name(&self, key: &str) -> Result<(), ArrowError> { + if let Some(field_name_id) = self.parent.dict.get(key) { + if self.fields.contains_key(field_name_id) { + return Err(ArrowError::InvalidArgumentError( + "field name must be unique and already exists in this object".to_string(), + )); + } + } + + Ok(()) + } + /// Add a field with key and value to the object - pub fn append_value<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { - let id = self.parent.add_key(key); + pub fn append_value<'m, 'd, T: Into>>( + &mut self, + key: &str, + value: T, + ) -> Result<(), ArrowError> { + self.check_duplicate_field_name(key)?; + + let field_name_id = self.parent.add_key(key); let field_start = self.parent.offset() - self.start_pos; self.parent.append_value(value); - let res = self.fields.insert(id, field_start); - debug_assert!(res.is_none()); + let res = self.fields.insert(field_name_id, field_start); + debug_assert!(res.is_none()); // this is almost like an unreachable!(), since we check if a duplicate field name already exists + + Ok(()) } /// Finalize object with sorted fields @@ -704,8 +726,8 @@ mod tests { { let mut obj = builder.new_object(); - obj.append_value("name", "John"); - obj.append_value("age", 42i8); + obj.append_value("name", "John").unwrap(); + obj.append_value("age", 42i8).unwrap(); obj.finish(); } @@ -720,9 +742,9 @@ mod tests { { let mut obj = builder.new_object(); - obj.append_value("zebra", "stripes"); // ID = 0 - obj.append_value("apple", "red"); // ID = 1 - obj.append_value("banana", "yellow"); // ID = 2 + obj.append_value("zebra", "stripes").unwrap(); // ID = 0 + obj.append_value("apple", "red").unwrap(); // ID = 1 + obj.append_value("banana", "yellow").unwrap(); // ID = 2 obj.finish(); } @@ -747,8 +769,8 @@ mod tests { let mut obj = builder.new_object(); - obj.append_value("zebra", "stripes"); // ID = 0 - obj.append_value("apple", "red"); // ID = 1 + obj.append_value("zebra", "stripes").unwrap(); // ID = 0 + obj.append_value("apple", "red").unwrap(); // ID = 1 { // fields_map is ordered by insertion order (field id) @@ -776,7 +798,7 @@ mod tests { assert_eq!(dict_keys, vec!["zebra", "apple"]); } - obj.append_value("banana", "yellow"); // ID = 2 + obj.append_value("banana", "yellow").unwrap(); // ID = 2 { // fields_map is ordered by insertion order (field id) @@ -811,4 +833,18 @@ mod tests { builder.finish(); } + + #[test] + fn test_duplicate_field_name_in_same_object() { + let mut builder = VariantBuilder::new(); + + let mut obj = builder.new_object(); + obj.append_value("name", "John").unwrap(); + + let res = obj.append_value("name", 42i8); + assert!( + res.is_err(), + "ObjectBuilder should error when trying to append duplicate field name in same object" + ); + } } diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index bfa2ab267c27..0c6265c3be56 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -206,16 +206,18 @@ fn variant_object_builder() { let mut builder = VariantBuilder::new(); let mut obj = builder.new_object(); - obj.append_value("int_field", 1i8); + obj.append_value("int_field", 1i8).unwrap(); // The double field is actually encoded as decimal4 with scale 8 // Value: 123456789, Scale: 8 -> 1.23456789 - obj.append_value("double_field", (123456789i32, 8u8)); - obj.append_value("boolean_true_field", true); - obj.append_value("boolean_false_field", false); - obj.append_value("string_field", "Apache Parquet"); - obj.append_value("null_field", ()); - obj.append_value("timestamp_field", "2025-04-16T12:34:56.78"); + obj.append_value("double_field", (123456789i32, 8u8)) + .unwrap(); + obj.append_value("boolean_true_field", true).unwrap(); + obj.append_value("boolean_false_field", false).unwrap(); + obj.append_value("string_field", "Apache Parquet").unwrap(); + obj.append_value("null_field", ()).unwrap(); + obj.append_value("timestamp_field", "2025-04-16T12:34:56.78") + .unwrap(); obj.finish(); From 07d3841392f5e41731c7682447d47a6e47c7781e Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sun, 22 Jun 2025 08:48:41 -0400 Subject: [PATCH 4/5] Revert "Raise error when attempting to append duplicate field name in object" This reverts commit e8af7ff81605679091aeae570f118a2c39b61b5d. --- parquet-variant/src/builder.rs | 60 +++++------------------- parquet-variant/tests/variant_interop.rs | 16 +++---- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index b297a2f7bea5..1c789924e9a2 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1,5 +1,3 @@ -use arrow_schema::ArrowError; - // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -486,33 +484,13 @@ impl<'a> ObjectBuilder<'a> { } } - fn check_duplicate_field_name(&self, key: &str) -> Result<(), ArrowError> { - if let Some(field_name_id) = self.parent.dict.get(key) { - if self.fields.contains_key(field_name_id) { - return Err(ArrowError::InvalidArgumentError( - "field name must be unique and already exists in this object".to_string(), - )); - } - } - - Ok(()) - } - /// Add a field with key and value to the object - pub fn append_value<'m, 'd, T: Into>>( - &mut self, - key: &str, - value: T, - ) -> Result<(), ArrowError> { - self.check_duplicate_field_name(key)?; - - let field_name_id = self.parent.add_key(key); + pub fn append_value<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { + let id = self.parent.add_key(key); let field_start = self.parent.offset() - self.start_pos; self.parent.append_value(value); - let res = self.fields.insert(field_name_id, field_start); - debug_assert!(res.is_none()); // this is almost like an unreachable!(), since we check if a duplicate field name already exists - - Ok(()) + let res = self.fields.insert(id, field_start); + debug_assert!(res.is_none()); } /// Finalize object with sorted fields @@ -726,8 +704,8 @@ mod tests { { let mut obj = builder.new_object(); - obj.append_value("name", "John").unwrap(); - obj.append_value("age", 42i8).unwrap(); + obj.append_value("name", "John"); + obj.append_value("age", 42i8); obj.finish(); } @@ -742,9 +720,9 @@ mod tests { { let mut obj = builder.new_object(); - obj.append_value("zebra", "stripes").unwrap(); // ID = 0 - obj.append_value("apple", "red").unwrap(); // ID = 1 - obj.append_value("banana", "yellow").unwrap(); // ID = 2 + obj.append_value("zebra", "stripes"); // ID = 0 + obj.append_value("apple", "red"); // ID = 1 + obj.append_value("banana", "yellow"); // ID = 2 obj.finish(); } @@ -769,8 +747,8 @@ mod tests { let mut obj = builder.new_object(); - obj.append_value("zebra", "stripes").unwrap(); // ID = 0 - obj.append_value("apple", "red").unwrap(); // ID = 1 + obj.append_value("zebra", "stripes"); // ID = 0 + obj.append_value("apple", "red"); // ID = 1 { // fields_map is ordered by insertion order (field id) @@ -798,7 +776,7 @@ mod tests { assert_eq!(dict_keys, vec!["zebra", "apple"]); } - obj.append_value("banana", "yellow").unwrap(); // ID = 2 + obj.append_value("banana", "yellow"); // ID = 2 { // fields_map is ordered by insertion order (field id) @@ -833,18 +811,4 @@ mod tests { builder.finish(); } - - #[test] - fn test_duplicate_field_name_in_same_object() { - let mut builder = VariantBuilder::new(); - - let mut obj = builder.new_object(); - obj.append_value("name", "John").unwrap(); - - let res = obj.append_value("name", 42i8); - assert!( - res.is_err(), - "ObjectBuilder should error when trying to append duplicate field name in same object" - ); - } } diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 0c6265c3be56..bfa2ab267c27 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -206,18 +206,16 @@ fn variant_object_builder() { let mut builder = VariantBuilder::new(); let mut obj = builder.new_object(); - obj.append_value("int_field", 1i8).unwrap(); + obj.append_value("int_field", 1i8); // The double field is actually encoded as decimal4 with scale 8 // Value: 123456789, Scale: 8 -> 1.23456789 - obj.append_value("double_field", (123456789i32, 8u8)) - .unwrap(); - obj.append_value("boolean_true_field", true).unwrap(); - obj.append_value("boolean_false_field", false).unwrap(); - obj.append_value("string_field", "Apache Parquet").unwrap(); - obj.append_value("null_field", ()).unwrap(); - obj.append_value("timestamp_field", "2025-04-16T12:34:56.78") - .unwrap(); + obj.append_value("double_field", (123456789i32, 8u8)); + obj.append_value("boolean_true_field", true); + obj.append_value("boolean_false_field", false); + obj.append_value("string_field", "Apache Parquet"); + obj.append_value("null_field", ()); + obj.append_value("timestamp_field", "2025-04-16T12:34:56.78"); obj.finish(); From b9035549a71814e74cc00be2cf0fd5301e4541d3 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 23 Jun 2025 17:54:52 -0400 Subject: [PATCH 5/5] Fmt --- parquet-variant/src/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 4445f82f54f5..a5fb66a84ff4 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -822,7 +822,7 @@ mod tests { builder.finish(); } - + #[test] fn test_append_object() { let (object_metadata, object_value) = {