From ce2b73ed031a71a899e76a8a3f34ff94eb7f4dc7 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 25 Jun 2025 13:03:05 -0400 Subject: [PATCH 1/3] Support nested object and object with list building --- parquet-variant/src/builder.rs | 164 ++++++++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 43b8e59bce5e..57d271759bc6 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -579,6 +579,7 @@ pub struct ObjectBuilder<'a> { metadata_builder: &'a mut MetadataBuilder, fields: BTreeMap, // (field_id, offset) buffer: ValueBuffer, + pending: Option<(String, usize)>, } impl<'a> ObjectBuilder<'a> { @@ -588,6 +589,7 @@ impl<'a> ObjectBuilder<'a> { metadata_builder, fields: BTreeMap::new(), buffer: ValueBuffer::default(), + pending: None, } } @@ -600,8 +602,41 @@ impl<'a> ObjectBuilder<'a> { debug_assert!(res.is_none()); } + fn check_pending_field(&mut self) { + let Some((field_name, field_start)) = self.pending.as_ref() else { + return; + }; + + let field_id = self.metadata_builder.add_field_name(field_name); + self.fields.insert(field_id, *field_start); + + self.pending = None; + } + + pub fn new_object(&mut self, key: &str) -> ObjectBuilder { + self.check_pending_field(); + + let field_start = self.buffer.offset(); + let obj_builder = ObjectBuilder::new(&mut self.buffer, self.metadata_builder); + self.pending = Some((key.to_string(), field_start)); + + obj_builder + } + + pub fn new_list(&mut self, key: &str) -> ListBuilder { + self.check_pending_field(); + + let field_start = self.buffer.offset(); + let list_builder = ListBuilder::new(&mut self.buffer, self.metadata_builder); + self.pending = Some((key.to_string(), field_start)); + + list_builder + } + /// Finalize object with sorted fields - pub fn finish(self) { + pub fn finish(mut self) { + self.check_pending_field(); + let data_size = self.buffer.offset(); let num_fields = self.fields.len(); let is_large = num_fields > u8::MAX as usize; @@ -1168,4 +1203,131 @@ mod tests { assert_eq!(list.get(4).unwrap(), Variant::from(3)); } + + #[test] + fn test_nested_object() { + /* + { + "c": { + "b": "a" + } + } + + */ + + let mut builder = VariantBuilder::new(); + { + let mut outer_object_builder = builder.new_object(); + { + let mut inner_object_builder = outer_object_builder.new_object("c"); + inner_object_builder.append_value("b", "a"); + inner_object_builder.finish(); + } + + outer_object_builder.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let outer_object = variant.as_object().unwrap(); + + assert_eq!(outer_object.len(), 1); + assert_eq!(outer_object.field_name(0).unwrap(), "c"); + + let inner_object_variant = outer_object.field(0).unwrap(); + let inner_object = inner_object_variant.as_object().unwrap(); + + assert_eq!(inner_object.len(), 1); + assert_eq!(inner_object.field_name(0).unwrap(), "b"); + assert_eq!(inner_object.field(0).unwrap(), Variant::from("a")); + } + + #[test] + fn test_nested_object_with_duplicate_field_names_per_object() { + /* + { + "c": { + "c": "a" + } + } + + */ + + let mut builder = VariantBuilder::new(); + { + let mut outer_object_builder = builder.new_object(); + { + let mut inner_object_builder = outer_object_builder.new_object("c"); + inner_object_builder.append_value("c", "a"); + inner_object_builder.finish(); + } + + outer_object_builder.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let outer_object = variant.as_object().unwrap(); + + assert_eq!(outer_object.len(), 1); + assert_eq!(outer_object.field_name(0).unwrap(), "c"); + + let inner_object_variant = outer_object.field(0).unwrap(); + let inner_object = inner_object_variant.as_object().unwrap(); + + assert_eq!(inner_object.len(), 1); + assert_eq!(inner_object.field_name(0).unwrap(), "c"); + assert_eq!(inner_object.field(0).unwrap(), Variant::from("a")); + } + + #[test] + fn test_nested_object_with_lists() { + /* + { + "door 1": { + "items": ["apple", false ] + } + } + + */ + + let mut builder = VariantBuilder::new(); + { + let mut outer_object_builder = builder.new_object(); + { + let mut inner_object_builder = outer_object_builder.new_object("door 1"); + + { + let mut inner_object_list_builder = inner_object_builder.new_list("items"); + inner_object_list_builder.append_value("apple"); + inner_object_list_builder.append_value(false); + inner_object_list_builder.finish(); + } + + inner_object_builder.finish(); + } + + outer_object_builder.finish(); + } + + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let outer_object = variant.as_object().unwrap(); + + assert_eq!(outer_object.len(), 1); + assert_eq!(outer_object.field_name(0).unwrap(), "door 1"); + + let inner_object_variant = outer_object.field(0).unwrap(); + let inner_object = inner_object_variant.as_object().unwrap(); + + assert_eq!(inner_object.len(), 1); + assert_eq!(inner_object.field_name(0).unwrap(), "items"); + + let items_variant = inner_object.field(0).unwrap(); + let items_list = items_variant.as_list().unwrap(); + + assert_eq!(items_list.len(), 2); + assert_eq!(items_list.get(0).unwrap(), Variant::from("apple")); + assert_eq!(items_list.get(1).unwrap(), Variant::from(false)); + } } From 1e8c995b00ecd42c486565e20d9437204eb388eb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 25 Jun 2025 17:01:01 -0400 Subject: [PATCH 2/3] Avoid a clone and add some more comments --- parquet-variant/src/builder.rs | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 57d271759bc6..8037c21b2ecb 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -184,7 +184,7 @@ impl ValueBuffer { self.0.len() } - fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { + fn append_non_nested_value<'m, 'd, T: Into>>(&mut self, value: T) { let variant = value.into(); match variant { Variant::Null => self.append_null(), @@ -212,7 +212,7 @@ impl ValueBuffer { Variant::String(s) => self.append_string(s), Variant::ShortString(s) => self.append_short_string(s), Variant::Object(_) | Variant::List(_) => { - todo!("How does this work with the redesign?"); + unreachable!("Nested values are handled specially by ObjectBuilder and ListBuilder"); } } } @@ -414,7 +414,7 @@ impl VariantBuilder { } pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { - self.buffer.append_value(value); + self.buffer.append_non_nested_value(value); } pub fn finish(self) -> (Vec, Vec) { @@ -477,6 +477,7 @@ pub struct ListBuilder<'a> { metadata_builder: &'a mut MetadataBuilder, offsets: Vec, buffer: ValueBuffer, + /// Is there a pending nested object or list that needs to be finalized? pending: bool, } @@ -523,7 +524,7 @@ impl<'a> ListBuilder<'a> { pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { self.check_new_offset(); - self.buffer.append_value(value); + self.buffer.append_non_nested_value(value); let element_end = self.buffer.offset(); self.offsets.push(element_end); } @@ -574,15 +575,16 @@ impl<'a> ListBuilder<'a> { /// A builder for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. -pub struct ObjectBuilder<'a> { +pub struct ObjectBuilder<'a, 'b> { parent_buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder, fields: BTreeMap, // (field_id, offset) buffer: ValueBuffer, - pending: Option<(String, usize)>, + /// Is there a pending list or object that needs to be finalized? + pending: Option<(&'b str, usize)>, } -impl<'a> ObjectBuilder<'a> { +impl<'a, 'b> ObjectBuilder<'a, 'b> { fn new(parent_buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self { Self { parent_buffer, @@ -597,7 +599,7 @@ impl<'a> ObjectBuilder<'a> { pub fn append_value<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { let field_id = self.metadata_builder.add_field_name(key); let field_start = self.buffer.offset(); - self.buffer.append_value(value); + self.buffer.append_non_nested_value(value); let res = self.fields.insert(field_id, field_start); debug_assert!(res.is_none()); } @@ -613,27 +615,33 @@ impl<'a> ObjectBuilder<'a> { self.pending = None; } - pub fn new_object(&mut self, key: &str) -> ObjectBuilder { + /// Return a new [`ObjectBuilder`] to add a nested object with the specified + /// key to the object. + pub fn new_object(&mut self, key: &'b str) -> ObjectBuilder { self.check_pending_field(); let field_start = self.buffer.offset(); let obj_builder = ObjectBuilder::new(&mut self.buffer, self.metadata_builder); - self.pending = Some((key.to_string(), field_start)); + self.pending = Some((key, field_start)); obj_builder } - pub fn new_list(&mut self, key: &str) -> ListBuilder { + /// Return a new [`ListBuilder`] to add a list with the specified key to the + /// object. + pub fn new_list(&mut self, key: &'b str) -> ListBuilder { self.check_pending_field(); let field_start = self.buffer.offset(); let list_builder = ListBuilder::new(&mut self.buffer, self.metadata_builder); - self.pending = Some((key.to_string(), field_start)); + self.pending = Some((key, field_start)); list_builder } - /// Finalize object with sorted fields + /// Finalize object + /// + /// This consumes self and writes the object to the parent buffer. pub fn finish(mut self) { self.check_pending_field(); From a58ce7ab77de7388d1058454e7fb55906f4c5707 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:28:28 -0400 Subject: [PATCH 3/3] Fix up merge --- parquet-variant/src/builder.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index c4c440f34c9a..4f88b351a1db 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -212,7 +212,9 @@ impl ValueBuffer { Variant::String(s) => self.append_string(s), Variant::ShortString(s) => self.append_short_string(s), Variant::Object(_) | Variant::List(_) => { - unreachable!("Nested values are handled specially by ObjectBuilder and ListBuilder"); + unreachable!( + "Nested values are handled specially by ObjectBuilder and ListBuilder" + ); } } } @@ -602,9 +604,9 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { pub fn insert<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { let field_id = self.metadata_builder.add_field_name(key); let field_start = self.buffer.offset(); + + self.fields.insert(field_id, field_start); self.buffer.append_non_nested_value(value); - let res = self.fields.insert(field_id, field_start); - debug_assert!(res.is_none()); } fn check_pending_field(&mut self) { @@ -643,7 +645,7 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { } /// Finalize object - /// + /// /// This consumes self and writes the object to the parent buffer. pub fn finish(mut self) { self.check_pending_field(); @@ -1252,7 +1254,7 @@ mod tests { let mut outer_object_builder = builder.new_object(); { let mut inner_object_builder = outer_object_builder.new_object("c"); - inner_object_builder.append_value("b", "a"); + inner_object_builder.insert("b", "a"); inner_object_builder.finish(); } @@ -1290,7 +1292,7 @@ mod tests { let mut outer_object_builder = builder.new_object(); { let mut inner_object_builder = outer_object_builder.new_object("c"); - inner_object_builder.append_value("c", "a"); + inner_object_builder.insert("c", "a"); inner_object_builder.finish(); }