From fb5d6e3839c1743d349f425c97081610ccbb6ba4 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 3 Jul 2025 16:30:10 -0700 Subject: [PATCH 1/7] [Variant] List and object builders have no effect until finalized --- parquet-variant/src/builder.rs | 276 +++++++++++------- parquet-variant/src/from_json.rs | 4 +- parquet-variant/tests/test_json_to_variant.rs | 2 +- 3 files changed, 174 insertions(+), 108 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index fe3090f70b8f..386bc2ec1776 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -299,6 +299,82 @@ impl MetadataBuilder { } } +/// Tracks information needed to correctly finalize a nested builder, for each parent builder type. +/// +/// A child builder has no effect on its parent unless/until its `finalize` method is called, at +/// which point the child appends the new value to the parent. As a (desirable) side effect, +/// creating a parent state instance captures mutable references to a subset of the parent's fields, +/// rendering the parent object completely unusable until the parent state goes out of scope. This +/// ensures that at most one child builder can exist at a time. +/// +/// The redundancy in buffer and metadata_builder is because all the references come from the +/// parent, and we cannot "split" a mutable reference across two objects (parent state and the child +/// builder that uses it). So everything has to be here. Rust layout optimizations should treat the +/// variants as a union, so that accessing a `buffer` or `metadata_builder` is branch-free. +enum ParentState<'a> { + Variant { + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + }, + List { + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + offsets: &'a mut Vec, + }, + Object { + buffer: &'a mut ValueBuffer, + metadata_builder: &'a mut MetadataBuilder, + fields: &'a mut IndexMap, + field_name: &'a str, + }, +} + +impl ParentState<'_> { + fn buffer(&mut self) -> &mut ValueBuffer { + match self { + ParentState::Variant { buffer, .. } => buffer, + ParentState::List { buffer, .. } => buffer, + ParentState::Object { buffer, .. } => buffer, + } + } + + fn metadata_builder(&mut self) -> &mut MetadataBuilder { + match self { + ParentState::Variant { + metadata_builder, .. + } => metadata_builder, + ParentState::List { + metadata_builder, .. + } => metadata_builder, + ParentState::Object { + metadata_builder, .. + } => metadata_builder, + } + } + + // Performs any parent-specific aspects of finishing, after the child has appended all necessary + // bytes to the parent's value buffer. ListBuilder captures the new value's ending offset; + // ObjectBuilder associates the new value's starting offset with its field id; VariantBuilder + // doesn't need anything special. + fn finish(&mut self, starting_offset: usize) { + match self { + ParentState::Variant { .. } => (), + ParentState::List { + buffer, offsets, .. + } => offsets.push(buffer.offset()), + ParentState::Object { + metadata_builder, + fields, + field_name, + .. + } => { + let field_id = metadata_builder.upsert_field_name(field_name); + fields.insert(field_id, starting_offset); + } + } + } +} + /// Top level builder for [`Variant`] values /// /// # Example: create a Primitive Int8 @@ -479,20 +555,29 @@ impl VariantBuilder { self } + // Returns validate_unique_fields because we can no longer reference self once this method returns. + fn parent_state(&mut self) -> (ParentState, bool) { + let state = ParentState::Variant { + buffer: &mut self.buffer, + metadata_builder: &mut self.metadata_builder, + }; + (state, self.validate_unique_fields) + } + /// Create an [`ListBuilder`] for creating [`Variant::List`] values. /// /// See the examples on [`VariantBuilder`] for usage. pub fn new_list(&mut self) -> ListBuilder { - ListBuilder::new(&mut self.buffer, &mut self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields) + let (parent_state, validate_unique_fields) = self.parent_state(); + ListBuilder::new(parent_state, validate_unique_fields) } /// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. pub fn new_object(&mut self) -> ObjectBuilder { - ObjectBuilder::new(&mut self.buffer, &mut self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields) + let (parent_state, validate_unique_fields) = self.parent_state(); + ObjectBuilder::new(parent_state, validate_unique_fields) } pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { @@ -508,36 +593,20 @@ impl VariantBuilder { /// /// See the examples on [`VariantBuilder`] for usage. pub struct ListBuilder<'a> { - parent_buffer: &'a mut ValueBuffer, - metadata_builder: &'a mut MetadataBuilder, + parent_state: ParentState<'a>, offsets: Vec, buffer: ValueBuffer, - /// Is there a pending nested object or list that needs to be finalized? - pending: bool, validate_unique_fields: bool, } impl<'a> ListBuilder<'a> { - fn new(parent_buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self { + fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self { Self { - parent_buffer, - metadata_builder, + parent_state, offsets: vec![0], buffer: ValueBuffer::default(), - pending: false, - validate_unique_fields: false, - } - } - - fn check_new_offset(&mut self) { - if !self.pending { - return; + validate_unique_fields, } - - let element_end = self.buffer.offset(); - self.offsets.push(element_end); - - self.pending = false; } /// Enables unique field key validation for objects created within this list. @@ -549,45 +618,53 @@ impl<'a> ListBuilder<'a> { self } - pub fn new_object(&mut self) -> ObjectBuilder { - self.check_new_offset(); - - let obj_builder = ObjectBuilder::new(&mut self.buffer, self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields); - self.pending = true; + // Returns validate_unique_fields because we can no longer reference self once this method returns. + fn parent_state(&mut self) -> (ParentState, bool) { + let state = ParentState::List { + buffer: &mut self.buffer, + metadata_builder: self.parent_state.metadata_builder(), + offsets: &mut self.offsets, + }; + (state, self.validate_unique_fields) + } - obj_builder + /// Returns an object builder that can be used to append a new (nested) object to this list. + /// + /// WARNING: The builder will have no effect unless/until [`ObjectBuilder::finish`] is called. + pub fn new_object(&mut self) -> ObjectBuilder { + let (parent_state, validate_unique_fields) = self.parent_state(); + ObjectBuilder::new(parent_state, validate_unique_fields) } + /// Returns a list builder that can be used to append a new (nested) list to this list. + /// + /// WARNING: The builder will have no effect unless/until [`ListBuilder::finish`] is called. pub fn new_list(&mut self) -> ListBuilder { - self.check_new_offset(); - - let list_builder = ListBuilder::new(&mut self.buffer, self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields); - self.pending = true; - - list_builder + let (parent_state, validate_unique_fields) = self.parent_state(); + ListBuilder::new(parent_state, validate_unique_fields) } + /// Appends a new primitive value to this list pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { - self.check_new_offset(); - self.buffer.append_non_nested_value(value); let element_end = self.buffer.offset(); self.offsets.push(element_end); } + /// Finalizes this list and appends it to its parent, which otherwise remains unmodified. pub fn finish(mut self) { - self.check_new_offset(); - let data_size = self.buffer.offset(); let num_elements = self.offsets.len() - 1; let is_large = num_elements > u8::MAX as usize; let offset_size = int_size(data_size); + // Get parent's buffer + let parent_buffer = self.parent_state.buffer(); + let starting_offset = parent_buffer.offset(); + // Write header write_header( - self.parent_buffer.inner_mut(), + parent_buffer.inner_mut(), array_header(is_large, offset_size), is_large, num_elements, @@ -595,61 +672,47 @@ impl<'a> ListBuilder<'a> { // Write offsets for offset in &self.offsets { - write_offset(self.parent_buffer.inner_mut(), *offset, offset_size); + write_offset(parent_buffer.inner_mut(), *offset, offset_size); } // Append values - self.parent_buffer.append_slice(self.buffer.inner()); + parent_buffer.append_slice(self.buffer.inner()); + self.parent_state.finish(starting_offset); } } /// A builder for creating [`Variant::Object`] values. /// /// See the examples on [`VariantBuilder`] for usage. -pub struct ObjectBuilder<'a, 'b> { - parent_buffer: &'a mut ValueBuffer, - metadata_builder: &'a mut MetadataBuilder, +pub struct ObjectBuilder<'a> { + parent_state: ParentState<'a>, fields: IndexMap, // (field_id, offset) buffer: ValueBuffer, - /// Is there a pending list or object that needs to be finalized? - pending: Option<(&'b str, usize)>, validate_unique_fields: bool, /// Set of duplicate fields to report for errors duplicate_fields: HashSet, } -impl<'a, 'b> ObjectBuilder<'a, 'b> { - fn new(parent_buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self { +impl<'a> ObjectBuilder<'a> { + fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self { Self { - parent_buffer, - metadata_builder, + parent_state, fields: IndexMap::new(), buffer: ValueBuffer::default(), - pending: None, - validate_unique_fields: false, + validate_unique_fields, duplicate_fields: HashSet::new(), } } - fn check_pending_field(&mut self) { - let Some(&(field_name, field_start)) = self.pending.as_ref() else { - return; - }; - - let field_id = self.metadata_builder.upsert_field_name(field_name); - self.fields.insert(field_id, field_start); - - self.pending = None; - } - /// Add a field with key and value to the object /// /// Note: when inserting duplicate keys, the new value overwrites the previous mapping, /// but the old value remains in the buffer, resulting in a larger variant pub fn insert<'m, 'd, T: Into>>(&mut self, key: &str, value: T) { - self.check_pending_field(); + // Get metadata_builder from parent state + let metadata_builder = self.parent_state.metadata_builder(); - let field_id = self.metadata_builder.upsert_field_name(key); + let field_id = metadata_builder.upsert_field_name(key); let field_start = self.buffer.offset(); if self.fields.insert(field_id, field_start).is_some() && self.validate_unique_fields { @@ -668,43 +731,41 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { self } - /// Return a new [`ObjectBuilder`] to add a nested object with the specified - /// key to the object. - pub fn new_object(&mut self, key: &'b str) -> ObjectBuilder { - self.check_pending_field(); - - let field_start = self.buffer.offset(); - let obj_builder = ObjectBuilder::new(&mut self.buffer, self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields); - self.pending = Some((key, field_start)); - - obj_builder + // Returns validate_unique_fields because we can no longer reference self once this method returns. + fn parent_state<'b>(&'b mut self, key: &'b str) -> (ParentState<'b>, bool) { + let state = ParentState::Object { + buffer: &mut self.buffer, + metadata_builder: self.parent_state.metadata_builder(), + fields: &mut self.fields, + field_name: key, + }; + (state, self.validate_unique_fields) } - /// Return a new [`ListBuilder`] to add a list with the specified key to the - /// object. - pub fn new_list(&mut self, key: &'b str) -> ListBuilder { - self.check_pending_field(); - - let field_start = self.buffer.offset(); - let list_builder = ListBuilder::new(&mut self.buffer, self.metadata_builder) - .with_validate_unique_fields(self.validate_unique_fields); - self.pending = Some((key, field_start)); - - list_builder + /// Returns an object builder that can be used to append a new (nested) object to this list. + /// + /// WARNING: The builder will have no effect unless/until [`ObjectBuilder::finish`] is called. + pub fn new_object<'b>(&'b mut self, key: &'b str) -> ObjectBuilder<'b> { + let (parent_state, validate_unique_fields) = self.parent_state(key); + ObjectBuilder::new(parent_state, validate_unique_fields) } - /// Finalize object + /// Returns a list builder that can be used to append a new (nested) list to this list. /// - /// This consumes self and writes the object to the parent buffer. - pub fn finish(mut self) -> Result<(), ArrowError> { - self.check_pending_field(); + /// WARNING: The builder will have no effect unless/until [`ListBuilder::finish`] is called. + pub fn new_list<'b>(&'b mut self, key: &'b str) -> ListBuilder<'b> { + let (parent_state, validate_unique_fields) = self.parent_state(key); + ListBuilder::new(parent_state, validate_unique_fields) + } + /// Finalizes this object and appends it to its parent, which otherwise remains unmodified. + pub fn finish(mut self) -> Result<(), ArrowError> { + let metadata_builder = self.parent_state.metadata_builder(); if self.validate_unique_fields && !self.duplicate_fields.is_empty() { let mut names = self .duplicate_fields .iter() - .map(|id| self.metadata_builder.field_name(*id as usize)) + .map(|id| metadata_builder.field_name(*id as usize)) .collect::>(); names.sort_unstable(); @@ -720,8 +781,8 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { let is_large = num_fields > u8::MAX as usize; self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &self.metadata_builder.field_name(field_a_id as usize); - let key_b = &self.metadata_builder.field_name(field_b_id as usize); + let key_a = &metadata_builder.field_name(field_a_id as usize); + let key_b = &metadata_builder.field_name(field_b_id as usize); key_a.cmp(key_b) }); @@ -730,9 +791,13 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { let id_size = int_size(max_id as usize); let offset_size = int_size(data_size); + // Get parent's buffer + let parent_buffer = self.parent_state.buffer(); + let starting_offset = parent_buffer.offset(); + // Write header write_header( - self.parent_buffer.inner_mut(), + parent_buffer.inner_mut(), object_header(is_large, id_size, offset_size), is_large, num_fields, @@ -740,24 +805,25 @@ impl<'a, 'b> ObjectBuilder<'a, 'b> { // Write field IDs (sorted order) for (&id, _) in &self.fields { - write_offset(self.parent_buffer.inner_mut(), id as usize, id_size); + write_offset(parent_buffer.inner_mut(), id as usize, id_size); } // Write field offsets for (_, &offset) in &self.fields { - write_offset(self.parent_buffer.inner_mut(), offset, offset_size); + write_offset(parent_buffer.inner_mut(), offset, offset_size); } - write_offset(self.parent_buffer.inner_mut(), data_size, offset_size); + write_offset(parent_buffer.inner_mut(), data_size, offset_size); - self.parent_buffer.append_slice(self.buffer.inner()); + parent_buffer.append_slice(self.buffer.inner()); + self.parent_state.finish(starting_offset); Ok(()) } } -/// Trait that abstracts functionality from Variant fconstruction implementations, namely -/// `VariantBuilder`, `ListBuilder` and `ObjectFieldBuilder` to minimize code duplication. +/// Trait that abstracts functionality from Variant construction implementations, namely +/// [`VariantBuilder`], [`ListBuilder`] and [`ObjectFieldBuilder`] to minimize code duplication. pub(crate) trait VariantBuilderExt<'m, 'v> { fn append_value(&mut self, value: impl Into>); diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs index 00d205f38584..c4adbd1377a8 100644 --- a/parquet-variant/src/from_json.rs +++ b/parquet-variant/src/from_json.rs @@ -131,9 +131,9 @@ fn append_json<'m, 'v>( Ok(()) } -struct ObjectFieldBuilder<'s, 'o, 'v> { +struct ObjectFieldBuilder<'o, 'v, 's> { key: &'s str, - builder: &'o mut ObjectBuilder<'v, 's>, + builder: &'o mut ObjectBuilder<'v>, } impl<'m, 'v> VariantBuilderExt<'m, 'v> for ObjectFieldBuilder<'_, '_, '_> { diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs index fd6056d02d9c..737c4cd4826e 100644 --- a/parquet-variant/tests/test_json_to_variant.rs +++ b/parquet-variant/tests/test_json_to_variant.rs @@ -28,7 +28,7 @@ struct JsonToVariantTest<'a> { expected: Variant<'a, 'a>, } -impl<'a> JsonToVariantTest<'a> { +impl JsonToVariantTest<'_> { fn run(self) -> Result<(), ArrowError> { let mut variant_builder = VariantBuilder::new(); json_to_variant(self.json, &mut variant_builder)?; From c8ba561e2848f6baba2a6b57991be242479c289a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 3 Jul 2025 16:44:20 -0700 Subject: [PATCH 2/7] add tests --- parquet-variant/src/builder.rs | 112 +++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 386bc2ec1776..72d715c7a708 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1565,4 +1565,116 @@ mod tests { let valid_result = valid_obj.finish(); assert!(valid_result.is_ok()); } + + #[test] + fn test_variant_builder_to_list_builder_no_finish() { + // Create a list builder but never finish it + let mut builder = VariantBuilder::new(); + let _list_builder = builder.new_list(); + + builder.append_value(42i8); + + // The original builder should be unchanged + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(42)); + } + + #[test] + fn test_variant_builder_to_object_builder_no_finish() { + // Create an object builder but never finish it + let mut builder = VariantBuilder::new(); + let _object_builder = builder.new_object(); + + builder.append_value(42i8); + + // The original builder should be unchanged + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(42)); + } + + #[test] + fn test_list_builder_to_list_builder_no_finish() { + let mut builder = VariantBuilder::new(); + let mut list_builder = builder.new_list(); + list_builder.append_value(1i8); + + // Create a nested list builder but never finish it + let _nested_list_builder = list_builder.new_list(); + + list_builder.append_value(2i8); + + // The parent list should only contain the original values + list_builder.finish(); + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let list = variant.as_list().unwrap(); + assert_eq!(list.len(), 2); + assert_eq!(list.get(0).unwrap(), Variant::Int8(1)); + assert_eq!(list.get(1).unwrap(), Variant::Int8(2)); + } + + #[test] + fn test_list_builder_to_object_builder_no_finish() { + let mut builder = VariantBuilder::new(); + let mut list_builder = builder.new_list(); + list_builder.append_value(1i8); + + // Create a nested object builder but never finish it + let _nested_object_builder = list_builder.new_object(); + + list_builder.append_value(2i8); + + // The parent list should only contain the original values + list_builder.finish(); + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let list = variant.as_list().unwrap(); + assert_eq!(list.len(), 2); + assert_eq!(list.get(0).unwrap(), Variant::Int8(1)); + assert_eq!(list.get(1).unwrap(), Variant::Int8(2)); + } + + #[test] + fn test_object_builder_to_list_builder_no_finish() { + let mut builder = VariantBuilder::new(); + let mut object_builder = builder.new_object(); + object_builder.insert("first", 1i8); + + // Create a nested list builder but never finish it + let _nested_list_builder = object_builder.new_list("nested"); + + object_builder.insert("second", 2i8); + + // The parent object should only contain the original fields + object_builder.finish().unwrap(); + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let obj = variant.as_object().unwrap(); + assert_eq!(obj.len(), 2); + assert_eq!(obj.get("first"), Some(Variant::Int8(1))); + assert_eq!(obj.get("second"), Some(Variant::Int8(2))); + } + + #[test] + fn test_object_builder_to_object_builder_no_finish() { + let mut builder = VariantBuilder::new(); + let mut object_builder = builder.new_object(); + object_builder.insert("first", 1i8); + + // Create a nested object builder but never finish it + let _nested_object_builder = object_builder.new_object("nested"); + + object_builder.insert("second", 2i8); + + // The parent object should only contain the original fields + object_builder.finish().unwrap(); + let (metadata, value) = builder.finish(); + let variant = Variant::try_new(&metadata, &value).unwrap(); + let obj = variant.as_object().unwrap(); + assert_eq!(obj.len(), 2); + assert_eq!(obj.get("first"), Some(Variant::Int8(1))); + assert_eq!(obj.get("second"), Some(Variant::Int8(2))); + } } From 850eacaa0207172f60c9a189471ba3c49804e579 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 3 Jul 2025 17:29:39 -0700 Subject: [PATCH 3/7] misc cleanups --- parquet-variant/src/builder.rs | 99 +++++++++++++++++----------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 72d715c7a708..dc917b020dbd 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -61,17 +61,6 @@ fn write_offset(buf: &mut Vec, value: usize, nbytes: u8) { buf.extend_from_slice(&bytes[..nbytes as usize]); } -fn write_header(buf: &mut Vec, header_byte: u8, is_large: bool, num_items: usize) { - buf.push(header_byte); - - if is_large { - let num_items = num_items as u32; - buf.extend_from_slice(&num_items.to_le_bytes()); - } else { - let num_items = num_items as u8; - buf.push(num_items); - }; -} #[derive(Default)] struct ValueBuffer(Vec); @@ -231,6 +220,36 @@ impl ValueBuffer { } } } + + /// Writes out the header byte for a variant object or list + fn append_header(&mut self, header_byte: u8, is_large: bool, num_items: usize) { + let buf = self.inner_mut(); + buf.push(header_byte); + + if is_large { + let num_items = num_items as u32; + buf.extend_from_slice(&num_items.to_le_bytes()); + } else { + let num_items = num_items as u8; + buf.push(num_items); + }; + } + + /// Writes out the offsets for an array of offsets, including the final offset (data size). + fn append_offset_array( + &mut self, + offsets: impl IntoIterator, + data_size: Option, + nbytes: u8, + ) { + let buf = self.inner_mut(); + for offset in offsets { + write_offset(buf, offset, nbytes); + } + if let Some(data_size) = data_size { + write_offset(buf, data_size, nbytes); + } + } } #[derive(Default)] @@ -291,7 +310,7 @@ impl MetadataBuilder { write_offset(&mut metadata, cur_offset, offset_size); // Write string data - for key in self.field_names.iter() { + for key in self.field_names { metadata.extend_from_slice(key.as_bytes()); } @@ -353,15 +372,13 @@ impl ParentState<'_> { } // Performs any parent-specific aspects of finishing, after the child has appended all necessary - // bytes to the parent's value buffer. ListBuilder captures the new value's ending offset; + // bytes to the parent's value buffer. ListBuilder records the new value's starting offset; // ObjectBuilder associates the new value's starting offset with its field id; VariantBuilder // doesn't need anything special. fn finish(&mut self, starting_offset: usize) { match self { ParentState::Variant { .. } => (), - ParentState::List { - buffer, offsets, .. - } => offsets.push(buffer.offset()), + ParentState::List { offsets, .. } => offsets.push(starting_offset), ParentState::Object { metadata_builder, fields, @@ -603,7 +620,7 @@ impl<'a> ListBuilder<'a> { fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self { Self { parent_state, - offsets: vec![0], + offsets: vec![], buffer: ValueBuffer::default(), validate_unique_fields, } @@ -646,15 +663,14 @@ impl<'a> ListBuilder<'a> { /// Appends a new primitive value to this list pub fn append_value<'m, 'd, T: Into>>(&mut self, value: T) { + self.offsets.push(self.buffer.offset()); self.buffer.append_non_nested_value(value); - let element_end = self.buffer.offset(); - self.offsets.push(element_end); } /// Finalizes this list and appends it to its parent, which otherwise remains unmodified. pub fn finish(mut self) { let data_size = self.buffer.offset(); - let num_elements = self.offsets.len() - 1; + let num_elements = self.offsets.len(); let is_large = num_elements > u8::MAX as usize; let offset_size = int_size(data_size); @@ -663,19 +679,11 @@ impl<'a> ListBuilder<'a> { let starting_offset = parent_buffer.offset(); // Write header - write_header( - parent_buffer.inner_mut(), - array_header(is_large, offset_size), - is_large, - num_elements, - ); - - // Write offsets - for offset in &self.offsets { - write_offset(parent_buffer.inner_mut(), *offset, offset_size); - } + let header = array_header(is_large, offset_size); + parent_buffer.append_header(header, is_large, num_elements); - // Append values + // Write out the offset array followed by the value bytes + parent_buffer.append_offset_array(self.offsets, Some(data_size), offset_size); parent_buffer.append_slice(self.buffer.inner()); self.parent_state.finish(starting_offset); } @@ -796,25 +804,16 @@ impl<'a> ObjectBuilder<'a> { let starting_offset = parent_buffer.offset(); // Write header - write_header( - parent_buffer.inner_mut(), - object_header(is_large, id_size, offset_size), - is_large, - num_fields, - ); + let header = object_header(is_large, id_size, offset_size); + parent_buffer.append_header(header, is_large, num_fields); // Write field IDs (sorted order) - for (&id, _) in &self.fields { - write_offset(parent_buffer.inner_mut(), id as usize, id_size); - } - - // Write field offsets - for (_, &offset) in &self.fields { - write_offset(parent_buffer.inner_mut(), offset, offset_size); - } - - write_offset(parent_buffer.inner_mut(), data_size, offset_size); + let ids = self.fields.keys().map(|id| *id as usize); + parent_buffer.append_offset_array(ids, None, offset_size); + // Write the field offset array, followed by the value bytes + let offsets = self.fields.into_values(); + parent_buffer.append_offset_array(offsets, Some(data_size), offset_size); parent_buffer.append_slice(self.buffer.inner()); self.parent_state.finish(starting_offset); @@ -822,8 +821,8 @@ impl<'a> ObjectBuilder<'a> { } } -/// Trait that abstracts functionality from Variant construction implementations, namely -/// [`VariantBuilder`], [`ListBuilder`] and [`ObjectFieldBuilder`] to minimize code duplication. +/// Trait that abstracts functionality from Variant construction implementations, such as +/// [`VariantBuilder`] and [`ListBuilder`], to minimize code duplication. pub(crate) trait VariantBuilderExt<'m, 'v> { fn append_value(&mut self, value: impl Into>); From ba6d38936ff09dc04df49a77bf92c69c9d72bd12 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Jul 2025 06:00:07 -0700 Subject: [PATCH 4/7] fix --- parquet-variant/src/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index dc917b020dbd..da1f62341def 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -809,7 +809,7 @@ impl<'a> ObjectBuilder<'a> { // Write field IDs (sorted order) let ids = self.fields.keys().map(|id| *id as usize); - parent_buffer.append_offset_array(ids, None, offset_size); + parent_buffer.append_offset_array(ids, None, id_size); // Write the field offset array, followed by the value bytes let offsets = self.fields.into_values(); From 4946d01ed930c1051a0dee9e3de1f53751dafbe5 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Jul 2025 13:35:37 -0700 Subject: [PATCH 5/7] docs typo --- parquet-variant/src/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index da1f62341def..8d962939e846 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -750,7 +750,7 @@ impl<'a> ObjectBuilder<'a> { (state, self.validate_unique_fields) } - /// Returns an object builder that can be used to append a new (nested) object to this list. + /// Returns an object builder that can be used to append a new (nested) object to this object. /// /// WARNING: The builder will have no effect unless/until [`ObjectBuilder::finish`] is called. pub fn new_object<'b>(&'b mut self, key: &'b str) -> ObjectBuilder<'b> { @@ -758,7 +758,7 @@ impl<'a> ObjectBuilder<'a> { ObjectBuilder::new(parent_state, validate_unique_fields) } - /// Returns a list builder that can be used to append a new (nested) list to this list. + /// Returns a list builder that can be used to append a new (nested) list to this object. /// /// WARNING: The builder will have no effect unless/until [`ListBuilder::finish`] is called. pub fn new_list<'b>(&'b mut self, key: &'b str) -> ListBuilder<'b> { From 6f443ecdee8de926e3e4cf23c6c25ec02fb93782 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Jul 2025 15:04:20 -0700 Subject: [PATCH 6/7] expanded testing --- parquet-variant/src/builder.rs | 171 +++++++++++++++++++++--- parquet-variant/src/utils.rs | 1 + parquet-variant/src/variant/metadata.rs | 10 ++ 3 files changed, 166 insertions(+), 16 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index e50c204d8979..78412fbb8f72 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -890,6 +890,7 @@ impl<'m, 'v> VariantBuilderExt<'m, 'v> for VariantBuilder { #[cfg(test)] mod tests { use super::*; + use crate::VariantMetadata; #[test] fn test_simple_usage() { @@ -1600,14 +1601,19 @@ mod tests { fn test_variant_builder_to_list_builder_no_finish() { // Create a list builder but never finish it let mut builder = VariantBuilder::new(); - let list_builder = builder.new_list(); + let mut list_builder = builder.new_list(); + list_builder.append_value("hi"); drop(list_builder); builder.append_value(42i8); // The original builder should be unchanged let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert!(metadata.is_empty()); + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); + assert!(metadata.is_empty()); assert_eq!(variant, Variant::Int8(42)); } @@ -1615,25 +1621,31 @@ mod tests { fn test_variant_builder_to_object_builder_no_finish() { // Create an object builder but never finish it let mut builder = VariantBuilder::new(); - let object_builder = builder.new_object(); + let mut object_builder = builder.new_object(); + object_builder.insert("name", "unknown"); drop(object_builder); builder.append_value(42i8); // The original builder should be unchanged let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 1); + assert_eq!(&metadata[0], "name"); // not rolled back + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); assert_eq!(variant, Variant::Int8(42)); } #[test] - fn test_list_builder_to_list_builder_no_finish() { + fn test_list_builder_to_list_builder_inner_no_finish() { let mut builder = VariantBuilder::new(); let mut list_builder = builder.new_list(); list_builder.append_value(1i8); // Create a nested list builder but never finish it - let nested_list_builder = list_builder.new_list(); + let mut nested_list_builder = list_builder.new_list(); + nested_list_builder.append_value("hi"); drop(nested_list_builder); list_builder.append_value(2i8); @@ -1641,7 +1653,10 @@ mod tests { // The parent list should only contain the original values list_builder.finish(); let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert!(metadata.is_empty()); + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); let list = variant.as_list().unwrap(); assert_eq!(list.len(), 2); assert_eq!(list.get(0).unwrap(), Variant::Int8(1)); @@ -1649,13 +1664,39 @@ mod tests { } #[test] - fn test_list_builder_to_object_builder_no_finish() { + fn test_list_builder_to_list_builder_outer_no_finish() { + let mut builder = VariantBuilder::new(); + let mut list_builder = builder.new_list(); + list_builder.append_value(1i8); + + // Create a nested list builder and finish it + let mut nested_list_builder = list_builder.new_list(); + nested_list_builder.append_value("hi"); + nested_list_builder.finish(); + + // Drop the outer list builder without finishing it + drop(list_builder); + + builder.append_value(2i8); + + // Only the second attempt should appear in the final variant + let (metadata, value) = builder.finish(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert!(metadata.is_empty()); + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(2)); + } + + #[test] + fn test_list_builder_to_object_builder_inner_no_finish() { let mut builder = VariantBuilder::new(); let mut list_builder = builder.new_list(); list_builder.append_value(1i8); // Create a nested object builder but never finish it - let nested_object_builder = list_builder.new_object(); + let mut nested_object_builder = list_builder.new_object(); + nested_object_builder.insert("name", "unknown"); drop(nested_object_builder); list_builder.append_value(2i8); @@ -1663,7 +1704,11 @@ mod tests { // The parent list should only contain the original values list_builder.finish(); let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 1); + assert_eq!(&metadata[0], "name"); // not rolled back + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); let list = variant.as_list().unwrap(); assert_eq!(list.len(), 2); assert_eq!(list.get(0).unwrap(), Variant::Int8(1)); @@ -1671,13 +1716,40 @@ mod tests { } #[test] - fn test_object_builder_to_list_builder_no_finish() { + fn test_list_builder_to_object_builder_outer_no_finish() { + let mut builder = VariantBuilder::new(); + let mut list_builder = builder.new_list(); + list_builder.append_value(1i8); + + // Create a nested object builder and finish it + let mut nested_object_builder = list_builder.new_object(); + nested_object_builder.insert("name", "unknown"); + nested_object_builder.finish().unwrap(); + + // Drop the outer list builder without finishing it + drop(list_builder); + + builder.append_value(2i8); + + // Only the second attempt should appear in the final variant + let (metadata, value) = builder.finish(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 1); + assert_eq!(&metadata[0], "name"); // not rolled back + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(2)); + } + + #[test] + fn test_object_builder_to_list_builder_inner_no_finish() { let mut builder = VariantBuilder::new(); let mut object_builder = builder.new_object(); object_builder.insert("first", 1i8); // Create a nested list builder but never finish it - let nested_list_builder = object_builder.new_list("nested"); + let mut nested_list_builder = object_builder.new_list("nested"); + nested_list_builder.append_value("hi"); drop(nested_list_builder); object_builder.insert("second", 2i8); @@ -1685,7 +1757,12 @@ mod tests { // The parent object should only contain the original fields object_builder.finish().unwrap(); let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 2); + assert_eq!(&metadata[0], "first"); + assert_eq!(&metadata[1], "second"); + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); let obj = variant.as_object().unwrap(); assert_eq!(obj.len(), 2); assert_eq!(obj.get("first"), Some(Variant::Int8(1))); @@ -1693,13 +1770,41 @@ mod tests { } #[test] - fn test_object_builder_to_object_builder_no_finish() { + fn test_object_builder_to_list_builder_outer_no_finish() { + let mut builder = VariantBuilder::new(); + let mut object_builder = builder.new_object(); + object_builder.insert("first", 1i8); + + // Create a nested list builder and finish it + let mut nested_list_builder = object_builder.new_list("nested"); + nested_list_builder.append_value("hi"); + nested_list_builder.finish(); + + // Drop the outer object builder without finishing it + drop(object_builder); + + builder.append_value(2i8); + + // Only the second attempt should appear in the final variant + let (metadata, value) = builder.finish(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 2); + assert_eq!(&metadata[0], "first"); + assert_eq!(&metadata[1], "nested"); // not rolled back + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(2)); + } + + #[test] + fn test_object_builder_to_object_builder_inner_no_finish() { let mut builder = VariantBuilder::new(); let mut object_builder = builder.new_object(); object_builder.insert("first", 1i8); // Create a nested object builder but never finish it - let nested_object_builder = object_builder.new_object("nested"); + let mut nested_object_builder = object_builder.new_object("nested"); + nested_object_builder.insert("name", "unknown"); drop(nested_object_builder); object_builder.insert("second", 2i8); @@ -1707,10 +1812,44 @@ mod tests { // The parent object should only contain the original fields object_builder.finish().unwrap(); let (metadata, value) = builder.finish(); - let variant = Variant::try_new(&metadata, &value).unwrap(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 3); + assert_eq!(&metadata[0], "first"); + assert_eq!(&metadata[1], "name"); // not rolled back + assert_eq!(&metadata[2], "second"); + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); let obj = variant.as_object().unwrap(); assert_eq!(obj.len(), 2); assert_eq!(obj.get("first"), Some(Variant::Int8(1))); assert_eq!(obj.get("second"), Some(Variant::Int8(2))); } + + #[test] + fn test_object_builder_to_object_builder_outer_no_finish() { + let mut builder = VariantBuilder::new(); + let mut object_builder = builder.new_object(); + object_builder.insert("first", 1i8); + + // Create a nested object builder and finish it + let mut nested_object_builder = object_builder.new_object("nested"); + nested_object_builder.insert("name", "unknown"); + nested_object_builder.finish().unwrap(); + + // Drop the outer object builder without finishing it + drop(object_builder); + + builder.append_value(2i8); + + // Only the second attempt should appear in the final variant + let (metadata, value) = builder.finish(); + let metadata = VariantMetadata::try_new(&metadata).unwrap(); + assert_eq!(metadata.len(), 3); + assert_eq!(&metadata[0], "first"); // not rolled back + assert_eq!(&metadata[1], "name"); // not rolled back + assert_eq!(&metadata[2], "nested"); // not rolled back + + let variant = Variant::try_new_with_metadata(metadata, &value).unwrap(); + assert_eq!(variant, Variant::Int8(2)); + } } diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 765ea04ae6ae..1d3f7542d70b 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -72,6 +72,7 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result { .first() .copied() .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string())) + .inspect_err(|e| panic!("{e}")) } /// Helper to get a &str from a slice at the given offset and range, or an error if invalid. diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 46d89557bfae..742f586fb3b4 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -206,6 +206,16 @@ impl<'m> VariantMetadata<'m> { Ok(new_self) } + /// The number of metadata dictionary entries + pub fn len(&self) -> usize { + self.dictionary_size + } + + /// True if this metadata dictionary contains no entries + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + /// True if this instance is fully [validated] for panic-free infallible accesses. /// /// [validated]: Self#Validation From 6eb2b8726c1a2a9199aef5b82b6884e1c2b7c430 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 5 Jul 2025 19:38:50 -0700 Subject: [PATCH 7/7] remove stray debugging panic --- parquet-variant/src/utils.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 1d3f7542d70b..765ea04ae6ae 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -72,7 +72,6 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result { .first() .copied() .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string())) - .inspect_err(|e| panic!("{e}")) } /// Helper to get a &str from a slice at the given offset and range, or an error if invalid.