From 2b8f55825260e83b783fd7f61902a393d2f2319e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Jul 2025 11:51:01 -0400 Subject: [PATCH 1/3] Add VariantBuilder::new_with_buffers --- parquet-variant/src/builder.rs | 175 ++++++++++++++++++++++++++++++--- 1 file changed, 161 insertions(+), 14 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 33608d27cbb7..1408a5732812 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -61,9 +61,21 @@ fn write_offset(buf: &mut Vec, value: usize, nbytes: u8) { buf.extend_from_slice(&bytes[..nbytes as usize]); } -#[derive(Default)] +#[derive(Debug, Default)] struct ValueBuffer(Vec); +impl ValueBuffer { + /// Construct a ValueBuffer that will write to a new underlying `Vec` + fn new() -> Self { + Default::default() + } + + /// Construct a ValueBuffer from an existing buffer + fn from_existing(existing: Vec) -> Self { + Self(existing) + } +} + impl ValueBuffer { fn append_u8(&mut self, term: u8) { self.0.push(term); @@ -252,16 +264,26 @@ impl ValueBuffer { } } -#[derive(Default)] +#[derive(Default, Debug)] struct MetadataBuilder { // Field names -- field_ids are assigned in insert order field_names: IndexSet, // flag that checks if field names by insertion order are also lexicographically sorted is_sorted: bool, + + /// Output buffer. Metadata is written to the end of this buffer + metadata_buffer: Vec, } impl MetadataBuilder { + /// Specify that the metadata builder should write to the specified + /// buffer. + pub fn with_metadata_buffer(mut self, metadata_buffer: Vec) -> Self { + self.metadata_buffer = metadata_buffer; + self + } + /// Upsert field name to dictionary, return its ID fn upsert_field_name(&mut self, field_name: &str) -> u32 { let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); @@ -307,6 +329,12 @@ impl MetadataBuilder { // Calculate metadata size let total_dict_size: usize = self.metadata_size(); + let Self { + field_names, + is_sorted, + mut metadata_buffer, + } = self; + // Determine appropriate offset size based on the larger of dict size or total string size let max_offset = std::cmp::max(total_dict_size, nkeys); let offset_size = int_size(max_offset); @@ -315,29 +343,29 @@ impl MetadataBuilder { let string_start = offset_start + (nkeys + 1) * offset_size as usize; let metadata_size = string_start + total_dict_size; - let mut metadata = Vec::with_capacity(metadata_size); + metadata_buffer.reserve(metadata_size); // Write header: version=1, field names are sorted, with calculated offset_size - metadata.push(0x01 | (self.is_sorted as u8) << 4 | ((offset_size - 1) << 6)); + metadata_buffer.push(0x01 | (is_sorted as u8) << 4 | ((offset_size - 1) << 6)); // Write dictionary size - write_offset(&mut metadata, nkeys, offset_size); + write_offset(&mut metadata_buffer, nkeys, offset_size); // Write offsets let mut cur_offset = 0; - for key in self.field_names.iter() { - write_offset(&mut metadata, cur_offset, offset_size); + for key in field_names.iter() { + write_offset(&mut metadata_buffer, cur_offset, offset_size); cur_offset += key.len(); } // Write final offset - write_offset(&mut metadata, cur_offset, offset_size); + write_offset(&mut metadata_buffer, cur_offset, offset_size); // Write string data - for key in self.field_names { - metadata.extend_from_slice(key.as_bytes()); + for key in field_names { + metadata_buffer.extend_from_slice(key.as_bytes()); } - metadata + metadata_buffer } } @@ -570,6 +598,41 @@ impl ParentState<'_> { /// ); /// /// ``` +/// # Example: Reusing Buffers +/// +/// You can use the [`VariantBuffer`] to write into existing buffers (for +/// example to write multiple variants back to back in the same buffer) +/// +/// ``` +/// // we will write two variants back to back +/// use parquet_variant::{Variant, VariantBuilder}; +/// // Append 12345 +/// let mut builder = VariantBuilder::new(); +/// builder.append_value(12345); +/// let (metadata, value) = builder.finish(); +/// // remember where the first variant ends +/// let (first_meta_offset, first_meta_len) = (0, metadata.len()); +/// let (first_value_offset, first_value_len) = (0, value.len()); +/// +/// // now, append a second variant to the same buffers +/// let mut builder = VariantBuilder::new_with_buffers(metadata, value); +/// builder.append_value("Foo"); +/// let (metadata, value) = builder.finish(); +/// +/// // The variants can be referenced in their appropriate location +/// let variant1 = Variant::new( +/// &metadata[first_meta_offset..first_meta_len], +/// &value[first_value_offset..first_value_len] +/// ); +/// assert_eq!(variant1, Variant::Int32(12345)); +/// +/// let variant2 = Variant::new( +/// &metadata[first_meta_len..], +/// &value[first_value_len..] +/// ); +/// assert_eq!(variant2, Variant::from("Foo")); +/// ``` +/// /// # Example: Unique Field Validation /// /// This example shows how enabling unique field validation will cause an error @@ -626,8 +689,7 @@ impl ParentState<'_> { /// let (metadata, value) = builder.finish(); /// let variant = Variant::try_new(&metadata, &value).unwrap(); /// ``` -/// -#[derive(Default)] +#[derive(Default, Debug)] pub struct VariantBuilder { buffer: ValueBuffer, metadata_builder: MetadataBuilder, @@ -635,14 +697,25 @@ pub struct VariantBuilder { } impl VariantBuilder { + /// Create a new VariantBuilder with new underlying buffer pub fn new() -> Self { Self { - buffer: ValueBuffer::default(), + buffer: ValueBuffer::new(), metadata_builder: MetadataBuilder::default(), validate_unique_fields: false, } } + /// Create a new VariantBuilder that will write the metadata and values to + /// the specified buffers. + pub fn new_with_buffers(metadata_buffer: Vec, value_buffer: Vec) -> Self { + Self { + buffer: ValueBuffer::from_existing(value_buffer), + metadata_builder: MetadataBuilder::default().with_metadata_buffer(metadata_buffer), + validate_unique_fields: false, + } + } + /// Enables validation of unique field keys in nested objects. /// /// This setting is propagated to all [`ObjectBuilder`]s created through this [`VariantBuilder`] @@ -1916,6 +1989,80 @@ mod tests { assert_eq!(metadata.num_field_names(), 3); } + /// Test reusing buffers with nested objects + #[test] + fn test_with_existing_buffers_nested() { + let mut builder = VariantBuilder::new(); + append_test_list(&mut builder); + let (m1, v1) = builder.finish(); + let variant1 = Variant::new(&m1, &v1); + + let mut builder = VariantBuilder::new(); + append_test_object(&mut builder); + let (m2, v2) = builder.finish(); + let variant2 = Variant::new(&m2, &v2); + + let mut builder = VariantBuilder::new(); + builder.append_value("This is a string"); + let (m3, v3) = builder.finish(); + let variant3 = Variant::new(&m3, &v3); + + // Now, append those three variants to the a new buffer that is reused + let mut builder = VariantBuilder::new(); + append_test_list(&mut builder); + let (metadata, value) = builder.finish(); + let (meta1_offset, meta1_end) = (0, metadata.len()); + let (value1_offset, value1_end) = (0, value.len()); + + // reuse same buffer + let mut builder = VariantBuilder::new_with_buffers(metadata, value); + append_test_object(&mut builder); + let (metadata, value) = builder.finish(); + let (meta2_offset, meta2_end) = (meta1_end, metadata.len()); + let (value2_offset, value2_end) = (value1_end, value.len()); + + // Append a string + let mut builder = VariantBuilder::new_with_buffers(metadata, value); + builder.append_value("This is a string"); + let (metadata, value) = builder.finish(); + let (meta3_offset, meta3_end) = (meta2_end, metadata.len()); + let (value3_offset, value3_end) = (value2_end, value.len()); + + // verify we can read the variants back correctly + let roundtrip1 = Variant::new( + &metadata[meta1_offset..meta1_end], + &value[value1_offset..value1_end], + ); + assert_eq!(roundtrip1, variant1,); + + let roundtrip2 = Variant::new( + &metadata[meta2_offset..meta2_end], + &value[value2_offset..value2_end], + ); + assert_eq!(roundtrip2, variant2,); + + let roundtrip3 = Variant::new( + &metadata[meta3_offset..meta3_end], + &value[value3_offset..value3_end], + ); + assert_eq!(roundtrip3, variant3); + } + + /// append a simple List variant + fn append_test_list(builder: &mut VariantBuilder) { + let mut list = builder.new_list(); + list.append_value(1234); + list.append_value("a string value"); + list.finish(); + } + + /// append an object variant + fn append_test_object(builder: &mut VariantBuilder) { + let mut obj = builder.new_object(); + obj.insert("a", true); + obj.finish().unwrap(); + } + #[test] fn test_variant_builder_to_list_builder_no_finish() { // Create a list builder but never finish it From 365b0047a4561a9d5e8996d4ca9a954371f7a4be Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Jul 2025 14:59:21 -0400 Subject: [PATCH 2/3] Fix docs --- parquet-variant/src/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 1408a5732812..6e572e4eb0bb 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -600,7 +600,7 @@ impl ParentState<'_> { /// ``` /// # Example: Reusing Buffers /// -/// You can use the [`VariantBuffer`] to write into existing buffers (for +/// You can use the [`VariantBuilder`] to write into existing buffers (for /// example to write multiple variants back to back in the same buffer) /// /// ``` From af926fe92045108df0c0d2a5e78d0e84f1e2b45a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 13 Jul 2025 07:54:24 -0400 Subject: [PATCH 3/3] Suggestions from @scovich --- parquet-variant/src/builder.rs | 46 +++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 6e572e4eb0bb..15ae9a964191 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -61,6 +61,13 @@ fn write_offset(buf: &mut Vec, value: usize, nbytes: u8) { buf.extend_from_slice(&bytes[..nbytes as usize]); } +/// Wrapper around a `Vec` that provides methods for appending +/// primitive values, variant types, and metadata. +/// +/// This is used internally by the builders to construct the +/// the `value` field for [`Variant`] values. +/// +/// You can reuse an existing `Vec` by using the `from` impl #[derive(Debug, Default)] struct ValueBuffer(Vec); @@ -69,10 +76,17 @@ impl ValueBuffer { fn new() -> Self { Default::default() } +} - /// Construct a ValueBuffer from an existing buffer - fn from_existing(existing: Vec) -> Self { - Self(existing) +impl From> for ValueBuffer { + fn from(value: Vec) -> Self { + Self(value) + } +} + +impl From for Vec { + fn from(value_buffer: ValueBuffer) -> Self { + value_buffer.0 } } @@ -94,7 +108,7 @@ impl ValueBuffer { } fn into_inner(self) -> Vec { - self.0 + self.into() } fn inner_mut(&mut self) -> &mut Vec { @@ -264,6 +278,11 @@ impl ValueBuffer { } } +/// Builder for constructing metadata for [`Variant`] values. +/// +/// This is used internally by the [`VariantBuilder`] to construct the metadata +/// +/// You can use an existing `Vec` as the metadata buffer by using the `from` impl. #[derive(Default, Debug)] struct MetadataBuilder { // Field names -- field_ids are assigned in insert order @@ -276,14 +295,17 @@ struct MetadataBuilder { metadata_buffer: Vec, } -impl MetadataBuilder { - /// Specify that the metadata builder should write to the specified - /// buffer. - pub fn with_metadata_buffer(mut self, metadata_buffer: Vec) -> Self { - self.metadata_buffer = metadata_buffer; - self +/// Create a new MetadataBuilder that will write to the specified metadata buffer +impl From> for MetadataBuilder { + fn from(metadata_buffer: Vec) -> Self { + Self { + metadata_buffer, + ..Default::default() + } } +} +impl MetadataBuilder { /// Upsert field name to dictionary, return its ID fn upsert_field_name(&mut self, field_name: &str) -> u32 { let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); @@ -710,8 +732,8 @@ impl VariantBuilder { /// the specified buffers. pub fn new_with_buffers(metadata_buffer: Vec, value_buffer: Vec) -> Self { Self { - buffer: ValueBuffer::from_existing(value_buffer), - metadata_builder: MetadataBuilder::default().with_metadata_buffer(metadata_buffer), + buffer: ValueBuffer::from(value_buffer), + metadata_builder: MetadataBuilder::from(metadata_buffer), validate_unique_fields: false, } }