From 81e3847e50c321aaa8916fa1104e3c191b9fb682 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 15:16:59 +0200 Subject: [PATCH 01/42] feat: Initial READ Api for Variant in parquet-variant crate --- parquet-variant/Cargo.toml | 3 + parquet-variant/src/decoder.rs | 167 ++++++++++++++++++++++ parquet-variant/src/lib.rs | 10 ++ parquet-variant/src/test_variant.rs | 49 +++++++ parquet-variant/src/variant.rs | 213 ++++++++++++++++++++++++++++ 5 files changed, 442 insertions(+) create mode 100644 parquet-variant/src/decoder.rs create mode 100644 parquet-variant/src/test_variant.rs create mode 100644 parquet-variant/src/variant.rs diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 60c9b316cdf1..9c77bd5b72fc 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -31,6 +31,9 @@ edition = { workspace = true } rust-version = { workspace = true } [dependencies] +arrow-schema = "55.1.0" +strum = "0.27.1" +strum_macros = "0.27.1" [lib] diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs new file mode 100644 index 000000000000..d5d28d7fb7b6 --- /dev/null +++ b/parquet-variant/src/decoder.rs @@ -0,0 +1,167 @@ +use crate::variant::VariantType; +use arrow_schema::ArrowError; +use std::{array::TryFromSliceError, str}; + +#[derive(Debug, Clone, Copy)] +pub enum VariantBasicType { + Primitive = 0, + ShortString = 1, + Object = 2, + Array = 3, +} + +#[derive(Debug, Clone, Copy)] +pub enum VariantPrimitiveType { + Null = 0, + BooleanTrue = 1, + BooleanFalse = 2, + Int8 = 3, + // TODO: Add 'legs' for the rest, once API is agreed upon + String = 16, +} + +/// Extracts the basic type from a header byte +pub(crate) fn get_basic_type(header: u8) -> Result { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits + let basic_type = match basic_type { + 0 => VariantBasicType::Primitive, + 1 => VariantBasicType::ShortString, + 2 => VariantBasicType::Object, + 3 => VariantBasicType::Array, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown basic type: {}", + basic_type + ))) + } + }; + Ok(basic_type) +} + +/// Extracts the primitive type from a header byte +pub(crate) fn get_primitive_type(header: u8) -> Result { + // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding + //// Primitive type is encoded in the last 6 bits of the header byte + let primitive_type = (header >> 2) & 0x3F; + let primitive_type = match primitive_type { + 0 => VariantPrimitiveType::Null, + 1 => VariantPrimitiveType::BooleanTrue, + 2 => VariantPrimitiveType::BooleanFalse, + 3 => VariantPrimitiveType::Int8, + // TODO: Add 'legs' for the rest, once API is agreed upon + 16 => VariantPrimitiveType::String, + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "unknown primitive type: {}", + primitive_type + ))) + } + }; + Ok(primitive_type) +} + +/// Extracts the variant type from the value section of a variant. The variant +/// type is defined as the set of all basic types and all primitive types. +pub fn get_variant_type(value: &[u8]) -> Result { + let header = value[0]; + let variant_type = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => VariantType::Null, + VariantPrimitiveType::Int8 => VariantType::Int8, + VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue, + VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse, + // TODO: Add 'legs' for the rest, once API is agreed upon + VariantPrimitiveType::String => VariantType::String, + }, + VariantBasicType::ShortString => VariantType::ShortString, + VariantBasicType::Object => VariantType::Object, + VariantBasicType::Array => VariantType::Array, + }; + Ok(variant_type) +} + +/// To be used in `map_err` when unpacking an integer from a slice of bytes. +fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { + ArrowError::InvalidArgumentError(e.to_string()) +} + +/// Constructs the error message for an invalid UTF-8 string. +fn invalid_utf8_err() -> ArrowError { + ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +} + +/// Decodes an Int8 from the value section of a variant. +pub(crate) fn decode_int8(value: &[u8]) -> Result { + let value = i8::from_le_bytes([value[1]]); + Ok(value) +} + +/// Decodes a long string from the value section of a variant. +pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { + let len = + u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) as usize; + let string_bytes = &value[5..5 + len]; + let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + Ok(string) +} + +/// Decodes a short string from the value section of a variant. +pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { + let len = ((value[0] & 0b11111100) >> 2) as usize; + let string_bytes = &value[1..1 + len]; + let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + Ok(string) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_i8() -> Result<(), ArrowError> { + let value = [ + 0 | 3 << 2, // Primitive type for i8 + 42, + ]; + let result = decode_int8(&value)?; + assert_eq!(result, 42); + Ok(()) + } + + #[test] + fn test_short_string() -> Result<(), ArrowError> { + let value = [ + 1 | 5 << 2, // Basic type for short string | length of short string + 'H' as u8, + 'e' as u8, + 'l' as u8, + 'l' as u8, + 'o' as u8, + 'o' as u8, + ]; + let result = decode_short_string(&value)?; + assert_eq!(result, "Hello"); + Ok(()) + } + + #[test] + fn test_string() -> Result<(), ArrowError> { + let value = [ + 0 | 16 << 2, // Basic type for short string | length of short string + 5, + 0, + 0, + 0, // Length of string + 'H' as u8, + 'e' as u8, + 'l' as u8, + 'l' as u8, + 'o' as u8, + 'o' as u8, + ]; + let result = decode_long_string(&value)?; + assert_eq!(result, "Hello"); + Ok(()) + } +} diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 6289f86a263f..8e27ebf989f2 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -26,3 +26,13 @@ //! If you are interested in helping, you can find more information on the GitHub [Variant issue] //! //! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736 + +// TODO: dead code removal +#[allow(dead_code)] +mod decoder; +// TODO: dead code removal +#[allow(dead_code)] +mod variant; + +#[cfg(test)] +mod test_variant; diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs new file mode 100644 index 000000000000..cfd60d436945 --- /dev/null +++ b/parquet-variant/src/test_variant.rs @@ -0,0 +1,49 @@ +//! End-to-end check: (almost) every sample from apache/parquet-testing/variant +//! can be parsed into our `Variant`. + +// NOTE: We keep this file separate rather than a test mod inside variant.rs because it should be +// moved to the test folder later +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::variant::Variant; +use arrow_schema::ArrowError; + +fn cases_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("parquet-testing") + .join("variant") +} + +fn load_case(name: &str) -> Result<(Vec, Vec), ArrowError> { + let root = cases_dir(); + let meta = fs::read(root.join(format!("{name}.metadata")))?; + let val = fs::read(root.join(format!("{name}.value")))?; + Ok((meta, val)) +} + +fn get_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { + vec![ + ("primitive_boolean_false", Variant::BooleanFalse), + ("primitive_boolean_true", Variant::BooleanTrue), + ("primitive_int8", Variant::Int8(42)), + // Using the From trait + ("primitive_string", Variant::from("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, šŸ’–, ♄\u{fe0f}, šŸŽ£ and 🤦!!")), + // Using the From trait + ("short_string", Variant::from("Less than 64 bytes (ā¤\u{fe0f} with utf8)")), + // TODO Reenable when https://github.com/apache/parquet-testing/issues/81 is fixed + // ("primitive_null", Variant::Null), + ] +} + +#[test] +fn variant() -> Result<(), ArrowError> { + let cases = get_cases(); + for (case, want) in cases { + let (metadata, value) = load_case(case)?; + let got = Variant::try_new(&metadata, &value)?; + assert_eq!(got, want); + } + Ok(()) +} diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs new file mode 100644 index 000000000000..91b3dc503148 --- /dev/null +++ b/parquet-variant/src/variant.rs @@ -0,0 +1,213 @@ +use std::{borrow::Cow, ops::Index}; + +use crate::decoder::{self, get_variant_type}; +use arrow_schema::ArrowError; +use strum_macros::EnumDiscriminants; + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantMetadata<'m> { + bytes: &'m [u8], +} + +impl<'m> VariantMetadata<'m> { + /// View the raw bytes (needed by very low-level decoders) + #[inline] + pub const fn as_bytes(&self) -> &'m [u8] { + self.bytes + } + + pub fn is_sorted(&self) -> bool { + todo!() + } + + pub fn dict_len(&self) -> usize { + todo!() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantObject<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +impl<'m, 'v> VariantObject<'m, 'v> { + pub fn fields(&self) -> Result)>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } + + pub fn field(&self, _name: &'m str) -> Result, ArrowError> { + todo!() + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct VariantArray<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], +} + +// TODO: Let's agree on the API here, also should we expose a way to get the values as a vec of +// variants for those who want it? Would require allocations. +impl<'m, 'v> VariantArray<'m, 'v> { + pub fn len(&self) -> usize { + todo!() + } + + pub fn values(&self) -> Result>, ArrowError> { + todo!(); + #[allow(unreachable_code)] // Just to infer the return type + Ok(vec![].into_iter()) + } +} + +impl<'m, 'v> Index for VariantArray<'m, 'v> { + type Output = Variant<'m, 'v>; + + fn index(&self, _index: usize) -> &Self::Output { + todo!() + } +} + +/// Variant value. May contain references to metadata and value +// TODO: Add copy if no Cow on String and Shortstring? +#[derive(Clone, Debug, PartialEq, EnumDiscriminants)] +#[strum_discriminants(name(VariantType))] +pub enum Variant<'m, 'v> { + // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon + Null, + Int8(i8), + + BooleanTrue, + BooleanFalse, + + // only need the *value* buffer + // TODO: Do we want Cow<'v, str> over &'v str? It eanbles From - discuss on PR + String(Cow<'v, str>), + ShortString(Cow<'v, str>), + + // need both metadata & value + Object(VariantObject<'m, 'v>), + Array(VariantArray<'m, 'v>), +} + +impl<'m, 'v> Variant<'m, 'v> { + /// Parse the buffers and return the appropriate variant. + pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result { + Ok(match get_variant_type(value)? { + VariantType::Null => Variant::Null, + VariantType::BooleanTrue => Variant::BooleanTrue, + VariantType::BooleanFalse => Variant::BooleanFalse, + + VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?), + + // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon + VariantType::String => { + Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?)) + } + + VariantType::ShortString => { + Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?)) + } + + VariantType::Object => Variant::Object(VariantObject { + metadata: VariantMetadata { bytes: metadata }, + value, + }), + VariantType::Array => Variant::Array(VariantArray { + metadata: VariantMetadata { bytes: metadata }, + value, + }), + }) + } + + pub fn as_null(&self) -> Option<()> { + match self { + Variant::Null => Some(()), + _ => None, + } + } + + pub fn as_boolean(&self) -> Option { + match self { + Variant::BooleanTrue => Some(true), + Variant::BooleanFalse => Some(false), + _ => None, + } + } + + pub fn as_string(&'v self) -> Option<&'v str> { + match self { + Variant::String(s) | Variant::ShortString(s) => Some(s), + _ => None, + } + } + + pub fn as_int8(&self) -> Option { + match self { + Variant::Int8(i) => Some(*i), + _ => None, + } + } + + /// Borrow the raw metadata, if this variant has any. + pub fn metadata(&self) -> Option<&'m [u8]> { + match self { + Variant::Object(VariantObject { metadata, .. }) + | Variant::Array(VariantArray { metadata, .. }) => Some(metadata.as_bytes()), + _ => None, + } + } + + /// Borrow the raw value bytes, if present. + pub fn value(&'v self) -> Option<&'v [u8]> { + match self { + // Both arms bind `value` with the same type + Variant::Object(VariantObject { value, .. }) + | Variant::Array(VariantArray { value, .. }) => Some(*value), + + // Short and long strings borrow from inside the slice + Variant::String(s) | Variant::ShortString(s) => Some(s.as_bytes()), + + _ => None, + } + } +} + +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: i8) -> Self { + Variant::Int8(value) + } +} + +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: bool) -> Self { + if value { + Variant::BooleanTrue + } else { + Variant::BooleanFalse + } + } +} + +impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { + fn from(value: &'v str) -> Self { + if value.len() < 64 { + Variant::ShortString(Cow::Borrowed(value)) + } else { + Variant::String(Cow::Borrowed(value)) + } + } +} + +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: String) -> Self { + if value.len() < 64 { + Variant::ShortString(Cow::Owned(value)) + } else { + Variant::String(Cow::Owned(value)) + } + } +} From 9075dd4161a7cb45d827396d6ff077dd21906aa4 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 15:17:25 +0200 Subject: [PATCH 02/42] chore: API refinements, questions for discussions --- parquet-variant/src/decoder.rs | 2 + parquet-variant/src/variant.rs | 97 +++++++++++++++++++++++++++++++++- 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index d5d28d7fb7b6..a92b3a53bf00 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,3 +1,5 @@ +// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 +// And the feedback there. use crate::variant::VariantType; use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 91b3dc503148..0c3af4d4d9db 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -20,9 +20,104 @@ impl<'m> VariantMetadata<'m> { todo!() } - pub fn dict_len(&self) -> usize { + #[inline] + pub fn dict_len(&self) -> Result { + let dict_len_bytes = &self.bytes[1..self.offset_size()? as usize + 1]; + let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to convert dictionary_size bytes into usize: {}", + e, + )) + })?); + Ok(dict_len) + } + pub fn version(&self) -> usize { todo!() } + + /// Get the offset by index + #[inline] + pub fn get_offset_by(&self, index: usize) -> Result { + todo!() + } + + #[inline] + pub fn header(&self) -> u8 { + self.bytes[0] + } + + #[inline] + pub fn offset_size_minus_one(&self) -> Result { + if self.bytes.is_empty() { + Err(ArrowError::InvalidArgumentError( + "Tried to get offset_size_minus_one from header, but self.bytes buffer is emtpy." + .to_string(), + )) + } else { + Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits + } + } + + #[inline] + pub fn offset_size(&self) -> Result { + Ok(self.offset_size_minus_one()? + 1) + } + + /// Get the offset by index + // TODO: Do we want this kind of API? + // TODO: Test once API is agreed upon + #[inline] + pub fn offsets(&'m self) -> Result + 'm, ArrowError> { + struct OffsetIterators<'m> { + buffer: &'m [u8], + total: usize, + seen: usize, + offset_size: usize, + } + impl<'m> Iterator for OffsetIterators<'m> { + type Item = (usize, usize); // (start, end) positions of the bytes + + fn next(&mut self) -> Option { + // +1 to skip the first offset + if self.seen < self.total { + let start = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + self.seen += 1; + let end = usize::from_le_bytes( + self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header + ..(self.seen ) * self.offset_size + 1] + .try_into() + .ok()?, + ); + + Some((start, end)) + } else { + None + } + } + } + let iterator: OffsetIterators = OffsetIterators { + buffer: self.bytes, + total: self.dict_len()?, + seen: 0, + offset_size: self.offset_size()? as usize, + }; + Ok(iterator) + } + /// Get the key-name-bytes by index + pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { + todo!() + } + /// Get all key-names as string + pub fn fields(&self) -> impl Iterator { + // Do the same as for offsets + todo!(); + vec![].into_iter() + } } #[derive(Clone, Copy, Debug, PartialEq)] From 1ebbad7c5f4fd9220450307395dd5f2ad34e33cf Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 15:40:18 +0200 Subject: [PATCH 03/42] chore: comments & cleanup --- parquet-variant/src/decoder.rs | 2 +- parquet-variant/src/variant.rs | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a92b3a53bf00..7ec691888f5f 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -18,7 +18,7 @@ pub enum VariantPrimitiveType { BooleanTrue = 1, BooleanFalse = 2, Int8 = 3, - // TODO: Add 'legs' for the rest, once API is agreed upon + // TODO: Add 'legs' for the rest of primitives, once API is agreed upon String = 16, } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0c3af4d4d9db..09c667bc0cc3 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -5,6 +5,7 @@ use arrow_schema::ArrowError; use strum_macros::EnumDiscriminants; #[derive(Clone, Copy, Debug, PartialEq)] +/// Encodes the Variant Metadata, see the Variant spec file for more information pub struct VariantMetadata<'m> { bytes: &'m [u8], } @@ -16,11 +17,12 @@ impl<'m> VariantMetadata<'m> { self.bytes } + /// Whether the dictionary keys are sorted and unique pub fn is_sorted(&self) -> bool { todo!() } - #[inline] + /// Get the dict length pub fn dict_len(&self) -> Result { let dict_len_bytes = &self.bytes[1..self.offset_size()? as usize + 1]; let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { @@ -36,17 +38,27 @@ impl<'m> VariantMetadata<'m> { } /// Get the offset by index - #[inline] pub fn get_offset_by(&self, index: usize) -> Result { todo!() } - #[inline] + /// Get the header byte, which has the following form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 pub fn header(&self) -> u8 { self.bytes[0] } - #[inline] + /// Get the offset_minus_one value from the header pub fn offset_size_minus_one(&self) -> Result { if self.bytes.is_empty() { Err(ArrowError::InvalidArgumentError( @@ -58,15 +70,14 @@ impl<'m> VariantMetadata<'m> { } } - #[inline] + /// Get the offset_size pub fn offset_size(&self) -> Result { Ok(self.offset_size_minus_one()? + 1) } - /// Get the offset by index + /// Get the offsets as an iterator // TODO: Do we want this kind of API? // TODO: Test once API is agreed upon - #[inline] pub fn offsets(&'m self) -> Result + 'm, ArrowError> { struct OffsetIterators<'m> { buffer: &'m [u8], @@ -112,7 +123,8 @@ impl<'m> VariantMetadata<'m> { pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { todo!() } - /// Get all key-names as string + /// Get all key-names as an Iterator of strings + // TODO: Result pub fn fields(&self) -> impl Iterator { // Do the same as for offsets todo!(); From b3ecdd9d366c77df0871e77dd32fcd3f6ab6abdf Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 15:41:43 +0200 Subject: [PATCH 04/42] chore: typo --- parquet-variant/src/variant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 09c667bc0cc3..9118debac702 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -191,7 +191,7 @@ pub enum Variant<'m, 'v> { BooleanFalse, // only need the *value* buffer - // TODO: Do we want Cow<'v, str> over &'v str? It eanbles From - discuss on PR + // TODO: Do we want Cow<'v, str> over &'v str? It enables From - discuss on PR String(Cow<'v, str>), ShortString(Cow<'v, str>), From 9688fa48d6ab3850d759d5cc9fa0fddf68c14447 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 16:09:30 +0200 Subject: [PATCH 05/42] feat: buffer access validations --- parquet-variant/src/decoder.rs | 30 ++++++++++++++++++++++++++++++ parquet-variant/src/variant.rs | 22 +++++++++++++++++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 7ec691888f5f..493529f2e0af 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -66,6 +66,11 @@ pub(crate) fn get_primitive_type(header: u8) -> Result Result { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } let header = value[0]; let variant_type = match get_basic_type(header)? { VariantBasicType::Primitive => match get_primitive_type(header)? { @@ -95,14 +100,29 @@ fn invalid_utf8_err() -> ArrowError { /// Decodes an Int8 from the value section of a variant. pub(crate) fn decode_int8(value: &[u8]) -> Result { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Got empty value buffer so can't decode into int8.".to_string(), + )); + } let value = i8::from_le_bytes([value[1]]); Ok(value) } /// Decodes a long string from the value section of a variant. pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { + if value.len() < 5 { + return Err(ArrowError::InvalidArgumentError( + "Tried to decode value buffer into long_string, but it's too short (len<5)." + .to_string(), + )); + } let len = u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) as usize; + if value.len() < len + 5 { + let err_str = format!("The length of the buffer for the long_string is soo short, it is {} and it should be at least {} ({} < {} + 5)", value.len(), len + 5 , value.len(), len); + return Err(ArrowError::InvalidArgumentError(err_str)); + } let string_bytes = &value[5..5 + len]; let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; Ok(string) @@ -110,7 +130,17 @@ pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { /// Decodes a short string from the value section of a variant. pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to decode value buffer into short_string, but it's empty.".to_string(), + )); + } let len = ((value[0] & 0b11111100) >> 2) as usize; + + if value.len() < len + 1 { + let err_str = format!("The length of the buffer for the short_string is too short, it is {} and it should be at least {} ({} < {} + 1)", value.len(), len + 1 , value.len(), len); + return Err(ArrowError::InvalidArgumentError(err_str)); + } let string_bytes = &value[1..1 + len]; let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; Ok(string) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 9118debac702..15a7921b1cb6 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -24,7 +24,16 @@ impl<'m> VariantMetadata<'m> { /// Get the dict length pub fn dict_len(&self) -> Result { - let dict_len_bytes = &self.bytes[1..self.offset_size()? as usize + 1]; + let end_location = self.offset_size()? as usize + 1; + if self.bytes.len() < end_location { + let err_str = format!( + "Invalid metadata bytes, must have at least length {} but has length {}", + &end_location, + self.bytes.len() + ); + return Err(ArrowError::InvalidArgumentError(err_str)); + } + let dict_len_bytes = &self.bytes[1..end_location]; let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { ArrowError::InvalidArgumentError(format!( "Unable to convert dictionary_size bytes into usize: {}", @@ -54,8 +63,13 @@ impl<'m> VariantMetadata<'m> { /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 - pub fn header(&self) -> u8 { - self.bytes[0] + pub fn header(&self) -> Result { + if self.bytes.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Can't get header from empty buffer".to_string(), + )); + } + Ok(self.bytes[0]) } /// Get the offset_minus_one value from the header @@ -88,6 +102,8 @@ impl<'m> VariantMetadata<'m> { impl<'m> Iterator for OffsetIterators<'m> { type Item = (usize, usize); // (start, end) positions of the bytes + // TODO: Check bounds here or ensure they're correct + fn next(&mut self) -> Option { // +1 to skip the first offset if self.seen < self.total { From 5705687456ba67af849ed929bd233082a599b36b Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Wed, 21 May 2025 16:23:01 +0200 Subject: [PATCH 06/42] chore: remove useless API --- parquet-variant/src/variant.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 15a7921b1cb6..b6081af15627 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -275,7 +275,6 @@ impl<'m, 'v> Variant<'m, 'v> { } } - /// Borrow the raw metadata, if this variant has any. pub fn metadata(&self) -> Option<&'m [u8]> { match self { Variant::Object(VariantObject { metadata, .. }) @@ -283,20 +282,6 @@ impl<'m, 'v> Variant<'m, 'v> { _ => None, } } - - /// Borrow the raw value bytes, if present. - pub fn value(&'v self) -> Option<&'v [u8]> { - match self { - // Both arms bind `value` with the same type - Variant::Object(VariantObject { value, .. }) - | Variant::Array(VariantArray { value, .. }) => Some(*value), - - // Short and long strings borrow from inside the slice - Variant::String(s) | Variant::ShortString(s) => Some(s.as_bytes()), - - _ => None, - } - } } impl<'m, 'v> From for Variant<'m, 'v> { From d80d1d86a8f892a73be2a185be76b1d941f011e0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 May 2025 15:41:48 -0400 Subject: [PATCH 07/42] Test out avoiding strum and Cow --- parquet-variant/Cargo.toml | 2 - parquet-variant/src/decoder.rs | 25 ------------- parquet-variant/src/variant.rs | 68 ++++++++++++++-------------------- 3 files changed, 28 insertions(+), 67 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 9c77bd5b72fc..41b127ef14e6 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -32,8 +32,6 @@ rust-version = { workspace = true } [dependencies] arrow-schema = "55.1.0" -strum = "0.27.1" -strum_macros = "0.27.1" [lib] diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 493529f2e0af..718d5b0efa02 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,6 +1,5 @@ // NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 // And the feedback there. -use crate::variant::VariantType; use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; @@ -63,30 +62,6 @@ pub(crate) fn get_primitive_type(header: u8) -> Result Result { - if value.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Tried to get variant type from empty buffer array".to_string(), - )); - } - let header = value[0]; - let variant_type = match get_basic_type(header)? { - VariantBasicType::Primitive => match get_primitive_type(header)? { - VariantPrimitiveType::Null => VariantType::Null, - VariantPrimitiveType::Int8 => VariantType::Int8, - VariantPrimitiveType::BooleanTrue => VariantType::BooleanTrue, - VariantPrimitiveType::BooleanFalse => VariantType::BooleanFalse, - // TODO: Add 'legs' for the rest, once API is agreed upon - VariantPrimitiveType::String => VariantType::String, - }, - VariantBasicType::ShortString => VariantType::ShortString, - VariantBasicType::Object => VariantType::Object, - VariantBasicType::Array => VariantType::Array, - }; - Ok(variant_type) -} /// To be used in `map_err` when unpacking an integer from a slice of bytes. fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b6081af15627..293ed0a1e2bd 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,8 +1,7 @@ -use std::{borrow::Cow, ops::Index}; +use std::{ops::Index}; -use crate::decoder::{self, get_variant_type}; +use crate::decoder::{self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType}; use arrow_schema::ArrowError; -use strum_macros::EnumDiscriminants; #[derive(Clone, Copy, Debug, PartialEq)] /// Encodes the Variant Metadata, see the Variant spec file for more information @@ -196,8 +195,7 @@ impl<'m, 'v> Index for VariantArray<'m, 'v> { /// Variant value. May contain references to metadata and value // TODO: Add copy if no Cow on String and Shortstring? -#[derive(Clone, Debug, PartialEq, EnumDiscriminants)] -#[strum_discriminants(name(VariantType))] +#[derive(Clone, Debug, PartialEq)] pub enum Variant<'m, 'v> { // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon Null, @@ -206,10 +204,9 @@ pub enum Variant<'m, 'v> { BooleanTrue, BooleanFalse, - // only need the *value* buffer - // TODO: Do we want Cow<'v, str> over &'v str? It enables From - discuss on PR - String(Cow<'v, str>), - ShortString(Cow<'v, str>), + // Note: only need the *value* buffer + String(&'v str), + ShortString(& 'v str), // need both metadata & value Object(VariantObject<'m, 'v>), @@ -219,31 +216,32 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result { - Ok(match get_variant_type(value)? { - VariantType::Null => Variant::Null, - VariantType::BooleanTrue => Variant::BooleanTrue, - VariantType::BooleanFalse => Variant::BooleanFalse, - - VariantType::Int8 => Variant::Int8(decoder::decode_int8(value)?), - - // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon - VariantType::String => { - Variant::String(Cow::Borrowed(decoder::decode_long_string(value)?)) - } - - VariantType::ShortString => { - Variant::ShortString(Cow::Borrowed(decoder::decode_short_string(value)?)) - } - - VariantType::Object => Variant::Object(VariantObject { + if value.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Tried to get variant type from empty buffer array".to_string(), + )); + } + let header = value[0]; + let new_self = match get_basic_type(header)? { + VariantBasicType::Primitive => match get_primitive_type(header)? { + VariantPrimitiveType::Null => Variant::Null, + VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value)?), + VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, + VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, + // TODO: Add 'legs' for the rest, once API is agreed upon + VariantPrimitiveType::String => Variant::String(decoder::decode_long_string(value)?), + }, + VariantBasicType::ShortString => Variant::ShortString(decoder::decode_short_string(value)?), + VariantBasicType::Object => Variant::Object(VariantObject { metadata: VariantMetadata { bytes: metadata }, value, }), - VariantType::Array => Variant::Array(VariantArray { + VariantBasicType::Array => Variant::Array (VariantArray { metadata: VariantMetadata { bytes: metadata }, value, }), - }) + }; + Ok(new_self) } pub fn as_null(&self) -> Option<()> { @@ -303,19 +301,9 @@ impl<'m, 'v> From for Variant<'m, 'v> { impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { fn from(value: &'v str) -> Self { if value.len() < 64 { - Variant::ShortString(Cow::Borrowed(value)) - } else { - Variant::String(Cow::Borrowed(value)) - } - } -} - -impl<'m, 'v> From for Variant<'m, 'v> { - fn from(value: String) -> Self { - if value.len() < 64 { - Variant::ShortString(Cow::Owned(value)) + Variant::ShortString(value) } else { - Variant::String(Cow::Owned(value)) + Variant::String(value) } } } From 696fa9b7cd97080b1f1633062a8fd6b3d15e5f96 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 09:30:13 +0200 Subject: [PATCH 08/42] chore: comments --- parquet-variant/src/decoder.rs | 5 ++--- parquet-variant/src/variant.rs | 27 +++++++++++++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 718d5b0efa02..f4906bc5b9e0 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -17,7 +17,7 @@ pub enum VariantPrimitiveType { BooleanTrue = 1, BooleanFalse = 2, Int8 = 3, - // TODO: Add 'legs' for the rest of primitives, once API is agreed upon + // TODO: Add types for the rest of primitives, once API is agreed upon String = 16, } @@ -50,7 +50,7 @@ pub(crate) fn get_primitive_type(header: u8) -> Result VariantPrimitiveType::BooleanTrue, 2 => VariantPrimitiveType::BooleanFalse, 3 => VariantPrimitiveType::Int8, - // TODO: Add 'legs' for the rest, once API is agreed upon + // TODO: Add types for the rest, once API is agreed upon 16 => VariantPrimitiveType::String, _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -62,7 +62,6 @@ pub(crate) fn get_primitive_type(header: u8) -> Result ArrowError { ArrowError::InvalidArgumentError(e.to_string()) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 293ed0a1e2bd..db932d74ff60 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,6 +1,8 @@ -use std::{ops::Index}; +use std::ops::Index; -use crate::decoder::{self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType}; +use crate::decoder::{ + self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, +}; use arrow_schema::ArrowError; #[derive(Clone, Copy, Debug, PartialEq)] @@ -171,8 +173,6 @@ pub struct VariantArray<'m, 'v> { pub value: &'v [u8], } -// TODO: Let's agree on the API here, also should we expose a way to get the values as a vec of -// variants for those who want it? Would require allocations. impl<'m, 'v> VariantArray<'m, 'v> { pub fn len(&self) -> usize { todo!() @@ -194,10 +194,9 @@ impl<'m, 'v> Index for VariantArray<'m, 'v> { } /// Variant value. May contain references to metadata and value -// TODO: Add copy if no Cow on String and Shortstring? -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, Copy, PartialEq)] pub enum Variant<'m, 'v> { - // TODO: Add 'legs' for the rest of the primitive types, once API is agreed upon + // TODO: Add types for the rest of the primitive types, once API is agreed upon Null, Int8(i8), @@ -206,7 +205,7 @@ pub enum Variant<'m, 'v> { // Note: only need the *value* buffer String(&'v str), - ShortString(& 'v str), + ShortString(&'v str), // need both metadata & value Object(VariantObject<'m, 'v>), @@ -228,15 +227,19 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value)?), VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, - // TODO: Add 'legs' for the rest, once API is agreed upon - VariantPrimitiveType::String => Variant::String(decoder::decode_long_string(value)?), + // TODO: Add types for the rest, once API is agreed upon + VariantPrimitiveType::String => { + Variant::String(decoder::decode_long_string(value)?) + } }, - VariantBasicType::ShortString => Variant::ShortString(decoder::decode_short_string(value)?), + VariantBasicType::ShortString => { + Variant::ShortString(decoder::decode_short_string(value)?) + } VariantBasicType::Object => Variant::Object(VariantObject { metadata: VariantMetadata { bytes: metadata }, value, }), - VariantBasicType::Array => Variant::Array (VariantArray { + VariantBasicType::Array => Variant::Array(VariantArray { metadata: VariantMetadata { bytes: metadata }, value, }), From 798b627dc9daf8a989aee41dd2516033c7a10e80 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 09:32:41 +0200 Subject: [PATCH 09/42] chore: rename total to dict_len for consistency --- parquet-variant/src/variant.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index db932d74ff60..0636396d106d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -96,7 +96,7 @@ impl<'m> VariantMetadata<'m> { pub fn offsets(&'m self) -> Result + 'm, ArrowError> { struct OffsetIterators<'m> { buffer: &'m [u8], - total: usize, + dict_len: usize, seen: usize, offset_size: usize, } @@ -107,7 +107,7 @@ impl<'m> VariantMetadata<'m> { fn next(&mut self) -> Option { // +1 to skip the first offset - if self.seen < self.total { + if self.seen < self.dict_len { let start = usize::from_le_bytes( self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header ..(self.seen ) * self.offset_size + 1] @@ -130,7 +130,7 @@ impl<'m> VariantMetadata<'m> { } let iterator: OffsetIterators = OffsetIterators { buffer: self.bytes, - total: self.dict_len()?, + dict_len: self.dict_len()?, seen: 0, offset_size: self.offset_size()? as usize, }; From 8bea9b4c9a9d349bfd3e9bc7cdad3d9f780cdc10 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 09:56:08 +0200 Subject: [PATCH 10/42] chore: remove unnecessary masks --- parquet-variant/src/decoder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index f4906bc5b9e0..5cd4c5b8c20d 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -44,7 +44,7 @@ pub(crate) fn get_basic_type(header: u8) -> Result pub(crate) fn get_primitive_type(header: u8) -> Result { // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding //// Primitive type is encoded in the last 6 bits of the header byte - let primitive_type = (header >> 2) & 0x3F; + let primitive_type = header >> 2; let primitive_type = match primitive_type { 0 => VariantPrimitiveType::Null, 1 => VariantPrimitiveType::BooleanTrue, @@ -109,7 +109,7 @@ pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { "Tried to decode value buffer into short_string, but it's empty.".to_string(), )); } - let len = ((value[0] & 0b11111100) >> 2) as usize; + let len = (value[0] >> 2) as usize; if value.len() < len + 1 { let err_str = format!("The length of the buffer for the short_string is too short, it is {} and it should be at least {} ({} < {} + 1)", value.len(), len + 1 , value.len(), len); From 2e262f13870b122095417beeabe7c37f3f52b5c3 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:02:51 +0200 Subject: [PATCH 11/42] chore: Start work on PR comments, validating header, refactor --- parquet-variant/src/decoder.rs | 7 +- parquet-variant/src/lib.rs | 3 + parquet-variant/src/test_variant.rs | 5 +- parquet-variant/src/utils.rs | 42 ++++++ parquet-variant/src/variant.rs | 207 +++++++++++++++++++++++----- 5 files changed, 219 insertions(+), 45 deletions(-) create mode 100644 parquet-variant/src/utils.rs diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 5cd4c5b8c20d..c51d225b2955 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -3,6 +3,8 @@ use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; +use crate::utils::invalid_utf8_err; + #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { Primitive = 0, @@ -67,11 +69,6 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { ArrowError::InvalidArgumentError(e.to_string()) } -/// Constructs the error message for an invalid UTF-8 string. -fn invalid_utf8_err() -> ArrowError { - ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) -} - /// Decodes an Int8 from the value section of a variant. pub(crate) fn decode_int8(value: &[u8]) -> Result { if value.is_empty() { diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 8e27ebf989f2..a31187daeb69 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -33,6 +33,9 @@ mod decoder; // TODO: dead code removal #[allow(dead_code)] mod variant; +// TODO: dead code removal +#[allow(dead_code)] +mod utils; #[cfg(test)] mod test_variant; diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index cfd60d436945..062576701604 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -6,7 +6,7 @@ use std::fs; use std::path::{Path, PathBuf}; -use crate::variant::Variant; +use crate::variant::{Variant, VariantMetadata}; use arrow_schema::ArrowError; fn cases_dir() -> PathBuf { @@ -42,7 +42,8 @@ fn variant() -> Result<(), ArrowError> { let cases = get_cases(); for (case, want) in cases { let (metadata, value) = load_case(case)?; - let got = Variant::try_new(&metadata, &value)?; + let metadata_header = VariantMetadata::try_new(&metadata)?; + let got = Variant::try_new(&metadata_header, &value)?; assert_eq!(got, want); } Ok(()) diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs new file mode 100644 index 000000000000..dac2296fe3d3 --- /dev/null +++ b/parquet-variant/src/utils.rs @@ -0,0 +1,42 @@ +use std::{array::TryFromSliceError, ops::Range}; + +use arrow_schema::ArrowError; + +pub(crate) fn slice_from_slice(bytes: &[u8], range: Range) -> Result<&[u8], ArrowError> { + bytes.get(range.clone()).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Tried to extract {} bytes at offset {} from {}-byte buffer", + range.end - range.start, + range.start, + bytes.len(), + )) + }) +} + +pub(crate) fn array_from_slice( + bytes: &[u8], + offset: usize, +) -> Result<[u8; N], ArrowError> { + let bytes = slice_from_slice(bytes, offset..offset + N)?; + bytes.try_into().map_err(map_try_from_slice_error) +} + +/// To be used in `map_err` when unpacking an integer from a slice of bytes. +pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { + ArrowError::InvalidArgumentError(e.to_string()) +} + +pub(crate) fn is_empty_slice(slice: &[u8]) -> Result<(), ArrowError> { + if slice.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Received zero bytes".to_string(), + )); + } else { + return Ok(()); + } +} + +/// Constructs the error message for an invalid UTF-8 string. +pub(crate) fn invalid_utf8_err() -> ArrowError { + ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +} diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0636396d106d..9ad9bf44a208 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,14 +1,105 @@ -use std::ops::Index; +use crate::utils::{array_from_slice, invalid_utf8_err, slice_from_slice}; +use std::{ + num::TryFromIntError, + ops::{Index, Range}, + str, +}; use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; use arrow_schema::ArrowError; +#[derive(Clone, Debug, Copy, PartialEq)] +enum OffsetSizeBytes { + One = 1, + Two = 2, + Three = 3, + Four = 4, +} + +impl OffsetSizeBytes { + fn try_new(offset_size_minus_one: u8) -> Result { + use OffsetSizeBytes::*; + let result = match offset_size_minus_one { + 0 => One, + 1 => Two, + 2 => Three, + 3 => Four, + _ => { + return Err(ArrowError::InvalidArgumentError( + "offset_size_minus_one must be 0–3".to_string(), + )) + } + }; + Ok(result) + } + + fn unpack_usize( + &self, + bytes: &[u8], + byte_offset: usize, // how many bytes to skip + offset_index: usize, // which offset in an array of offsets + ) -> Result { + use OffsetSizeBytes::*; + let offset = byte_offset + (*self as usize) * offset_index; + let result = match self { + One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), + Three => todo!(), // ugh, endianness + Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, + }; + Ok(result) + } +} + +#[derive(Clone, Debug, Copy, PartialEq)] +pub(crate) struct VariantMetadataHeader { + version: u8, + is_sorted: bool, + /// Note: This is `offset_size_minus_one` + 1 + offset_size: OffsetSizeBytes, +} + +impl<'m> VariantMetadataHeader { + /// Tries to construct the variant metadata header, which has the form + /// 7 6 5 4 3 0 + /// +-------+---+---+---------------+ + /// header | | | | version | + /// +-------+---+---+---------------+ + /// ^ ^ + /// | +-- sorted_strings + /// +-- offset_size_minus_one + /// The version is a 4-bit value that must always contain the value 1. + /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. + /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. + /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 + pub fn try_new(bytes: &'m [u8]) -> Result { + let Some(header) = bytes.get(0) else { + return Err(ArrowError::InvalidArgumentError( + "Received zero bytes".to_string(), + )); + }; + + let version = header & 0x0F; // First four bits + let is_sorted = (header & 0x10) != 0; // Fifth bit + let offset_size_minus_one = (header >> 6) & 0x03; // Last two bits + + Ok(Self { + version, + is_sorted, + offset_size: OffsetSizeBytes::try_new(offset_size_minus_one)?, + }) + } +} + #[derive(Clone, Copy, Debug, PartialEq)] /// Encodes the Variant Metadata, see the Variant spec file for more information pub struct VariantMetadata<'m> { bytes: &'m [u8], + header: VariantMetadataHeader, } impl<'m> VariantMetadata<'m> { @@ -18,9 +109,40 @@ impl<'m> VariantMetadata<'m> { self.bytes } + pub fn try_new(bytes: &'m [u8]) -> Result { + let header = VariantMetadataHeader::try_new(bytes)?; + let dictionary_size = header.offset_size.unpack_usize(bytes, 1, 0)?; + + // TODO: Refactor, add test for validation + let valid = (0..=dictionary_size) + .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) + .scan(0, |prev, cur| { + let Ok(cur_offset) = cur else { + return Some(false); + }; + *prev = cur_offset; + + // Skip the first offset, which is always 0 + if *prev == 0 { + return Some(true); + } + + let valid = cur_offset > *prev; + Some(valid) + }) + .all(|valid| valid); + + if !valid { + return Err(ArrowError::InvalidArgumentError( + "Offsets are not monotonically increasing".to_string(), + )); + } + Ok(Self { bytes, header }) + } + /// Whether the dictionary keys are sorted and unique pub fn is_sorted(&self) -> bool { - todo!() + self.header.is_sorted } /// Get the dict length @@ -47,32 +169,47 @@ impl<'m> VariantMetadata<'m> { todo!() } - /// Get the offset by index - pub fn get_offset_by(&self, index: usize) -> Result { - todo!() + /// Get the offset by key-index + pub fn get_offset_by(&self, index: usize) -> Result, ArrowError> { + if index >= self.dict_len()? { + return Err(ArrowError::InvalidArgumentError(format!( + "Index {} out of bounds for dictionary of length {}", + index, + self.dict_len()? + ))); + } + + // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) + // TODO: Validate size before looking up? + // TODO: Fix location / bytes here, the index is wrong. + let start = self + .header + .offset_size + .unpack_usize(self.bytes, 1, index + 1)?; + let end = self + .header + .offset_size + .unpack_usize(self.bytes, 1, index + 2)?; + Ok(start..end) } - /// Get the header byte, which has the following form - /// 7 6 5 4 3 0 - /// +-------+---+---+---------------+ - /// header | | | | version | - /// +-------+---+---+---------------+ - /// ^ ^ - /// | +-- sorted_strings - /// +-- offset_size_minus_one - /// The version is a 4-bit value that must always contain the value 1. - /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. - /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. - /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 - pub fn header(&self) -> Result { - if self.bytes.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Can't get header from empty buffer".to_string(), - )); + /// Get the key-name by index + pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { + match self.get_offset_by(index) { + Ok(range) => { + let bytes = slice_from_slice(self.bytes, 1 + range.start..1 + range.end)?; + let result = str::from_utf8(bytes).map_err(|_| invalid_utf8_err())?; + Ok(result) + } + Err(e) => Err(e), } - Ok(self.bytes[0]) } + pub fn header(&self) -> VariantMetadataHeader { + self.header + } + + // TODO: Fix this + next two /// Get the offset_minus_one value from the header pub fn offset_size_minus_one(&self) -> Result { if self.bytes.is_empty() { @@ -136,10 +273,7 @@ impl<'m> VariantMetadata<'m> { }; Ok(iterator) } - /// Get the key-name-bytes by index - pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { - todo!() - } + /// Get all key-names as an Iterator of strings // TODO: Result pub fn fields(&self) -> impl Iterator { @@ -151,7 +285,7 @@ impl<'m> VariantMetadata<'m> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { - pub metadata: VariantMetadata<'m>, + pub metadata: &'m VariantMetadata<'m>, pub value: &'v [u8], } @@ -169,7 +303,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantArray<'m, 'v> { - pub metadata: VariantMetadata<'m>, + pub metadata: &'m VariantMetadata<'m>, pub value: &'v [u8], } @@ -214,7 +348,7 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. - pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result { + pub fn try_new(metadata: &'m VariantMetadata, value: &'v [u8]) -> Result { if value.is_empty() { return Err(ArrowError::InvalidArgumentError( "Tried to get variant type from empty buffer array".to_string(), @@ -235,14 +369,8 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value)?) } - VariantBasicType::Object => Variant::Object(VariantObject { - metadata: VariantMetadata { bytes: metadata }, - value, - }), - VariantBasicType::Array => Variant::Array(VariantArray { - metadata: VariantMetadata { bytes: metadata }, - value, - }), + VariantBasicType::Object => Variant::Object(VariantObject { metadata, value }), + VariantBasicType::Array => Variant::Array(VariantArray { metadata, value }), }; Ok(new_self) } @@ -272,6 +400,9 @@ impl<'m, 'v> Variant<'m, 'v> { pub fn as_int8(&self) -> Option { match self { Variant::Int8(i) => Some(*i), + // TODO: Add branches for type-widening/shortening when implemting rest of primitives for int + // Variant::Int16(i) => i.try_into().ok(), + // ... _ => None, } } From 715081b7966d3cb5ee9d0979a5b2f1b516480b80 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:30:29 +0200 Subject: [PATCH 12/42] chore: minor cleanups and utils --- parquet-variant/src/decoder.rs | 45 +++++++++------------------------ parquet-variant/src/utils.rs | 46 +++++++++++++++++++++++++--------- parquet-variant/src/variant.rs | 18 +++---------- 3 files changed, 49 insertions(+), 60 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index c51d225b2955..1edc339bfa9e 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -3,7 +3,7 @@ use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; -use crate::utils::invalid_utf8_err; +use crate::utils::{invalid_utf8_err, non_empty_slice, slice_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -71,49 +71,28 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { /// Decodes an Int8 from the value section of a variant. pub(crate) fn decode_int8(value: &[u8]) -> Result { - if value.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Got empty value buffer so can't decode into int8.".to_string(), - )); - } - let value = i8::from_le_bytes([value[1]]); + let value = i8::from_le_bytes([non_empty_slice(value)?[1]]); Ok(value) } /// Decodes a long string from the value section of a variant. pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { - if value.len() < 5 { - return Err(ArrowError::InvalidArgumentError( - "Tried to decode value buffer into long_string, but it's too short (len<5)." - .to_string(), - )); - } - let len = - u32::from_le_bytes(value[1..=4].try_into().map_err(map_try_from_slice_error)?) as usize; - if value.len() < len + 5 { - let err_str = format!("The length of the buffer for the long_string is soo short, it is {} and it should be at least {} ({} < {} + 5)", value.len(), len + 5 , value.len(), len); - return Err(ArrowError::InvalidArgumentError(err_str)); - } - let string_bytes = &value[5..5 + len]; - let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + let len = u32::from_le_bytes( + slice_from_slice(value, 1..=4)? + .try_into() + .map_err(map_try_from_slice_error)?, + ) as usize; + let string = + str::from_utf8(slice_from_slice(value, 5..5 + len)?).map_err(|_| invalid_utf8_err())?; Ok(string) } /// Decodes a short string from the value section of a variant. pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { - if value.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Tried to decode value buffer into short_string, but it's empty.".to_string(), - )); - } - let len = (value[0] >> 2) as usize; + let len = (non_empty_slice(value)?[0] >> 2) as usize; - if value.len() < len + 1 { - let err_str = format!("The length of the buffer for the short_string is too short, it is {} and it should be at least {} ({} < {} + 1)", value.len(), len + 1 , value.len(), len); - return Err(ArrowError::InvalidArgumentError(err_str)); - } - let string_bytes = &value[1..1 + len]; - let string = str::from_utf8(string_bytes).map_err(|_| invalid_utf8_err())?; + let string = + str::from_utf8(slice_from_slice(value, 1..1 + len)?).map_err(|_| invalid_utf8_err())?; Ok(string) } diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index dac2296fe3d3..721a44788307 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -1,18 +1,40 @@ -use std::{array::TryFromSliceError, ops::Range}; +use std::{ + array::TryFromSliceError, + ops::{Bound, RangeBounds}, +}; use arrow_schema::ArrowError; -pub(crate) fn slice_from_slice(bytes: &[u8], range: Range) -> Result<&[u8], ArrowError> { - bytes.get(range.clone()).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( +pub(crate) fn slice_from_slice(bytes: &[u8], range: R) -> Result<&[u8], ArrowError> +where + R: RangeBounds, +{ + // ----- translate RangeBounds → concrete `[start, end_exclusive)` ----- + let start = match range.start_bound() { + Bound::Included(&s) => s, + Bound::Excluded(&s) => s.saturating_add(1), + Bound::Unbounded => 0, + }; + + let end_exclusive = match range.end_bound() { + Bound::Included(&e) => e.saturating_add(1), // inclusive → exclusive + Bound::Excluded(&e) => e, + Bound::Unbounded => bytes.len(), + }; + + // ----- bounds check -------------------------------------------------- + if start > end_exclusive || end_exclusive > bytes.len() { + return Err(ArrowError::InvalidArgumentError(format!( "Tried to extract {} bytes at offset {} from {}-byte buffer", - range.end - range.start, - range.start, + end_exclusive.saturating_sub(start), + start, bytes.len(), - )) - }) -} + ))); + } + // Safe: we just verified the range. + Ok(&bytes[start..end_exclusive]) +} pub(crate) fn array_from_slice( bytes: &[u8], offset: usize, @@ -26,13 +48,13 @@ pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { ArrowError::InvalidArgumentError(e.to_string()) } -pub(crate) fn is_empty_slice(slice: &[u8]) -> Result<(), ArrowError> { +pub(crate) fn non_empty_slice(slice: &[u8]) -> Result<&[u8], ArrowError> { if slice.is_empty() { return Err(ArrowError::InvalidArgumentError( - "Received zero bytes".to_string(), + "Received empty bytes".to_string(), )); } else { - return Ok(()); + return Ok(slice); } } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 9ad9bf44a208..3fadf841f460 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,4 +1,4 @@ -use crate::utils::{array_from_slice, invalid_utf8_err, slice_from_slice}; +use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; use std::{ num::TryFromIntError, ops::{Index, Range}, @@ -212,14 +212,7 @@ impl<'m> VariantMetadata<'m> { // TODO: Fix this + next two /// Get the offset_minus_one value from the header pub fn offset_size_minus_one(&self) -> Result { - if self.bytes.is_empty() { - Err(ArrowError::InvalidArgumentError( - "Tried to get offset_size_minus_one from header, but self.bytes buffer is emtpy." - .to_string(), - )) - } else { - Ok(self.bytes[0] & (0b11 << 6)) // Grab the last 2 bits - } + Ok(non_empty_slice(self.bytes)?[0] & (0b11 << 6)) // Grab the last 2 bits } /// Get the offset_size @@ -349,12 +342,7 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. pub fn try_new(metadata: &'m VariantMetadata, value: &'v [u8]) -> Result { - if value.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Tried to get variant type from empty buffer array".to_string(), - )); - } - let header = value[0]; + let header = non_empty_slice(value)?[0]; let new_self = match get_basic_type(header)? { VariantBasicType::Primitive => match get_primitive_type(header)? { VariantPrimitiveType::Null => Variant::Null, From b774b38714b01c698f0001b7f2803693af2a762e Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:32:57 +0200 Subject: [PATCH 13/42] chore: remove redundant comments --- parquet-variant/src/utils.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 721a44788307..750c984d7a5d 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -9,7 +9,6 @@ pub(crate) fn slice_from_slice(bytes: &[u8], range: R) -> Result<&[u8], Arrow where R: RangeBounds, { - // ----- translate RangeBounds → concrete `[start, end_exclusive)` ----- let start = match range.start_bound() { Bound::Included(&s) => s, Bound::Excluded(&s) => s.saturating_add(1), @@ -22,7 +21,6 @@ where Bound::Unbounded => bytes.len(), }; - // ----- bounds check -------------------------------------------------- if start > end_exclusive || end_exclusive > bytes.len() { return Err(ArrowError::InvalidArgumentError(format!( "Tried to extract {} bytes at offset {} from {}-byte buffer", @@ -32,7 +30,6 @@ where ))); } - // Safe: we just verified the range. Ok(&bytes[start..end_exclusive]) } pub(crate) fn array_from_slice( From 5c668b4f54a266ccb8ec9b4a67ea8d68692d76a9 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:43:39 +0200 Subject: [PATCH 14/42] chore: fix int_8 to use array_from_slice --- parquet-variant/src/decoder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 1edc339bfa9e..4af9b935574a 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -3,7 +3,7 @@ use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; -use crate::utils::{invalid_utf8_err, non_empty_slice, slice_from_slice}; +use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -71,7 +71,7 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { /// Decodes an Int8 from the value section of a variant. pub(crate) fn decode_int8(value: &[u8]) -> Result { - let value = i8::from_le_bytes([non_empty_slice(value)?[1]]); + let value = i8::from_le_bytes(array_from_slice(value, 1)?); Ok(value) } From 0ceaedc739eed948ded3dfbce95c81236b02588d Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:53:25 +0200 Subject: [PATCH 15/42] chore: revert slice_from_slice due to saturation issues at max usize --- parquet-variant/src/decoder.rs | 2 +- parquet-variant/src/utils.rs | 37 +++++++++------------------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 4af9b935574a..7454be807bda 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -78,7 +78,7 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { /// Decodes a long string from the value section of a variant. pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { let len = u32::from_le_bytes( - slice_from_slice(value, 1..=4)? + slice_from_slice(value, 1..5)? .try_into() .map_err(map_try_from_slice_error)?, ) as usize; diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 750c984d7a5d..532f0e89078d 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -1,36 +1,17 @@ -use std::{ - array::TryFromSliceError, - ops::{Bound, RangeBounds}, -}; +use std::{array::TryFromSliceError, ops::Range}; use arrow_schema::ArrowError; -pub(crate) fn slice_from_slice(bytes: &[u8], range: R) -> Result<&[u8], ArrowError> -where - R: RangeBounds, -{ - let start = match range.start_bound() { - Bound::Included(&s) => s, - Bound::Excluded(&s) => s.saturating_add(1), - Bound::Unbounded => 0, - }; - - let end_exclusive = match range.end_bound() { - Bound::Included(&e) => e.saturating_add(1), // inclusive → exclusive - Bound::Excluded(&e) => e, - Bound::Unbounded => bytes.len(), - }; - - if start > end_exclusive || end_exclusive > bytes.len() { - return Err(ArrowError::InvalidArgumentError(format!( +#[inline] +pub(crate) fn slice_from_slice(bytes: &[u8], range: Range) -> Result<&[u8], ArrowError> { + bytes.get(range.clone()).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( "Tried to extract {} bytes at offset {} from {}-byte buffer", - end_exclusive.saturating_sub(start), - start, + range.end - range.start, + range.start, bytes.len(), - ))); - } - - Ok(&bytes[start..end_exclusive]) + )) + }) } pub(crate) fn array_from_slice( bytes: &[u8], From 130e1135c37ec6a2ed858048d915676dd15cd5dc Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 18:59:38 +0200 Subject: [PATCH 16/42] chore: comment on memoize with TODO --- parquet-variant/src/variant.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3fadf841f460..1fb89a3437e3 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -171,6 +171,7 @@ impl<'m> VariantMetadata<'m> { /// Get the offset by key-index pub fn get_offset_by(&self, index: usize) -> Result, ArrowError> { + // TODO: Should we memoize the offsets? There could be thousands of them (https://github.com/apache/arrow-rs/pull/7535#discussion_r2101351294) if index >= self.dict_len()? { return Err(ArrowError::InvalidArgumentError(format!( "Index {} out of bounds for dictionary of length {}", From 45a71c62d04dbc86faf20fcb3d4c97c95de1d37d Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 19:19:45 +0200 Subject: [PATCH 17/42] chore: clean-up and fix some things in the variant API --- parquet-variant/src/test_variant.rs | 34 +++++++++++++--- parquet-variant/src/variant.rs | 61 +++++++++++------------------ 2 files changed, 50 insertions(+), 45 deletions(-) diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index 062576701604..1337bfc9f599 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -23,7 +23,7 @@ fn load_case(name: &str) -> Result<(Vec, Vec), ArrowError> { Ok((meta, val)) } -fn get_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { +fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { vec![ ("primitive_boolean_false", Variant::BooleanFalse), ("primitive_boolean_true", Variant::BooleanTrue), @@ -37,14 +37,36 @@ fn get_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ] } +fn get_non_primitive_cases() -> Vec<&'static str> { + vec!["object_primitive"] +} + #[test] -fn variant() -> Result<(), ArrowError> { - let cases = get_cases(); +fn variant_primitive() -> Result<(), ArrowError> { + let cases = get_primitive_cases(); for (case, want) in cases { - let (metadata, value) = load_case(case)?; - let metadata_header = VariantMetadata::try_new(&metadata)?; - let got = Variant::try_new(&metadata_header, &value)?; + let (metadata_bytes, value) = load_case(case)?; + let metadata = VariantMetadata::try_new(&metadata_bytes)?; + let got = Variant::try_new(&metadata, &value)?; assert_eq!(got, want); } Ok(()) } + +#[test] +fn variant_non_primitive() -> Result<(), ArrowError> { + let cases = get_non_primitive_cases(); + for case in cases { + let (metadata, value) = load_case(case)?; + let metadata = VariantMetadata::try_new(&metadata)?; + let variant = Variant::try_new(&metadata, &value)?; + match case { + "object_primitive" => { + assert!(matches!(variant, Variant::Object(_))); + assert_eq!(metadata.dictionary_size(), 7); + } + _ => unreachable!(), + } + } + Ok(()) +} diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 1fb89a3437e3..d8c3c6df0d3d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,15 +1,14 @@ +use crate::decoder::{ + self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, +}; use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; +use arrow_schema::ArrowError; use std::{ num::TryFromIntError, ops::{Index, Range}, str, }; -use crate::decoder::{ - self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, -}; -use arrow_schema::ArrowError; - #[derive(Clone, Debug, Copy, PartialEq)] enum OffsetSizeBytes { One = 1, @@ -86,7 +85,6 @@ impl<'m> VariantMetadataHeader { let version = header & 0x0F; // First four bits let is_sorted = (header & 0x10) != 0; // Fifth bit let offset_size_minus_one = (header >> 6) & 0x03; // Last two bits - Ok(Self { version, is_sorted, @@ -100,6 +98,7 @@ impl<'m> VariantMetadataHeader { pub struct VariantMetadata<'m> { bytes: &'m [u8], header: VariantMetadataHeader, + dict_size: usize, } impl<'m> VariantMetadata<'m> { @@ -111,23 +110,24 @@ impl<'m> VariantMetadata<'m> { pub fn try_new(bytes: &'m [u8]) -> Result { let header = VariantMetadataHeader::try_new(bytes)?; - let dictionary_size = header.offset_size.unpack_usize(bytes, 1, 0)?; + // Offset 1, index 0 because first element after header is dictionary size + let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; // TODO: Refactor, add test for validation - let valid = (0..=dictionary_size) + let valid = (0..=dict_size) .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) .scan(0, |prev, cur| { let Ok(cur_offset) = cur else { return Some(false); }; - *prev = cur_offset; - // Skip the first offset, which is always 0 if *prev == 0 { + *prev = cur_offset; return Some(true); } let valid = cur_offset > *prev; + *prev = cur_offset; Some(valid) }) .all(|valid| valid); @@ -137,7 +137,11 @@ impl<'m> VariantMetadata<'m> { "Offsets are not monotonically increasing".to_string(), )); } - Ok(Self { bytes, header }) + Ok(Self { + bytes, + header, + dict_size, + }) } /// Whether the dictionary keys are sorted and unique @@ -145,25 +149,9 @@ impl<'m> VariantMetadata<'m> { self.header.is_sorted } - /// Get the dict length - pub fn dict_len(&self) -> Result { - let end_location = self.offset_size()? as usize + 1; - if self.bytes.len() < end_location { - let err_str = format!( - "Invalid metadata bytes, must have at least length {} but has length {}", - &end_location, - self.bytes.len() - ); - return Err(ArrowError::InvalidArgumentError(err_str)); - } - let dict_len_bytes = &self.bytes[1..end_location]; - let dict_len = usize::from_le_bytes(dict_len_bytes.try_into().map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Unable to convert dictionary_size bytes into usize: {}", - e, - )) - })?); - Ok(dict_len) + /// Get the dictionary size + pub fn dictionary_size(&self) -> usize { + self.dict_size } pub fn version(&self) -> usize { todo!() @@ -172,11 +160,10 @@ impl<'m> VariantMetadata<'m> { /// Get the offset by key-index pub fn get_offset_by(&self, index: usize) -> Result, ArrowError> { // TODO: Should we memoize the offsets? There could be thousands of them (https://github.com/apache/arrow-rs/pull/7535#discussion_r2101351294) - if index >= self.dict_len()? { + if index >= self.dict_size { return Err(ArrowError::InvalidArgumentError(format!( "Index {} out of bounds for dictionary of length {}", - index, - self.dict_len()? + index, self.dict_size ))); } @@ -233,9 +220,7 @@ impl<'m> VariantMetadata<'m> { } impl<'m> Iterator for OffsetIterators<'m> { type Item = (usize, usize); // (start, end) positions of the bytes - - // TODO: Check bounds here or ensure they're correct - + // TODO: Check bounds here or ensure they're correct fn next(&mut self) -> Option { // +1 to skip the first offset if self.seen < self.dict_len { @@ -261,7 +246,7 @@ impl<'m> VariantMetadata<'m> { } let iterator: OffsetIterators = OffsetIterators { buffer: self.bytes, - dict_len: self.dict_len()?, + dict_len: self.dict_size, seen: 0, offset_size: self.offset_size()? as usize, }; @@ -282,14 +267,12 @@ pub struct VariantObject<'m, 'v> { pub metadata: &'m VariantMetadata<'m>, pub value: &'v [u8], } - impl<'m, 'v> VariantObject<'m, 'v> { pub fn fields(&self) -> Result)>, ArrowError> { todo!(); #[allow(unreachable_code)] // Just to infer the return type Ok(vec![].into_iter()) } - pub fn field(&self, _name: &'m str) -> Result, ArrowError> { todo!() } From 032a9d6b8345ce7f6ec0044b618383a1040089be Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 20:25:08 +0200 Subject: [PATCH 18/42] chore: Iterators, and fix get_by methods and some cleanups --- parquet-variant/src/test_variant.rs | 2 + parquet-variant/src/variant.rs | 139 ++++++++++++++++++---------- 2 files changed, 94 insertions(+), 47 deletions(-) diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index 1337bfc9f599..73a53a0a277c 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -64,6 +64,8 @@ fn variant_non_primitive() -> Result<(), ArrowError> { "object_primitive" => { assert!(matches!(variant, Variant::Object(_))); assert_eq!(metadata.dictionary_size(), 7); + let dict_val = metadata.get_field_by_index(0)?; + assert_eq!(dict_val, "int_field"); } _ => unreachable!(), } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index d8c3c6df0d3d..019da20c074d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -45,7 +45,8 @@ impl OffsetSizeBytes { let result = match self { One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), - Three => todo!(), // ugh, endianness + // TODO: Do this one + Three => todo!(), Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) .try_into() .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, @@ -153,8 +154,8 @@ impl<'m> VariantMetadata<'m> { pub fn dictionary_size(&self) -> usize { self.dict_size } - pub fn version(&self) -> usize { - todo!() + pub fn version(&self) -> u8 { + self.header.version } /// Get the offset by key-index @@ -182,63 +183,61 @@ impl<'m> VariantMetadata<'m> { } /// Get the key-name by index - pub fn get_by(&self, index: usize) -> Result<&'m str, ArrowError> { + pub fn get_field_by_index(&self, index: usize) -> Result<&'m str, ArrowError> { match self.get_offset_by(index) { - Ok(range) => { - let bytes = slice_from_slice(self.bytes, 1 + range.start..1 + range.end)?; - let result = str::from_utf8(bytes).map_err(|_| invalid_utf8_err())?; - Ok(result) - } + Ok(range) => self.get_field_by_offset(range), Err(e) => Err(e), } } - pub fn header(&self) -> VariantMetadataHeader { - self.header - } - - // TODO: Fix this + next two - /// Get the offset_minus_one value from the header - pub fn offset_size_minus_one(&self) -> Result { - Ok(non_empty_slice(self.bytes)?[0] & (0b11 << 6)) // Grab the last 2 bits + /// Gets the field using an offset (Range) - helper method to keep consistent API. + pub fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { + let dictionary_key_start_byte = 1 // header + + self.header.offset_size as usize // dictionary_size field itself + + (self.dict_size + 1) * (self.header.offset_size as usize); // all offset entries + let dictionary_keys_bytes = + slice_from_slice(self.bytes, dictionary_key_start_byte..self.bytes.len())?; + let dictionary_key_bytes = + slice_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; + let result = str::from_utf8(dictionary_key_bytes).map_err(|_| invalid_utf8_err())?; + Ok(result) } - /// Get the offset_size - pub fn offset_size(&self) -> Result { - Ok(self.offset_size_minus_one()? + 1) + pub fn header(&self) -> VariantMetadataHeader { + self.header } /// Get the offsets as an iterator - // TODO: Do we want this kind of API? - // TODO: Test once API is agreed upon - pub fn offsets(&'m self) -> Result + 'm, ArrowError> { + // TODO: Write tests + pub fn offsets( + &'m self, + ) -> Result, ArrowError>> + 'm, ArrowError> { struct OffsetIterators<'m> { buffer: &'m [u8], + header: &'m VariantMetadataHeader, dict_len: usize, seen: usize, - offset_size: usize, } impl<'m> Iterator for OffsetIterators<'m> { - type Item = (usize, usize); // (start, end) positions of the bytes - // TODO: Check bounds here or ensure they're correct + type Item = Result, ArrowError>; // Range = (start, end) positions of the bytes fn next(&mut self) -> Option { - // +1 to skip the first offset if self.seen < self.dict_len { - let start = usize::from_le_bytes( - self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header - ..(self.seen ) * self.offset_size + 1] - .try_into() - .ok()?, - ); + let start = self + .header + .offset_size + // skip header via byte_offset=1 and self.seen + 1 because first is dictionary_size + .unpack_usize(self.buffer, 1, self.seen + 1); + + let end = self + .header + .offset_size + // skip header via byte_offset=1 and self.seen + 2 to get end offset + .unpack_usize(self.buffer, 1, self.seen + 2); self.seen += 1; - let end = usize::from_le_bytes( - self.buffer[(self.seen ) * self.offset_size + 1 // +1 to skip header - ..(self.seen ) * self.offset_size + 1] - .try_into() - .ok()?, - ); - - Some((start, end)) + match (start, end) { + (Ok(start), Ok(end)) => Some(Ok(start..end)), + (Err(e), _) | (_, Err(e)) => Some(Err(e)), + } } else { None } @@ -246,19 +245,65 @@ impl<'m> VariantMetadata<'m> { } let iterator: OffsetIterators = OffsetIterators { buffer: self.bytes, + header: &self.header, dict_len: self.dict_size, seen: 0, - offset_size: self.offset_size()? as usize, }; Ok(iterator) } /// Get all key-names as an Iterator of strings - // TODO: Result - pub fn fields(&self) -> impl Iterator { - // Do the same as for offsets - todo!(); - vec![].into_iter() + // NOTE: Duplicated code due to issues putting Impl's on structs, this is the same as `.offsets` except it + // extracts the field using the offset instead of returning the offset. + pub fn fields( + &'m self, + ) -> Result> + 'm, ArrowError> { + struct FieldIterator<'m> { + buffer: &'m [u8], + header: &'m VariantMetadataHeader, + dict_len: usize, + seen: usize, + } + impl<'m> Iterator for FieldIterator<'m> { + type Item = Result<&'m str, ArrowError>; + fn next(&mut self) -> Option { + if self.seen < self.dict_len { + let start = self + .header + .offset_size + // skip header via byte_offset=1 and self.seen + 1 because first is dictionary_size + .unpack_usize(self.buffer, 1, self.seen + 1); + + let end = self + .header + .offset_size + // skip header via byte_offset=1 and self.seen + 2 to get end offset + .unpack_usize(self.buffer, 1, self.seen + 2); + self.seen += 1; + let result = match (start, end) { + (Ok(start), Ok(end)) => { + // Try to get the slice + match slice_from_slice(self.buffer, 1 + start..1 + end) { + // Get the field and return it + Ok(bytes) => str::from_utf8(bytes).map_err(|_| invalid_utf8_err()), + Err(e) => Err(e), + } + } + (Err(e), _) | (_, Err(e)) => Err(e), + }; + Some(result) + } else { + None + } + } + } + let iterator: FieldIterator = FieldIterator { + buffer: self.bytes, + header: &self.header, + dict_len: self.dict_size, + seen: 0, + }; + Ok(iterator) } } From fe2bf4582be0ae83044c4ab7008b7a5891890a79 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 22:50:44 +0200 Subject: [PATCH 19/42] chore: 3rd branch of unpack, docs + some tests --- parquet-variant/src/test_variant.rs | 2 +- parquet-variant/src/variant.rs | 132 +++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 5 deletions(-) diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index 73a53a0a277c..753d01a1ae72 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -64,7 +64,7 @@ fn variant_non_primitive() -> Result<(), ArrowError> { "object_primitive" => { assert!(matches!(variant, Variant::Object(_))); assert_eq!(metadata.dictionary_size(), 7); - let dict_val = metadata.get_field_by_index(0)?; + let dict_val = metadata.get_field_by(0)?; assert_eq!(dict_val, "int_field"); } _ => unreachable!(), diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 019da20c074d..f67412a382b6 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -18,6 +18,7 @@ enum OffsetSizeBytes { } impl OffsetSizeBytes { + /// Build from the `offset_size_minus_one` bits (see spec). fn try_new(offset_size_minus_one: u8) -> Result { use OffsetSizeBytes::*; let result = match offset_size_minus_one { @@ -34,6 +35,17 @@ impl OffsetSizeBytes { Ok(result) } + /// Return one unsigned little-endian value from `bytes`. + /// + /// * `bytes` – the Variant-metadata buffer. + /// * `byte_offset` – number of bytes to skip **before** reading the first + /// value (usually `1` to move past the header byte). + /// * `offset_index` – 0-based index **after** the skip + /// (`0` is the first value, `1` the next, …). + /// + /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). + /// Three-byte values are zero-extended to 32 bits before the final + /// fallible cast to `usize`. fn unpack_usize( &self, bytes: &[u8], @@ -45,8 +57,16 @@ impl OffsetSizeBytes { let result = match self { One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), - // TODO: Do this one - Three => todo!(), + Three => { + // Let's grab the three byte le-chunk first + let b3_chunks: [u8; 3] = array_from_slice(bytes, offset)?; + // Let's pad it and construct a padded u32 from it. + let mut buf = [0u8; 4]; + buf[..3].copy_from_slice(&b3_chunks); + u32::from_le_bytes(buf) + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))? + } Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) .try_into() .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, @@ -183,7 +203,7 @@ impl<'m> VariantMetadata<'m> { } /// Get the key-name by index - pub fn get_field_by_index(&self, index: usize) -> Result<&'m str, ArrowError> { + pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { match self.get_offset_by(index) { Ok(range) => self.get_field_by_offset(range), Err(e) => Err(e), @@ -191,7 +211,7 @@ impl<'m> VariantMetadata<'m> { } /// Gets the field using an offset (Range) - helper method to keep consistent API. - pub fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { + pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { let dictionary_key_start_byte = 1 // header + self.header.offset_size as usize // dictionary_size field itself + (self.dict_size + 1) * (self.header.offset_size as usize); // all offset entries @@ -458,3 +478,107 @@ impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_offset() { + assert_eq!(OffsetSizeBytes::try_new(0).unwrap(), OffsetSizeBytes::One); + assert_eq!(OffsetSizeBytes::try_new(1).unwrap(), OffsetSizeBytes::Two); + assert_eq!(OffsetSizeBytes::try_new(2).unwrap(), OffsetSizeBytes::Three); + assert_eq!(OffsetSizeBytes::try_new(3).unwrap(), OffsetSizeBytes::Four); + + // everything outside 0-3 must error + assert!(OffsetSizeBytes::try_new(4).is_err()); + assert!(OffsetSizeBytes::try_new(255).is_err()); + } + + #[test] + fn unpack_usize_all_widths() { + // One-byte offsets + let buf_one = [0x01u8, 0xAB, 0xCD]; + assert_eq!( + OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(), + 0x01 + ); + assert_eq!( + OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 2).unwrap(), + 0xCD + ); + + // Two-byte offsets (little-endian 0x1234, 0x5678) + let buf_two = [0x34, 0x12, 0x78, 0x56]; + assert_eq!( + OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 0).unwrap(), + 0x1234 + ); + assert_eq!( + OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 1).unwrap(), + 0x5678 + ); + + // Three-byte offsets (0x030201 and 0x0000FF) + let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00]; + assert_eq!( + OffsetSizeBytes::Three + .unpack_usize(&buf_three, 0, 0) + .unwrap(), + 0x0302_01 + ); + assert_eq!( + OffsetSizeBytes::Three + .unpack_usize(&buf_three, 0, 1) + .unwrap(), + 0x0000_FF + ); + + // Four-byte offsets (0x12345678, 0x90ABCDEF) + let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90]; + assert_eq!( + OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 0).unwrap(), + 0x1234_5678 + ); + assert_eq!( + OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 1).unwrap(), + 0x90AB_CDEF + ); + } + + #[test] + fn unpack_usize_out_of_bounds() { + let tiny = [0x00u8]; // deliberately too short + assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0, 0).is_err()); + assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0, 0).is_err()); + } + + #[test] + fn unpack_simple() { + let buf = [ + 0x41, // header + 0x02, 0x00, // dictionary_size = 2 + 0x00, 0x00, // offset[0] = 0 + 0x05, 0x00, // offset[1] = 5 + 0x09, 0x00, // offset[2] = 9 + ]; + + let width = OffsetSizeBytes::Two; + + // dictionary_size starts immediately after the header + let dict_size = width.unpack_usize(&buf, 1, 0).unwrap(); + assert_eq!(dict_size, 2); + + let first = width.unpack_usize(&buf, 1, 1).unwrap(); + assert_eq!(first, 0); + + let second = width.unpack_usize(&buf, 1, 2).unwrap(); + assert_eq!(second, 5); + + let third = width.unpack_usize(&buf, 1, 3).unwrap(); + assert_eq!(third, 9); + + let err = width.unpack_usize(&buf, 1, 4); + assert!(err.is_err()) + } +} From 10fe6bf4f3129cbf31925508160f7dbd15f4fad5 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 22:57:04 +0200 Subject: [PATCH 20/42] chore: cleanup old comments --- parquet-variant/src/variant.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index f67412a382b6..bc47bbedcf60 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -134,7 +134,7 @@ impl<'m> VariantMetadata<'m> { // Offset 1, index 0 because first element after header is dictionary size let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; - // TODO: Refactor, add test for validation + // TODO: add test for validation let valid = (0..=dict_size) .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) .scan(0, |prev, cur| { @@ -180,7 +180,6 @@ impl<'m> VariantMetadata<'m> { /// Get the offset by key-index pub fn get_offset_by(&self, index: usize) -> Result, ArrowError> { - // TODO: Should we memoize the offsets? There could be thousands of them (https://github.com/apache/arrow-rs/pull/7535#discussion_r2101351294) if index >= self.dict_size { return Err(ArrowError::InvalidArgumentError(format!( "Index {} out of bounds for dictionary of length {}", @@ -189,8 +188,6 @@ impl<'m> VariantMetadata<'m> { } // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) - // TODO: Validate size before looking up? - // TODO: Fix location / bytes here, the index is wrong. let start = self .header .offset_size From 992a35d87302e3c610e2377726c8bf843fc1da43 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 23:05:13 +0200 Subject: [PATCH 21/42] chore: use unreachable!() and add comment on why it's sound --- parquet-variant/src/decoder.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 7454be807bda..df954a135b8d 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -33,10 +33,9 @@ pub(crate) fn get_basic_type(header: u8) -> Result 2 => VariantBasicType::Object, 3 => VariantBasicType::Array, _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "unknown basic type: {}", - basic_type - ))) + //NOTE: A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we + // masked `basic_type` with 0x03 above. + unreachable!(); } }; Ok(basic_type) From c2c6bfd049808f52398ac4a632842514dedd4d81 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 23:07:06 +0200 Subject: [PATCH 22/42] chore: remove note per PR feedback --- parquet-variant/src/decoder.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index df954a135b8d..142f8b4926dc 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,5 +1,3 @@ -// NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 -// And the feedback there. use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; From 695f49dffcf89c804eb2c6b577f8c35f4ea6c854 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Thu, 22 May 2025 23:18:14 +0200 Subject: [PATCH 23/42] feat: impl TryFrom for VariantPrimitiveType --- parquet-variant/src/decoder.rs | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 142f8b4926dc..7395ce63ca1b 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -39,26 +39,28 @@ pub(crate) fn get_basic_type(header: u8) -> Result Ok(basic_type) } -/// Extracts the primitive type from a header byte -pub(crate) fn get_primitive_type(header: u8) -> Result { - // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding - //// Primitive type is encoded in the last 6 bits of the header byte - let primitive_type = header >> 2; - let primitive_type = match primitive_type { - 0 => VariantPrimitiveType::Null, - 1 => VariantPrimitiveType::BooleanTrue, - 2 => VariantPrimitiveType::BooleanFalse, - 3 => VariantPrimitiveType::Int8, - // TODO: Add types for the rest, once API is agreed upon - 16 => VariantPrimitiveType::String, - _ => { - return Err(ArrowError::InvalidArgumentError(format!( +impl TryFrom for VariantPrimitiveType { + type Error = ArrowError; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(VariantPrimitiveType::Null), + 1 => Ok(VariantPrimitiveType::BooleanTrue), + 2 => Ok(VariantPrimitiveType::BooleanFalse), + 3 => Ok(VariantPrimitiveType::Int8), + // TODO: Add types for the rest, once API is agreed upon + 16 => Ok(VariantPrimitiveType::String), + _ => Err(ArrowError::InvalidArgumentError(format!( "unknown primitive type: {}", - primitive_type - ))) + value + ))), } - }; - Ok(primitive_type) + } +} +/// Extract the primitive type from a Variant value-header byte +pub(crate) fn get_primitive_type(header: u8) -> Result { + // last 6 bits contain the primitive-type, see spec + VariantPrimitiveType::try_from(header >> 2) } /// To be used in `map_err` when unpacking an integer from a slice of bytes. From fd055ef7dd336d355454d157ae30d982744ec32f Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 10:16:42 +0200 Subject: [PATCH 24/42] chore: cleanup iterators --- parquet-variant/src/variant.rs | 107 ++++++--------------------------- 1 file changed, 20 insertions(+), 87 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index bc47bbedcf60..243cdec853d2 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -226,100 +226,33 @@ impl<'m> VariantMetadata<'m> { /// Get the offsets as an iterator // TODO: Write tests - pub fn offsets( - &'m self, - ) -> Result, ArrowError>> + 'm, ArrowError> { - struct OffsetIterators<'m> { - buffer: &'m [u8], - header: &'m VariantMetadataHeader, - dict_len: usize, - seen: usize, - } - impl<'m> Iterator for OffsetIterators<'m> { - type Item = Result, ArrowError>; // Range = (start, end) positions of the bytes - fn next(&mut self) -> Option { - if self.seen < self.dict_len { - let start = self - .header - .offset_size - // skip header via byte_offset=1 and self.seen + 1 because first is dictionary_size - .unpack_usize(self.buffer, 1, self.seen + 1); - - let end = self - .header - .offset_size - // skip header via byte_offset=1 and self.seen + 2 to get end offset - .unpack_usize(self.buffer, 1, self.seen + 2); - self.seen += 1; - match (start, end) { - (Ok(start), Ok(end)) => Some(Ok(start..end)), - (Err(e), _) | (_, Err(e)) => Some(Err(e)), - } - } else { - None - } + pub fn offsets(&'m self) -> impl Iterator, ArrowError>> + 'm { + let offset_size = self.header.offset_size; // `Copy` + let bytes = self.bytes; + + let iterator = (0..self.dict_size).map(move |i| { + // This wont be out of bounds as long as dict_size and offsets have been validated + // during construction via `try_new`, as it calls unpack_usize for the + // indices `1..dict_size+1` already. + let start = offset_size.unpack_usize(bytes, 1, i + 1); + let end = offset_size.unpack_usize(bytes, 1, i + 2); + + match (start, end) { + (Ok(s), Ok(e)) => Ok(s..e), + (Err(e), _) | (_, Err(e)) => Err(e), } - } - let iterator: OffsetIterators = OffsetIterators { - buffer: self.bytes, - header: &self.header, - dict_len: self.dict_size, - seen: 0, - }; - Ok(iterator) + }); + + iterator } /// Get all key-names as an Iterator of strings - // NOTE: Duplicated code due to issues putting Impl's on structs, this is the same as `.offsets` except it - // extracts the field using the offset instead of returning the offset. pub fn fields( &'m self, ) -> Result> + 'm, ArrowError> { - struct FieldIterator<'m> { - buffer: &'m [u8], - header: &'m VariantMetadataHeader, - dict_len: usize, - seen: usize, - } - impl<'m> Iterator for FieldIterator<'m> { - type Item = Result<&'m str, ArrowError>; - fn next(&mut self) -> Option { - if self.seen < self.dict_len { - let start = self - .header - .offset_size - // skip header via byte_offset=1 and self.seen + 1 because first is dictionary_size - .unpack_usize(self.buffer, 1, self.seen + 1); - - let end = self - .header - .offset_size - // skip header via byte_offset=1 and self.seen + 2 to get end offset - .unpack_usize(self.buffer, 1, self.seen + 2); - self.seen += 1; - let result = match (start, end) { - (Ok(start), Ok(end)) => { - // Try to get the slice - match slice_from_slice(self.buffer, 1 + start..1 + end) { - // Get the field and return it - Ok(bytes) => str::from_utf8(bytes).map_err(|_| invalid_utf8_err()), - Err(e) => Err(e), - } - } - (Err(e), _) | (_, Err(e)) => Err(e), - }; - Some(result) - } else { - None - } - } - } - let iterator: FieldIterator = FieldIterator { - buffer: self.bytes, - header: &self.header, - dict_len: self.dict_size, - seen: 0, - }; + let iterator = self + .offsets() + .map(move |range_res| range_res.and_then(|r| self.get_field_by_offset(r))); Ok(iterator) } } From be0a0013f6a5a20ee7b0a9d6a73b9a97defb1efc Mon Sep 17 00:00:00 2001 From: Malthe Karbo <38499319+mkarbo@users.noreply.github.com> Date: Fri, 23 May 2025 10:19:16 +0200 Subject: [PATCH 25/42] Update parquet-variant/src/variant.rs Co-authored-by: Ryan Johnson --- parquet-variant/src/variant.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 243cdec853d2..3e79dc5c2188 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -212,8 +212,7 @@ impl<'m> VariantMetadata<'m> { let dictionary_key_start_byte = 1 // header + self.header.offset_size as usize // dictionary_size field itself + (self.dict_size + 1) * (self.header.offset_size as usize); // all offset entries - let dictionary_keys_bytes = - slice_from_slice(self.bytes, dictionary_key_start_byte..self.bytes.len())?; + let dictionary_keys_bytes = slice_from_slice(self.bytes, dictionary_key_start_byte..)?; let dictionary_key_bytes = slice_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; let result = str::from_utf8(dictionary_key_bytes).map_err(|_| invalid_utf8_err())?; From ed77dbf546aca7ab5d7c9132afc54bb3db80f369 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 10:25:55 +0200 Subject: [PATCH 26/42] chore: cleanup lifetimes --- parquet-variant/src/variant.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3e79dc5c2188..3df7ae52d845 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -212,7 +212,8 @@ impl<'m> VariantMetadata<'m> { let dictionary_key_start_byte = 1 // header + self.header.offset_size as usize // dictionary_size field itself + (self.dict_size + 1) * (self.header.offset_size as usize); // all offset entries - let dictionary_keys_bytes = slice_from_slice(self.bytes, dictionary_key_start_byte..)?; + let dictionary_keys_bytes = + slice_from_slice(self.bytes, dictionary_key_start_byte..self.bytes.len())?; let dictionary_key_bytes = slice_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; let result = str::from_utf8(dictionary_key_bytes).map_err(|_| invalid_utf8_err())?; @@ -225,7 +226,7 @@ impl<'m> VariantMetadata<'m> { /// Get the offsets as an iterator // TODO: Write tests - pub fn offsets(&'m self) -> impl Iterator, ArrowError>> + 'm { + pub fn offsets(&self) -> impl Iterator, ArrowError>> + 'm { let offset_size = self.header.offset_size; // `Copy` let bytes = self.bytes; @@ -248,7 +249,7 @@ impl<'m> VariantMetadata<'m> { /// Get all key-names as an Iterator of strings pub fn fields( &'m self, - ) -> Result> + 'm, ArrowError> { + ) -> Result>, ArrowError> { let iterator = self .offsets() .map(move |range_res| range_res.and_then(|r| self.get_field_by_offset(r))); From 654963173e610184eff8e2fdc63972d7efb8c69b Mon Sep 17 00:00:00 2001 From: Malthe Karbo <38499319+mkarbo@users.noreply.github.com> Date: Fri, 23 May 2025 10:31:32 +0200 Subject: [PATCH 27/42] Update parquet-variant/src/variant.rs Co-authored-by: Ryan Johnson --- parquet-variant/src/variant.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3df7ae52d845..2c64e24b6b24 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -188,15 +188,8 @@ impl<'m> VariantMetadata<'m> { } // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) - let start = self - .header - .offset_size - .unpack_usize(self.bytes, 1, index + 1)?; - let end = self - .header - .offset_size - .unpack_usize(self.bytes, 1, index + 2)?; - Ok(start..end) + let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); + Ok(unpack(index)?..unpack(index + 1)?) } /// Get the key-name by index From dafc028931f7974ae69ad91fb689e552cafe955e Mon Sep 17 00:00:00 2001 From: Malthe Karbo <38499319+mkarbo@users.noreply.github.com> Date: Fri, 23 May 2025 10:37:37 +0200 Subject: [PATCH 28/42] Apply suggestions from code review Co-authored-by: Ryan Johnson --- parquet-variant/src/utils.rs | 12 ++++-------- parquet-variant/src/variant.rs | 5 ++--- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 532f0e89078d..97d5a0d5fb22 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -26,14 +26,10 @@ pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { ArrowError::InvalidArgumentError(e.to_string()) } -pub(crate) fn non_empty_slice(slice: &[u8]) -> Result<&[u8], ArrowError> { - if slice.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Received empty bytes".to_string(), - )); - } else { - return Ok(slice); - } +pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result { + slice.get(0).ok_or_else(|| { + ArrowError::InvalidArgumentError("Received empty bytes".to_string()) + }); } /// Constructs the error message for an invalid UTF-8 string. diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 2c64e24b6b24..ef0dfd72ba80 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -194,9 +194,8 @@ impl<'m> VariantMetadata<'m> { /// Get the key-name by index pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { - match self.get_offset_by(index) { - Ok(range) => self.get_field_by_offset(range), - Err(e) => Err(e), + let range = self.get_offset_by(index)?; + self.get_field_by_offset(range) } } From 88137b0abec2c451e1a2fa2d7362efa031c10651 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 11:26:16 +0200 Subject: [PATCH 29/42] chore(refactor): non_empty_byte util -> first_byte_from_slice --- parquet-variant/src/decoder.rs | 4 ++-- parquet-variant/src/utils.rs | 8 ++++---- parquet-variant/src/variant.rs | 5 ++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 7395ce63ca1b..a7a66168a77a 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,7 +1,7 @@ use arrow_schema::ArrowError; use std::{array::TryFromSliceError, str}; -use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, invalid_utf8_err, slice_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -88,7 +88,7 @@ pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { /// Decodes a short string from the value section of a variant. pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = (non_empty_slice(value)?[0] >> 2) as usize; + let len = (first_byte_from_slice(value)? >> 2) as usize; let string = str::from_utf8(slice_from_slice(value, 1..1 + len)?).map_err(|_| invalid_utf8_err())?; diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 97d5a0d5fb22..2a0c03dc4a29 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -26,10 +26,10 @@ pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { ArrowError::InvalidArgumentError(e.to_string()) } -pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result { - slice.get(0).ok_or_else(|| { - ArrowError::InvalidArgumentError("Received empty bytes".to_string()) - }); +pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<&u8, ArrowError> { + slice + .get(0) + .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string())) } /// Constructs the error message for an invalid UTF-8 string. diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index ef0dfd72ba80..b03d0a06e113 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,7 +1,7 @@ use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; -use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, invalid_utf8_err, slice_from_slice}; use arrow_schema::ArrowError; use std::{ num::TryFromIntError, @@ -196,7 +196,6 @@ impl<'m> VariantMetadata<'m> { pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { let range = self.get_offset_by(index)?; self.get_field_by_offset(range) - } } /// Gets the field using an offset (Range) - helper method to keep consistent API. @@ -313,7 +312,7 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. pub fn try_new(metadata: &'m VariantMetadata, value: &'v [u8]) -> Result { - let header = non_empty_slice(value)?[0]; + let header = *first_byte_from_slice(value)?; let new_self = match get_basic_type(header)? { VariantBasicType::Primitive => match get_primitive_type(header)? { VariantPrimitiveType::Null => Variant::Null, From 5b4115b3e236f8511196b118e5db5f22e4f7d0b1 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 11:27:06 +0200 Subject: [PATCH 30/42] chore(validation): validate version in try_new --- parquet-variant/src/variant.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b03d0a06e113..7beb234d7478 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -83,6 +83,10 @@ pub(crate) struct VariantMetadataHeader { offset_size: OffsetSizeBytes, } +// According to the spec this is currently always = 1, and so we store this const for validation +// purposes and to make that visible. +const CORRECT_VERSION_VALUE: u8 = 1; + impl<'m> VariantMetadataHeader { /// Tries to construct the variant metadata header, which has the form /// 7 6 5 4 3 0 @@ -104,8 +108,15 @@ impl<'m> VariantMetadataHeader { }; let version = header & 0x0F; // First four bits + if version != CORRECT_VERSION_VALUE { + let err_msg = format!( + "The version bytes in the header is not {CORRECT_VERSION_VALUE}, got {:b}", + version + ); + return Err(ArrowError::InvalidArgumentError(err_msg)); + } let is_sorted = (header & 0x10) != 0; // Fifth bit - let offset_size_minus_one = (header >> 6) & 0x03; // Last two bits + let offset_size_minus_one = header >> 6; // Last two bits Ok(Self { version, is_sorted, From 362341e668baa246590d00f532419342bb1ef52d Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 12:37:30 +0200 Subject: [PATCH 31/42] chore(validation): Add final checks to try_new for VariantMetadata --- parquet-variant/src/variant.rs | 66 +++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 7beb234d7478..7ea0c1b926fb 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -145,30 +145,62 @@ impl<'m> VariantMetadata<'m> { // Offset 1, index 0 because first element after header is dictionary size let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?; + // Check that we have the correct metadata length according to dictionary_size, or return + // error early. + // Minimum number of bytes the metadata buffer must contain: + // 1 byte header + // + offset_size-byte `dictionary_size` field + // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table size, essentially) + // 1 + offset_size + (dict_size + 1) * offset_size + let offset_size = header.offset_size as usize; // Cheap to copy + + let dict_bytes_offset = 1usize // 1-byte header + .checked_add(offset_size) // 1 + offset_size + .and_then(|p| { + dict_size + .checked_add(1) // dict_size + 1 + .and_then(|n| n.checked_mul(offset_size)) + .and_then(|table_size| p.checked_add(table_size)) + }) + .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?; + if bytes.len() < dict_bytes_offset { + return Err(ArrowError::InvalidArgumentError( + "Metadata shorter than dictionary_size implies".to_string(), + )); + } + + // Check that all offsets are monotonically increasing // TODO: add test for validation - let valid = (0..=dict_size) + let mut prev = None; + for (i, offset) in (0..=dict_size) .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) - .scan(0, |prev, cur| { - let Ok(cur_offset) = cur else { - return Some(false); - }; - // Skip the first offset, which is always 0 - if *prev == 0 { - *prev = cur_offset; - return Some(true); - } + .enumerate() + { + let offset = offset?; + if i == 0 && offset != 0 { + return Err(ArrowError::InvalidArgumentError( + "First offset is non-zero".to_string(), + )); + } + if prev.is_some_and(|prev| prev >= offset) { + return Err(ArrowError::InvalidArgumentError( + "Offsets are not monotonically increasing".to_string(), + )); + } + prev = Some(offset); + } - let valid = cur_offset > *prev; - *prev = cur_offset; - Some(valid) - }) - .all(|valid| valid); + // Check that the final offset equals the length of the + // dictionary-string section. + let dict_block_len = bytes.len() - dict_bytes_offset; // actual length of the string block - if !valid { + if prev != Some(dict_block_len) { + // `prev` holds the last offset seen still return Err(ArrowError::InvalidArgumentError( - "Offsets are not monotonically increasing".to_string(), + "Last offset does not equal dictionary length".to_string(), )); } + Ok(Self { bytes, header, From 69462994b312b7ea31da06a0ed1366c2ca5ed2db Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:08:33 +0200 Subject: [PATCH 32/42] feat(test): tests for try_new --- parquet-variant/src/variant.rs | 92 ++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 7ea0c1b926fb..7522672202d7 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -545,4 +545,96 @@ mod tests { let err = width.unpack_usize(&buf, 1, 4); assert!(err.is_err()) } + + /// `"cat"`, `"dog"` – valid metadata + #[test] + fn try_new_ok_inline() { + let bytes = &[ + 0b0000_0001, // header, offset_size_minus_one=0 and version=1 + 0x02, // dictionary_size (2 strings) + 0x00, + 0x03, + 0x06, + b'c', + b'a', + b't', + b'd', + b'o', + b'g', + ]; + + let md = VariantMetadata::try_new(bytes).expect("should parse"); + assert_eq!(md.dictionary_size(), 2); + assert_eq!(md.get_field_by(0).unwrap(), "cat"); + assert_eq!(md.get_field_by(1).unwrap(), "dog"); + } + /// Too short buffer test (missing one required offset). + /// Should error with ā€œmetadata shorter than dictionary_size impliesā€. + #[test] + fn try_new_missing_last_value() { + let bytes = &[ + 0b0000_0001, // header, offset_size_minus_one=0 and version=1 + 0x02, // dictionary_size = 2 + 0x00, + 0x01, + 0x02, + b'a', + b'b', // <-- we'll remove this + ]; + + let working_md = VariantMetadata::try_new(bytes).expect("should parse"); + assert_eq!(working_md.dictionary_size(), 2); + assert_eq!(working_md.get_field_by(0).unwrap(), "a"); + assert_eq!(working_md.get_field_by(1).unwrap(), "b"); + + let truncated = &bytes[..bytes.len() - 1]; + + let err = VariantMetadata::try_new(truncated).unwrap_err(); + assert!( + matches!(err, ArrowError::InvalidArgumentError(ref msg) + if msg.contains("Last offset")), + "unexpected error: {err:?}" + ); + } + + #[test] + fn try_new_fails_non_monotonic() { + // 'cat', 'dog', 'lamb' + let bytes = &[ + 0b0000_0001, // header, offset_size_minus_one=0 and version=1 + 0x03, // dictionary_size + 0x00, + 0x02, + 0x01, // Doesn't increase monotonically + 0x10, + b'c', + b'a', + b't', + b'd', + b'o', + b'g', + b'l', + b'a', + b'm', + b'b', + ]; + + let err = VariantMetadata::try_new(bytes).unwrap_err(); + assert!( + matches!(err, ArrowError::InvalidArgumentError(ref msg) if msg.contains("monotonically")), + "unexpected error: {err:?}" + ); + } + + #[test] + fn try_new_truncated_offsets_inline() { + // Missing final offset + let bytes = &[0b0000_0001, 0x02, 0x00, 0x01]; + + let err = VariantMetadata::try_new(bytes).unwrap_err(); + assert!( + matches!(err, ArrowError::InvalidArgumentError(ref msg) if msg.contains("shorter")), + "unexpected error: {err:?}" + ); + } } From d3426aba6d1c9e13e53aa8df1563f6e55b97aa9a Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:17:28 +0200 Subject: [PATCH 33/42] chore: memoize dictionary byte offset start --- parquet-variant/src/variant.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 7522672202d7..648884f0ef9c 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -131,6 +131,7 @@ pub struct VariantMetadata<'m> { bytes: &'m [u8], header: VariantMetadataHeader, dict_size: usize, + dictionary_key_start_byte: usize, } impl<'m> VariantMetadata<'m> { @@ -154,7 +155,7 @@ impl<'m> VariantMetadata<'m> { // 1 + offset_size + (dict_size + 1) * offset_size let offset_size = header.offset_size as usize; // Cheap to copy - let dict_bytes_offset = 1usize // 1-byte header + let dictionary_key_start_byte = 1usize // 1-byte header .checked_add(offset_size) // 1 + offset_size .and_then(|p| { dict_size @@ -163,14 +164,13 @@ impl<'m> VariantMetadata<'m> { .and_then(|table_size| p.checked_add(table_size)) }) .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?; - if bytes.len() < dict_bytes_offset { + if bytes.len() < dictionary_key_start_byte { return Err(ArrowError::InvalidArgumentError( "Metadata shorter than dictionary_size implies".to_string(), )); } // Check that all offsets are monotonically increasing - // TODO: add test for validation let mut prev = None; for (i, offset) in (0..=dict_size) .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) @@ -192,7 +192,7 @@ impl<'m> VariantMetadata<'m> { // Check that the final offset equals the length of the // dictionary-string section. - let dict_block_len = bytes.len() - dict_bytes_offset; // actual length of the string block + let dict_block_len = bytes.len() - dictionary_key_start_byte; // actual length of the string block if prev != Some(dict_block_len) { // `prev` holds the last offset seen still @@ -205,6 +205,7 @@ impl<'m> VariantMetadata<'m> { bytes, header, dict_size, + dictionary_key_start_byte, }) } @@ -243,11 +244,8 @@ impl<'m> VariantMetadata<'m> { /// Gets the field using an offset (Range) - helper method to keep consistent API. pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { - let dictionary_key_start_byte = 1 // header - + self.header.offset_size as usize // dictionary_size field itself - + (self.dict_size + 1) * (self.header.offset_size as usize); // all offset entries let dictionary_keys_bytes = - slice_from_slice(self.bytes, dictionary_key_start_byte..self.bytes.len())?; + slice_from_slice(self.bytes, self.dictionary_key_start_byte..self.bytes.len())?; let dictionary_key_bytes = slice_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; let result = str::from_utf8(dictionary_key_bytes).map_err(|_| invalid_utf8_err())?; From c507efb3e3be292ee04a78b590a1576c340e30d8 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:29:59 +0200 Subject: [PATCH 34/42] chore(refactor): simpliy str from slice to util (feedback) --- parquet-variant/src/decoder.rs | 10 ++++------ parquet-variant/src/utils.rs | 12 ++++++++---- parquet-variant/src/variant.rs | 18 ++++++------------ 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a7a66168a77a..8672141d7c72 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,7 +1,7 @@ use arrow_schema::ArrowError; -use std::{array::TryFromSliceError, str}; +use std::array::TryFromSliceError; -use crate::utils::{array_from_slice, first_byte_from_slice, invalid_utf8_err, slice_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -81,8 +81,7 @@ pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { .try_into() .map_err(map_try_from_slice_error)?, ) as usize; - let string = - str::from_utf8(slice_from_slice(value, 5..5 + len)?).map_err(|_| invalid_utf8_err())?; + let string = string_from_slice(value, 5..5 + len)?; Ok(string) } @@ -90,8 +89,7 @@ pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { let len = (first_byte_from_slice(value)? >> 2) as usize; - let string = - str::from_utf8(slice_from_slice(value, 1..1 + len)?).map_err(|_| invalid_utf8_err())?; + let string = string_from_slice(value, 1..1 + len)?; Ok(string) } diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 2a0c03dc4a29..f3db534f03e3 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -1,4 +1,4 @@ -use std::{array::TryFromSliceError, ops::Range}; +use std::{array::TryFromSliceError, ops::Range, str}; use arrow_schema::ArrowError; @@ -32,7 +32,11 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<&u8, ArrowError> { .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string())) } -/// Constructs the error message for an invalid UTF-8 string. -pub(crate) fn invalid_utf8_err() -> ArrowError { - ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +// /// Constructs the error message for an invalid UTF-8 string. +// pub(crate) fn invalid_utf8_err() -> ArrowError { +// ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) +// } +pub(crate) fn string_from_slice(slice: &[u8], range: Range) -> Result<&str, ArrowError> { + str::from_utf8(slice_from_slice(slice, range)?) + .map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string())) } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 648884f0ef9c..b76c97d44249 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,12 +1,11 @@ use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; -use crate::utils::{array_from_slice, first_byte_from_slice, invalid_utf8_err, slice_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; use arrow_schema::ArrowError; use std::{ num::TryFromIntError, ops::{Index, Range}, - str, }; #[derive(Clone, Debug, Copy, PartialEq)] @@ -87,7 +86,7 @@ pub(crate) struct VariantMetadataHeader { // purposes and to make that visible. const CORRECT_VERSION_VALUE: u8 = 1; -impl<'m> VariantMetadataHeader { +impl VariantMetadataHeader { /// Tries to construct the variant metadata header, which has the form /// 7 6 5 4 3 0 /// +-------+---+---+---------------+ @@ -100,12 +99,8 @@ impl<'m> VariantMetadataHeader { /// - sorted_strings is a 1-bit value indicating whether dictionary strings are sorted and unique. /// - offset_size_minus_one is a 2-bit value providing the number of bytes per dictionary size and offset field. /// - The actual number of bytes, offset_size, is offset_size_minus_one + 1 - pub fn try_new(bytes: &'m [u8]) -> Result { - let Some(header) = bytes.get(0) else { - return Err(ArrowError::InvalidArgumentError( - "Received zero bytes".to_string(), - )); - }; + pub fn try_new(bytes: &[u8]) -> Result { + let header = first_byte_from_slice(bytes)?; let version = header & 0x0F; // First four bits if version != CORRECT_VERSION_VALUE { @@ -246,9 +241,8 @@ impl<'m> VariantMetadata<'m> { pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { let dictionary_keys_bytes = slice_from_slice(self.bytes, self.dictionary_key_start_byte..self.bytes.len())?; - let dictionary_key_bytes = - slice_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; - let result = str::from_utf8(dictionary_key_bytes).map_err(|_| invalid_utf8_err())?; + let result = string_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; + Ok(result) } From 7d2b06d0b869e87d8486f17e2194e74b953140e7 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:31:30 +0200 Subject: [PATCH 35/42] chore(docstring): forgot to update docstring --- parquet-variant/src/utils.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index f3db534f03e3..1515abee0d1f 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -32,10 +32,7 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<&u8, ArrowError> { .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string())) } -// /// Constructs the error message for an invalid UTF-8 string. -// pub(crate) fn invalid_utf8_err() -> ArrowError { -// ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()) -// } +/// Helper to get a &str from a slice based on range, if it's valid or an error otherwise pub(crate) fn string_from_slice(slice: &[u8], range: Range) -> Result<&str, ArrowError> { str::from_utf8(slice_from_slice(slice, range)?) .map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string())) From a9354c44925ab6a7e1b0fe11530ade86b74e4e40 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:47:01 +0200 Subject: [PATCH 36/42] chore: minor comment + clearer offset api --- parquet-variant/src/decoder.rs | 6 +----- parquet-variant/src/variant.rs | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 8672141d7c72..a8d97db4cff1 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -76,11 +76,7 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { /// Decodes a long string from the value section of a variant. pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = u32::from_le_bytes( - slice_from_slice(value, 1..5)? - .try_into() - .map_err(map_try_from_slice_error)?, - ) as usize; + let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; let string = string_from_slice(value, 5..5 + len)?; Ok(string) } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b76c97d44249..e87174c4f631 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -217,8 +217,8 @@ impl<'m> VariantMetadata<'m> { self.header.version } - /// Get the offset by key-index - pub fn get_offset_by(&self, index: usize) -> Result, ArrowError> { + /// Get the offset start and end range for a key by index + pub fn get_offsets_for_key_by(&self, index: usize) -> Result, ArrowError> { if index >= self.dict_size { return Err(ArrowError::InvalidArgumentError(format!( "Index {} out of bounds for dictionary of length {}", @@ -231,9 +231,23 @@ impl<'m> VariantMetadata<'m> { Ok(unpack(index)?..unpack(index + 1)?) } + /// Get a single offset by index + pub fn get_offset_by(&self, index: usize) -> Result { + if index >= self.dict_size { + return Err(ArrowError::InvalidArgumentError(format!( + "Index {} out of bounds for dictionary of length {}", + index, self.dict_size + ))); + } + + // Skipping the header byte (setting byte_offset = 1) and the dictionary_size (setting offset_index +1) + let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i + 1); + Ok(unpack(index)?) + } + /// Get the key-name by index pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { - let range = self.get_offset_by(index)?; + let range = self.get_offsets_for_key_by(index)?; self.get_field_by_offset(range) } @@ -557,8 +571,21 @@ mod tests { let md = VariantMetadata::try_new(bytes).expect("should parse"); assert_eq!(md.dictionary_size(), 2); + // Fields assert_eq!(md.get_field_by(0).unwrap(), "cat"); assert_eq!(md.get_field_by(1).unwrap(), "dog"); + + // Offsets + assert_eq!(md.get_offset_by(0).unwrap(), 0x00); + assert_eq!(md.get_offset_by(1).unwrap(), 0x03); + // We only have 2 keys, the final offset should not be accessible using this method. + let err = md.get_offset_by(2).unwrap_err(); + + assert!( + matches!(err, ArrowError::InvalidArgumentError(ref msg) + if msg.contains("Index 2 out of bounds for dictionary of length 2")), + "unexpected error: {err:?}" + ); } /// Too short buffer test (missing one required offset). /// Should error with ā€œmetadata shorter than dictionary_size impliesā€. From a68828bd591a4048bfc051c8df6894b907b322b6 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 13:58:47 +0200 Subject: [PATCH 37/42] chore: test fields iterator --- parquet-variant/src/variant.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index e87174c4f631..7a3032784380 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -586,6 +586,13 @@ mod tests { if msg.contains("Index 2 out of bounds for dictionary of length 2")), "unexpected error: {err:?}" ); + let fields: Vec<(usize, &str)> = md + .fields() + .unwrap() + .enumerate() + .map(|(i, r)| (i, r.unwrap())) + .collect(); + assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]); } /// Too short buffer test (missing one required offset). /// Should error with ā€œmetadata shorter than dictionary_size impliesā€. From f4b8f5898c078a46de286ebcbd812b38f6fc1914 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 23 May 2025 14:39:39 +0200 Subject: [PATCH 38/42] feat: initial get method for array (will refactor) --- parquet-variant/src/test_variant.rs | 11 +++++- parquet-variant/src/variant.rs | 60 ++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index 753d01a1ae72..bdb802396add 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -38,7 +38,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { } fn get_non_primitive_cases() -> Vec<&'static str> { - vec!["object_primitive"] + vec!["object_primitive", "array_primitive"] } #[test] @@ -67,6 +67,15 @@ fn variant_non_primitive() -> Result<(), ArrowError> { let dict_val = metadata.get_field_by(0)?; assert_eq!(dict_val, "int_field"); } + "array_primitive" => match variant { + Variant::Array(arr) => { + let v = arr.get(0)?; + assert!(matches!(v, Variant::Int8(2))); + let v = arr.get(1)?; + assert!(matches!(v, Variant::Int8(1))); + } + _ => panic!("expected an array"), + }, _ => unreachable!(), } } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 7a3032784380..c8c8fe565b98 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -265,7 +265,6 @@ impl<'m> VariantMetadata<'m> { } /// Get the offsets as an iterator - // TODO: Write tests pub fn offsets(&self) -> impl Iterator, ArrowError>> + 'm { let offset_size = self.header.offset_size; // `Copy` let bytes = self.bytes; @@ -329,16 +328,64 @@ impl<'m, 'v> VariantArray<'m, 'v> { #[allow(unreachable_code)] // Just to infer the return type Ok(vec![].into_iter()) } -} -impl<'m, 'v> Index for VariantArray<'m, 'v> { - type Output = Variant<'m, 'v>; + pub fn get(&self, index: usize) -> Result, ArrowError> { + // The 6 first bits to the left are the value_header and the 2 bits + // to the right are the basic type, so we shift to get only the value_header + let value_header = first_byte_from_slice(self.value)? >> 2; + let is_large = (value_header & 0x04) != 0; // 3rd bit from the right + let field_offset_size_minus_one = value_header & 0x03; // Last two bits + let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; + + // The size of the num_elements entry in the array value_data is 4 bytes if + // is_large is true, otherwise 1 byte. + let num_elements_size = if is_large { 4 } else { 1 }; + + // Skip the header byte to read the num_elements + let num_elements_bytes = slice_from_slice(self.value, 1..num_elements_size + 1)?; + let num_elements = match num_elements_size { + 1 => num_elements_bytes[0] as usize, + 4 => { + let arr: [u8; 4] = + num_elements_bytes + .try_into() + .map_err(|e: std::array::TryFromSliceError| { + ArrowError::InvalidArgumentError(e.to_string()) + })?; + u32::from_le_bytes(arr) as usize + } + _ => { + return Err(ArrowError::InvalidArgumentError( + "Invalid num_elements_size".to_string(), + )) + } + }; + + // Following the spec, there are "num_elements + 1" offsets in the value section + let first_value_byte = 1 + num_elements_size + (1 + num_elements) * offset_size as usize; - fn index(&self, _index: usize) -> &Self::Output { - todo!() + // Skip header and num_elements bytes to read the offsets + let start_field_offset_from_first_value_byte = + offset_size.unpack_usize(self.value, 1 + num_elements_size, index)?; + let end_field_offset_from_first_value_byte = + offset_size.unpack_usize(self.value, 1 + num_elements_size, index + 1)?; + + // Read the value bytes from the offsets + let variant_value_bytes = slice_from_slice( + self.value, + first_value_byte + start_field_offset_from_first_value_byte + ..first_value_byte + end_field_offset_from_first_value_byte, + )?; + let variant = Variant::try_new(self.metadata, variant_value_bytes)?; + Ok(variant) } } +// impl<'m, 'v> Index for VariantArray<'m, 'v> { +// type Output = Variant<'m, 'v>; +// +// } + /// Variant value. May contain references to metadata and value #[derive(Clone, Debug, Copy, PartialEq)] pub enum Variant<'m, 'v> { @@ -594,6 +641,7 @@ mod tests { .collect(); assert_eq!(fields, vec![(0usize, "cat"), (1usize, "dog")]); } + /// Too short buffer test (missing one required offset). /// Should error with ā€œmetadata shorter than dictionary_size impliesā€. #[test] From 1273a6d64b70a302fc5f954e026cf6e93ef934d3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 23 May 2025 17:25:52 -0400 Subject: [PATCH 39/42] Add header to parquet files --- parquet-variant/src/decoder.rs | 16 ++++++++++++++++ parquet-variant/src/test_variant.rs | 17 +++++++++++++++++ parquet-variant/src/utils.rs | 16 ++++++++++++++++ parquet-variant/src/variant.rs | 16 ++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a8d97db4cff1..6d2931a9af73 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. use arrow_schema::ArrowError; use std::array::TryFromSliceError; diff --git a/parquet-variant/src/test_variant.rs b/parquet-variant/src/test_variant.rs index bdb802396add..07c9eaf9c6f0 100644 --- a/parquet-variant/src/test_variant.rs +++ b/parquet-variant/src/test_variant.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + //! End-to-end check: (almost) every sample from apache/parquet-testing/variant //! can be parsed into our `Variant`. diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 1515abee0d1f..27069060c599 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. use std::{array::TryFromSliceError, ops::Range, str}; use arrow_schema::ArrowError; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index c8c8fe565b98..f0b22a842107 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. use crate::decoder::{ self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType, }; From 3fad3eb613cf9596f28c979fee0f3f45e7eb471e Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 30 May 2025 09:55:28 +0200 Subject: [PATCH 40/42] chore: various PR comments --- parquet-variant/src/variant.rs | 92 ++++++++++++++++------------------ 1 file changed, 43 insertions(+), 49 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index f0b22a842107..951ec1ad4253 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -164,17 +164,15 @@ impl<'m> VariantMetadata<'m> { // + offset_size-byte `dictionary_size` field // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table size, essentially) // 1 + offset_size + (dict_size + 1) * offset_size + // = (dict_size + 2) * offset_size + 1 let offset_size = header.offset_size as usize; // Cheap to copy - let dictionary_key_start_byte = 1usize // 1-byte header - .checked_add(offset_size) // 1 + offset_size - .and_then(|p| { - dict_size - .checked_add(1) // dict_size + 1 - .and_then(|n| n.checked_mul(offset_size)) - .and_then(|table_size| p.checked_add(table_size)) - }) + let dictionary_key_start_byte = dict_size + .checked_add(2) + .and_then(|n| n.checked_mul(offset_size)) + .and_then(|n| n.checked_add(1)) .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length overflow".into()))?; + if bytes.len() < dictionary_key_start_byte { return Err(ArrowError::InvalidArgumentError( "Metadata shorter than dictionary_size implies".to_string(), @@ -233,8 +231,8 @@ impl<'m> VariantMetadata<'m> { self.header.version } - /// Get the offset start and end range for a key by index - pub fn get_offsets_for_key_by(&self, index: usize) -> Result, ArrowError> { + /// Helper method to get the offset start and end range for a key by index. + fn get_offsets_for_key_by(&self, index: usize) -> Result, ArrowError> { if index >= self.dict_size { return Err(ArrowError::InvalidArgumentError(format!( "Index {} out of bounds for dictionary of length {}", @@ -263,15 +261,15 @@ impl<'m> VariantMetadata<'m> { /// Get the key-name by index pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> { - let range = self.get_offsets_for_key_by(index)?; - self.get_field_by_offset(range) + let offset_range = self.get_offsets_for_key_by(index)?; + self.get_field_by_offset(offset_range) } /// Gets the field using an offset (Range) - helper method to keep consistent API. pub(crate) fn get_field_by_offset(&self, offset: Range) -> Result<&'m str, ArrowError> { let dictionary_keys_bytes = slice_from_slice(self.bytes, self.dictionary_key_start_byte..self.bytes.len())?; - let result = string_from_slice(dictionary_keys_bytes, offset.start..offset.end)?; + let result = string_from_slice(dictionary_keys_bytes, offset)?; Ok(result) } @@ -307,7 +305,7 @@ impl<'m> VariantMetadata<'m> { ) -> Result>, ArrowError> { let iterator = self .offsets() - .map(move |range_res| range_res.and_then(|r| self.get_field_by_offset(r))); + .map(move |offset_range| self.get_field_by_offset(offset_range?)); Ok(iterator) } } @@ -352,39 +350,39 @@ impl<'m, 'v> VariantArray<'m, 'v> { let is_large = (value_header & 0x04) != 0; // 3rd bit from the right let field_offset_size_minus_one = value_header & 0x03; // Last two bits let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; - // The size of the num_elements entry in the array value_data is 4 bytes if // is_large is true, otherwise 1 byte. - let num_elements_size = if is_large { 4 } else { 1 }; - - // Skip the header byte to read the num_elements - let num_elements_bytes = slice_from_slice(self.value, 1..num_elements_size + 1)?; - let num_elements = match num_elements_size { - 1 => num_elements_bytes[0] as usize, - 4 => { - let arr: [u8; 4] = - num_elements_bytes - .try_into() - .map_err(|e: std::array::TryFromSliceError| { - ArrowError::InvalidArgumentError(e.to_string()) - })?; - u32::from_le_bytes(arr) as usize - } - _ => { - return Err(ArrowError::InvalidArgumentError( - "Invalid num_elements_size".to_string(), - )) - } + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, }; + // Skip the header byte to read the num_elements + // The size of the num_elements entry in the array value_data is 4 bytes if + // is_large is true, otherwise 1 byte. + let num_elements = num_elements_size.unpack_usize(self.value, 1, 0)?; + let first_offset_byte = 1 + num_elements_size as usize; + + let overflow = + || ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into()); + + // 1. num_elements + 1 + let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?; + + // 2. (num_elements + 1) * offset_size + let value_bytes = n_offsets + .checked_mul(offset_size as usize) + .ok_or_else(overflow)?; - // Following the spec, there are "num_elements + 1" offsets in the value section - let first_value_byte = 1 + num_elements_size + (1 + num_elements) * offset_size as usize; + // 3. first_offset_byte + ... + let first_value_byte = first_offset_byte + .checked_add(value_bytes) + .ok_or_else(overflow)?; // Skip header and num_elements bytes to read the offsets let start_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, 1 + num_elements_size, index)?; + offset_size.unpack_usize(self.value, first_offset_byte, index)?; let end_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, 1 + num_elements_size, index + 1)?; + offset_size.unpack_usize(self.value, first_offset_byte, index + 1)?; // Read the value bytes from the offsets let variant_value_bytes = slice_from_slice( @@ -446,10 +444,7 @@ impl<'m, 'v> Variant<'m, 'v> { } pub fn as_null(&self) -> Option<()> { - match self { - Variant::Null => Some(()), - _ => None, - } + matches!(self, Variant::Null).then_some(()) } pub fn as_boolean(&self) -> Option { @@ -468,8 +463,8 @@ impl<'m, 'v> Variant<'m, 'v> { } pub fn as_int8(&self) -> Option { - match self { - Variant::Int8(i) => Some(*i), + match *self { + Variant::Int8(i) => Some(i), // TODO: Add branches for type-widening/shortening when implemting rest of primitives for int // Variant::Int16(i) => i.try_into().ok(), // ... @@ -494,10 +489,9 @@ impl<'m, 'v> From for Variant<'m, 'v> { impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: bool) -> Self { - if value { - Variant::BooleanTrue - } else { - Variant::BooleanFalse + match value { + true => Variant::BooleanTrue, + false => Variant::BooleanFalse, } } } From e203359707734b7148d47e49cf4516434e291a9c Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 30 May 2025 10:13:17 +0200 Subject: [PATCH 41/42] chore: PR feedback and better monotone check based on scovich --- parquet-variant/src/decoder.rs | 2 +- parquet-variant/src/utils.rs | 15 ++++++++++----- parquet-variant/src/variant.rs | 29 ++++++++++++----------------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 6d2931a9af73..80d6947c3da6 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -17,7 +17,7 @@ use arrow_schema::ArrowError; use std::array::TryFromSliceError; -use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs index 27069060c599..85feb0bcb1c9 100644 --- a/parquet-variant/src/utils.rs +++ b/parquet-variant/src/utils.rs @@ -18,13 +18,18 @@ use std::{array::TryFromSliceError, ops::Range, str}; use arrow_schema::ArrowError; +use std::fmt::Debug; +use std::slice::SliceIndex; + #[inline] -pub(crate) fn slice_from_slice(bytes: &[u8], range: Range) -> Result<&[u8], ArrowError> { - bytes.get(range.clone()).ok_or_else(|| { + +pub(crate) fn slice_from_slice + Clone + Debug>( + bytes: &[u8], + index: I, +) -> Result<&I::Output, ArrowError> { + bytes.get(index.clone()).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( - "Tried to extract {} bytes at offset {} from {}-byte buffer", - range.end - range.start, - range.start, + "Tried to extract byte(s) {index:?} from {}-byte buffer", bytes.len(), )) }) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 951ec1ad4253..204a9ddaf695 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -180,30 +180,25 @@ impl<'m> VariantMetadata<'m> { } // Check that all offsets are monotonically increasing - let mut prev = None; - for (i, offset) in (0..=dict_size) - .map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)) - .enumerate() - { + let mut offsets = (0..=dict_size).map(|i| header.offset_size.unpack_usize(bytes, 1, i + 1)); + let Some(Ok(mut end @ 0)) = offsets.next() else { + return Err(ArrowError::InvalidArgumentError( + "First offset is non-zero".to_string(), + )); + }; + + for offset in offsets { let offset = offset?; - if i == 0 && offset != 0 { - return Err(ArrowError::InvalidArgumentError( - "First offset is non-zero".to_string(), - )); - } - if prev.is_some_and(|prev| prev >= offset) { + if end >= offset { return Err(ArrowError::InvalidArgumentError( "Offsets are not monotonically increasing".to_string(), )); } - prev = Some(offset); + end = offset; } - // Check that the final offset equals the length of the - // dictionary-string section. - let dict_block_len = bytes.len() - dictionary_key_start_byte; // actual length of the string block - - if prev != Some(dict_block_len) { + // Verify the buffer covers the whole dictionary-string section + if end > bytes.len() - dictionary_key_start_byte { // `prev` holds the last offset seen still return Err(ArrowError::InvalidArgumentError( "Last offset does not equal dictionary length".to_string(), From 026774d6637fbcdfd8a6ba5384cbaf4a05540a14 Mon Sep 17 00:00:00 2001 From: Malthe Karbo Date: Fri, 30 May 2025 10:15:39 +0200 Subject: [PATCH 42/42] chore: metadata return struct not bytes --- parquet-variant/src/variant.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 204a9ddaf695..cf9f51acc72d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -467,10 +467,10 @@ impl<'m, 'v> Variant<'m, 'v> { } } - pub fn metadata(&self) -> Option<&'m [u8]> { + pub fn metadata(&self) -> Option<&'m VariantMetadata> { match self { Variant::Object(VariantObject { metadata, .. }) - | Variant::Array(VariantArray { metadata, .. }) => Some(metadata.as_bytes()), + | Variant::Array(VariantArray { metadata, .. }) => Some(*metadata), _ => None, } }