From 512b73a98b1a9168582b59c7a3e1a75665e3976b Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Mon, 9 Jun 2025 14:27:24 -0700 Subject: [PATCH 01/19] Implement Binary variant --- parquet-variant/src/decoder.rs | 37 +++++++++++++++++++++++- parquet-variant/src/variant.rs | 16 ++++++++++ parquet-variant/tests/variant_interop.rs | 2 +- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a3d2f87062ea..897203bd3bd3 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -17,7 +17,7 @@ use arrow_schema::ArrowError; use std::array::TryFromSliceError; -use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice}; +use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -34,6 +34,7 @@ pub enum VariantPrimitiveType { BooleanFalse = 2, Int8 = 3, // TODO: Add types for the rest of primitives, once API is agreed upon + Binary = 15, String = 16, } @@ -65,6 +66,7 @@ impl TryFrom for VariantPrimitiveType { 2 => Ok(VariantPrimitiveType::BooleanFalse), 3 => Ok(VariantPrimitiveType::Int8), // TODO: Add types for the rest, once API is agreed upon + 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), _ => Err(ArrowError::InvalidArgumentError(format!( "unknown primitive type: {}", @@ -90,6 +92,13 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { Ok(value) } +/// Decodes a Binary from the value section of a variant. +pub(crate) fn decode_binary(value: &[u8]) -> Result<&[u8], ArrowError> { + let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; + let value = slice_from_slice(value, 5..5 + len)?; + Ok(value) +} + /// Decodes a long string from the value section of a variant. pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; @@ -120,6 +129,32 @@ mod tests { Ok(()) } + #[test] + fn test_binary() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Binary as u8) << 2, // Basic type + 9, + 0, + 0, + 0, // Length of binary data, 4-byte little-endian + 0x03, + 0x13, + 0x37, + 0xde, + 0xad, + 0xbe, + 0xef, + 0xca, + 0xfe, // Data + ]; + let result = decode_binary(&value)?; + assert_eq!( + result, + [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe] + ); + Ok(()) + } + #[test] fn test_short_string() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b7b1932580b1..ce1e1656551c 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -407,6 +407,7 @@ pub enum Variant<'m, 'v> { BooleanFalse, // Note: only need the *value* buffer + Binary(&'v [u8]), String(&'v str), ShortString(&'v str), @@ -426,6 +427,7 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon + VariantPrimitiveType::Binary => Variant::Binary(decoder::decode_binary(value)?), VariantPrimitiveType::String => { Variant::String(decoder::decode_long_string(value)?) } @@ -451,6 +453,14 @@ impl<'m, 'v> Variant<'m, 'v> { } } + pub fn as_u8_slice(&'v self) -> Option<&'v [u8]> { + if let Variant::Binary(d) = self { + Some(d) + } else { + None + } + } + pub fn as_string(&'v self) -> Option<&'v str> { match self { Variant::String(s) | Variant::ShortString(s) => Some(s), @@ -492,6 +502,12 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From<&'v [u8]> for Variant<'m, 'v> { + fn from(value: &'v [u8]) -> Self { + Variant::Binary(value) + } +} + impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { fn from(value: &'v str) -> Self { if value.len() < 64 { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 617a10d63d12..2c1478147117 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -46,7 +46,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { // Cases are commented out // Enabling is tracked in https://github.com/apache/arrow-rs/issues/7630 vec![ - // ("primitive_binary", Variant::Binary), + ("primitive_binary", Variant::Binary(&[0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe])), ("primitive_boolean_false", Variant::BooleanFalse), ("primitive_boolean_true", Variant::BooleanTrue), // ("primitive_date", Variant::Null), From d354c9586f07958d8981772426a5e34a42d14eea Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Mon, 9 Jun 2025 15:09:38 -0700 Subject: [PATCH 02/19] Implement Date variant --- parquet-variant/Cargo.toml | 3 +-- parquet-variant/src/decoder.rs | 24 ++++++++++++++++++++++++ parquet-variant/src/variant.rs | 17 +++++++++++++++++ parquet-variant/tests/variant_interop.rs | 3 ++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 41b127ef14e6..47522f469995 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -32,7 +32,6 @@ rust-version = { workspace = true } [dependencies] arrow-schema = "55.1.0" +chrono = { workspace = true } [lib] - - diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 897203bd3bd3..296d523c5449 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow_schema::ArrowError; +use chrono::{DateTime, Duration, NaiveDate}; use std::array::TryFromSliceError; use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; @@ -34,6 +35,7 @@ pub enum VariantPrimitiveType { BooleanFalse = 2, Int8 = 3, // TODO: Add types for the rest of primitives, once API is agreed upon + Date = 11, Binary = 15, String = 16, } @@ -66,6 +68,7 @@ impl TryFrom for VariantPrimitiveType { 2 => Ok(VariantPrimitiveType::BooleanFalse), 3 => Ok(VariantPrimitiveType::Int8), // TODO: Add types for the rest, once API is agreed upon + 11 => Ok(VariantPrimitiveType::Date), 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), _ => Err(ArrowError::InvalidArgumentError(format!( @@ -92,6 +95,13 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { Ok(value) } +/// Decodes a Date from the value section of a variant. +pub(crate) fn decode_date(value: &[u8]) -> Result { + let days_since_epoch = i32::from_le_bytes(array_from_slice(value, 1)?); + let value = (DateTime::UNIX_EPOCH + Duration::days(days_since_epoch as i64)).date_naive(); + Ok(value) +} + /// Decodes a Binary from the value section of a variant. pub(crate) fn decode_binary(value: &[u8]) -> Result<&[u8], ArrowError> { let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; @@ -129,6 +139,20 @@ mod tests { Ok(()) } + #[test] + fn test_date() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Date as u8) << 2, // Basic type + 0xe2, + 0x4e, + 0x0, + 0x0, // Data + ]; + let result = decode_date(&value)?; + assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()); + Ok(()) + } + #[test] fn test_binary() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index ce1e1656551c..77adddbf84b1 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -19,6 +19,7 @@ use crate::decoder::{ }; use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; use arrow_schema::ArrowError; +use chrono::NaiveDate; use std::{num::TryFromIntError, ops::Range}; #[derive(Clone, Debug, Copy, PartialEq)] @@ -402,6 +403,7 @@ pub enum Variant<'m, 'v> { // TODO: Add types for the rest of the primitive types, once API is agreed upon Null, Int8(i8), + Date(NaiveDate), BooleanTrue, BooleanFalse, @@ -427,6 +429,7 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon + VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value)?), VariantPrimitiveType::Binary => Variant::Binary(decoder::decode_binary(value)?), VariantPrimitiveType::String => { Variant::String(decoder::decode_long_string(value)?) @@ -453,6 +456,14 @@ impl<'m, 'v> Variant<'m, 'v> { } } + pub fn as_naive_date(self) -> Option { + if let Variant::Date(d) = self { + Some(d) + } else { + None + } + } + pub fn as_u8_slice(&'v self) -> Option<&'v [u8]> { if let Variant::Binary(d) = self { Some(d) @@ -502,6 +513,12 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: NaiveDate) -> Self { + Variant::Date(value) + } +} + impl<'m, 'v> From<&'v [u8]> for Variant<'m, 'v> { fn from(value: &'v [u8]) -> Self { Variant::Binary(value) diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 2c1478147117..3bcbd03c1fd7 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,6 +24,7 @@ use std::fs; use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; +use chrono::NaiveDate; use parquet_variant::{Variant, VariantMetadata}; fn cases_dir() -> PathBuf { @@ -49,7 +50,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_binary", Variant::Binary(&[0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe])), ("primitive_boolean_false", Variant::BooleanFalse), ("primitive_boolean_true", Variant::BooleanTrue), - // ("primitive_date", Variant::Null), + ("primitive_date", Variant::Date(NaiveDate::from_ymd_opt(2025, 4 , 16).unwrap())), //("primitive_decimal4", Variant::Null), //("primitive_decimal8", Variant::Null), //("primitive_decimal16", Variant::Null), From e0918d1a1735c8869cf129904b118e3e4ff18257 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Mon, 9 Jun 2025 20:27:47 -0700 Subject: [PATCH 03/19] Implement TimestampMicros and TimestampNTZMicros variants --- parquet-variant/src/decoder.rs | 79 +++++++++++++++++++++++- parquet-variant/src/variant.rs | 36 ++++++++++- parquet-variant/tests/variant_interop.rs | 4 +- 3 files changed, 115 insertions(+), 4 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 296d523c5449..8c8ac2fa3bfd 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow_schema::ArrowError; -use chrono::{DateTime, Duration, NaiveDate}; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc}; use std::array::TryFromSliceError; use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; @@ -36,6 +36,8 @@ pub enum VariantPrimitiveType { Int8 = 3, // TODO: Add types for the rest of primitives, once API is agreed upon Date = 11, + TimestampMicros = 12, + TimestampNTZMicros = 13, Binary = 15, String = 16, } @@ -69,6 +71,8 @@ impl TryFrom for VariantPrimitiveType { 3 => Ok(VariantPrimitiveType::Int8), // TODO: Add types for the rest, once API is agreed upon 11 => Ok(VariantPrimitiveType::Date), + 12 => Ok(VariantPrimitiveType::TimestampMicros), + 13 => Ok(VariantPrimitiveType::TimestampNTZMicros), 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), _ => Err(ArrowError::InvalidArgumentError(format!( @@ -102,6 +106,30 @@ pub(crate) fn decode_date(value: &[u8]) -> Result { Ok(value) } +/// Decodes a TimestampMicros from the value section of a variant. +pub(crate) fn decode_timestamp_micros(value: &[u8]) -> Result, ArrowError> { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(value, 1)?); + if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { + Ok(value) + } else { + Err(ArrowError::CastError(format!( + "Could not cast `{micros_since_epoch}` microseconds into a DateTime" + ))) + } +} + +/// Decodes a TimestampNTZMicros from the value section of a variant. +pub(crate) fn decode_timestampntz_micros(value: &[u8]) -> Result { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(value, 1)?); + if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { + Ok(value.naive_utc()) + } else { + Err(ArrowError::CastError(format!( + "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime" + ))) + } +} + /// Decodes a Binary from the value section of a variant. pub(crate) fn decode_binary(value: &[u8]) -> Result<&[u8], ArrowError> { let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; @@ -153,6 +181,55 @@ mod tests { Ok(()) } + #[test] + fn test_timestamp_micros() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::TimestampMicros as u8) << 2, // Basic type + 0xe0, + 0x52, + 0x97, + 0xdd, + 0xe7, + 0x32, + 0x06, + 0x00, // Data + ]; + let result = decode_timestamp_micros(&value)?; + assert_eq!( + result, + NaiveDate::from_ymd_opt(2025, 4, 16) + .unwrap() + .and_hms_milli_opt(16, 34, 56, 780) + .unwrap() + .and_utc() + ); + Ok(()) + } + + #[test] + fn test_timestampntz_micros() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::TimestampNTZMicros as u8) << 2, // Basic type + 0xe0, + 0x52, + 0x97, + 0xdd, + 0xe7, + 0x32, + 0x06, + 0x00, // Data + ]; + let result = decode_timestampntz_micros(&value)?; + assert_eq!( + result, + NaiveDate::from_ymd_opt(2025, 4, 16) + .unwrap() + .and_hms_milli_opt(16, 34, 56, 780) + .unwrap() + ); + Ok(()) + } + #[test] fn test_binary() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 77adddbf84b1..e3504160fa70 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -19,7 +19,7 @@ use crate::decoder::{ }; use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; use arrow_schema::ArrowError; -use chrono::NaiveDate; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; use std::{num::TryFromIntError, ops::Range}; #[derive(Clone, Debug, Copy, PartialEq)] @@ -404,6 +404,8 @@ pub enum Variant<'m, 'v> { Null, Int8(i8), Date(NaiveDate), + TimestampMicros(DateTime), + TimestampNTZMicros(NaiveDateTime), BooleanTrue, BooleanFalse, @@ -430,6 +432,12 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value)?), + VariantPrimitiveType::TimestampMicros => { + Variant::TimestampMicros(decoder::decode_timestamp_micros(value)?) + } + VariantPrimitiveType::TimestampNTZMicros => { + Variant::TimestampNTZMicros(decoder::decode_timestampntz_micros(value)?) + } VariantPrimitiveType::Binary => Variant::Binary(decoder::decode_binary(value)?), VariantPrimitiveType::String => { Variant::String(decoder::decode_long_string(value)?) @@ -464,6 +472,21 @@ impl<'m, 'v> Variant<'m, 'v> { } } + pub fn as_datetime_utc(self) -> Option> { + if let Variant::TimestampMicros(d) = self { + Some(d) + } else { + None + } + } + pub fn as_naive_datetime(self) -> Option { + if let Variant::TimestampNTZMicros(d) = self { + Some(d) + } else { + None + } + } + pub fn as_u8_slice(&'v self) -> Option<&'v [u8]> { if let Variant::Binary(d) = self { Some(d) @@ -519,6 +542,17 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From> for Variant<'m, 'v> { + fn from(value: DateTime) -> Self { + Variant::TimestampMicros(value) + } +} +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: NaiveDateTime) -> Self { + Variant::TimestampNTZMicros(value) + } +} + impl<'m, 'v> From<&'v [u8]> for Variant<'m, 'v> { fn from(value: &'v [u8]) -> Self { Variant::Binary(value) diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 3bcbd03c1fd7..55aaa936c5b7 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -61,8 +61,8 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { //("primitive_int64", Variant::Null), ("primitive_null", Variant::Null), ("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), - //("primitive_timestamp", Variant::Null), - //("primitive_timestampntz", Variant::Null), + ("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())), + ("primitive_timestampntz", Variant::TimestampNTZMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())), ("short_string", Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)")), ] } From 2eb1d227e2468faf19fcf0afdc13ba08c99f98bc Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Tue, 10 Jun 2025 14:45:18 -0700 Subject: [PATCH 04/19] Implement Int16, Int32, and Int64 variants --- parquet-variant/src/decoder.rs | 71 +++++++++++++++++++++++- parquet-variant/src/variant.rs | 58 ++++++++++++++++++- parquet-variant/tests/variant_interop.rs | 6 +- 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 8c8ac2fa3bfd..bebb388c5aae 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -34,6 +34,9 @@ pub enum VariantPrimitiveType { BooleanTrue = 1, BooleanFalse = 2, Int8 = 3, + Int16 = 4, + Int32 = 5, + Int64 = 6, // TODO: Add types for the rest of primitives, once API is agreed upon Date = 11, TimestampMicros = 12, @@ -69,6 +72,9 @@ impl TryFrom for VariantPrimitiveType { 1 => Ok(VariantPrimitiveType::BooleanTrue), 2 => Ok(VariantPrimitiveType::BooleanFalse), 3 => Ok(VariantPrimitiveType::Int8), + 4 => Ok(VariantPrimitiveType::Int16), + 5 => Ok(VariantPrimitiveType::Int32), + 6 => Ok(VariantPrimitiveType::Int64), // TODO: Add types for the rest, once API is agreed upon 11 => Ok(VariantPrimitiveType::Date), 12 => Ok(VariantPrimitiveType::TimestampMicros), @@ -98,6 +104,23 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { let value = i8::from_le_bytes(array_from_slice(value, 1)?); Ok(value) } +/// Decodes an Int16 from the value section of a variant. +pub(crate) fn decode_int16(value: &[u8]) -> Result { + let value = i16::from_le_bytes(array_from_slice(value, 1)?); + Ok(value) +} + +/// Decodes an Int32 from the value section of a variant. +pub(crate) fn decode_int32(value: &[u8]) -> Result { + let value = i32::from_le_bytes(array_from_slice(value, 1)?); + Ok(value) +} + +/// Decodes an Int64 from the value section of a variant. +pub(crate) fn decode_int64(value: &[u8]) -> Result { + let value = i64::from_le_bytes(array_from_slice(value, 1)?); + Ok(value) +} /// Decodes a Date from the value section of a variant. pub(crate) fn decode_date(value: &[u8]) -> Result { @@ -159,14 +182,58 @@ mod tests { #[test] fn test_i8() -> Result<(), ArrowError> { let value = [ - 3 << 2, // Primitive type for i8 - 42, + (VariantPrimitiveType::Int8 as u8) << 2, // Basic type + 0x2a, // Data ]; let result = decode_int8(&value)?; assert_eq!(result, 42); Ok(()) } + #[test] + fn test_i16() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Int16 as u8) << 2, // Basic type + 0xd2, + 0x04, // Data + ]; + let result = decode_int16(&value)?; + assert_eq!(result, 1234); + Ok(()) + } + + #[test] + fn test_i32() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Int32 as u8) << 2, // Basic type + 0x40, + 0xe2, + 0x01, + 0x00, // Data + ]; + let result = decode_int32(&value)?; + assert_eq!(result, 123456); + Ok(()) + } + + #[test] + fn test_i64() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Int64 as u8) << 2, // Basic type + 0x15, + 0x81, + 0xe9, + 0x7d, + 0xf4, + 0x10, + 0x22, + 0x11, // Data + ]; + let result = decode_int64(&value)?; + assert_eq!(result, 1234567890123456789); + Ok(()) + } + #[test] fn test_date() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index e3504160fa70..7fd3ab88b168 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -403,6 +403,9 @@ pub enum Variant<'m, 'v> { // TODO: Add types for the rest of the primitive types, once API is agreed upon Null, Int8(i8), + Int16(i16), + Int32(i32), + Int64(i64), Date(NaiveDate), TimestampMicros(DateTime), TimestampNTZMicros(NaiveDateTime), @@ -428,6 +431,9 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::Primitive => match get_primitive_type(header)? { VariantPrimitiveType::Null => Variant::Null, VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value)?), + VariantPrimitiveType::Int16 => Variant::Int16(decoder::decode_int16(value)?), + VariantPrimitiveType::Int32 => Variant::Int32(decoder::decode_int32(value)?), + VariantPrimitiveType::Int64 => Variant::Int64(decoder::decode_int64(value)?), VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon @@ -505,9 +511,37 @@ impl<'m, 'v> Variant<'m, 'v> { pub fn as_int8(&self) -> Option { match *self { Variant::Int8(i) => Some(i), - // TODO: Add branches for type-widening/shortening when implemting rest of primitives for int - // Variant::Int16(i) => i.try_into().ok(), - // ... + Variant::Int16(i) => i.try_into().ok(), + Variant::Int32(i) => i.try_into().ok(), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + + pub fn as_int16(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i), + Variant::Int32(i) => i.try_into().ok(), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + pub fn as_int32(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i.into()), + Variant::Int32(i) => Some(i), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + pub fn as_int64(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i.into()), + Variant::Int32(i) => Some(i.into()), + Variant::Int64(i) => Some(i), _ => None, } } @@ -527,6 +561,24 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: i16) -> Self { + Variant::Int16(value) + } +} + +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: i32) -> Self { + Variant::Int32(value) + } +} + +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: i64) -> Self { + Variant::Int64(value) + } +} + impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: bool) -> Self { match value { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 55aaa936c5b7..175e1ec720c3 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -56,9 +56,9 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { //("primitive_decimal16", Variant::Null), //("primitive_float", Variant::Null), ("primitive_int8", Variant::Int8(42)), - //("primitive_int16", Variant::Null), - //("primitive_int32", Variant::Null), - //("primitive_int64", Variant::Null), + ("primitive_int16", Variant::Int16(1234)), + ("primitive_int32", Variant::Int32(123456)), + ("primitive_int64", Variant::Int64(1234567890123456789)), ("primitive_null", Variant::Null), ("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), ("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())), From f9261382dd975bd9e6eed7a72e9aea00fd2a1faa Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Tue, 10 Jun 2025 15:22:55 -0700 Subject: [PATCH 05/19] Implement Decimal4, Decimal8, and Decimal16 variants --- parquet-variant/src/decoder.rs | 88 ++++++++++++++++++++++++ parquet-variant/src/variant.rs | 88 +++++++++++++++++++++++- parquet-variant/tests/variant_interop.rs | 6 +- 3 files changed, 178 insertions(+), 4 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index bebb388c5aae..65f274f353a3 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -37,6 +37,9 @@ pub enum VariantPrimitiveType { Int16 = 4, Int32 = 5, Int64 = 6, + Decimal4 = 8, + Decimal8 = 9, + Decimal16 = 10, // TODO: Add types for the rest of primitives, once API is agreed upon Date = 11, TimestampMicros = 12, @@ -76,6 +79,9 @@ impl TryFrom for VariantPrimitiveType { 5 => Ok(VariantPrimitiveType::Int32), 6 => Ok(VariantPrimitiveType::Int64), // TODO: Add types for the rest, once API is agreed upon + 8 => Ok(VariantPrimitiveType::Decimal4), + 9 => Ok(VariantPrimitiveType::Decimal8), + 10 => Ok(VariantPrimitiveType::Decimal16), 11 => Ok(VariantPrimitiveType::Date), 12 => Ok(VariantPrimitiveType::TimestampMicros), 13 => Ok(VariantPrimitiveType::TimestampNTZMicros), @@ -122,6 +128,27 @@ pub(crate) fn decode_int64(value: &[u8]) -> Result { Ok(value) } +/// Decodes a Decimal4 from the value section of a variant. +pub(crate) fn decode_decimal4(value: &[u8]) -> Result<(i32, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(value, 1)?); + let integer = i32::from_le_bytes(array_from_slice(value, 2)?); + Ok((integer, scale)) +} + +/// Decodes a Decimal8 from the value section of a variant. +pub(crate) fn decode_decimal8(value: &[u8]) -> Result<(i64, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(value, 1)?); + let integer = i64::from_le_bytes(array_from_slice(value, 2)?); + Ok((integer, scale)) +} + +/// Decodes a Decimal16 from the value section of a variant. +pub(crate) fn decode_decimal16(value: &[u8]) -> Result<(i128, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(value, 1)?); + let integer = i128::from_le_bytes(array_from_slice(value, 2)?); + Ok((integer, scale)) +} + /// Decodes a Date from the value section of a variant. pub(crate) fn decode_date(value: &[u8]) -> Result { let days_since_epoch = i32::from_le_bytes(array_from_slice(value, 1)?); @@ -234,6 +261,67 @@ mod tests { Ok(()) } + #[test] + fn test_decimal4() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Decimal4 as u8) << 2, // Basic type + 0x02, // Scale + 0xd2, + 0x04, + 0x00, + 0x00, // Integer + ]; + let result = decode_decimal4(&value)?; + assert_eq!(result, (1234, 2)); + Ok(()) + } + + #[test] + fn test_decimal8() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Decimal8 as u8) << 2, // Basic type + 0x02, // Scale + 0xd2, + 0x02, + 0x96, + 0x49, + 0x00, + 0x00, + 0x00, + 0x00, // Integer + ]; + let result = decode_decimal8(&value)?; + assert_eq!(result, (1234567890, 2)); + Ok(()) + } + + #[test] + fn test_decimal16() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Decimal16 as u8) << 2, // Basic type + 0x02, // Scale + 0xd2, + 0xb6, + 0x23, + 0xc0, + 0xf4, + 0x10, + 0x22, + 0x11, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, // Integer + ]; + let result = decode_decimal16(&value)?; + assert_eq!(result, (1234567891234567890, 2)); + Ok(()) + } + #[test] fn test_date() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 7fd3ab88b168..6f2cef80b765 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -409,7 +409,9 @@ pub enum Variant<'m, 'v> { Date(NaiveDate), TimestampMicros(DateTime), TimestampNTZMicros(NaiveDateTime), - + Decimal4 { integer: i32, scale: u8 }, + Decimal8 { integer: i64, scale: u8 }, + Decimal16 { integer: i128, scale: u8 }, BooleanTrue, BooleanFalse, @@ -434,6 +436,18 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::Int16 => Variant::Int16(decoder::decode_int16(value)?), VariantPrimitiveType::Int32 => Variant::Int32(decoder::decode_int32(value)?), VariantPrimitiveType::Int64 => Variant::Int64(decoder::decode_int64(value)?), + VariantPrimitiveType::Decimal4 => { + let (integer, scale) = decoder::decode_decimal4(value)?; + Variant::Decimal4 { integer, scale } + } + VariantPrimitiveType::Decimal8 => { + let (integer, scale) = decoder::decode_decimal8(value)?; + Variant::Decimal8 { integer, scale } + } + VariantPrimitiveType::Decimal16 => { + let (integer, scale) = decoder::decode_decimal16(value)?; + Variant::Decimal16 { integer, scale } + } VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon @@ -546,6 +560,51 @@ impl<'m, 'v> Variant<'m, 'v> { } } + pub fn as_decimal_int32(&self) -> Option<(i32, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer, scale)), + Variant::Decimal8 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + Variant::Decimal16 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + _ => None, + } + } + + pub fn as_decimal_int64(&self) -> Option<(i64, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal8 { integer, scale } => Some((integer, scale)), + Variant::Decimal16 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + _ => None, + } + } + + pub fn as_decimal_int128(&self) -> Option<(i128, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal8 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal16 { integer, scale } => Some((integer, scale)), + _ => None, + } + } + pub fn metadata(&self) -> Option<&'m VariantMetadata> { match self { Variant::Object(VariantObject { metadata, .. }) @@ -579,6 +638,33 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From<(i32, u8)> for Variant<'m, 'v> { + fn from(value: (i32, u8)) -> Self { + Variant::Decimal4 { + integer: value.0, + scale: value.1, + } + } +} + +impl<'m, 'v> From<(i64, u8)> for Variant<'m, 'v> { + fn from(value: (i64, u8)) -> Self { + Variant::Decimal8 { + integer: value.0, + scale: value.1, + } + } +} + +impl<'m, 'v> From<(i128, u8)> for Variant<'m, 'v> { + fn from(value: (i128, u8)) -> Self { + Variant::Decimal16 { + integer: value.0, + scale: value.1, + } + } +} + impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: bool) -> Self { match value { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 175e1ec720c3..215c47585c79 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -51,9 +51,9 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_boolean_false", Variant::BooleanFalse), ("primitive_boolean_true", Variant::BooleanTrue), ("primitive_date", Variant::Date(NaiveDate::from_ymd_opt(2025, 4 , 16).unwrap())), - //("primitive_decimal4", Variant::Null), - //("primitive_decimal8", Variant::Null), - //("primitive_decimal16", Variant::Null), + ("primitive_decimal4", Variant::Decimal4{integer: 1234, scale: 2}), + ("primitive_decimal8", Variant::Decimal8{integer: 1234567890, scale: 2}), + ("primitive_decimal16", Variant::Decimal16{integer: 1234567891234567890, scale: 2}), //("primitive_float", Variant::Null), ("primitive_int8", Variant::Int8(42)), ("primitive_int16", Variant::Int16(1234)), From aa64f33cbaaed8e9a53525539f0e210689ffc108 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Tue, 10 Jun 2025 20:17:20 -0700 Subject: [PATCH 06/19] Implement Float variant --- parquet-variant/src/decoder.rs | 23 +++++++++++++++++++++++ parquet-variant/src/variant.rs | 18 ++++++++++++++++++ parquet-variant/tests/variant_interop.rs | 2 +- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 65f274f353a3..34417990d075 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -44,6 +44,7 @@ pub enum VariantPrimitiveType { Date = 11, TimestampMicros = 12, TimestampNTZMicros = 13, + Float = 14, Binary = 15, String = 16, } @@ -85,6 +86,7 @@ impl TryFrom for VariantPrimitiveType { 11 => Ok(VariantPrimitiveType::Date), 12 => Ok(VariantPrimitiveType::TimestampMicros), 13 => Ok(VariantPrimitiveType::TimestampNTZMicros), + 14 => Ok(VariantPrimitiveType::Float), 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), _ => Err(ArrowError::InvalidArgumentError(format!( @@ -110,6 +112,7 @@ pub(crate) fn decode_int8(value: &[u8]) -> Result { let value = i8::from_le_bytes(array_from_slice(value, 1)?); Ok(value) } + /// Decodes an Int16 from the value section of a variant. pub(crate) fn decode_int16(value: &[u8]) -> Result { let value = i16::from_le_bytes(array_from_slice(value, 1)?); @@ -149,6 +152,12 @@ pub(crate) fn decode_decimal16(value: &[u8]) -> Result<(i128, u8), ArrowError> { Ok((integer, scale)) } +/// Decodes a Float from the value section of a variant. +pub(crate) fn decode_float(value: &[u8]) -> Result { + let value = f32::from_le_bytes(array_from_slice(value, 1)?); + Ok(value) +} + /// Decodes a Date from the value section of a variant. pub(crate) fn decode_date(value: &[u8]) -> Result { let days_since_epoch = i32::from_le_bytes(array_from_slice(value, 1)?); @@ -322,6 +331,20 @@ mod tests { Ok(()) } + #[test] + fn test_float() -> Result<(), ArrowError> { + let value = [ + (VariantPrimitiveType::Float as u8) << 2, // Basic type + 0x06, + 0x2c, + 0x93, + 0x4e, // Data + ]; + let result = decode_float(&value)?; + assert_eq!(result, 1234567890.1234); + Ok(()) + } + #[test] fn test_date() -> Result<(), ArrowError> { let value = [ diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 6f2cef80b765..941fff1cb453 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -412,6 +412,7 @@ pub enum Variant<'m, 'v> { Decimal4 { integer: i32, scale: u8 }, Decimal8 { integer: i64, scale: u8 }, Decimal16 { integer: i128, scale: u8 }, + Float(f32), BooleanTrue, BooleanFalse, @@ -448,6 +449,7 @@ impl<'m, 'v> Variant<'m, 'v> { let (integer, scale) = decoder::decode_decimal16(value)?; Variant::Decimal16 { integer, scale } } + VariantPrimitiveType::Float => Variant::Float(decoder::decode_float(value)?), VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon @@ -605,6 +607,16 @@ impl<'m, 'v> Variant<'m, 'v> { } } + pub fn as_f32(&self) -> Option { + match *self { + Variant::Float(i) => Some(i), + // TODO Add Variant::Double + // TODO Add int variants? + // TODO Add decimal variants? + _ => None, + } + } + pub fn metadata(&self) -> Option<&'m VariantMetadata> { match self { Variant::Object(VariantObject { metadata, .. }) @@ -665,6 +677,12 @@ impl<'m, 'v> From<(i128, u8)> for Variant<'m, 'v> { } } +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: f32) -> Self { + Variant::Float(value) + } +} + impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: bool) -> Self { match value { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 215c47585c79..cec23cc8bd53 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -54,7 +54,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_decimal4", Variant::Decimal4{integer: 1234, scale: 2}), ("primitive_decimal8", Variant::Decimal8{integer: 1234567890, scale: 2}), ("primitive_decimal16", Variant::Decimal16{integer: 1234567891234567890, scale: 2}), - //("primitive_float", Variant::Null), + ("primitive_float", Variant::Float(1234567890.1234)), ("primitive_int8", Variant::Int8(42)), ("primitive_int16", Variant::Int16(1234)), ("primitive_int32", Variant::Int32(123456)), From 8b40263c93e1b068e52684b8cc61fd760dc7f271 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 11 Jun 2025 19:34:33 -0700 Subject: [PATCH 07/19] Remove unnecessary TODOs --- parquet-variant/src/decoder.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 34417990d075..ea02ca758073 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -40,7 +40,6 @@ pub enum VariantPrimitiveType { Decimal4 = 8, Decimal8 = 9, Decimal16 = 10, - // TODO: Add types for the rest of primitives, once API is agreed upon Date = 11, TimestampMicros = 12, TimestampNTZMicros = 13, @@ -79,7 +78,6 @@ impl TryFrom for VariantPrimitiveType { 4 => Ok(VariantPrimitiveType::Int16), 5 => Ok(VariantPrimitiveType::Int32), 6 => Ok(VariantPrimitiveType::Int64), - // TODO: Add types for the rest, once API is agreed upon 8 => Ok(VariantPrimitiveType::Decimal4), 9 => Ok(VariantPrimitiveType::Decimal8), 10 => Ok(VariantPrimitiveType::Decimal16), From 85036ad3f6c47d304b8aaf0621efb2a470368ff7 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 11 Jun 2025 20:21:16 -0700 Subject: [PATCH 08/19] Simplify offset calculations for values by separating value-metadata from value-data --- parquet-variant/src/decoder.rs | 265 ++++++++++----------------------- parquet-variant/src/variant.rs | 71 +++++---- 2 files changed, 119 insertions(+), 217 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index ea02ca758073..f77115476b0f 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -18,7 +18,7 @@ use arrow_schema::ArrowError; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc}; use std::array::TryFromSliceError; -use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; +use crate::utils::{array_from_slice, slice_from_slice, string_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -94,10 +94,10 @@ impl TryFrom for VariantPrimitiveType { } } } -/// Extract the primitive type from a Variant value-header byte -pub(crate) fn get_primitive_type(header: u8) -> Result { +/// Extract the primitive type from a Variant value-metadata byte +pub(crate) fn get_primitive_type(metadata: u8) -> Result { // last 6 bits contain the primitive-type, see spec - VariantPrimitiveType::try_from(header >> 2) + VariantPrimitiveType::try_from(metadata >> 2) } /// To be used in `map_err` when unpacking an integer from a slice of bytes. @@ -106,66 +106,66 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { } /// Decodes an Int8 from the value section of a variant. -pub(crate) fn decode_int8(value: &[u8]) -> Result { - let value = i8::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_int8(data: &[u8]) -> Result { + let value = i8::from_le_bytes(array_from_slice(data, 0)?); Ok(value) } /// Decodes an Int16 from the value section of a variant. -pub(crate) fn decode_int16(value: &[u8]) -> Result { - let value = i16::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_int16(data: &[u8]) -> Result { + let value = i16::from_le_bytes(array_from_slice(data, 0)?); Ok(value) } /// Decodes an Int32 from the value section of a variant. -pub(crate) fn decode_int32(value: &[u8]) -> Result { - let value = i32::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_int32(data: &[u8]) -> Result { + let value = i32::from_le_bytes(array_from_slice(data, 0)?); Ok(value) } /// Decodes an Int64 from the value section of a variant. -pub(crate) fn decode_int64(value: &[u8]) -> Result { - let value = i64::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_int64(data: &[u8]) -> Result { + let value = i64::from_le_bytes(array_from_slice(data, 0)?); Ok(value) } /// Decodes a Decimal4 from the value section of a variant. -pub(crate) fn decode_decimal4(value: &[u8]) -> Result<(i32, u8), ArrowError> { - let scale = u8::from_le_bytes(array_from_slice(value, 1)?); - let integer = i32::from_le_bytes(array_from_slice(value, 2)?); +pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i32::from_le_bytes(array_from_slice(data, 1)?); Ok((integer, scale)) } /// Decodes a Decimal8 from the value section of a variant. -pub(crate) fn decode_decimal8(value: &[u8]) -> Result<(i64, u8), ArrowError> { - let scale = u8::from_le_bytes(array_from_slice(value, 1)?); - let integer = i64::from_le_bytes(array_from_slice(value, 2)?); +pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i64::from_le_bytes(array_from_slice(data, 1)?); Ok((integer, scale)) } /// Decodes a Decimal16 from the value section of a variant. -pub(crate) fn decode_decimal16(value: &[u8]) -> Result<(i128, u8), ArrowError> { - let scale = u8::from_le_bytes(array_from_slice(value, 1)?); - let integer = i128::from_le_bytes(array_from_slice(value, 2)?); +pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i128::from_le_bytes(array_from_slice(data, 1)?); Ok((integer, scale)) } /// Decodes a Float from the value section of a variant. -pub(crate) fn decode_float(value: &[u8]) -> Result { - let value = f32::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_float(data: &[u8]) -> Result { + let value = f32::from_le_bytes(array_from_slice(data, 0)?); Ok(value) } /// Decodes a Date from the value section of a variant. -pub(crate) fn decode_date(value: &[u8]) -> Result { - let days_since_epoch = i32::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_date(data: &[u8]) -> Result { + let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?); let value = (DateTime::UNIX_EPOCH + Duration::days(days_since_epoch as i64)).date_naive(); Ok(value) } /// Decodes a TimestampMicros from the value section of a variant. -pub(crate) fn decode_timestamp_micros(value: &[u8]) -> Result, ArrowError> { - let micros_since_epoch = i64::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result, ArrowError> { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { Ok(value) } else { @@ -176,8 +176,8 @@ pub(crate) fn decode_timestamp_micros(value: &[u8]) -> Result, Arr } /// Decodes a TimestampNTZMicros from the value section of a variant. -pub(crate) fn decode_timestampntz_micros(value: &[u8]) -> Result { - let micros_since_epoch = i64::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { Ok(value.naive_utc()) } else { @@ -188,24 +188,24 @@ pub(crate) fn decode_timestampntz_micros(value: &[u8]) -> Result Result<&[u8], ArrowError> { - let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; - let value = slice_from_slice(value, 5..5 + len)?; +pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> { + let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; + let value = slice_from_slice(data, 4..4 + len)?; Ok(value) } /// Decodes a long string from the value section of a variant. -pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; - let string = string_from_slice(value, 5..5 + len)?; +pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> { + let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; + let string = string_from_slice(data, 4..4 + len)?; Ok(string) } /// Decodes a short string from the value section of a variant. -pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = (first_byte_from_slice(value)? >> 2) as usize; +pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> { + let len = (metadata >> 2) as usize; - let string = string_from_slice(value, 1..1 + len)?; + let string = string_from_slice(data, 0..len)?; Ok(string) } @@ -215,162 +215,90 @@ mod tests { #[test] fn test_i8() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Int8 as u8) << 2, // Basic type - 0x2a, // Data - ]; - let result = decode_int8(&value)?; + let data = [0x2a]; + let result = decode_int8(&data)?; assert_eq!(result, 42); Ok(()) } #[test] fn test_i16() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Int16 as u8) << 2, // Basic type - 0xd2, - 0x04, // Data - ]; - let result = decode_int16(&value)?; + let data = [0xd2, 0x04]; + let result = decode_int16(&data)?; assert_eq!(result, 1234); Ok(()) } #[test] fn test_i32() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Int32 as u8) << 2, // Basic type - 0x40, - 0xe2, - 0x01, - 0x00, // Data - ]; - let result = decode_int32(&value)?; + let data = [0x40, 0xe2, 0x01, 0x00]; + let result = decode_int32(&data)?; assert_eq!(result, 123456); Ok(()) } #[test] fn test_i64() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Int64 as u8) << 2, // Basic type - 0x15, - 0x81, - 0xe9, - 0x7d, - 0xf4, - 0x10, - 0x22, - 0x11, // Data - ]; - let result = decode_int64(&value)?; + let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11]; + let result = decode_int64(&data)?; assert_eq!(result, 1234567890123456789); Ok(()) } #[test] fn test_decimal4() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Decimal4 as u8) << 2, // Basic type - 0x02, // Scale - 0xd2, - 0x04, - 0x00, - 0x00, // Integer + let data = [ + 0x02, // Scale + 0xd2, 0x04, 0x00, 0x00, // Integer ]; - let result = decode_decimal4(&value)?; + let result = decode_decimal4(&data)?; assert_eq!(result, (1234, 2)); Ok(()) } #[test] fn test_decimal8() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Decimal8 as u8) << 2, // Basic type - 0x02, // Scale - 0xd2, - 0x02, - 0x96, - 0x49, - 0x00, - 0x00, - 0x00, - 0x00, // Integer + let data = [ + 0x02, // Scale + 0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer ]; - let result = decode_decimal8(&value)?; + let result = decode_decimal8(&data)?; assert_eq!(result, (1234567890, 2)); Ok(()) } #[test] fn test_decimal16() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Decimal16 as u8) << 2, // Basic type - 0x02, // Scale - 0xd2, - 0xb6, - 0x23, - 0xc0, - 0xf4, - 0x10, - 0x22, - 0x11, - 0x00, - 0x00, - 0x00, - 0x00, - 0x00, - 0x00, - 0x00, - 0x00, // Integer + let data = [ + 0x02, // Scale + 0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // Integer ]; - let result = decode_decimal16(&value)?; + let result = decode_decimal16(&data)?; assert_eq!(result, (1234567891234567890, 2)); Ok(()) } #[test] fn test_float() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Float as u8) << 2, // Basic type - 0x06, - 0x2c, - 0x93, - 0x4e, // Data - ]; - let result = decode_float(&value)?; + let data = [0x06, 0x2c, 0x93, 0x4e]; + let result = decode_float(&data)?; assert_eq!(result, 1234567890.1234); Ok(()) } #[test] fn test_date() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Date as u8) << 2, // Basic type - 0xe2, - 0x4e, - 0x0, - 0x0, // Data - ]; - let result = decode_date(&value)?; + let data = [0xe2, 0x4e, 0x0, 0x0]; + let result = decode_date(&data)?; assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()); Ok(()) } #[test] fn test_timestamp_micros() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::TimestampMicros as u8) << 2, // Basic type - 0xe0, - 0x52, - 0x97, - 0xdd, - 0xe7, - 0x32, - 0x06, - 0x00, // Data - ]; - let result = decode_timestamp_micros(&value)?; + let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; + let result = decode_timestamp_micros(&data)?; assert_eq!( result, NaiveDate::from_ymd_opt(2025, 4, 16) @@ -384,18 +312,8 @@ mod tests { #[test] fn test_timestampntz_micros() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::TimestampNTZMicros as u8) << 2, // Basic type - 0xe0, - 0x52, - 0x97, - 0xdd, - 0xe7, - 0x32, - 0x06, - 0x00, // Data - ]; - let result = decode_timestampntz_micros(&value)?; + let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; + let result = decode_timestampntz_micros(&data)?; assert_eq!( result, NaiveDate::from_ymd_opt(2025, 4, 16) @@ -408,23 +326,11 @@ mod tests { #[test] fn test_binary() -> Result<(), ArrowError> { - let value = [ - (VariantPrimitiveType::Binary as u8) << 2, // Basic type - 9, - 0, - 0, - 0, // Length of binary data, 4-byte little-endian - 0x03, - 0x13, - 0x37, - 0xde, - 0xad, - 0xbe, - 0xef, - 0xca, - 0xfe, // Data + let data = [ + 9, 0, 0, 0, // Length of binary data, 4-byte little-endian + 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, ]; - let result = decode_binary(&value)?; + let result = decode_binary(&data)?; assert_eq!( result, [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe] @@ -434,36 +340,19 @@ mod tests { #[test] fn test_short_string() -> Result<(), ArrowError> { - let value = [ - 1 | 5 << 2, // Basic type for short string | length of short string - b'H', - b'e', - b'l', - b'l', - b'o', - b'o', - ]; - let result = decode_short_string(&value)?; + let data = [b'H', b'e', b'l', b'l', b'o', b'o']; + let result = decode_short_string(1 | 5 << 2, &data)?; assert_eq!(result, "Hello"); Ok(()) } #[test] fn test_string() -> Result<(), ArrowError> { - let value = [ - 16 << 2, // Basic type for short string | length of short string - 5, - 0, - 0, - 0, // Length of string - b'H', - b'e', - b'l', - b'l', - b'o', - b'o', + let data = [ + 5, 0, 0, 0, // Length of string, 4-byte little-endian + b'H', b'e', b'l', b'l', b'o', b'o', ]; - let result = decode_long_string(&value)?; + let result = decode_long_string(&data)?; assert_eq!(result, "Hello"); Ok(()) } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 941fff1cb453..1ba341b27fc9 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -304,7 +304,8 @@ impl<'m> VariantMetadata<'m> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { pub metadata: &'m VariantMetadata<'m>, - pub value: &'v [u8], + pub value_metadata: u8, + pub value_data: &'v [u8], } impl<'m, 'v> VariantObject<'m, 'v> { pub fn fields(&self) -> Result)>, ArrowError> { @@ -320,7 +321,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantArray<'m, 'v> { pub metadata: &'m VariantMetadata<'m>, - pub value: &'v [u8], + pub value_metadata: u8, + pub value_data: &'v [u8], } impl<'m, 'v> VariantArray<'m, 'v> { @@ -343,7 +345,7 @@ impl<'m, 'v> VariantArray<'m, 'v> { pub fn get(&self, index: usize) -> Result, ArrowError> { // The 6 first bits to the left are the value_header and the 2 bits // to the right are the basic type, so we shift to get only the value_header - let value_header = first_byte_from_slice(self.value)? >> 2; + let value_header = self.value_metadata >> 2; let is_large = (value_header & 0x04) != 0; // 3rd bit from the right let field_offset_size_minus_one = value_header & 0x03; // Last two bits let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; @@ -353,11 +355,11 @@ impl<'m, 'v> VariantArray<'m, 'v> { true => OffsetSizeBytes::Four, false => OffsetSizeBytes::One, }; - // Skip the header byte to read the num_elements + // Read the num_elements // The size of the num_elements entry in the array value_data is 4 bytes if // is_large is true, otherwise 1 byte. - let num_elements = num_elements_size.unpack_usize(self.value, 1, 0)?; - let first_offset_byte = 1 + num_elements_size as usize; + let num_elements = num_elements_size.unpack_usize(self.value_data, 0, 0)?; + let first_offset_byte = num_elements_size as usize; let overflow = || ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into()); @@ -375,15 +377,15 @@ impl<'m, 'v> VariantArray<'m, 'v> { .checked_add(value_bytes) .ok_or_else(overflow)?; - // Skip header and num_elements bytes to read the offsets + // Skip num_elements bytes to read the offsets let start_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, first_offset_byte, index)?; + offset_size.unpack_usize(self.value_data, first_offset_byte, index)?; let end_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, first_offset_byte, index + 1)?; + offset_size.unpack_usize(self.value_data, first_offset_byte, index + 1)?; // Read the value bytes from the offsets let variant_value_bytes = slice_from_slice( - self.value, + self.value_data, first_value_byte + start_field_offset_from_first_value_byte ..first_value_byte + end_field_offset_from_first_value_byte, )?; @@ -429,47 +431,58 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. pub fn try_new(metadata: &'m VariantMetadata, value: &'v [u8]) -> Result { - let header = *first_byte_from_slice(value)?; - let new_self = match get_basic_type(header)? { - VariantBasicType::Primitive => match get_primitive_type(header)? { + let value_metadata = *first_byte_from_slice(value)?; + let value_data = slice_from_slice(value, 1..)?; + let new_self = match get_basic_type(value_metadata)? { + VariantBasicType::Primitive => match get_primitive_type(value_metadata)? { VariantPrimitiveType::Null => Variant::Null, - VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value)?), - VariantPrimitiveType::Int16 => Variant::Int16(decoder::decode_int16(value)?), - VariantPrimitiveType::Int32 => Variant::Int32(decoder::decode_int32(value)?), - VariantPrimitiveType::Int64 => Variant::Int64(decoder::decode_int64(value)?), + VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value_data)?), + VariantPrimitiveType::Int16 => Variant::Int16(decoder::decode_int16(value_data)?), + VariantPrimitiveType::Int32 => Variant::Int32(decoder::decode_int32(value_data)?), + VariantPrimitiveType::Int64 => Variant::Int64(decoder::decode_int64(value_data)?), VariantPrimitiveType::Decimal4 => { - let (integer, scale) = decoder::decode_decimal4(value)?; + let (integer, scale) = decoder::decode_decimal4(value_data)?; Variant::Decimal4 { integer, scale } } VariantPrimitiveType::Decimal8 => { - let (integer, scale) = decoder::decode_decimal8(value)?; + let (integer, scale) = decoder::decode_decimal8(value_data)?; Variant::Decimal8 { integer, scale } } VariantPrimitiveType::Decimal16 => { - let (integer, scale) = decoder::decode_decimal16(value)?; + let (integer, scale) = decoder::decode_decimal16(value_data)?; Variant::Decimal16 { integer, scale } } - VariantPrimitiveType::Float => Variant::Float(decoder::decode_float(value)?), + VariantPrimitiveType::Float => Variant::Float(decoder::decode_float(value_data)?), VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon - VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value)?), + VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value_data)?), VariantPrimitiveType::TimestampMicros => { - Variant::TimestampMicros(decoder::decode_timestamp_micros(value)?) + Variant::TimestampMicros(decoder::decode_timestamp_micros(value_data)?) } VariantPrimitiveType::TimestampNTZMicros => { - Variant::TimestampNTZMicros(decoder::decode_timestampntz_micros(value)?) + Variant::TimestampNTZMicros(decoder::decode_timestampntz_micros(value_data)?) + } + VariantPrimitiveType::Binary => { + Variant::Binary(decoder::decode_binary(value_data)?) } - VariantPrimitiveType::Binary => Variant::Binary(decoder::decode_binary(value)?), VariantPrimitiveType::String => { - Variant::String(decoder::decode_long_string(value)?) + Variant::String(decoder::decode_long_string(value_data)?) } }, VariantBasicType::ShortString => { - Variant::ShortString(decoder::decode_short_string(value)?) + Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => Variant::Object(VariantObject { metadata, value }), - VariantBasicType::Array => Variant::Array(VariantArray { metadata, value }), + VariantBasicType::Object => Variant::Object(VariantObject { + metadata, + value_metadata, + value_data, + }), + VariantBasicType::Array => Variant::Array(VariantArray { + metadata, + value_metadata, + value_data, + }), }; Ok(new_self) } From ffa65fc40b1a9975ff8195244444dafa581f82fb Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Wed, 11 Jun 2025 21:48:42 -0700 Subject: [PATCH 09/19] Add `as_int*` doc tests --- parquet-variant/src/variant.rs | 86 ++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 1ba341b27fc9..28420ce9dc14 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -537,6 +537,28 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to an `i8` if possible. + /// + /// Returns `Some(i8)` for integer variants that fit in `i8` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i8 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int8(), Some(123i8)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(1234i64); + /// assert_eq!(v2.as_int8(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int8(), None); + /// ``` pub fn as_int8(&self) -> Option { match *self { Variant::Int8(i) => Some(i), @@ -547,6 +569,28 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to an `i16` if possible. + /// + /// Returns `Some(i16)` for integer variants that fit in `i16` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i16 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int16(), Some(123i16)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(123456i64); + /// assert_eq!(v2.as_int16(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int16(), None); + /// ``` pub fn as_int16(&self) -> Option { match *self { Variant::Int8(i) => Some(i.into()), @@ -556,6 +600,29 @@ impl<'m, 'v> Variant<'m, 'v> { _ => None, } } + + /// Converts this variant to an `i32` if possible. + /// + /// Returns `Some(i32)` for integer variants that fit in `i32` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i32 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int32(), Some(123i32)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(12345678901i64); + /// assert_eq!(v2.as_int32(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int32(), None); + /// ``` pub fn as_int32(&self) -> Option { match *self { Variant::Int8(i) => Some(i.into()), @@ -565,6 +632,25 @@ impl<'m, 'v> Variant<'m, 'v> { _ => None, } } + + /// Converts this variant to an `i64` if possible. + /// + /// Returns `Some(i64)` for integer variants that fit in `i64` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i64 + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int64(), Some(123i64)); + /// + /// // but not a variant that cannot be cast into an integer + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_int64(), None); + /// ``` pub fn as_int64(&self) -> Option { match *self { Variant::Int8(i) => Some(i.into()), From f6a6a7be8c94d4cbc8f7e9f6ea22dfd3c9ba174e Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 07:05:29 -0700 Subject: [PATCH 10/19] Add `as_naive_date` doc tests --- parquet-variant/src/variant.rs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 28420ce9dc14..792c58fffb74 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -499,9 +499,29 @@ impl<'m, 'v> Variant<'m, 'v> { } } - pub fn as_naive_date(self) -> Option { + /// Converts this variant to a `NaiveDate` if possible. + /// + /// Returns `Some(NaiveDate)` for date variants, + /// `None` for non-date variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a NaiveDate from a date variant + /// let date = NaiveDate::from_ymd_opt(2025, 4, 12).unwrap(); + /// let v1 = Variant::from(date); + /// assert_eq!(v1.as_naive_date(), Some(date)); + /// + /// // but not from other variants + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_naive_date(), None); + /// ``` + pub fn as_naive_date(&self) -> Option { if let Variant::Date(d) = self { - Some(d) + Some(*d) } else { None } @@ -514,6 +534,7 @@ impl<'m, 'v> Variant<'m, 'v> { None } } + pub fn as_naive_datetime(self) -> Option { if let Variant::TimestampNTZMicros(d) = self { Some(d) From 2f27c380353db016341ab521f891cb937b81d0b0 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 07:54:26 -0700 Subject: [PATCH 11/19] Add `as_naive_datetime` and `as_datetime_utc` doc tests --- parquet-variant/src/variant.rs | 70 +++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 792c58fffb74..416fe31921be 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -527,19 +527,69 @@ impl<'m, 'v> Variant<'m, 'v> { } } - pub fn as_datetime_utc(self) -> Option> { - if let Variant::TimestampMicros(d) = self { - Some(d) - } else { - None + /// Converts this variant to a `DateTime` if possible. + /// + /// Returns `Some(DateTime)` for timestamp variants, + /// `None` for non-timestamp variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a DateTime from a UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_datetime_utc(), Some(datetime)); + /// + /// // or a non-UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let v2 = Variant::from(datetime); + /// assert_eq!(v2.as_datetime_utc(), Some(datetime.and_utc())); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_datetime_utc(), None); + /// ``` + pub fn as_datetime_utc(&self) -> Option> { + match *self { + Variant::TimestampMicros(d) => Some(d), + Variant::TimestampNTZMicros(d) => Some(d.and_utc()), + _ => None, } } - pub fn as_naive_datetime(self) -> Option { - if let Variant::TimestampNTZMicros(d) = self { - Some(d) - } else { - None + /// Converts this variant to a `NaiveDateTime` if possible. + /// + /// Returns `Some(NaiveDateTime)` for timestamp variants, + /// `None` for non-timestamp variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_naive_datetime(), Some(datetime)); + /// + /// // or a UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let v2 = Variant::from(datetime); + /// assert_eq!(v2.as_naive_datetime(), Some(datetime.naive_utc())); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_naive_datetime(), None); + /// ``` + pub fn as_naive_datetime(&self) -> Option { + match *self { + Variant::TimestampNTZMicros(d) => Some(d), + Variant::TimestampMicros(d) => Some(d.naive_utc()), + _ => None, } } From 9a89cc757527d856783ee4340377047121bc88de Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 08:08:40 -0700 Subject: [PATCH 12/19] Add `as_boolean` doc tests --- parquet-variant/src/variant.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 416fe31921be..0179ea367b6d 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -491,6 +491,28 @@ impl<'m, 'v> Variant<'m, 'v> { matches!(self, Variant::Null).then_some(()) } + /// Converts this variant to a `bool` if possible. + /// + /// Returns `Some(bool)` for boolean variants, + /// `None` for non-boolean variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a bool from the true variant + /// let v1 = Variant::from(true); + /// assert_eq!(v1.as_boolean(), Some(true)); + /// + /// // and the false variant + /// let v2 = Variant::from(false); + /// assert_eq!(v2.as_boolean(), Some(false)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_boolean(), None); + /// ``` pub fn as_boolean(&self) -> Option { match self { Variant::BooleanTrue => Some(true), From 9fb65ac990853ddb93c8db49b0abb65e9712cf1a Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 08:20:59 -0700 Subject: [PATCH 13/19] Implement `From<()> for Variant` and add `as_null` doc tests --- parquet-variant/src/variant.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0179ea367b6d..e98045a1a619 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -487,6 +487,24 @@ impl<'m, 'v> Variant<'m, 'v> { Ok(new_self) } + /// Converts this variant to `()` if it is null. + /// + /// Returns `Some(())` for null variants, + /// `None` for non-null variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract `()` from a null variant + /// let v1 = Variant::from(()); + /// assert_eq!(v1.as_null(), Some(())); + /// + /// // but not from other variants + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_null(), None); + /// ``` pub fn as_null(&self) -> Option<()> { matches!(self, Variant::Null).then_some(()) } @@ -818,6 +836,12 @@ impl<'m, 'v> Variant<'m, 'v> { } } +impl<'m, 'v> From<()> for Variant<'m, 'v> { + fn from(_: ()) -> Self { + Variant::Null + } +} + impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: i8) -> Self { Variant::Int8(value) From 12921400c8181f3032ccad223e7bf55faafab2d6 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 08:36:29 -0700 Subject: [PATCH 14/19] Add `as_u8_slice` doc tests --- parquet-variant/src/variant.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index e98045a1a619..5340c9f11c94 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -633,6 +633,25 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to a `&[u8]` if possible. + /// + /// Returns `Some(&[u8])` for binary variants, + /// `None` for non-binary variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a byte slice from a binary variant + /// let data = b"hello!"; + /// let v1 = Variant::Binary(data); + /// assert_eq!(v1.as_u8_slice(), Some(data.as_slice())); + /// + /// // but not from other variant types + /// let v2 = Variant::from(123i64); + /// assert_eq!(v2.as_u8_slice(), None); + /// ``` pub fn as_u8_slice(&'v self) -> Option<&'v [u8]> { if let Variant::Binary(d) = self { Some(d) From 81ee289b5788f70838bb9a189dae2a8436709262 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 08:47:13 -0700 Subject: [PATCH 15/19] Add `as_string` doc tests --- parquet-variant/src/variant.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 5340c9f11c94..16a8d8620783 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -660,6 +660,25 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to a `&str` if possible. + /// + /// Returns `Some(&str)` for string variants (both regular and short strings), + /// `None` for non-string variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a string from string variants + /// let s = "hello!"; + /// let v1 = Variant::ShortString(s); + /// assert_eq!(v1.as_string(), Some(s)); + /// + /// // but not from other variants + /// let v2 = Variant::from(123i64); + /// assert_eq!(v2.as_string(), None); + /// ``` pub fn as_string(&'v self) -> Option<&'v str> { match self { Variant::String(s) | Variant::ShortString(s) => Some(s), From 8e0985c9f67dac75b3dcca07596ea39f84884d3a Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 09:47:39 -0700 Subject: [PATCH 16/19] Add `as_decimal_*` doc tests --- parquet-variant/src/variant.rs | 73 ++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 16a8d8620783..3dd2a865da62 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -810,6 +810,33 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to tuple with a 4-byte unscaled value if possible. + /// + /// Returns `Some((i32, u8))` for decimal variants where the unscaled value + /// fits in `i32` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i32, 2)); + /// assert_eq!(v1.as_decimal_int32(), Some((1234_i32, 2))); + /// + /// // and from larger decimal variants if they fit + /// let v2 = Variant::from((1234_i64, 2)); + /// assert_eq!(v2.as_decimal_int32(), Some((1234_i32, 2))); + /// + /// // but not if the value would overflow i32 + /// let v3 = Variant::from((12345678901i64, 2)); + /// assert_eq!(v3.as_decimal_int32(), None); + /// + /// // or if the variant is not a decimal + /// let v4 = Variant::from("hello!"); + /// assert_eq!(v4.as_decimal_int32(), None); + /// ``` pub fn as_decimal_int32(&self) -> Option<(i32, u8)> { match *self { Variant::Decimal4 { integer, scale } => Some((integer, scale)), @@ -831,6 +858,33 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to tuple with an 8-byte unscaled value if possible. + /// + /// Returns `Some((i64, u8))` for decimal variants where the unscaled value + /// fits in `i64` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i64, 2)); + /// assert_eq!(v1.as_decimal_int64(), Some((1234_i64, 2))); + /// + /// // and from larger decimal variants if they fit + /// let v2 = Variant::from((1234_i128, 2)); + /// assert_eq!(v2.as_decimal_int64(), Some((1234_i64, 2))); + /// + /// // but not if the value would overflow i64 + /// let v3 = Variant::from((2e19 as i128, 2)); + /// assert_eq!(v3.as_decimal_int64(), None); + /// + /// // or if the variant is not a decimal + /// let v4 = Variant::from("hello!"); + /// assert_eq!(v4.as_decimal_int64(), None); + /// ``` pub fn as_decimal_int64(&self) -> Option<(i64, u8)> { match *self { Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), @@ -846,6 +900,25 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to tuple with a 16-byte unscaled value if possible. + /// + /// Returns `Some((i128, u8))` for decimal variants where the unscaled value + /// fits in `i128` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i128, 2)); + /// assert_eq!(v1.as_decimal_int128(), Some((1234_i128, 2))); + /// + /// // but not if the variant is not a decimal + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_decimal_int128(), None); + /// ``` pub fn as_decimal_int128(&self) -> Option<(i128, u8)> { match *self { Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), From b62d776a29a0fc76bd5c98059b78d3351138c96e Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 10:08:18 -0700 Subject: [PATCH 17/19] Implement Double variant --- parquet-variant/src/decoder.rs | 16 ++++++++++++++++ parquet-variant/src/variant.rs | 22 +++++++++++++++++++--- parquet-variant/tests/variant_interop.rs | 1 + 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index f77115476b0f..60886f98d31c 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -37,6 +37,7 @@ pub enum VariantPrimitiveType { Int16 = 4, Int32 = 5, Int64 = 6, + Double = 7, Decimal4 = 8, Decimal8 = 9, Decimal16 = 10, @@ -78,6 +79,7 @@ impl TryFrom for VariantPrimitiveType { 4 => Ok(VariantPrimitiveType::Int16), 5 => Ok(VariantPrimitiveType::Int32), 6 => Ok(VariantPrimitiveType::Int64), + 7 => Ok(VariantPrimitiveType::Double), 8 => Ok(VariantPrimitiveType::Decimal4), 9 => Ok(VariantPrimitiveType::Decimal8), 10 => Ok(VariantPrimitiveType::Decimal16), @@ -156,6 +158,12 @@ pub(crate) fn decode_float(data: &[u8]) -> Result { Ok(value) } +/// Decodes a Double from the value section of a variant. +pub(crate) fn decode_double(data: &[u8]) -> Result { + let value = f64::from_le_bytes(array_from_slice(data, 0)?); + Ok(value) +} + /// Decodes a Date from the value section of a variant. pub(crate) fn decode_date(data: &[u8]) -> Result { let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?); @@ -287,6 +295,14 @@ mod tests { Ok(()) } + #[test] + fn test_double() -> Result<(), ArrowError> { + let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41]; + let result = decode_double(&data)?; + assert_eq!(result, 1234567890.1234); + Ok(()) + } + #[test] fn test_date() -> Result<(), ArrowError> { let data = [0xe2, 0x4e, 0x0, 0x0]; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 3dd2a865da62..6d77f207f011 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -415,6 +415,7 @@ pub enum Variant<'m, 'v> { Decimal8 { integer: i64, scale: u8 }, Decimal16 { integer: i128, scale: u8 }, Float(f32), + Double(f64), BooleanTrue, BooleanFalse, @@ -453,6 +454,9 @@ impl<'m, 'v> Variant<'m, 'v> { Variant::Decimal16 { integer, scale } } VariantPrimitiveType::Float => Variant::Float(decoder::decode_float(value_data)?), + VariantPrimitiveType::Double => { + Variant::Double(decoder::decode_double(value_data)?) + } VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon @@ -931,9 +935,15 @@ impl<'m, 'v> Variant<'m, 'v> { pub fn as_f32(&self) -> Option { match *self { Variant::Float(i) => Some(i), - // TODO Add Variant::Double - // TODO Add int variants? - // TODO Add decimal variants? + Variant::Double(i) => Some(i as f32), + _ => None, + } + } + + pub fn as_f64(&self) -> Option { + match *self { + Variant::Float(i) => Some(i.into()), + Variant::Double(i) => Some(i), _ => None, } } @@ -1010,6 +1020,12 @@ impl<'m, 'v> From for Variant<'m, 'v> { } } +impl<'m, 'v> From for Variant<'m, 'v> { + fn from(value: f64) -> Self { + Variant::Double(value) + } +} + impl<'m, 'v> From for Variant<'m, 'v> { fn from(value: bool) -> Self { match value { diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index cec23cc8bd53..677bba240b4d 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -55,6 +55,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_decimal8", Variant::Decimal8{integer: 1234567890, scale: 2}), ("primitive_decimal16", Variant::Decimal16{integer: 1234567891234567890, scale: 2}), ("primitive_float", Variant::Float(1234567890.1234)), + ("primitive_double", Variant::Double(1234567890.1234)), ("primitive_int8", Variant::Int8(42)), ("primitive_int16", Variant::Int16(1234)), ("primitive_int32", Variant::Int32(123456)), From 932d9258393c9cfb711c866f5937cc8f28058762 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 10:33:30 -0700 Subject: [PATCH 18/19] Add `as_f32` and `as_f64` doc tests --- parquet-variant/src/variant.rs | 45 +++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 6d77f207f011..0dd9db5cdacf 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -931,7 +931,28 @@ impl<'m, 'v> Variant<'m, 'v> { _ => None, } } - + /// Converts this variant to an `f32` if possible. + /// + /// Returns `Some(f32)` for float and double variants, + /// `None` for non-floating-point variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract an f32 from a float variant + /// let v1 = Variant::from(std::f32::consts::PI); + /// assert_eq!(v1.as_f32(), Some(std::f32::consts::PI)); + /// + /// // and from a double variant (with loss of precision to nearest f32) + /// let v2 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v2.as_f32(), Some(std::f32::consts::PI)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_f32(), None); + /// ``` pub fn as_f32(&self) -> Option { match *self { Variant::Float(i) => Some(i), @@ -940,6 +961,28 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to an `f64` if possible. + /// + /// Returns `Some(f64)` for float and double variants, + /// `None` for non-floating-point variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract an f64 from a float variant + /// let v1 = Variant::from(std::f32::consts::PI); + /// assert_eq!(v1.as_f64(), Some(std::f32::consts::PI as f64)); + /// + /// // and from a double variant + /// let v2 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v2.as_f64(), Some(std::f64::consts::PI)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_f64(), None); + /// ``` pub fn as_f64(&self) -> Option { match *self { Variant::Float(i) => Some(i.into()), From 346c264ebdf6a1a9c41cc56bfa967227d59c0869 Mon Sep 17 00:00:00 2001 From: SuperSerious Dev Date: Thu, 12 Jun 2025 11:30:58 -0700 Subject: [PATCH 19/19] cleanup --- parquet-variant/src/decoder.rs | 57 ++++++++++-------------- parquet-variant/src/variant.rs | 54 +++++++++++----------- parquet-variant/tests/variant_interop.rs | 2 +- 3 files changed, 53 insertions(+), 60 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index 60886f98d31c..2b2e3bc0ca3c 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -43,7 +43,7 @@ pub enum VariantPrimitiveType { Decimal16 = 10, Date = 11, TimestampMicros = 12, - TimestampNTZMicros = 13, + TimestampNtzMicros = 13, Float = 14, Binary = 15, String = 16, @@ -85,7 +85,7 @@ impl TryFrom for VariantPrimitiveType { 10 => Ok(VariantPrimitiveType::Decimal16), 11 => Ok(VariantPrimitiveType::Date), 12 => Ok(VariantPrimitiveType::TimestampMicros), - 13 => Ok(VariantPrimitiveType::TimestampNTZMicros), + 13 => Ok(VariantPrimitiveType::TimestampNtzMicros), 14 => Ok(VariantPrimitiveType::Float), 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), @@ -109,26 +109,22 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { /// Decodes an Int8 from the value section of a variant. pub(crate) fn decode_int8(data: &[u8]) -> Result { - let value = i8::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(i8::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes an Int16 from the value section of a variant. pub(crate) fn decode_int16(data: &[u8]) -> Result { - let value = i16::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(i16::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes an Int32 from the value section of a variant. pub(crate) fn decode_int32(data: &[u8]) -> Result { - let value = i32::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(i32::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes an Int64 from the value section of a variant. pub(crate) fn decode_int64(data: &[u8]) -> Result { - let value = i64::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(i64::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes a Decimal4 from the value section of a variant. @@ -154,45 +150,41 @@ pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> { /// Decodes a Float from the value section of a variant. pub(crate) fn decode_float(data: &[u8]) -> Result { - let value = f32::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(f32::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes a Double from the value section of a variant. pub(crate) fn decode_double(data: &[u8]) -> Result { - let value = f64::from_le_bytes(array_from_slice(data, 0)?); - Ok(value) + Ok(f64::from_le_bytes(array_from_slice(data, 0)?)) } /// Decodes a Date from the value section of a variant. pub(crate) fn decode_date(data: &[u8]) -> Result { let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?); - let value = (DateTime::UNIX_EPOCH + Duration::days(days_since_epoch as i64)).date_naive(); - Ok(value) + let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch)); + Ok(value.date_naive()) } /// Decodes a TimestampMicros from the value section of a variant. pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result, ArrowError> { let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); - if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { - Ok(value) - } else { - Err(ArrowError::CastError(format!( + DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| { + ArrowError::CastError(format!( "Could not cast `{micros_since_epoch}` microseconds into a DateTime" - ))) - } + )) + }) } -/// Decodes a TimestampNTZMicros from the value section of a variant. +/// Decodes a TimestampNtzMicros from the value section of a variant. pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result { let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); - if let Some(value) = DateTime::from_timestamp_micros(micros_since_epoch) { - Ok(value.naive_utc()) - } else { - Err(ArrowError::CastError(format!( - "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime" - ))) - } + DateTime::from_timestamp_micros(micros_since_epoch) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime" + )) + }) + .map(|v| v.naive_utc()) } /// Decodes a Binary from the value section of a variant. @@ -212,7 +204,6 @@ pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> { /// Decodes a short string from the value section of a variant. pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> { let len = (metadata >> 2) as usize; - let string = string_from_slice(data, 0..len)?; Ok(string) } @@ -343,7 +334,7 @@ mod tests { #[test] fn test_binary() -> Result<(), ArrowError> { let data = [ - 9, 0, 0, 0, // Length of binary data, 4-byte little-endian + 0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, ]; let result = decode_binary(&data)?; @@ -365,7 +356,7 @@ mod tests { #[test] fn test_string() -> Result<(), ArrowError> { let data = [ - 5, 0, 0, 0, // Length of string, 4-byte little-endian + 0x05, 0, 0, 0, // Length of string, 4-byte little-endian b'H', b'e', b'l', b'l', b'o', b'o', ]; let result = decode_long_string(&data)?; diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0dd9db5cdacf..8a33eb2a9964 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -410,7 +410,7 @@ pub enum Variant<'m, 'v> { Int64(i64), Date(NaiveDate), TimestampMicros(DateTime), - TimestampNTZMicros(NaiveDateTime), + TimestampNtzMicros(NaiveDateTime), Decimal4 { integer: i32, scale: u8 }, Decimal8 { integer: i64, scale: u8 }, Decimal16 { integer: i128, scale: u8 }, @@ -464,8 +464,8 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::TimestampMicros => { Variant::TimestampMicros(decoder::decode_timestamp_micros(value_data)?) } - VariantPrimitiveType::TimestampNTZMicros => { - Variant::TimestampNTZMicros(decoder::decode_timestampntz_micros(value_data)?) + VariantPrimitiveType::TimestampNtzMicros => { + Variant::TimestampNtzMicros(decoder::decode_timestampntz_micros(value_data)?) } VariantPrimitiveType::Binary => { Variant::Binary(decoder::decode_binary(value_data)?) @@ -599,7 +599,7 @@ impl<'m, 'v> Variant<'m, 'v> { pub fn as_datetime_utc(&self) -> Option> { match *self { Variant::TimestampMicros(d) => Some(d), - Variant::TimestampNTZMicros(d) => Some(d.and_utc()), + Variant::TimestampNtzMicros(d) => Some(d.and_utc()), _ => None, } } @@ -631,7 +631,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// ``` pub fn as_naive_datetime(&self) -> Option { match *self { - Variant::TimestampNTZMicros(d) => Some(d), + Variant::TimestampNtzMicros(d) => Some(d), Variant::TimestampMicros(d) => Some(d.naive_utc()), _ => None, } @@ -953,6 +953,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// let v3 = Variant::from("hello!"); /// assert_eq!(v3.as_f32(), None); /// ``` + #[allow(clippy::cast_possible_truncation)] pub fn as_f32(&self) -> Option { match *self { Variant::Float(i) => Some(i), @@ -1000,37 +1001,37 @@ impl<'m, 'v> Variant<'m, 'v> { } } -impl<'m, 'v> From<()> for Variant<'m, 'v> { - fn from(_: ()) -> Self { +impl From<()> for Variant<'_, '_> { + fn from((): ()) -> Self { Variant::Null } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: i8) -> Self { Variant::Int8(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: i16) -> Self { Variant::Int16(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: i32) -> Self { Variant::Int32(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: i64) -> Self { Variant::Int64(value) } } -impl<'m, 'v> From<(i32, u8)> for Variant<'m, 'v> { +impl From<(i32, u8)> for Variant<'_, '_> { fn from(value: (i32, u8)) -> Self { Variant::Decimal4 { integer: value.0, @@ -1039,7 +1040,7 @@ impl<'m, 'v> From<(i32, u8)> for Variant<'m, 'v> { } } -impl<'m, 'v> From<(i64, u8)> for Variant<'m, 'v> { +impl From<(i64, u8)> for Variant<'_, '_> { fn from(value: (i64, u8)) -> Self { Variant::Decimal8 { integer: value.0, @@ -1048,7 +1049,7 @@ impl<'m, 'v> From<(i64, u8)> for Variant<'m, 'v> { } } -impl<'m, 'v> From<(i128, u8)> for Variant<'m, 'v> { +impl From<(i128, u8)> for Variant<'_, '_> { fn from(value: (i128, u8)) -> Self { Variant::Decimal16 { integer: value.0, @@ -1057,51 +1058,52 @@ impl<'m, 'v> From<(i128, u8)> for Variant<'m, 'v> { } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: f32) -> Self { Variant::Float(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: f64) -> Self { Variant::Double(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: bool) -> Self { - match value { - true => Variant::BooleanTrue, - false => Variant::BooleanFalse, + if value { + Variant::BooleanTrue + } else { + Variant::BooleanFalse } } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: NaiveDate) -> Self { Variant::Date(value) } } -impl<'m, 'v> From> for Variant<'m, 'v> { +impl From> for Variant<'_, '_> { fn from(value: DateTime) -> Self { Variant::TimestampMicros(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { fn from(value: NaiveDateTime) -> Self { - Variant::TimestampNTZMicros(value) + Variant::TimestampNtzMicros(value) } } -impl<'m, 'v> From<&'v [u8]> for Variant<'m, 'v> { +impl<'v> From<&'v [u8]> for Variant<'_, 'v> { fn from(value: &'v [u8]) -> Self { Variant::Binary(value) } } -impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { +impl<'v> From<&'v str> for Variant<'_, 'v> { fn from(value: &'v str) -> Self { if value.len() < 64 { Variant::ShortString(value) diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 677bba240b4d..0db54e969a92 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -63,7 +63,7 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_null", Variant::Null), ("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), ("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())), - ("primitive_timestampntz", Variant::TimestampNTZMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())), + ("primitive_timestampntz", Variant::TimestampNtzMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())), ("short_string", Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)")), ] }