diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 41b127ef14e6..47522f469995 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -32,7 +32,6 @@ rust-version = { workspace = true } [dependencies] arrow-schema = "55.1.0" +chrono = { workspace = true } [lib] - - diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index a3d2f87062ea..2b2e3bc0ca3c 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. use arrow_schema::ArrowError; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc}; use std::array::TryFromSliceError; -use crate::utils::{array_from_slice, first_byte_from_slice, string_from_slice}; +use crate::utils::{array_from_slice, slice_from_slice, string_from_slice}; #[derive(Debug, Clone, Copy)] pub enum VariantBasicType { @@ -33,7 +34,18 @@ pub enum VariantPrimitiveType { BooleanTrue = 1, BooleanFalse = 2, Int8 = 3, - // TODO: Add types for the rest of primitives, once API is agreed upon + Int16 = 4, + Int32 = 5, + Int64 = 6, + Double = 7, + Decimal4 = 8, + Decimal8 = 9, + Decimal16 = 10, + Date = 11, + TimestampMicros = 12, + TimestampNtzMicros = 13, + Float = 14, + Binary = 15, String = 16, } @@ -64,7 +76,18 @@ impl TryFrom for VariantPrimitiveType { 1 => Ok(VariantPrimitiveType::BooleanTrue), 2 => Ok(VariantPrimitiveType::BooleanFalse), 3 => Ok(VariantPrimitiveType::Int8), - // TODO: Add types for the rest, once API is agreed upon + 4 => Ok(VariantPrimitiveType::Int16), + 5 => Ok(VariantPrimitiveType::Int32), + 6 => Ok(VariantPrimitiveType::Int64), + 7 => Ok(VariantPrimitiveType::Double), + 8 => Ok(VariantPrimitiveType::Decimal4), + 9 => Ok(VariantPrimitiveType::Decimal8), + 10 => Ok(VariantPrimitiveType::Decimal16), + 11 => Ok(VariantPrimitiveType::Date), + 12 => Ok(VariantPrimitiveType::TimestampMicros), + 13 => Ok(VariantPrimitiveType::TimestampNtzMicros), + 14 => Ok(VariantPrimitiveType::Float), + 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), _ => Err(ArrowError::InvalidArgumentError(format!( "unknown primitive type: {}", @@ -73,10 +96,10 @@ impl TryFrom for VariantPrimitiveType { } } } -/// Extract the primitive type from a Variant value-header byte -pub(crate) fn get_primitive_type(header: u8) -> Result { +/// Extract the primitive type from a Variant value-metadata byte +pub(crate) fn get_primitive_type(metadata: u8) -> Result { // last 6 bits contain the primitive-type, see spec - VariantPrimitiveType::try_from(header >> 2) + VariantPrimitiveType::try_from(metadata >> 2) } /// To be used in `map_err` when unpacking an integer from a slice of bytes. @@ -85,23 +108,103 @@ fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { } /// Decodes an Int8 from the value section of a variant. -pub(crate) fn decode_int8(value: &[u8]) -> Result { - let value = i8::from_le_bytes(array_from_slice(value, 1)?); +pub(crate) fn decode_int8(data: &[u8]) -> Result { + Ok(i8::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes an Int16 from the value section of a variant. +pub(crate) fn decode_int16(data: &[u8]) -> Result { + Ok(i16::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes an Int32 from the value section of a variant. +pub(crate) fn decode_int32(data: &[u8]) -> Result { + Ok(i32::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes an Int64 from the value section of a variant. +pub(crate) fn decode_int64(data: &[u8]) -> Result { + Ok(i64::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes a Decimal4 from the value section of a variant. +pub(crate) fn decode_decimal4(data: &[u8]) -> Result<(i32, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i32::from_le_bytes(array_from_slice(data, 1)?); + Ok((integer, scale)) +} + +/// Decodes a Decimal8 from the value section of a variant. +pub(crate) fn decode_decimal8(data: &[u8]) -> Result<(i64, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i64::from_le_bytes(array_from_slice(data, 1)?); + Ok((integer, scale)) +} + +/// Decodes a Decimal16 from the value section of a variant. +pub(crate) fn decode_decimal16(data: &[u8]) -> Result<(i128, u8), ArrowError> { + let scale = u8::from_le_bytes(array_from_slice(data, 0)?); + let integer = i128::from_le_bytes(array_from_slice(data, 1)?); + Ok((integer, scale)) +} + +/// Decodes a Float from the value section of a variant. +pub(crate) fn decode_float(data: &[u8]) -> Result { + Ok(f32::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes a Double from the value section of a variant. +pub(crate) fn decode_double(data: &[u8]) -> Result { + Ok(f64::from_le_bytes(array_from_slice(data, 0)?)) +} + +/// Decodes a Date from the value section of a variant. +pub(crate) fn decode_date(data: &[u8]) -> Result { + let days_since_epoch = i32::from_le_bytes(array_from_slice(data, 0)?); + let value = DateTime::UNIX_EPOCH + Duration::days(i64::from(days_since_epoch)); + Ok(value.date_naive()) +} + +/// Decodes a TimestampMicros from the value section of a variant. +pub(crate) fn decode_timestamp_micros(data: &[u8]) -> Result, ArrowError> { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); + DateTime::from_timestamp_micros(micros_since_epoch).ok_or_else(|| { + ArrowError::CastError(format!( + "Could not cast `{micros_since_epoch}` microseconds into a DateTime" + )) + }) +} + +/// Decodes a TimestampNtzMicros from the value section of a variant. +pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> Result { + let micros_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); + DateTime::from_timestamp_micros(micros_since_epoch) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Could not cast `{micros_since_epoch}` microseconds into a NaiveDateTime" + )) + }) + .map(|v| v.naive_utc()) +} + +/// Decodes a Binary from the value section of a variant. +pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> { + let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; + let value = slice_from_slice(data, 4..4 + len)?; Ok(value) } /// Decodes a long string from the value section of a variant. -pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; - let string = string_from_slice(value, 5..5 + len)?; +pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> { + let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; + let string = string_from_slice(data, 4..4 + len)?; Ok(string) } /// Decodes a short string from the value section of a variant. -pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { - let len = (first_byte_from_slice(value)? >> 2) as usize; - - let string = string_from_slice(value, 1..1 + len)?; +pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> Result<&str, ArrowError> { + let len = (metadata >> 2) as usize; + let string = string_from_slice(data, 0..len)?; Ok(string) } @@ -111,47 +214,152 @@ mod tests { #[test] fn test_i8() -> Result<(), ArrowError> { - let value = [ - 3 << 2, // Primitive type for i8 - 42, - ]; - let result = decode_int8(&value)?; + let data = [0x2a]; + let result = decode_int8(&data)?; assert_eq!(result, 42); Ok(()) } #[test] - fn test_short_string() -> Result<(), ArrowError> { - let value = [ - 1 | 5 << 2, // Basic type for short string | length of short string - b'H', - b'e', - b'l', - b'l', - b'o', - b'o', + fn test_i16() -> Result<(), ArrowError> { + let data = [0xd2, 0x04]; + let result = decode_int16(&data)?; + assert_eq!(result, 1234); + Ok(()) + } + + #[test] + fn test_i32() -> Result<(), ArrowError> { + let data = [0x40, 0xe2, 0x01, 0x00]; + let result = decode_int32(&data)?; + assert_eq!(result, 123456); + Ok(()) + } + + #[test] + fn test_i64() -> Result<(), ArrowError> { + let data = [0x15, 0x81, 0xe9, 0x7d, 0xf4, 0x10, 0x22, 0x11]; + let result = decode_int64(&data)?; + assert_eq!(result, 1234567890123456789); + Ok(()) + } + + #[test] + fn test_decimal4() -> Result<(), ArrowError> { + let data = [ + 0x02, // Scale + 0xd2, 0x04, 0x00, 0x00, // Integer ]; - let result = decode_short_string(&value)?; + let result = decode_decimal4(&data)?; + assert_eq!(result, (1234, 2)); + Ok(()) + } + + #[test] + fn test_decimal8() -> Result<(), ArrowError> { + let data = [ + 0x02, // Scale + 0xd2, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // Integer + ]; + let result = decode_decimal8(&data)?; + assert_eq!(result, (1234567890, 2)); + Ok(()) + } + + #[test] + fn test_decimal16() -> Result<(), ArrowError> { + let data = [ + 0x02, // Scale + 0xd2, 0xb6, 0x23, 0xc0, 0xf4, 0x10, 0x22, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // Integer + ]; + let result = decode_decimal16(&data)?; + assert_eq!(result, (1234567891234567890, 2)); + Ok(()) + } + + #[test] + fn test_float() -> Result<(), ArrowError> { + let data = [0x06, 0x2c, 0x93, 0x4e]; + let result = decode_float(&data)?; + assert_eq!(result, 1234567890.1234); + Ok(()) + } + + #[test] + fn test_double() -> Result<(), ArrowError> { + let data = [0xc9, 0xe5, 0x87, 0xb4, 0x80, 0x65, 0xd2, 0x41]; + let result = decode_double(&data)?; + assert_eq!(result, 1234567890.1234); + Ok(()) + } + + #[test] + fn test_date() -> Result<(), ArrowError> { + let data = [0xe2, 0x4e, 0x0, 0x0]; + let result = decode_date(&data)?; + assert_eq!(result, NaiveDate::from_ymd_opt(2025, 4, 16).unwrap()); + Ok(()) + } + + #[test] + fn test_timestamp_micros() -> Result<(), ArrowError> { + let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; + let result = decode_timestamp_micros(&data)?; + assert_eq!( + result, + NaiveDate::from_ymd_opt(2025, 4, 16) + .unwrap() + .and_hms_milli_opt(16, 34, 56, 780) + .unwrap() + .and_utc() + ); + Ok(()) + } + + #[test] + fn test_timestampntz_micros() -> Result<(), ArrowError> { + let data = [0xe0, 0x52, 0x97, 0xdd, 0xe7, 0x32, 0x06, 0x00]; + let result = decode_timestampntz_micros(&data)?; + assert_eq!( + result, + NaiveDate::from_ymd_opt(2025, 4, 16) + .unwrap() + .and_hms_milli_opt(16, 34, 56, 780) + .unwrap() + ); + Ok(()) + } + + #[test] + fn test_binary() -> Result<(), ArrowError> { + let data = [ + 0x09, 0, 0, 0, // Length of binary data, 4-byte little-endian + 0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, + ]; + let result = decode_binary(&data)?; + assert_eq!( + result, + [0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe] + ); + Ok(()) + } + + #[test] + fn test_short_string() -> Result<(), ArrowError> { + let data = [b'H', b'e', b'l', b'l', b'o', b'o']; + let result = decode_short_string(1 | 5 << 2, &data)?; assert_eq!(result, "Hello"); Ok(()) } #[test] fn test_string() -> Result<(), ArrowError> { - let value = [ - 16 << 2, // Basic type for short string | length of short string - 5, - 0, - 0, - 0, // Length of string - b'H', - b'e', - b'l', - b'l', - b'o', - b'o', + let data = [ + 0x05, 0, 0, 0, // Length of string, 4-byte little-endian + b'H', b'e', b'l', b'l', b'o', b'o', ]; - let result = decode_long_string(&value)?; + let result = decode_long_string(&data)?; assert_eq!(result, "Hello"); Ok(()) } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index b7b1932580b1..8a33eb2a9964 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -19,6 +19,7 @@ use crate::decoder::{ }; use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; use arrow_schema::ArrowError; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; use std::{num::TryFromIntError, ops::Range}; #[derive(Clone, Debug, Copy, PartialEq)] @@ -303,7 +304,8 @@ impl<'m> VariantMetadata<'m> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { pub metadata: &'m VariantMetadata<'m>, - pub value: &'v [u8], + pub value_metadata: u8, + pub value_data: &'v [u8], } impl<'m, 'v> VariantObject<'m, 'v> { pub fn fields(&self) -> Result)>, ArrowError> { @@ -319,7 +321,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantArray<'m, 'v> { pub metadata: &'m VariantMetadata<'m>, - pub value: &'v [u8], + pub value_metadata: u8, + pub value_data: &'v [u8], } impl<'m, 'v> VariantArray<'m, 'v> { @@ -342,7 +345,7 @@ impl<'m, 'v> VariantArray<'m, 'v> { pub fn get(&self, index: usize) -> Result, ArrowError> { // The 6 first bits to the left are the value_header and the 2 bits // to the right are the basic type, so we shift to get only the value_header - let value_header = first_byte_from_slice(self.value)? >> 2; + let value_header = self.value_metadata >> 2; let is_large = (value_header & 0x04) != 0; // 3rd bit from the right let field_offset_size_minus_one = value_header & 0x03; // Last two bits let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; @@ -352,11 +355,11 @@ impl<'m, 'v> VariantArray<'m, 'v> { true => OffsetSizeBytes::Four, false => OffsetSizeBytes::One, }; - // Skip the header byte to read the num_elements + // Read the num_elements // The size of the num_elements entry in the array value_data is 4 bytes if // is_large is true, otherwise 1 byte. - let num_elements = num_elements_size.unpack_usize(self.value, 1, 0)?; - let first_offset_byte = 1 + num_elements_size as usize; + let num_elements = num_elements_size.unpack_usize(self.value_data, 0, 0)?; + let first_offset_byte = num_elements_size as usize; let overflow = || ArrowError::InvalidArgumentError("Variant value_byte_length overflow".into()); @@ -374,15 +377,15 @@ impl<'m, 'v> VariantArray<'m, 'v> { .checked_add(value_bytes) .ok_or_else(overflow)?; - // Skip header and num_elements bytes to read the offsets + // Skip num_elements bytes to read the offsets let start_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, first_offset_byte, index)?; + offset_size.unpack_usize(self.value_data, first_offset_byte, index)?; let end_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value, first_offset_byte, index + 1)?; + offset_size.unpack_usize(self.value_data, first_offset_byte, index + 1)?; // Read the value bytes from the offsets let variant_value_bytes = slice_from_slice( - self.value, + self.value_data, first_value_byte + start_field_offset_from_first_value_byte ..first_value_byte + end_field_offset_from_first_value_byte, )?; @@ -402,11 +405,22 @@ pub enum Variant<'m, 'v> { // TODO: Add types for the rest of the primitive types, once API is agreed upon Null, Int8(i8), - + Int16(i16), + Int32(i32), + Int64(i64), + Date(NaiveDate), + TimestampMicros(DateTime), + TimestampNtzMicros(NaiveDateTime), + Decimal4 { integer: i32, scale: u8 }, + Decimal8 { integer: i64, scale: u8 }, + Decimal16 { integer: i128, scale: u8 }, + Float(f32), + Double(f64), BooleanTrue, BooleanFalse, // Note: only need the *value* buffer + Binary(&'v [u8]), String(&'v str), ShortString(&'v str), @@ -418,31 +432,109 @@ pub enum Variant<'m, 'v> { impl<'m, 'v> Variant<'m, 'v> { /// Parse the buffers and return the appropriate variant. pub fn try_new(metadata: &'m VariantMetadata, value: &'v [u8]) -> Result { - let header = *first_byte_from_slice(value)?; - let new_self = match get_basic_type(header)? { - VariantBasicType::Primitive => match get_primitive_type(header)? { + let value_metadata = *first_byte_from_slice(value)?; + let value_data = slice_from_slice(value, 1..)?; + let new_self = match get_basic_type(value_metadata)? { + VariantBasicType::Primitive => match get_primitive_type(value_metadata)? { VariantPrimitiveType::Null => Variant::Null, - VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value)?), + VariantPrimitiveType::Int8 => Variant::Int8(decoder::decode_int8(value_data)?), + VariantPrimitiveType::Int16 => Variant::Int16(decoder::decode_int16(value_data)?), + VariantPrimitiveType::Int32 => Variant::Int32(decoder::decode_int32(value_data)?), + VariantPrimitiveType::Int64 => Variant::Int64(decoder::decode_int64(value_data)?), + VariantPrimitiveType::Decimal4 => { + let (integer, scale) = decoder::decode_decimal4(value_data)?; + Variant::Decimal4 { integer, scale } + } + VariantPrimitiveType::Decimal8 => { + let (integer, scale) = decoder::decode_decimal8(value_data)?; + Variant::Decimal8 { integer, scale } + } + VariantPrimitiveType::Decimal16 => { + let (integer, scale) = decoder::decode_decimal16(value_data)?; + Variant::Decimal16 { integer, scale } + } + VariantPrimitiveType::Float => Variant::Float(decoder::decode_float(value_data)?), + VariantPrimitiveType::Double => { + Variant::Double(decoder::decode_double(value_data)?) + } VariantPrimitiveType::BooleanTrue => Variant::BooleanTrue, VariantPrimitiveType::BooleanFalse => Variant::BooleanFalse, // TODO: Add types for the rest, once API is agreed upon + VariantPrimitiveType::Date => Variant::Date(decoder::decode_date(value_data)?), + VariantPrimitiveType::TimestampMicros => { + Variant::TimestampMicros(decoder::decode_timestamp_micros(value_data)?) + } + VariantPrimitiveType::TimestampNtzMicros => { + Variant::TimestampNtzMicros(decoder::decode_timestampntz_micros(value_data)?) + } + VariantPrimitiveType::Binary => { + Variant::Binary(decoder::decode_binary(value_data)?) + } VariantPrimitiveType::String => { - Variant::String(decoder::decode_long_string(value)?) + Variant::String(decoder::decode_long_string(value_data)?) } }, VariantBasicType::ShortString => { - Variant::ShortString(decoder::decode_short_string(value)?) + Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => Variant::Object(VariantObject { metadata, value }), - VariantBasicType::Array => Variant::Array(VariantArray { metadata, value }), + VariantBasicType::Object => Variant::Object(VariantObject { + metadata, + value_metadata, + value_data, + }), + VariantBasicType::Array => Variant::Array(VariantArray { + metadata, + value_metadata, + value_data, + }), }; Ok(new_self) } + /// Converts this variant to `()` if it is null. + /// + /// Returns `Some(())` for null variants, + /// `None` for non-null variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract `()` from a null variant + /// let v1 = Variant::from(()); + /// assert_eq!(v1.as_null(), Some(())); + /// + /// // but not from other variants + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_null(), None); + /// ``` pub fn as_null(&self) -> Option<()> { matches!(self, Variant::Null).then_some(()) } + /// Converts this variant to a `bool` if possible. + /// + /// Returns `Some(bool)` for boolean variants, + /// `None` for non-boolean variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a bool from the true variant + /// let v1 = Variant::from(true); + /// assert_eq!(v1.as_boolean(), Some(true)); + /// + /// // and the false variant + /// let v2 = Variant::from(false); + /// assert_eq!(v2.as_boolean(), Some(false)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_boolean(), None); + /// ``` pub fn as_boolean(&self) -> Option { match self { Variant::BooleanTrue => Some(true), @@ -451,6 +543,146 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to a `NaiveDate` if possible. + /// + /// Returns `Some(NaiveDate)` for date variants, + /// `None` for non-date variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a NaiveDate from a date variant + /// let date = NaiveDate::from_ymd_opt(2025, 4, 12).unwrap(); + /// let v1 = Variant::from(date); + /// assert_eq!(v1.as_naive_date(), Some(date)); + /// + /// // but not from other variants + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_naive_date(), None); + /// ``` + pub fn as_naive_date(&self) -> Option { + if let Variant::Date(d) = self { + Some(*d) + } else { + None + } + } + + /// Converts this variant to a `DateTime` if possible. + /// + /// Returns `Some(DateTime)` for timestamp variants, + /// `None` for non-timestamp variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a DateTime from a UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_datetime_utc(), Some(datetime)); + /// + /// // or a non-UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let v2 = Variant::from(datetime); + /// assert_eq!(v2.as_datetime_utc(), Some(datetime.and_utc())); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_datetime_utc(), None); + /// ``` + pub fn as_datetime_utc(&self) -> Option> { + match *self { + Variant::TimestampMicros(d) => Some(d), + Variant::TimestampNtzMicros(d) => Some(d.and_utc()), + _ => None, + } + } + + /// Converts this variant to a `NaiveDateTime` if possible. + /// + /// Returns `Some(NaiveDateTime)` for timestamp variants, + /// `None` for non-timestamp variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_naive_datetime(), Some(datetime)); + /// + /// // or a UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let v2 = Variant::from(datetime); + /// assert_eq!(v2.as_naive_datetime(), Some(datetime.naive_utc())); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_naive_datetime(), None); + /// ``` + pub fn as_naive_datetime(&self) -> Option { + match *self { + Variant::TimestampNtzMicros(d) => Some(d), + Variant::TimestampMicros(d) => Some(d.naive_utc()), + _ => None, + } + } + + /// Converts this variant to a `&[u8]` if possible. + /// + /// Returns `Some(&[u8])` for binary variants, + /// `None` for non-binary variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a byte slice from a binary variant + /// let data = b"hello!"; + /// let v1 = Variant::Binary(data); + /// assert_eq!(v1.as_u8_slice(), Some(data.as_slice())); + /// + /// // but not from other variant types + /// let v2 = Variant::from(123i64); + /// assert_eq!(v2.as_u8_slice(), None); + /// ``` + pub fn as_u8_slice(&'v self) -> Option<&'v [u8]> { + if let Variant::Binary(d) = self { + Some(d) + } else { + None + } + } + + /// Converts this variant to a `&str` if possible. + /// + /// Returns `Some(&str)` for string variants (both regular and short strings), + /// `None` for non-string variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract a string from string variants + /// let s = "hello!"; + /// let v1 = Variant::ShortString(s); + /// assert_eq!(v1.as_string(), Some(s)); + /// + /// // but not from other variants + /// let v2 = Variant::from(123i64); + /// assert_eq!(v2.as_string(), None); + /// ``` pub fn as_string(&'v self) -> Option<&'v str> { match self { Variant::String(s) | Variant::ShortString(s) => Some(s), @@ -458,12 +690,304 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to an `i8` if possible. + /// + /// Returns `Some(i8)` for integer variants that fit in `i8` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i8 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int8(), Some(123i8)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(1234i64); + /// assert_eq!(v2.as_int8(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int8(), None); + /// ``` pub fn as_int8(&self) -> Option { match *self { Variant::Int8(i) => Some(i), - // TODO: Add branches for type-widening/shortening when implemting rest of primitives for int - // Variant::Int16(i) => i.try_into().ok(), - // ... + Variant::Int16(i) => i.try_into().ok(), + Variant::Int32(i) => i.try_into().ok(), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + + /// Converts this variant to an `i16` if possible. + /// + /// Returns `Some(i16)` for integer variants that fit in `i16` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i16 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int16(), Some(123i16)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(123456i64); + /// assert_eq!(v2.as_int16(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int16(), None); + /// ``` + pub fn as_int16(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i), + Variant::Int32(i) => i.try_into().ok(), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + + /// Converts this variant to an `i32` if possible. + /// + /// Returns `Some(i32)` for integer variants that fit in `i32` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i32 if it fits + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int32(), Some(123i32)); + /// + /// // but not if it would overflow + /// let v2 = Variant::from(12345678901i64); + /// assert_eq!(v2.as_int32(), None); + /// + /// // or if the variant cannot be cast into an integer + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_int32(), None); + /// ``` + pub fn as_int32(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i.into()), + Variant::Int32(i) => Some(i), + Variant::Int64(i) => i.try_into().ok(), + _ => None, + } + } + + /// Converts this variant to an `i64` if possible. + /// + /// Returns `Some(i64)` for integer variants that fit in `i64` range, + /// `None` for non-integer variants or values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can read an int64 variant into an i64 + /// let v1 = Variant::from(123i64); + /// assert_eq!(v1.as_int64(), Some(123i64)); + /// + /// // but not a variant that cannot be cast into an integer + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_int64(), None); + /// ``` + pub fn as_int64(&self) -> Option { + match *self { + Variant::Int8(i) => Some(i.into()), + Variant::Int16(i) => Some(i.into()), + Variant::Int32(i) => Some(i.into()), + Variant::Int64(i) => Some(i), + _ => None, + } + } + + /// Converts this variant to tuple with a 4-byte unscaled value if possible. + /// + /// Returns `Some((i32, u8))` for decimal variants where the unscaled value + /// fits in `i32` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i32, 2)); + /// assert_eq!(v1.as_decimal_int32(), Some((1234_i32, 2))); + /// + /// // and from larger decimal variants if they fit + /// let v2 = Variant::from((1234_i64, 2)); + /// assert_eq!(v2.as_decimal_int32(), Some((1234_i32, 2))); + /// + /// // but not if the value would overflow i32 + /// let v3 = Variant::from((12345678901i64, 2)); + /// assert_eq!(v3.as_decimal_int32(), None); + /// + /// // or if the variant is not a decimal + /// let v4 = Variant::from("hello!"); + /// assert_eq!(v4.as_decimal_int32(), None); + /// ``` + pub fn as_decimal_int32(&self) -> Option<(i32, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer, scale)), + Variant::Decimal8 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + Variant::Decimal16 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + _ => None, + } + } + + /// Converts this variant to tuple with an 8-byte unscaled value if possible. + /// + /// Returns `Some((i64, u8))` for decimal variants where the unscaled value + /// fits in `i64` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i64, 2)); + /// assert_eq!(v1.as_decimal_int64(), Some((1234_i64, 2))); + /// + /// // and from larger decimal variants if they fit + /// let v2 = Variant::from((1234_i128, 2)); + /// assert_eq!(v2.as_decimal_int64(), Some((1234_i64, 2))); + /// + /// // but not if the value would overflow i64 + /// let v3 = Variant::from((2e19 as i128, 2)); + /// assert_eq!(v3.as_decimal_int64(), None); + /// + /// // or if the variant is not a decimal + /// let v4 = Variant::from("hello!"); + /// assert_eq!(v4.as_decimal_int64(), None); + /// ``` + pub fn as_decimal_int64(&self) -> Option<(i64, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal8 { integer, scale } => Some((integer, scale)), + Variant::Decimal16 { integer, scale } => { + if let Ok(converted_integer) = integer.try_into() { + Some((converted_integer, scale)) + } else { + None + } + } + _ => None, + } + } + + /// Converts this variant to tuple with a 16-byte unscaled value if possible. + /// + /// Returns `Some((i128, u8))` for decimal variants where the unscaled value + /// fits in `i128` range, + /// `None` for non-decimal variants or decimal values that would overflow. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract decimal parts from smaller or equally-sized decimal variants + /// let v1 = Variant::from((1234_i128, 2)); + /// assert_eq!(v1.as_decimal_int128(), Some((1234_i128, 2))); + /// + /// // but not if the variant is not a decimal + /// let v2 = Variant::from("hello!"); + /// assert_eq!(v2.as_decimal_int128(), None); + /// ``` + pub fn as_decimal_int128(&self) -> Option<(i128, u8)> { + match *self { + Variant::Decimal4 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal8 { integer, scale } => Some((integer.into(), scale)), + Variant::Decimal16 { integer, scale } => Some((integer, scale)), + _ => None, + } + } + /// Converts this variant to an `f32` if possible. + /// + /// Returns `Some(f32)` for float and double variants, + /// `None` for non-floating-point variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract an f32 from a float variant + /// let v1 = Variant::from(std::f32::consts::PI); + /// assert_eq!(v1.as_f32(), Some(std::f32::consts::PI)); + /// + /// // and from a double variant (with loss of precision to nearest f32) + /// let v2 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v2.as_f32(), Some(std::f32::consts::PI)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_f32(), None); + /// ``` + #[allow(clippy::cast_possible_truncation)] + pub fn as_f32(&self) -> Option { + match *self { + Variant::Float(i) => Some(i), + Variant::Double(i) => Some(i as f32), + _ => None, + } + } + + /// Converts this variant to an `f64` if possible. + /// + /// Returns `Some(f64)` for float and double variants, + /// `None` for non-floating-point variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // you can extract an f64 from a float variant + /// let v1 = Variant::from(std::f32::consts::PI); + /// assert_eq!(v1.as_f64(), Some(std::f32::consts::PI as f64)); + /// + /// // and from a double variant + /// let v2 = Variant::from(std::f64::consts::PI); + /// assert_eq!(v2.as_f64(), Some(std::f64::consts::PI)); + /// + /// // but not from other variants + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_f64(), None); + /// ``` + pub fn as_f64(&self) -> Option { + match *self { + Variant::Float(i) => Some(i.into()), + Variant::Double(i) => Some(i), _ => None, } } @@ -477,22 +1001,109 @@ impl<'m, 'v> Variant<'m, 'v> { } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From<()> for Variant<'_, '_> { + fn from((): ()) -> Self { + Variant::Null + } +} + +impl From for Variant<'_, '_> { fn from(value: i8) -> Self { Variant::Int8(value) } } -impl<'m, 'v> From for Variant<'m, 'v> { +impl From for Variant<'_, '_> { + fn from(value: i16) -> Self { + Variant::Int16(value) + } +} + +impl From for Variant<'_, '_> { + fn from(value: i32) -> Self { + Variant::Int32(value) + } +} + +impl From for Variant<'_, '_> { + fn from(value: i64) -> Self { + Variant::Int64(value) + } +} + +impl From<(i32, u8)> for Variant<'_, '_> { + fn from(value: (i32, u8)) -> Self { + Variant::Decimal4 { + integer: value.0, + scale: value.1, + } + } +} + +impl From<(i64, u8)> for Variant<'_, '_> { + fn from(value: (i64, u8)) -> Self { + Variant::Decimal8 { + integer: value.0, + scale: value.1, + } + } +} + +impl From<(i128, u8)> for Variant<'_, '_> { + fn from(value: (i128, u8)) -> Self { + Variant::Decimal16 { + integer: value.0, + scale: value.1, + } + } +} + +impl From for Variant<'_, '_> { + fn from(value: f32) -> Self { + Variant::Float(value) + } +} + +impl From for Variant<'_, '_> { + fn from(value: f64) -> Self { + Variant::Double(value) + } +} + +impl From for Variant<'_, '_> { fn from(value: bool) -> Self { - match value { - true => Variant::BooleanTrue, - false => Variant::BooleanFalse, + if value { + Variant::BooleanTrue + } else { + Variant::BooleanFalse } } } -impl<'m, 'v> From<&'v str> for Variant<'m, 'v> { +impl From for Variant<'_, '_> { + fn from(value: NaiveDate) -> Self { + Variant::Date(value) + } +} + +impl From> for Variant<'_, '_> { + fn from(value: DateTime) -> Self { + Variant::TimestampMicros(value) + } +} +impl From for Variant<'_, '_> { + fn from(value: NaiveDateTime) -> Self { + Variant::TimestampNtzMicros(value) + } +} + +impl<'v> From<&'v [u8]> for Variant<'_, 'v> { + fn from(value: &'v [u8]) -> Self { + Variant::Binary(value) + } +} + +impl<'v> From<&'v str> for Variant<'_, 'v> { fn from(value: &'v str) -> Self { if value.len() < 64 { Variant::ShortString(value) diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 617a10d63d12..0db54e969a92 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -24,6 +24,7 @@ use std::fs; use std::path::{Path, PathBuf}; use arrow_schema::ArrowError; +use chrono::NaiveDate; use parquet_variant::{Variant, VariantMetadata}; fn cases_dir() -> PathBuf { @@ -46,22 +47,23 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { // Cases are commented out // Enabling is tracked in https://github.com/apache/arrow-rs/issues/7630 vec![ - // ("primitive_binary", Variant::Binary), + ("primitive_binary", Variant::Binary(&[0x03, 0x13, 0x37, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe])), ("primitive_boolean_false", Variant::BooleanFalse), ("primitive_boolean_true", Variant::BooleanTrue), - // ("primitive_date", Variant::Null), - //("primitive_decimal4", Variant::Null), - //("primitive_decimal8", Variant::Null), - //("primitive_decimal16", Variant::Null), - //("primitive_float", Variant::Null), + ("primitive_date", Variant::Date(NaiveDate::from_ymd_opt(2025, 4 , 16).unwrap())), + ("primitive_decimal4", Variant::Decimal4{integer: 1234, scale: 2}), + ("primitive_decimal8", Variant::Decimal8{integer: 1234567890, scale: 2}), + ("primitive_decimal16", Variant::Decimal16{integer: 1234567891234567890, scale: 2}), + ("primitive_float", Variant::Float(1234567890.1234)), + ("primitive_double", Variant::Double(1234567890.1234)), ("primitive_int8", Variant::Int8(42)), - //("primitive_int16", Variant::Null), - //("primitive_int32", Variant::Null), - //("primitive_int64", Variant::Null), + ("primitive_int16", Variant::Int16(1234)), + ("primitive_int32", Variant::Int32(123456)), + ("primitive_int64", Variant::Int64(1234567890123456789)), ("primitive_null", Variant::Null), ("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), - //("primitive_timestamp", Variant::Null), - //("primitive_timestampntz", Variant::Null), + ("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())), + ("primitive_timestampntz", Variant::TimestampNtzMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())), ("short_string", Variant::ShortString("Less than 64 bytes (❤\u{fe0f} with utf8)")), ] }