diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 926a4d4efc97..7eeb4da632e4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -591,13 +591,19 @@ mod tests { Arc::new(microsecond_array.with_timezone("+01:00".to_string())), ); - // nanoseconds should get truncated to microseconds + let timestamp = DateTime::from_timestamp_nanos(nanosecond); let nanosecond_array = TimestampNanosecondArray::from(vec![Some(nanosecond), None]); - run_array_tests( - microsecond, + run_test( Arc::new(nanosecond_array.clone()), + vec![ + Some(Variant::TimestampNtzNanos(timestamp.naive_utc())), + None, + ], + ); + run_test( Arc::new(nanosecond_array.with_timezone("+01:00".to_string())), - ) + vec![Some(Variant::TimestampNanos(timestamp)), None], + ); } #[test] diff --git a/parquet-variant-json/Cargo.toml b/parquet-variant-json/Cargo.toml index 76255f0681cd..5d8e02546b09 100644 --- a/parquet-variant-json/Cargo.toml +++ b/parquet-variant-json/Cargo.toml @@ -37,6 +37,7 @@ parquet-variant = { path = "../parquet-variant" } chrono = { workspace = true } serde_json = "1.0" base64 = "0.22" +uuid = "1.18.0" [lib] diff --git a/parquet-variant-json/src/to_json.rs b/parquet-variant-json/src/to_json.rs index 4753d6cc96ed..b1894a64f837 100644 --- a/parquet-variant-json/src/to_json.rs +++ b/parquet-variant-json/src/to_json.rs @@ -181,9 +181,14 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> { Variant::Decimal8(decimal) => write!(buffer, "{decimal}")?, Variant::Decimal16(decimal) => write!(buffer, "{decimal}")?, Variant::Date(date) => write!(buffer, "\"{}\"", format_date_string(date))?, - Variant::TimestampMicros(ts) => write!(buffer, "\"{}\"", ts.to_rfc3339())?, + Variant::TimestampMicros(ts) | Variant::TimestampNanos(ts) => { + write!(buffer, "\"{}\"", ts.to_rfc3339())? + } Variant::TimestampNtzMicros(ts) => { - write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts))? + write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts, 6))? + } + Variant::TimestampNtzNanos(ts) => { + write!(buffer, "\"{}\"", format_timestamp_ntz_string(ts, 9))? } Variant::Time(time) => write!(buffer, "\"{}\"", format_time_ntz_str(time))?, Variant::Binary(bytes) => { @@ -208,6 +213,9 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> { })?; write!(buffer, "{json_str}")? } + Variant::Uuid(uuid) => { + write!(buffer, "\"{uuid}\"")?; + } Variant::Object(obj) => { convert_object_to_json(buffer, obj)?; } @@ -297,12 +305,18 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> { Ok(value) } Variant::Date(date) => Ok(Value::String(format_date_string(date))), - Variant::TimestampMicros(ts) => Ok(Value::String(ts.to_rfc3339())), - Variant::TimestampNtzMicros(ts) => Ok(Value::String(format_timestamp_ntz_string(ts))), + Variant::TimestampMicros(ts) | Variant::TimestampNanos(ts) => { + Ok(Value::String(ts.to_rfc3339())) + } + Variant::TimestampNtzMicros(ts) => { + Ok(Value::String(format_timestamp_ntz_string(ts, 6))) + } + Variant::TimestampNtzNanos(ts) => Ok(Value::String(format_timestamp_ntz_string(ts, 9))), Variant::Time(time) => Ok(Value::String(format_time_ntz_str(time))), Variant::Binary(bytes) => Ok(Value::String(format_binary_base64(bytes))), Variant::String(s) => Ok(Value::String(s.to_string())), Variant::ShortString(s) => Ok(Value::String(s.to_string())), + Variant::Uuid(uuid) => Ok(Value::String(uuid.to_string())), Variant::Object(obj) => { let map = obj .iter() @@ -323,15 +337,18 @@ impl<'m, 'v> VariantToJson for Variant<'m, 'v> { // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; -const TIMESTAMP_NTZ_FORMAT: &str = "%Y-%m-%dT%H:%M:%S%.6f"; // Helper functions for consistent formatting fn format_date_string(date: &chrono::NaiveDate) -> String { date.format(DATE_FORMAT).to_string() } -fn format_timestamp_ntz_string(ts: &chrono::NaiveDateTime) -> String { - ts.format(TIMESTAMP_NTZ_FORMAT).to_string() +fn format_timestamp_ntz_string(ts: &chrono::NaiveDateTime, precision: usize) -> String { + let format_str = format!( + "{}", + ts.format(&format!("%Y-%m-%dT%H:%M:%S%.{}f", precision)) + ); + ts.format(format_str.as_str()).to_string() } fn format_binary_base64(bytes: &[u8]) -> String { @@ -497,6 +514,34 @@ mod tests { Ok(()) } + #[test] + fn test_timestamp_nanos_to_json() -> Result<(), ArrowError> { + let timestamp = DateTime::parse_from_rfc3339("2023-12-25T10:30:45.123456789Z") + .unwrap() + .with_timezone(&Utc); + let variant = Variant::TimestampNanos(timestamp); + let json = variant.to_json_string()?; + assert_eq!(json, "\"2023-12-25T10:30:45.123456789+00:00\""); + + let json_value = variant.to_json_value()?; + assert!(matches!(json_value, Value::String(_))); + Ok(()) + } + + #[test] + fn test_timestamp_ntz_nanos_to_json() -> Result<(), ArrowError> { + let naive_timestamp = DateTime::from_timestamp(1703505045, 123456789) + .unwrap() + .naive_utc(); + let variant = Variant::TimestampNtzNanos(naive_timestamp); + let json = variant.to_json_string()?; + assert_eq!(json, "\"2023-12-25T11:50:45.123456789\""); + + let json_value = variant.to_json_value()?; + assert!(matches!(json_value, Value::String(_))); + Ok(()) + } + #[test] fn test_binary_to_json() -> Result<(), ArrowError> { let binary_data = b"Hello, World!"; @@ -546,6 +591,21 @@ mod tests { Ok(()) } + #[test] + fn test_uuid_to_json() -> Result<(), ArrowError> { + let uuid = uuid::Uuid::parse_str("123e4567-e89b-12d3-a456-426614174000").unwrap(); + let variant = Variant::Uuid(uuid); + let json = variant.to_json_string()?; + assert_eq!(json, "\"123e4567-e89b-12d3-a456-426614174000\""); + + let json_value = variant.to_json_value()?; + assert_eq!( + json_value, + Value::String("123e4567-e89b-12d3-a456-426614174000".to_string()) + ); + Ok(()) + } + #[test] fn test_string_escaping() -> Result<(), ArrowError> { let variant = Variant::from("hello\nworld\t\"quoted\""); diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 51fa4cc23311..9e0fa988287b 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -34,6 +34,7 @@ rust-version = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } indexmap = "2.10.0" +uuid = { version = "1.18.0"} simdutf8 = { workspace = true , optional = true } diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 6ab51ac23e63..fe3dd52853d1 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -23,6 +23,7 @@ use arrow_schema::ArrowError; use chrono::Timelike; use indexmap::{IndexMap, IndexSet}; use std::collections::HashSet; +use uuid::Uuid; const BASIC_TYPE_BITS: u8 = 2; const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); @@ -198,6 +199,23 @@ impl ValueBuffer { self.append_slice(µs_from_midnight.to_le_bytes()); } + fn append_timestamp_nanos(&mut self, value: chrono::DateTime) { + self.append_primitive_header(VariantPrimitiveType::TimestampNanos); + let nanos = value.timestamp_nanos_opt().unwrap(); + self.append_slice(&nanos.to_le_bytes()); + } + + fn append_timestamp_ntz_nanos(&mut self, value: chrono::NaiveDateTime) { + self.append_primitive_header(VariantPrimitiveType::TimestampNtzNanos); + let nanos = value.and_utc().timestamp_nanos_opt().unwrap(); + self.append_slice(&nanos.to_le_bytes()); + } + + fn append_uuid(&mut self, value: Uuid) { + self.append_primitive_header(VariantPrimitiveType::Uuid); + self.append_slice(&value.into_bytes()); + } + fn append_decimal4(&mut self, decimal4: VariantDecimal4) { self.append_primitive_header(VariantPrimitiveType::Decimal4); self.append_u8(decimal4.scale()); @@ -332,6 +350,8 @@ impl ValueBuffer { Variant::Date(v) => self.append_date(v), Variant::TimestampMicros(v) => self.append_timestamp_micros(v), Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), + Variant::TimestampNanos(v) => self.append_timestamp_nanos(v), + Variant::TimestampNtzNanos(v) => self.append_timestamp_ntz_nanos(v), Variant::Decimal4(decimal4) => self.append_decimal4(decimal4), Variant::Decimal8(decimal8) => self.append_decimal8(decimal8), Variant::Decimal16(decimal16) => self.append_decimal16(decimal16), @@ -340,6 +360,7 @@ impl ValueBuffer { Variant::Binary(v) => self.append_binary(v), Variant::String(s) => self.append_string(s), Variant::ShortString(s) => self.append_short_string(s), + Variant::Uuid(v) => self.append_uuid(v), Variant::Object(obj) => self.append_object(metadata_builder, obj), Variant::List(list) => self.append_list(metadata_builder, list), Variant::Time(v) => self.append_time_micros(v), @@ -363,12 +384,15 @@ impl ValueBuffer { Variant::Date(v) => self.append_date(v), Variant::TimestampMicros(v) => self.append_timestamp_micros(v), Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), + Variant::TimestampNanos(v) => self.append_timestamp_nanos(v), + Variant::TimestampNtzNanos(v) => self.append_timestamp_ntz_nanos(v), Variant::Decimal4(decimal4) => self.append_decimal4(decimal4), Variant::Decimal8(decimal8) => self.append_decimal8(decimal8), Variant::Decimal16(decimal16) => self.append_decimal16(decimal16), Variant::Float(v) => self.append_float(v), Variant::Double(v) => self.append_double(v), Variant::Binary(v) => self.append_binary(v), + Variant::Uuid(v) => self.append_uuid(v), Variant::String(s) => self.append_string(s), Variant::ShortString(s) => self.append_short_string(s), Variant::Object(obj) => self.try_append_object(metadata_builder, obj)?, diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index ff870596e4de..26b4e204fa69 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -21,6 +21,7 @@ use crate::ShortString; use arrow_schema::ArrowError; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use uuid::Uuid; /// The basic type of a [`Variant`] value, encoded in the first two bits of the /// header byte. @@ -64,6 +65,9 @@ pub enum VariantPrimitiveType { Binary = 15, String = 16, Time = 17, + TimestampNanos = 18, + TimestampNtzNanos = 19, + Uuid = 20, } /// Extracts the basic type from a header byte @@ -106,6 +110,9 @@ impl TryFrom for VariantPrimitiveType { 15 => Ok(VariantPrimitiveType::Binary), 16 => Ok(VariantPrimitiveType::String), 17 => Ok(VariantPrimitiveType::Time), + 18 => Ok(VariantPrimitiveType::TimestampNanos), + 19 => Ok(VariantPrimitiveType::TimestampNtzNanos), + 20 => Ok(VariantPrimitiveType::Uuid), _ => Err(ArrowError::InvalidArgumentError(format!( "unknown primitive type: {value}", ))), @@ -316,6 +323,25 @@ pub(crate) fn decode_time_ntz(data: &[u8]) -> Result { .ok_or(case_error) } +/// Decodes a TimestampNanos from the value section of a variant. +pub(crate) fn decode_timestamp_nanos(data: &[u8]) -> Result, ArrowError> { + let nanos_since_epoch = i64::from_le_bytes(array_from_slice(data, 0)?); + + // DateTime::from_timestamp_nanos would never fail + Ok(DateTime::from_timestamp_nanos(nanos_since_epoch)) +} + +/// Decodes a TimestampNtzNanos from the value section of a variant. +pub(crate) fn decode_timestampntz_nanos(data: &[u8]) -> Result { + decode_timestamp_nanos(data).map(|v| v.naive_utc()) +} + +/// Decodes a UUID from the value section of a variant. +pub(crate) fn decode_uuid(data: &[u8]) -> Result { + Uuid::from_slice(&data[0..16]) + .map_err(|_| ArrowError::CastError(format!("Cant decode uuid from {:?}", &data[0..16]))) +} + /// Decodes a Binary from the value section of a variant. pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> { let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize; @@ -460,6 +486,61 @@ mod tests { .and_hms_milli_opt(16, 34, 56, 780) .unwrap() ); + + test_decoder_bounds!( + test_timestamp_nanos, + [0x15, 0x41, 0xa2, 0x5a, 0x36, 0xa2, 0x5b, 0x18], + decode_timestamp_nanos, + NaiveDate::from_ymd_opt(2025, 8, 14) + .unwrap() + .and_hms_nano_opt(12, 33, 54, 123456789) + .unwrap() + .and_utc() + ); + + test_decoder_bounds!( + test_timestamp_nanos_before_epoch, + [0x15, 0x41, 0x52, 0xd4, 0x94, 0xe5, 0xad, 0xfa], + decode_timestamp_nanos, + NaiveDate::from_ymd_opt(1957, 11, 7) + .unwrap() + .and_hms_nano_opt(12, 33, 54, 123456789) + .unwrap() + .and_utc() + ); + + test_decoder_bounds!( + test_timestampntz_nanos, + [0x15, 0x41, 0xa2, 0x5a, 0x36, 0xa2, 0x5b, 0x18], + decode_timestampntz_nanos, + NaiveDate::from_ymd_opt(2025, 8, 14) + .unwrap() + .and_hms_nano_opt(12, 33, 54, 123456789) + .unwrap() + ); + + test_decoder_bounds!( + test_timestampntz_nanos_before_epoch, + [0x15, 0x41, 0x52, 0xd4, 0x94, 0xe5, 0xad, 0xfa], + decode_timestampntz_nanos, + NaiveDate::from_ymd_opt(1957, 11, 7) + .unwrap() + .and_hms_nano_opt(12, 33, 54, 123456789) + .unwrap() + ); + } + + #[test] + fn test_uuid() { + let data = [ + 0xf2, 0x4f, 0x9b, 0x64, 0x81, 0xfa, 0x49, 0xd1, 0xb7, 0x4e, 0x8c, 0x09, 0xa6, 0xe3, + 0x1c, 0x56, + ]; + let result = decode_uuid(&data).unwrap(); + assert_eq!( + Uuid::parse_str("f24f9b64-81fa-49d1-b74e-8c09a6e31c56").unwrap(), + result + ); } mod time { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 62da32bebdb7..0bf3eed9790a 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -27,7 +27,8 @@ use crate::utils::{first_byte_from_slice, slice_from_slice}; use std::ops::Deref; use arrow_schema::ArrowError; -use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Timelike, Utc}; +use uuid::Uuid; mod decimal; mod list; @@ -229,6 +230,10 @@ pub enum Variant<'m, 'v> { TimestampMicros(DateTime), /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, MICROS) TimestampNtzMicros(NaiveDateTime), + /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=true, NANOS) + TimestampNanos(DateTime), + /// Primitive (type_id=1): TIMESTAMP(isAdjustedToUTC=false, NANOS) + TimestampNtzNanos(NaiveDateTime), /// Primitive (type_id=1): DECIMAL(precision, scale) 32-bits Decimal4(VariantDecimal4), /// Primitive (type_id=1): DECIMAL(precision, scale) 64-bits @@ -250,6 +255,8 @@ pub enum Variant<'m, 'v> { String(&'v str), /// Primitive (type_id=1): TIME(isAdjustedToUTC=false, MICROS) Time(NaiveTime), + /// Primitive (type_id=1): UUID + Uuid(Uuid), /// Short String (type_id=2): STRING ShortString(ShortString<'v>), // need both metadata & value @@ -381,6 +388,13 @@ impl<'m, 'v> Variant<'m, 'v> { VariantPrimitiveType::TimestampNtzMicros => { Variant::TimestampNtzMicros(decoder::decode_timestampntz_micros(value_data)?) } + VariantPrimitiveType::TimestampNanos => { + Variant::TimestampNanos(decoder::decode_timestamp_nanos(value_data)?) + } + VariantPrimitiveType::TimestampNtzNanos => { + Variant::TimestampNtzNanos(decoder::decode_timestampntz_nanos(value_data)?) + } + VariantPrimitiveType::Uuid => Variant::Uuid(decoder::decode_uuid(value_data)?), VariantPrimitiveType::Binary => { Variant::Binary(decoder::decode_binary(value_data)?) } @@ -528,11 +542,9 @@ impl<'m, 'v> Variant<'m, 'v> { /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_datetime_utc(), Some(datetime)); - /// - /// // or a non-UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); - /// let v2 = Variant::from(datetime); - /// assert_eq!(v2.as_datetime_utc(), Some(datetime.and_utc())); + /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc(); + /// let v2 = Variant::from(datetime_nanos); + /// assert_eq!(v2.as_datetime_utc(), Some(datetime_nanos)); /// /// // but not from other variants /// let v3 = Variant::from("hello!"); @@ -540,8 +552,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// ``` pub fn as_datetime_utc(&self) -> Option> { match *self { - Variant::TimestampMicros(d) => Some(d), - Variant::TimestampNtzMicros(d) => Some(d.and_utc()), + Variant::TimestampMicros(d) | Variant::TimestampNanos(d) => Some(d), _ => None, } } @@ -563,9 +574,9 @@ impl<'m, 'v> Variant<'m, 'v> { /// assert_eq!(v1.as_naive_datetime(), Some(datetime)); /// /// // or a UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 123456789).unwrap(); /// let v2 = Variant::from(datetime); - /// assert_eq!(v2.as_naive_datetime(), Some(datetime.naive_utc())); + /// assert_eq!(v2.as_naive_datetime(), Some(datetime)); /// /// // but not from other variants /// let v3 = Variant::from("hello!"); @@ -573,8 +584,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// ``` pub fn as_naive_datetime(&self) -> Option { match *self { - Variant::TimestampNtzMicros(d) => Some(d), - Variant::TimestampMicros(d) => Some(d.naive_utc()), + Variant::TimestampNtzMicros(d) | Variant::TimestampNtzNanos(d) => Some(d), _ => None, } } @@ -632,6 +642,32 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to a `uuid hyphenated string` if possible. + /// + /// Returns `Some(String)` for UUID variants, `None` for non-UUID variants. + /// + /// # Examples + /// + /// ``` + /// use parquet_variant::Variant; + /// + /// // You can extract a UUID from a UUID variant + /// let s = uuid::Uuid::parse_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap(); + /// let v1 = Variant::Uuid(s); + /// assert_eq!(s, v1.as_uuid().unwrap()); + /// assert_eq!("67e55044-10b1-426f-9247-bb680e5fe0c8", v1.as_uuid().unwrap().to_string()); + /// + /// //but not from other variants + /// let v2 = Variant::from(1234); + /// assert_eq!(None, v2.as_uuid()) + /// ``` + pub fn as_uuid(&self) -> Option { + match self { + Variant::Uuid(u) => Some(*u), + _ => None, + } + } + /// Converts this variant to an `i8` if possible. /// /// Returns `Some(i8)` for integer variants that fit in `i8` range, @@ -1262,12 +1298,21 @@ impl From for Variant<'_, '_> { impl From> for Variant<'_, '_> { fn from(value: DateTime) -> Self { - Variant::TimestampMicros(value) + if value.nanosecond() % 1000 > 0 { + Variant::TimestampNanos(value) + } else { + Variant::TimestampMicros(value) + } } } + impl From for Variant<'_, '_> { fn from(value: NaiveDateTime) -> Self { - Variant::TimestampNtzMicros(value) + if value.nanosecond() % 1000 > 0 { + Variant::TimestampNtzNanos(value) + } else { + Variant::TimestampNtzMicros(value) + } } } @@ -1367,10 +1412,13 @@ impl std::fmt::Debug for Variant<'_, '_> { Variant::TimestampNtzMicros(ts) => { f.debug_tuple("TimestampNtzMicros").field(ts).finish() } + Variant::TimestampNanos(ts) => f.debug_tuple("TimestampNanos").field(ts).finish(), + Variant::TimestampNtzNanos(ts) => f.debug_tuple("TimestampNtzNanos").field(ts).finish(), Variant::Binary(bytes) => write!(f, "Binary({:?})", HexString(bytes)), Variant::String(s) => f.debug_tuple("String").field(s).finish(), Variant::Time(s) => f.debug_tuple("Time").field(s).finish(), Variant::ShortString(s) => f.debug_tuple("ShortString").field(s).finish(), + Variant::Uuid(uuid) => f.debug_tuple("Uuid").field(&uuid).finish(), Variant::Object(obj) => { let mut map = f.debug_map(); for res in obj.iter_try() { @@ -1476,6 +1524,25 @@ mod tests { Variant::TimestampNtzMicros(timestamp_ntz), ); + let timestamp_nanos_utc = chrono::NaiveDate::from_ymd_opt(2025, 8, 15) + .unwrap() + .and_hms_nano_opt(12, 3, 4, 123456789) + .unwrap() + .and_utc(); + root_obj.insert( + "timestamp_nanos", + Variant::TimestampNanos(timestamp_nanos_utc), + ); + + let timestamp_ntz_nanos = chrono::NaiveDate::from_ymd_opt(2025, 8, 15) + .unwrap() + .and_hms_nano_opt(12, 3, 4, 123456789) + .unwrap(); + root_obj.insert( + "timestamp_ntz_nanos", + Variant::TimestampNtzNanos(timestamp_ntz_nanos), + ); + // Add decimal types let decimal4 = VariantDecimal4::try_new(1234i32, 2).unwrap(); root_obj.insert("decimal4", decimal4); @@ -1497,6 +1564,10 @@ mod tests { let time = NaiveTime::from_hms_micro_opt(1, 2, 3, 4).unwrap(); root_obj.insert("time", time); + // Add uuid + let uuid = Uuid::parse_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap(); + root_obj.insert("uuid", Variant::Uuid(uuid)); + // Add nested object let mut nested_obj = root_obj.new_object("nested_object"); nested_obj.insert("inner_key1", "inner_value1"); @@ -1540,17 +1611,20 @@ mod tests { assert!(debug_output.contains("\"date\": Date(2024-12-25)")); assert!(debug_output.contains("\"timestamp_micros\": TimestampMicros(")); assert!(debug_output.contains("\"timestamp_ntz_micros\": TimestampNtzMicros(")); + assert!(debug_output.contains("\"timestamp_nanos\": TimestampNanos(")); + assert!(debug_output.contains("\"timestamp_ntz_nanos\": TimestampNtzNanos(")); assert!(debug_output.contains("\"decimal4\": Decimal4(")); assert!(debug_output.contains("\"decimal8\": Decimal8(")); assert!(debug_output.contains("\"decimal16\": Decimal16(")); assert!(debug_output.contains("\"binary\": Binary(01 02 03 04 de ad be ef)")); assert!(debug_output.contains("\"string\": String(")); assert!(debug_output.contains("\"short_string\": ShortString(")); + assert!(debug_output.contains("\"uuid\": Uuid(67e55044-10b1-426f-9247-bb680e5fe0c8)")); assert!(debug_output.contains("\"time\": Time(01:02:03.000004)")); assert!(debug_output.contains("\"nested_object\":")); assert!(debug_output.contains("\"mixed_list\":")); - let expected = r#"{"binary": Binary(01 02 03 04 de ad be ef), "boolean_false": BooleanFalse, "boolean_true": BooleanTrue, "date": Date(2024-12-25), "decimal16": Decimal16(VariantDecimal16 { integer: 123456789012345678901234567890, scale: 4 }), "decimal4": Decimal4(VariantDecimal4 { integer: 1234, scale: 2 }), "decimal8": Decimal8(VariantDecimal8 { integer: 123456789, scale: 3 }), "double": Double(1.23456789), "float": Float(1.234), "int16": Int16(1234), "int32": Int32(123456), "int64": Int64(1234567890123456789), "int8": Int8(42), "mixed_list": [Int32(1), ShortString(ShortString("two")), BooleanTrue, Float(4.0), Null, [ShortString(ShortString("nested")), Int8(10)]], "nested_object": {"inner_key1": ShortString(ShortString("inner_value1")), "inner_key2": Int32(999)}, "null": Null, "short_string": ShortString(ShortString("Short string with emoji 🎉")), "string": String("This is a long string that exceeds the short string limit and contains emoji 🦀"), "time": Time(01:02:03.000004), "timestamp_micros": TimestampMicros(2024-12-25T15:30:45.123Z), "timestamp_ntz_micros": TimestampNtzMicros(2024-12-25T15:30:45.123)}"#; + let expected = r#"{"binary": Binary(01 02 03 04 de ad be ef), "boolean_false": BooleanFalse, "boolean_true": BooleanTrue, "date": Date(2024-12-25), "decimal16": Decimal16(VariantDecimal16 { integer: 123456789012345678901234567890, scale: 4 }), "decimal4": Decimal4(VariantDecimal4 { integer: 1234, scale: 2 }), "decimal8": Decimal8(VariantDecimal8 { integer: 123456789, scale: 3 }), "double": Double(1.23456789), "float": Float(1.234), "int16": Int16(1234), "int32": Int32(123456), "int64": Int64(1234567890123456789), "int8": Int8(42), "mixed_list": [Int32(1), ShortString(ShortString("two")), BooleanTrue, Float(4.0), Null, [ShortString(ShortString("nested")), Int8(10)]], "nested_object": {"inner_key1": ShortString(ShortString("inner_value1")), "inner_key2": Int32(999)}, "null": Null, "short_string": ShortString(ShortString("Short string with emoji 🎉")), "string": String("This is a long string that exceeds the short string limit and contains emoji 🦀"), "time": Time(01:02:03.000004), "timestamp_micros": TimestampMicros(2024-12-25T15:30:45.123Z), "timestamp_nanos": TimestampNanos(2025-08-15T12:03:04.123456789Z), "timestamp_ntz_micros": TimestampNtzMicros(2024-12-25T15:30:45.123), "timestamp_ntz_nanos": TimestampNtzNanos(2025-08-15T12:03:04.123456789), "uuid": Uuid(67e55044-10b1-426f-9247-bb680e5fe0c8)}"#; assert_eq!(debug_output, expected); // Test alternate Debug formatter (#?) @@ -1648,9 +1722,18 @@ mod tests { "timestamp_micros": TimestampMicros( 2024-12-25T15:30:45.123Z, ), + "timestamp_nanos": TimestampNanos( + 2025-08-15T12:03:04.123456789Z, + ), "timestamp_ntz_micros": TimestampNtzMicros( 2024-12-25T15:30:45.123, ), + "timestamp_ntz_nanos": TimestampNtzNanos( + 2025-08-15T12:03:04.123456789, + ), + "uuid": Uuid( + 67e55044-10b1-426f-9247-bb680e5fe0c8, + ), }"#; assert_eq!(alt_debug_output, expected); } diff --git a/parquet-variant/tests/variant_interop.rs b/parquet-variant/tests/variant_interop.rs index 1c5b8ed221a6..518a77f53f7a 100644 --- a/parquet-variant/tests/variant_interop.rs +++ b/parquet-variant/tests/variant_interop.rs @@ -28,6 +28,7 @@ use parquet_variant::{ use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; +use uuid::Uuid; /// Returns a directory path for the parquet variant test data. /// @@ -126,6 +127,9 @@ fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { ("primitive_string", Variant::String("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), ("primitive_timestamp", Variant::TimestampMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(16, 34, 56, 780).unwrap().and_utc())), ("primitive_timestampntz", Variant::TimestampNtzMicros(NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap())), + ("primitive_timestamp_nanos", Variant::TimestampNanos(NaiveDate::from_ymd_opt(2024, 11, 7).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc())), + ("primitive_timestampntz_nanos", Variant::TimestampNtzNanos(NaiveDate::from_ymd_opt(2024, 11, 7).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap())), + ("primitive_uuid", Variant::Uuid(Uuid::parse_str("f24f9b64-81fa-49d1-b74e-8c09a6e31c56").unwrap())), ("short_string", Variant::ShortString(ShortString::try_new("Less than 64 bytes (❤\u{fe0f} with utf8)").unwrap())), ("primitive_time", Variant::Time(NaiveTime::from_hms_micro_opt(12, 33, 54, 123456).unwrap())), ]