From 0ead79ad93dcf963ca0bad0a1eacd344e7514edb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 12 Feb 2026 10:09:00 -0700 Subject: [PATCH] fix: [df52] timestamp nanos precision loss with nanosAsLong MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Spark's `LEGACY_PARQUET_NANOS_AS_LONG=true` converts TIMESTAMP(NANOS) to LongType, the PhysicalExprAdapter detects a type mismatch between the file's Timestamp(Nanosecond) and the logical Int64. The DefaultAdapter creates a CastColumnExpr, which SparkPhysicalExprAdapter then replaces with Spark's Cast expression. Spark's Cast postprocess for Timestamp→Int64 unconditionally divides by MICROS_PER_SECOND (10^6), assuming microsecond precision. But the values are nanoseconds, so the raw value 1668537129123534758 becomes 1668537129123 — losing sub-millisecond precision. Fix: route Timestamp→Int64 casts through CometCastColumnExpr (which uses spark_parquet_convert → Arrow cast) instead of Spark Cast. Arrow's cast correctly reinterprets the raw i64 value without any division. Co-Authored-By: Claude Opus 4.6 --- native/core/src/parquet/schema_adapter.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs index f19ec39fca..491f0a8e85 100644 --- a/native/core/src/parquet/schema_adapter.rs +++ b/native/core/src/parquet/schema_adapter.rs @@ -302,10 +302,11 @@ impl SparkPhysicalExprAdapter { let physical_type = cast.input_field().data_type(); let target_type = cast.target_field().data_type(); - // For complex nested types (Struct, List, Map) and Timestamp timezone - // mismatches, use CometCastColumnExpr with spark_parquet_convert which - // handles field-name-based selection, reordering, nested type casting, - // and metadata-only timestamp timezone relabeling correctly. + // For complex nested types (Struct, List, Map), Timestamp timezone + // mismatches, and Timestamp→Int64 (nanosAsLong), use CometCastColumnExpr + // with spark_parquet_convert which handles field-name-based selection, + // reordering, nested type casting, metadata-only timestamp timezone + // relabeling, and raw value reinterpretation correctly. // // Timestamp mismatches (e.g., Timestamp(us, None) -> Timestamp(us, Some("UTC"))) // occur when INT96 Parquet timestamps are coerced to Timestamp(us, None) by @@ -313,12 +314,18 @@ impl SparkPhysicalExprAdapter { // Using Spark's Cast here would incorrectly treat the None-timezone values as // local time (TimestampNTZ) and apply a timezone conversion, but the values are // already in UTC. spark_parquet_convert handles this as a metadata-only change. + // + // Timestamp→Int64 occurs when Spark's `nanosAsLong` config converts + // TIMESTAMP(NANOS) to LongType. Spark's Cast would divide by MICROS_PER_SECOND + // (assuming microseconds), but the values are nanoseconds. Arrow cast correctly + // reinterprets the raw i64 value without conversion. if matches!( (physical_type, target_type), (DataType::Struct(_), DataType::Struct(_)) | (DataType::List(_), DataType::List(_)) | (DataType::Map(_, _), DataType::Map(_, _)) | (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) + | (DataType::Timestamp(_, _), DataType::Int64) ) { let comet_cast: Arc = Arc::new( CometCastColumnExpr::new(