From 0ead79ad93dcf963ca0bad0a1eacd344e7514edb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 12 Feb 2026 10:09:00 -0700
Subject: [PATCH] fix: [df52] timestamp nanos precision loss with nanosAsLong
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When Spark's `LEGACY_PARQUET_NANOS_AS_LONG=true` converts TIMESTAMP(NANOS)
to LongType, the PhysicalExprAdapter detects a type mismatch between the
file's Timestamp(Nanosecond) and the logical Int64. The DefaultAdapter
creates a CastColumnExpr, which SparkPhysicalExprAdapter then replaces
with Spark's Cast expression. Spark's Cast postprocess for Timestamp→Int64
unconditionally divides by MICROS_PER_SECOND (10^6), assuming microsecond
precision. But the values are nanoseconds, so the raw value
1668537129123534758 becomes 1668537129123 — losing sub-millisecond
precision.

Fix: route Timestamp→Int64 casts through CometCastColumnExpr (which uses
spark_parquet_convert → Arrow cast) instead of Spark Cast. Arrow's cast
correctly reinterprets the raw i64 value without any division.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 native/core/src/parquet/schema_adapter.rs | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs
index f19ec39fca..491f0a8e85 100644
--- a/native/core/src/parquet/schema_adapter.rs
+++ b/native/core/src/parquet/schema_adapter.rs
@@ -302,10 +302,11 @@ impl SparkPhysicalExprAdapter {
             let physical_type = cast.input_field().data_type();
             let target_type = cast.target_field().data_type();
 
-            // For complex nested types (Struct, List, Map) and Timestamp timezone
-            // mismatches, use CometCastColumnExpr with spark_parquet_convert which
-            // handles field-name-based selection, reordering, nested type casting,
-            // and metadata-only timestamp timezone relabeling correctly.
+            // For complex nested types (Struct, List, Map), Timestamp timezone
+            // mismatches, and Timestamp→Int64 (nanosAsLong), use CometCastColumnExpr
+            // with spark_parquet_convert which handles field-name-based selection,
+            // reordering, nested type casting, metadata-only timestamp timezone
+            // relabeling, and raw value reinterpretation correctly.
             //
             // Timestamp mismatches (e.g., Timestamp(us, None) -> Timestamp(us, Some("UTC")))
             // occur when INT96 Parquet timestamps are coerced to Timestamp(us, None) by
@@ -313,12 +314,18 @@ impl SparkPhysicalExprAdapter {
             // Using Spark's Cast here would incorrectly treat the None-timezone values as
             // local time (TimestampNTZ) and apply a timezone conversion, but the values are
             // already in UTC. spark_parquet_convert handles this as a metadata-only change.
+            //
+            // Timestamp→Int64 occurs when Spark's `nanosAsLong` config converts
+            // TIMESTAMP(NANOS) to LongType. Spark's Cast would divide by MICROS_PER_SECOND
+            // (assuming microseconds), but the values are nanoseconds. Arrow cast correctly
+            // reinterprets the raw i64 value without conversion.
             if matches!(
                 (physical_type, target_type),
                 (DataType::Struct(_), DataType::Struct(_))
                     | (DataType::List(_), DataType::List(_))
                     | (DataType::Map(_, _), DataType::Map(_, _))
                     | (DataType::Timestamp(_, _), DataType::Timestamp(_, _))
+                    | (DataType::Timestamp(_, _), DataType::Int64)
             ) {
                 let comet_cast: Arc<dyn PhysicalExpr> = Arc::new(
                     CometCastColumnExpr::new(