From 3266a3ed9a6093a5d4658113dfc18b59829d1fe7 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 22 Apr 2026 09:41:03 -0700 Subject: [PATCH 1/5] feat: cast string to timestamp_ntz --- .../spark-expr/src/conversion_funcs/cast.rs | 6 +- .../spark-expr/src/conversion_funcs/string.rs | 452 ++++++++++++++++++ .../apache/comet/expressions/CometCast.scala | 3 +- .../expressions/cast/cast_timestamp_ntz.sql | 40 ++ .../cast/cast_timestamp_ntz_ansi.sql | 47 ++ .../org/apache/comet/CometCastSuite.scala | 31 +- 6 files changed, 570 insertions(+), 9 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index 5f855a36b2..1f574f1231 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -28,7 +28,8 @@ use crate::conversion_funcs::numeric::{ }; use crate::conversion_funcs::string::{ cast_string_to_date, cast_string_to_decimal, cast_string_to_float, cast_string_to_int, - cast_string_to_timestamp, is_df_cast_from_string_spark_compatible, spark_cast_utf8_to_boolean, + cast_string_to_timestamp, cast_string_to_timestamp_ntz, + is_df_cast_from_string_spark_compatible, spark_cast_utf8_to_boolean, }; use crate::conversion_funcs::temporal::{ cast_date_to_timestamp, is_df_cast_from_date_spark_compatible, @@ -316,6 +317,9 @@ pub(crate) fn cast_array( (Null, _) => Ok(cast_with_options(&array, to_type, &native_cast_options)?), (Utf8, Boolean) => spark_cast_utf8_to_boolean::(&array, eval_mode), (LargeUtf8, Boolean) => spark_cast_utf8_to_boolean::(&array, eval_mode), + (Utf8, Timestamp(_, None)) => { + cast_string_to_timestamp_ntz(&array, eval_mode, true, cast_options.is_spark4_plus) + } (Utf8, Timestamp(_, _)) => cast_string_to_timestamp( &array, to_type, diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index b22b8e976f..3cd30592ed 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -83,6 +83,54 @@ macro_rules! cast_utf8_to_timestamp { }}; } +macro_rules! cast_utf8_to_timestamp_ntz { + ($array:expr, $eval_mode:expr, $cast_method:ident, $allow_tz:expr, $is_spark4_plus:expr) => {{ + let len = $array.len(); + let mut cast_array = PrimitiveArray::::builder(len); + let mut cast_err: Option = None; + for i in 0..len { + if $array.is_null(i) { + cast_array.append_null() + } else { + match $cast_method( + $array.value(i).trim_end(), + $eval_mode, + $allow_tz, + $is_spark4_plus, + ) { + Ok(Some(cast_value)) => cast_array.append_value(cast_value), + Ok(None) => cast_array.append_null(), + Err(e) => { + if $eval_mode == EvalMode::Ansi { + let raw_value = $array.value(i).to_string(); + let e = match e { + SparkError::InvalidInputInCastToDatetime { + from_type, + to_type, + .. + } => SparkError::InvalidInputInCastToDatetime { + value: raw_value, + from_type, + to_type, + }, + other => other, + }; + cast_err = Some(e); + break; + } + cast_array.append_null() + } + } + } + } + if let Some(e) = cast_err { + Err(e) + } else { + Ok(Arc::new(cast_array.finish()) as ArrayRef) + } + }}; +} + macro_rules! cast_utf8_to_int { ($array:expr, $array_type:ty, $parse_fn:expr) => {{ let len = $array.len(); @@ -765,6 +813,27 @@ pub(crate) fn cast_string_to_timestamp( Ok(cast_array) } +pub(crate) fn cast_string_to_timestamp_ntz( + array: &ArrayRef, + eval_mode: EvalMode, + allow_time_zone: bool, + is_spark4_plus: bool, +) -> SparkResult { + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let cast_array: ArrayRef = cast_utf8_to_timestamp_ntz!( + string_array, + eval_mode, + timestamp_ntz_parser, + allow_time_zone, + is_spark4_plus + )?; + Ok(cast_array) +} + pub(crate) fn cast_string_to_int( to_type: &DataType, array: &ArrayRef, @@ -1253,6 +1322,48 @@ fn parse_timestamp_to_micros( } } +fn local_datetime_to_micros(timestamp_info: &TimeStampInfo) -> SparkResult> { + let year = timestamp_info.year as i64; + let m = timestamp_info.month; + let d = timestamp_info.day; + + if !(1..=12).contains(&m) { + return Ok(None); + } + let max_day = match m { + 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31u32, + 4 | 6 | 9 | 11 => 30, + 2 => { + let leap = year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); + if leap { + 29 + } else { + 28 + } + } + _ => return Ok(None), + }; + if d < 1 || d > max_day { + return Ok(None); + } + if timestamp_info.hour >= 24 || timestamp_info.minute >= 60 || timestamp_info.second >= 60 { + return Ok(None); + } + + let days = days_from_civil(year, m as i64, d as i64); + let time_secs = timestamp_info.hour as i64 * 3600 + + timestamp_info.minute as i64 * 60 + + timestamp_info.second as i64; + let total_secs = days + .checked_mul(86400) + .and_then(|s| s.checked_add(time_secs)); + let micros = total_secs.and_then(|s| { + let micros128 = s as i128 * 1_000_000 + timestamp_info.microsecond as i128; + i64::try_from(micros128).ok() + }); + Ok(micros) +} + fn parse_str_to_year_timestamp(value: &str, tz: &T) -> SparkResult> { get_timestamp_values(value, "year", tz) } @@ -1611,6 +1722,224 @@ fn timestamp_parser_with_tz( Ok(timestamp) } +fn get_timestamp_ntz_values(value: &str, timestamp_type: &str) -> SparkResult> { + let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') { + (-1i32, stripped) + } else { + (1i32, value) + }; + let mut parts = date_part.split(['T', ' ', '-', ':', '.']); + let year = sign + * parts + .next() + .unwrap_or("") + .parse::() + .unwrap_or_default(); + + if !(-290309..=294248).contains(&year) { + return Ok(None); + } + + let month = parts.next().map_or(1, |m| m.parse::().unwrap_or(1)); + let day = parts.next().map_or(1, |d| d.parse::().unwrap_or(1)); + let hour = parts.next().map_or(0, |h| h.parse::().unwrap_or(0)); + let minute = parts.next().map_or(0, |m| m.parse::().unwrap_or(0)); + let second = parts.next().map_or(0, |s| s.parse::().unwrap_or(0)); + let microsecond = parts.next().map_or(0, |ms| { + let ms = &ms[..ms.len().min(6)]; + let n = ms.len(); + ms.parse::().unwrap_or(0) * 10u32.pow((6 - n) as u32) + }); + + let mut timestamp_info = TimeStampInfo::default(); + + let timestamp_info = match timestamp_type { + "year" => timestamp_info.with_year(year), + "month" => timestamp_info.with_year(year).with_month(month), + "day" => timestamp_info + .with_year(year) + .with_month(month) + .with_day(day), + "hour" => timestamp_info + .with_year(year) + .with_month(month) + .with_day(day) + .with_hour(hour), + "minute" => timestamp_info + .with_year(year) + .with_month(month) + .with_day(day) + .with_hour(hour) + .with_minute(minute), + "second" => timestamp_info + .with_year(year) + .with_month(month) + .with_day(day) + .with_hour(hour) + .with_minute(minute) + .with_second(second), + "microsecond" => timestamp_info + .with_year(year) + .with_month(month) + .with_day(day) + .with_hour(hour) + .with_minute(minute) + .with_second(second) + .with_microsecond(microsecond), + _ => { + return Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } + }; + local_datetime_to_micros(timestamp_info) +} + +fn timestamp_ntz_parser( + value: &str, + eval_mode: EvalMode, + allow_time_zone: bool, + _is_spark4_plus: bool, +) -> SparkResult> { + let trimmed = value.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + // NTZ rejects leading whitespace for T-prefixed time-only strings on Spark 4+ + // (same logic as timestamp_parser), but time-only is rejected entirely for NTZ anyway. + + let value = trimmed; + + // Handle leading '+' the same way as timestamp_parser + let value = if let Some(rest) = value.strip_prefix('+') { + let first_non_digit = rest.find(|c: char| !c.is_ascii_digit()); + match first_non_digit { + Some(i) if i >= 1 && rest.as_bytes()[i] == b'-' => rest, + _ => return Ok(None), + } + } else { + value + }; + + // Reject time-only patterns: NTZ requires a date component + if RE_TIME_ONLY_H.is_match(value) + || RE_TIME_ONLY_HM.is_match(value) + || RE_TIME_ONLY_HMS.is_match(value) + || RE_TIME_ONLY_HMSU.is_match(value) + || RE_BARE_HM.is_match(value) + || RE_BARE_HMS.is_match(value) + || RE_BARE_HMSU.is_match(value) + { + return if eval_mode == EvalMode::Ansi { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } else { + Ok(None) + }; + } + + // Check if value matches a date-based pattern directly + let has_direct_match = RE_YEAR.is_match(value) + || RE_MONTH.is_match(value) + || RE_DAY.is_match(value) + || RE_HOUR.is_match(value) + || RE_MINUTE.is_match(value) + || RE_SECOND.is_match(value) + || RE_MICROSECOND.is_match(value); + + // If no direct match, try stripping a timezone suffix + let value_to_parse = if !has_direct_match { + if let Some((stripped, _tz)) = extract_offset_suffix(value) { + if !allow_time_zone { + return if eval_mode == EvalMode::Ansi { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } else { + Ok(None) + }; + } + stripped + } else { + value + } + } else { + value + }; + + timestamp_ntz_parser_inner(value_to_parse, eval_mode) +} + +fn timestamp_ntz_parser_inner(value: &str, eval_mode: EvalMode) -> SparkResult> { + type NtzParsePattern = (&'static Regex, fn(&str) -> SparkResult>); + + fn parse_ntz_year(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "year") + } + fn parse_ntz_month(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "month") + } + fn parse_ntz_day(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "day") + } + fn parse_ntz_hour(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "hour") + } + fn parse_ntz_minute(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "minute") + } + fn parse_ntz_second(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "second") + } + fn parse_ntz_microsecond(value: &str) -> SparkResult> { + get_timestamp_ntz_values(value, "microsecond") + } + + let patterns: &[NtzParsePattern] = &[ + ( + &RE_YEAR, + parse_ntz_year as fn(&str) -> SparkResult>, + ), + (&RE_MONTH, parse_ntz_month), + (&RE_DAY, parse_ntz_day), + (&RE_HOUR, parse_ntz_hour), + (&RE_MINUTE, parse_ntz_minute), + (&RE_SECOND, parse_ntz_second), + (&RE_MICROSECOND, parse_ntz_microsecond), + ]; + + let mut timestamp = None; + + for (pattern, parse_func) in patterns { + if pattern.is_match(value) { + timestamp = parse_func(value)?; + break; + } + } + + if timestamp.is_none() { + return if eval_mode == EvalMode::Ansi { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } else { + Ok(None) + }; + } + + Ok(timestamp) +} + fn parse_str_to_time_only_timestamp(value: &str, tz: &T) -> SparkResult> { // The 'T' is optional in the time format; strip it if specified. let time_part = value.strip_prefix('T').unwrap_or(value); @@ -1903,6 +2232,129 @@ mod tests { } } + #[test] + fn test_cast_string_to_timestamp_ntz() { + // Helper to reduce boilerplate + fn parse(s: &str, allow_tz: bool) -> Option { + timestamp_ntz_parser(s, EvalMode::Legacy, allow_tz, false).unwrap() + } + + // Basic: "2020-01-01 12:34:56" -> local micros + // days_from_civil(2020,1,1) = 18262; 18262*86400 = 1577836800 + // + 12*3600 + 34*60 + 56 = 45296; total = 1577882096s + assert_eq!( + parse("2020-01-01 12:34:56", true), + Some(1_577_882_096_000_000) + ); + assert_eq!( + parse("2020-01-01T12:34:56", true), + Some(1_577_882_096_000_000) + ); + + // With microseconds + assert_eq!( + parse("2020-01-01 12:34:56.123456", true), + Some(1_577_882_096_123_456) + ); + + // Date only + assert_eq!(parse("2020-01-01", true), Some(1_577_836_800_000_000)); + + // Timezone discarded (allow_time_zone=true): same result as without TZ + assert_eq!( + parse("2020-01-01T12:34:56Z", true), + Some(1_577_882_096_000_000) + ); + assert_eq!( + parse("2020-01-01T12:34:56+05:30", true), + Some(1_577_882_096_000_000) + ); + assert_eq!( + parse("2020-01-01T12:34:56-08:00", true), + Some(1_577_882_096_000_000) + ); + + // Timezone rejected (allow_time_zone=false) + assert_eq!(parse("2020-01-01T12:34:56Z", false), None); + assert_eq!(parse("2020-01-01T12:34:56+05:30", false), None); + + // Time-only rejected + assert_eq!(parse("T12:34:56", true), None); + assert_eq!(parse("12:34", true), None); + assert_eq!(parse("T2", true), None); + + // Invalid -> None in Legacy + assert_eq!(parse("invalid", true), None); + assert_eq!(parse("", true), None); + + // Invalid -> Error in ANSI + assert!(timestamp_ntz_parser("invalid", EvalMode::Ansi, true, false).is_err()); + assert!(timestamp_ntz_parser("T12:34", EvalMode::Ansi, true, false).is_err()); + + // Invalid -> None in Try + assert_eq!( + timestamp_ntz_parser("invalid", EvalMode::Try, true, false).unwrap(), + None + ); + + // DST gap time works for NTZ (pure arithmetic, no DST) + // days_from_civil(2024,3,10) * 86400 + 2*3600 + 30*60 = 1710037800s + assert_eq!( + parse("2024-03-10 02:30:00", true), + Some(1_710_037_800_000_000) + ); + + // Invalid leap day -> None + assert_eq!(parse("2023-02-29 00:00:00", true), None); + + // Valid leap day + assert!(parse("2020-02-29 00:00:00", true).is_some()); + } + + #[test] + fn test_cast_string_to_timestamp_ntz_array() { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("T2"), + Some("2020-01-01"), + None, + Some("invalid"), + Some("2020-06-15T12:30:00Z"), + ])); + let result = cast_string_to_timestamp_ntz(&array, EvalMode::Legacy, true, false).unwrap(); + let ts_array = result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(ts_array.len(), 6); + assert!(!ts_array.is_null(0)); // valid + assert!(ts_array.is_null(1)); // time-only -> null + assert!(!ts_array.is_null(2)); // date-only -> valid + assert!(ts_array.is_null(3)); // null input + assert!(ts_array.is_null(4)); // invalid -> null + assert!(!ts_array.is_null(5)); // TZ discarded -> valid + // TZ discarded: "2020-06-15T12:30:00Z" should give same micros as "2020-06-15T12:30:00" + assert_eq!( + ts_array.value(5), + timestamp_ntz_parser("2020-06-15T12:30:00", EvalMode::Legacy, true, false) + .unwrap() + .unwrap() + ); + } + + #[test] + fn test_cast_string_to_timestamp_ntz_ansi_error() { + let array: ArrayRef = Arc::new(StringArray::from(vec![Some("invalid")])); + let result = cast_string_to_timestamp_ntz(&array, EvalMode::Ansi, true, false); + assert!(result.is_err()); + match result.unwrap_err() { + SparkError::InvalidInputInCastToDatetime { to_type, .. } => { + assert_eq!(to_type, "TIMESTAMP_NTZ"); + } + other => panic!("Expected InvalidInputInCastToDatetime, got {other:?}"), + } + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 3b4c4b3bd4..898a4f916e 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -218,8 +218,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case DataTypes.TimestampType => Compatible() case _: TimestampNTZType => - // https://github.com/apache/datafusion-comet/issues/378 - Incompatible(Some("Cast from String to TimestampNTZ is not yet supported")) + Compatible() case _ => unsupported(DataTypes.StringType, toType) } diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql index 39d3615301..4d88ff9b34 100644 --- a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql @@ -60,3 +60,43 @@ SELECT cast(TIMESTAMP_NTZ'2020-01-01 12:34:56' as date) query SELECT cast(DATE'2020-01-15' as timestamp_ntz) + +-- String -> NTZ (timezone-independent: parses local time, discards any TZ info) +query +SELECT cast('2020-01-01 12:34:56.123456' as timestamp_ntz) + +query +SELECT cast('2020-01-01T12:34:56' as timestamp_ntz) + +query +SELECT cast('2020-01-01' as timestamp_ntz) + +-- Timezone in string should be silently discarded for NTZ +query +SELECT cast('2020-06-15 12:30:00Z' as timestamp_ntz) + +query +SELECT cast('2020-06-15 12:30:00+05:30' as timestamp_ntz) + +query +SELECT cast('2020-06-15 12:30:00-08:00' as timestamp_ntz) + +-- DST transition times (same regardless of session TZ since NTZ has no DST) +query +SELECT cast('2024-03-10 02:30:00' as timestamp_ntz), cast('2024-11-03 01:30:00' as timestamp_ntz) + +-- Time-only strings should produce NULL for NTZ +query +SELECT cast('T12:34:56' as timestamp_ntz), cast('12:34' as timestamp_ntz) + +-- Invalid inputs -> NULL +query +SELECT cast('not a timestamp' as timestamp_ntz), cast('' as timestamp_ntz) + +-- TRY_CAST: invalid input should return NULL +query +SELECT try_cast('invalid' as timestamp_ntz), try_cast('T12:34' as timestamp_ntz) + +-- TRY_CAST: valid inputs +query +SELECT try_cast('2020-01-01 12:34:56' as timestamp_ntz) diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql new file mode 100644 index 0000000000..53a21a3ed1 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql @@ -0,0 +1,47 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Config: spark.sql.ansi.enabled=true +-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles + +-- ANSI mode: invalid String -> NTZ should error +query expect_error(CAST_INVALID_INPUT) +SELECT cast('invalid' as timestamp_ntz) + +query expect_error(CAST_INVALID_INPUT) +SELECT cast('T12:34:56' as timestamp_ntz) + +query expect_error(CAST_INVALID_INPUT) +SELECT cast('' as timestamp_ntz) + +-- TRY_CAST: returns NULL even in ANSI mode +query +SELECT try_cast('invalid' as timestamp_ntz), try_cast('T12:34' as timestamp_ntz) + +-- TRY_CAST: valid inputs still work +query +SELECT try_cast('2020-01-01 12:34:56' as timestamp_ntz) + +-- Valid casts should still work in ANSI mode +query +SELECT cast('2020-01-01 12:34:56.123456' as timestamp_ntz) + +query +SELECT cast('2020-06-15 12:30:00Z' as timestamp_ntz) + +query +SELECT cast('2024-03-10 02:30:00' as timestamp_ntz) diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index dadfbfc93a..7b7324810d 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -1268,12 +1268,31 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - ignore("cast StringType to TimestampNTZType") { - // Phase 5: String → NTZ parsing not yet implemented - // https://github.com/apache/datafusion-comet/issues/378 - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { - val values = Seq("2020-01-01T12:34:56.123456", "2020-01-01T12:34:56", "2020-01-01") - castTimestampTest(values.toDF("a"), DataTypes.TimestampNTZType) + test("cast StringType to TimestampNTZType") { + representativeTimezones.foreach { tz => + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { + val values = Seq( + "2020-01-01T12:34:56.123456", + "2020-01-01T12:34:56", + "2020-01-01 12:34:56", + "2020-01-01", + "2020-01", + "2020", + "2020-06-15T12:30:00Z", + "2020-06-15T12:30:00+05:30", + "2020-06-15T12:30:00-08:00", + "2020-06-15T12:30:00 UTC", + "2024-03-10 02:30:00", + "2024-11-03 01:30:00", + "not a timestamp", + "", + "T12:34:56", + "12:34:56", + "2020-02-29 00:00:00", + "2023-02-29 00:00:00", + null) + castTimestampTest(values.toDF("a"), DataTypes.TimestampNTZType, assertNative = true) + } } } From fb28f5e1447528083cb847d12bb81f7552c87d5b Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Wed, 22 Apr 2026 13:30:55 -0700 Subject: [PATCH 2/5] fix trailing space --- native/spark-expr/src/conversion_funcs/string.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 3cd30592ed..d498c2d7e5 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -1867,7 +1867,7 @@ fn timestamp_ntz_parser( Ok(None) }; } - stripped + stripped.trim_end() } else { value } @@ -2273,6 +2273,11 @@ mod tests { parse("2020-01-01T12:34:56-08:00", true), Some(1_577_882_096_000_000) ); + // Space-separated offset (e.g. "2021-11-22 10:54:27 +08:00") + assert_eq!( + parse("2021-11-22 10:54:27 +08:00", true), + parse("2021-11-22 10:54:27", true) + ); // Timezone rejected (allow_time_zone=false) assert_eq!(parse("2020-01-01T12:34:56Z", false), None); From 09c647122cba430b15b5b0fd1b13720a01fca6be Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 23 Apr 2026 14:07:59 -0700 Subject: [PATCH 3/5] reviewd revert doc --- .../spark-expr/src/conversion_funcs/string.rs | 259 ++++-------------- .../expressions/cast/cast_timestamp_ntz.sql | 14 + 2 files changed, 69 insertions(+), 204 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index d498c2d7e5..00ddc1f34a 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -33,71 +33,19 @@ use std::num::Wrapping; use std::str::FromStr; use std::sync::{Arc, LazyLock}; +// Shared macro for casting UTF-8 string arrays to timestamp types (both TZ and NTZ). +// $builder is a PrimitiveBuilder expression; $extra_args are forwarded to $cast_method +// after (value, eval_mode). macro_rules! cast_utf8_to_timestamp { - // $tz is a Timezone:Tz object and contains the session timezone. - // $to_tz_str is a string containing the to_type timezone - ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr, $is_spark4_plus:expr) => {{ - let len = $array.len(); - let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str); - let mut cast_err: Option = None; - for i in 0..len { - if $array.is_null(i) { - cast_array.append_null() - } else { - // we use trim_end instead of trim because strings with leading spaces are interpreted differently - // by Spark in cases where the string has only the time component starting with T. - // The string " T2" results in null while "T2" results in a valid timestamp. - match $cast_method($array.value(i).trim_end(), $eval_mode, $tz, $is_spark4_plus) { - Ok(Some(cast_value)) => cast_array.append_value(cast_value), - Ok(None) => cast_array.append_null(), - Err(e) => { - if $eval_mode == EvalMode::Ansi { - // Replace the error value with the raw (untrimmed) input to match - // Spark's behavior: Spark reports the original string in CAST_INVALID_INPUT. - let raw_value = $array.value(i).to_string(); - let e = match e { - SparkError::InvalidInputInCastToDatetime { - from_type, - to_type, - .. - } => SparkError::InvalidInputInCastToDatetime { - value: raw_value, - from_type, - to_type, - }, - other => other, - }; - cast_err = Some(e); - break; - } - cast_array.append_null() - } - } - } - } - if let Some(e) = cast_err { - Err(e) - } else { - Ok(Arc::new(cast_array.finish()) as ArrayRef) - } - }}; -} - -macro_rules! cast_utf8_to_timestamp_ntz { - ($array:expr, $eval_mode:expr, $cast_method:ident, $allow_tz:expr, $is_spark4_plus:expr) => {{ - let len = $array.len(); - let mut cast_array = PrimitiveArray::::builder(len); + ($array:expr, $eval_mode:expr, $builder:expr, $cast_method:ident $(, $extra_arg:expr)*) => {{ + let mut cast_array = $builder; let mut cast_err: Option = None; - for i in 0..len { + for i in 0..$array.len() { if $array.is_null(i) { cast_array.append_null() } else { - match $cast_method( - $array.value(i).trim_end(), - $eval_mode, - $allow_tz, - $is_spark4_plus, - ) { + // trim_end only: leading spaces affect parsing (e.g. " T2" -> null, "T2" -> valid) + match $cast_method($array.value(i).trim_end(), $eval_mode $(, $extra_arg)*) { Ok(Some(cast_value)) => cast_array.append_value(cast_value), Ok(None) => cast_array.append_null(), Err(e) => { @@ -160,6 +108,7 @@ macro_rules! cast_utf8_to_int { }}; } +#[derive(Clone)] struct TimeStampInfo { year: i32, month: u32, @@ -801,10 +750,10 @@ pub(crate) fn cast_string_to_timestamp( cast_utf8_to_timestamp!( string_array, eval_mode, - TimestampMicrosecondType, + PrimitiveArray::::builder(string_array.len()) + .with_timezone(to_tz), timestamp_parser, tz, - to_tz, is_spark4_plus )? } @@ -824,9 +773,10 @@ pub(crate) fn cast_string_to_timestamp_ntz( .downcast_ref::>() .expect("Expected a string array"); - let cast_array: ArrayRef = cast_utf8_to_timestamp_ntz!( + let cast_array: ArrayRef = cast_utf8_to_timestamp!( string_array, eval_mode, + PrimitiveArray::::builder(string_array.len()), timestamp_ntz_parser, allow_time_zone, is_spark4_plus @@ -1106,12 +1056,10 @@ pub fn invalid_value(value: &str, from_type: &str, to_type: &str) -> SparkError } } -fn get_timestamp_values( +fn parse_to_timestamp_info( value: &str, timestamp_type: &str, - tz: &T, -) -> SparkResult> { - // Handle negative year: strip leading '-' and remember the sign. +) -> SparkResult> { let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') { (-1i32, stripped) } else { @@ -1139,8 +1087,6 @@ fn get_timestamp_values( let minute = parts.next().map_or(0, |m| m.parse::().unwrap_or(0)); let second = parts.next().map_or(0, |s| s.parse::().unwrap_or(0)); let microsecond = parts.next().map_or(0, |ms| { - // Truncate to at most 6 digits then scale to fill the microsecond field. - // E.g. ".123" -> 123 * 10^3 = 123_000 µs; ".1234567" -> truncated to 123_456 µs. let ms = &ms[..ms.len().min(6)]; let n = ms.len(); ms.parse::().unwrap_or(0) * 10u32.pow((6 - n) as u32) @@ -1189,7 +1135,18 @@ fn get_timestamp_values( }) } }; - parse_timestamp_to_micros(timestamp_info, tz) + Ok(Some(timestamp_info.to_owned())) +} + +fn get_timestamp_values( + value: &str, + timestamp_type: &str, + tz: &T, +) -> SparkResult> { + match parse_to_timestamp_info(value, timestamp_type)? { + Some(info) => parse_timestamp_to_micros(&info, tz), + None => Ok(None), + } } /// Howard Hinnant's algorithm: proleptic Gregorian days since 1970-01-01 for any i64 year. @@ -1722,81 +1679,6 @@ fn timestamp_parser_with_tz( Ok(timestamp) } -fn get_timestamp_ntz_values(value: &str, timestamp_type: &str) -> SparkResult> { - let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') { - (-1i32, stripped) - } else { - (1i32, value) - }; - let mut parts = date_part.split(['T', ' ', '-', ':', '.']); - let year = sign - * parts - .next() - .unwrap_or("") - .parse::() - .unwrap_or_default(); - - if !(-290309..=294248).contains(&year) { - return Ok(None); - } - - let month = parts.next().map_or(1, |m| m.parse::().unwrap_or(1)); - let day = parts.next().map_or(1, |d| d.parse::().unwrap_or(1)); - let hour = parts.next().map_or(0, |h| h.parse::().unwrap_or(0)); - let minute = parts.next().map_or(0, |m| m.parse::().unwrap_or(0)); - let second = parts.next().map_or(0, |s| s.parse::().unwrap_or(0)); - let microsecond = parts.next().map_or(0, |ms| { - let ms = &ms[..ms.len().min(6)]; - let n = ms.len(); - ms.parse::().unwrap_or(0) * 10u32.pow((6 - n) as u32) - }); - - let mut timestamp_info = TimeStampInfo::default(); - - let timestamp_info = match timestamp_type { - "year" => timestamp_info.with_year(year), - "month" => timestamp_info.with_year(year).with_month(month), - "day" => timestamp_info - .with_year(year) - .with_month(month) - .with_day(day), - "hour" => timestamp_info - .with_year(year) - .with_month(month) - .with_day(day) - .with_hour(hour), - "minute" => timestamp_info - .with_year(year) - .with_month(month) - .with_day(day) - .with_hour(hour) - .with_minute(minute), - "second" => timestamp_info - .with_year(year) - .with_month(month) - .with_day(day) - .with_hour(hour) - .with_minute(minute) - .with_second(second), - "microsecond" => timestamp_info - .with_year(year) - .with_month(month) - .with_day(day) - .with_hour(hour) - .with_minute(minute) - .with_second(second) - .with_microsecond(microsecond), - _ => { - return Err(SparkError::InvalidInputInCastToDatetime { - value: value.to_string(), - from_type: "STRING".to_string(), - to_type: "TIMESTAMP_NTZ".to_string(), - }) - } - }; - local_datetime_to_micros(timestamp_info) -} - fn timestamp_ntz_parser( value: &str, eval_mode: EvalMode, @@ -1879,65 +1761,34 @@ fn timestamp_ntz_parser( } fn timestamp_ntz_parser_inner(value: &str, eval_mode: EvalMode) -> SparkResult> { - type NtzParsePattern = (&'static Regex, fn(&str) -> SparkResult>); - - fn parse_ntz_year(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "year") - } - fn parse_ntz_month(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "month") - } - fn parse_ntz_day(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "day") - } - fn parse_ntz_hour(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "hour") - } - fn parse_ntz_minute(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "minute") - } - fn parse_ntz_second(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "second") - } - fn parse_ntz_microsecond(value: &str) -> SparkResult> { - get_timestamp_ntz_values(value, "microsecond") - } - - let patterns: &[NtzParsePattern] = &[ - ( - &RE_YEAR, - parse_ntz_year as fn(&str) -> SparkResult>, - ), - (&RE_MONTH, parse_ntz_month), - (&RE_DAY, parse_ntz_day), - (&RE_HOUR, parse_ntz_hour), - (&RE_MINUTE, parse_ntz_minute), - (&RE_SECOND, parse_ntz_second), - (&RE_MICROSECOND, parse_ntz_microsecond), + let patterns: &[(&Regex, &str)] = &[ + (&RE_YEAR, "year"), + (&RE_MONTH, "month"), + (&RE_DAY, "day"), + (&RE_HOUR, "hour"), + (&RE_MINUTE, "minute"), + (&RE_SECOND, "second"), + (&RE_MICROSECOND, "microsecond"), ]; - let mut timestamp = None; - - for (pattern, parse_func) in patterns { - if pattern.is_match(value) { - timestamp = parse_func(value)?; - break; + for (re, ts_type) in patterns { + if re.is_match(value) { + return match parse_to_timestamp_info(value, ts_type)? { + Some(info) => local_datetime_to_micros(&info), + None => Ok(None), + }; } } - if timestamp.is_none() { - return if eval_mode == EvalMode::Ansi { - Err(SparkError::InvalidInputInCastToDatetime { - value: value.to_string(), - from_type: "STRING".to_string(), - to_type: "TIMESTAMP_NTZ".to_string(), - }) - } else { - Ok(None) - }; + if eval_mode == EvalMode::Ansi { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } else { + Ok(None) } - - Ok(timestamp) } fn parse_str_to_time_only_timestamp(value: &str, tz: &T) -> SparkResult> { @@ -2151,10 +2002,10 @@ mod tests { let result = cast_utf8_to_timestamp!( &string_array, eval_mode, - TimestampMicrosecondType, + PrimitiveArray::::builder(string_array.len()) + .with_timezone("UTC"), timestamp_parser, tz, - "UTC", true ) .unwrap(); @@ -2186,10 +2037,10 @@ mod tests { let result = cast_utf8_to_timestamp!( &string_array, eval_mode, - TimestampMicrosecondType, + PrimitiveArray::::builder(string_array.len()) + .with_timezone("UTC"), timestamp_parser, tz, - "UTC", true ); assert!( @@ -2215,10 +2066,10 @@ mod tests { let result = cast_utf8_to_timestamp!( &string_array, eval_mode, - TimestampMicrosecondType, + PrimitiveArray::::builder(string_array.len()) + .with_timezone("UTC"), timestamp_parser, tz, - "UTC", true ); match result { diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql index 4d88ff9b34..b78750859d 100644 --- a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql @@ -51,6 +51,20 @@ SELECT cast(d as timestamp_ntz), id FROM test_ts_ntz ORDER BY id query SELECT cast(ts as timestamp_ntz), id FROM test_ts_ntz ORDER BY id +-- String -> NTZ via column reference (not constant-folded) +statement +CREATE TABLE test_str_to_ntz(s string, id int) USING parquet + +statement +INSERT INTO test_str_to_ntz VALUES + ('2020-01-01 12:34:56', 1), + ('2020-06-15T12:30:00Z', 2), + ('2021-11-22 10:54:27 +08:00', 3), + (NULL, 4) + +query +SELECT cast(s as timestamp_ntz), id FROM test_str_to_ntz ORDER BY id + -- Literal casts query SELECT cast(TIMESTAMP_NTZ'2020-01-01 12:34:56.789' as string) From af117c7528e2858d7392c95ae3df3d7334430dc4 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 23 Apr 2026 16:37:11 -0700 Subject: [PATCH 4/5] add back doc --- .../user-guide/latest/compatibility/expressions/cast.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/user-guide/latest/compatibility/expressions/cast.md b/docs/source/user-guide/latest/compatibility/expressions/cast.md index 76457db8c9..54e9900f68 100644 --- a/docs/source/user-guide/latest/compatibility/expressions/cast.md +++ b/docs/source/user-guide/latest/compatibility/expressions/cast.md @@ -58,6 +58,14 @@ suffixes (e.g. `Europe/Moscow`), and the full Spark timestamp year range (-290308 to 294247). Note that `CAST(string AS DATE)` is only compatible for years between 262143 BC and 262142 AD due to an underlying library limitation. +## String to TimestampNTZ + +Comet's native `CAST(string AS TIMESTAMP_NTZ)` implementation matches Apache Spark's behavior. +Unlike `CAST(string AS TIMESTAMP)`, this cast is timezone-independent: any timezone offset in +the input string (e.g. `+08:00`, `Z`, `UTC`) is silently discarded, and the local date-time +components are preserved as-is. Time-only strings (e.g. `T12:34:56`, `12:34`) produce `NULL`. +The result is always a wall-clock timestamp with no timezone conversion or DST adjustment. + ## Decimal with Negative Scale to String Casting a `DecimalType` with a negative scale to `StringType` is marked as incompatible when From e070b72df9151f612a572ae351379a7c630fd8f5 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Fri, 24 Apr 2026 07:51:00 -0700 Subject: [PATCH 5/5] Address review comments --- .../spark-expr/src/conversion_funcs/string.rs | 32 ++++++++++++++++++- .../cast/cast_timestamp_ntz_ansi.sql | 4 +++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 00ddc1f34a..adfd6e2390 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -1774,7 +1774,17 @@ fn timestamp_ntz_parser_inner(value: &str, eval_mode: EvalMode) -> SparkResult local_datetime_to_micros(&info), + Some(info) => match local_datetime_to_micros(&info)? { + some @ Some(_) => Ok(some), + None if eval_mode == EvalMode::Ansi => { + Err(SparkError::InvalidInputInCastToDatetime { + value: value.to_string(), + from_type: "STRING".to_string(), + to_type: "TIMESTAMP_NTZ".to_string(), + }) + } + None => Ok(None), + }, None => Ok(None), }; } @@ -2211,6 +2221,26 @@ mod tests { } } + #[test] + fn test_cast_string_to_timestamp_ntz_ansi_invalid_date() { + // 2023-02-29 is parseable but invalid (not a leap year). + // In ANSI mode this must error, not return NULL. + let result = timestamp_ntz_parser("2023-02-29", EvalMode::Ansi, false, false); + assert!( + result.is_err(), + "ANSI mode should error on invalid date 2023-02-29" + ); + match result.unwrap_err() { + SparkError::InvalidInputInCastToDatetime { to_type, .. } => { + assert_eq!(to_type, "TIMESTAMP_NTZ"); + } + other => panic!("Expected InvalidInputInCastToDatetime, got {other:?}"), + } + // In Legacy mode, same input should return None (null). + let result = timestamp_ntz_parser("2023-02-29", EvalMode::Legacy, false, false); + assert_eq!(result.unwrap(), None); + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql index 53a21a3ed1..f9ffe5318f 100644 --- a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz_ansi.sql @@ -28,6 +28,10 @@ SELECT cast('T12:34:56' as timestamp_ntz) query expect_error(CAST_INVALID_INPUT) SELECT cast('' as timestamp_ntz) +-- ANSI mode: parseable but invalid date should error, not return NULL +query expect_error(CAST_INVALID_INPUT) +SELECT cast('2023-02-29' as timestamp_ntz) + -- TRY_CAST: returns NULL even in ANSI mode query SELECT try_cast('invalid' as timestamp_ntz), try_cast('T12:34' as timestamp_ntz)