From 73c91562abc7f98e00d5e1e77d16c1e3a798a37d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 14:30:07 -0700 Subject: [PATCH 01/12] feat: add support for last_day expression Adds native Comet support for Spark's last_day function, which returns the last day of the month for a given date. Uses the SparkLastDay implementation from datafusion-spark crate. Closes #3090 Co-Authored-By: Claude Opus 4.5 --- docs/source/user-guide/latest/configs.md | 1 + native/core/src/execution/jni_api.rs | 2 ++ .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 ++- .../comet/CometTemporalExpressionSuite.scala | 26 +++++++++++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 1a273ad033..5a2450444d 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -266,6 +266,7 @@ These settings can be used to determine which parts of the plan are accelerated | `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true | | `spark.comet.expression.JsonToStructs.enabled` | Enable Comet acceleration for `JsonToStructs` | true | | `spark.comet.expression.KnownFloatingPointNormalized.enabled` | Enable Comet acceleration for `KnownFloatingPointNormalized` | true | +| `spark.comet.expression.LastDay.enabled` | Enable Comet acceleration for `LastDay` | true | | `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true | | `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true | | `spark.comet.expression.LessThanOrEqual.enabled` | Enable Comet acceleration for `LessThanOrEqual` | true | diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 75c53198b8..56569bc69c 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -44,6 +44,7 @@ use datafusion_spark::function::bitwise::bit_get::SparkBitGet; use datafusion_spark::function::bitwise::bitwise_not::SparkBitwiseNot; use datafusion_spark::function::datetime::date_add::SparkDateAdd; use datafusion_spark::function::datetime::date_sub::SparkDateSub; +use datafusion_spark::function::datetime::last_day::SparkLastDay; use datafusion_spark::function::hash::sha1::SparkSha1; use datafusion_spark::function::hash::sha2::SparkSha2; use datafusion_spark::function::math::expm1::SparkExpm1; @@ -345,6 +346,7 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitGet::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateAdd::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateSub::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkLastDay::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha1::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkConcat::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitwiseNot::default())); diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index e50b1d80e6..f68d79d9cc 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -187,6 +187,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[DateAdd] -> CometDateAdd, classOf[DateSub] -> CometDateSub, classOf[FromUnixTime] -> CometFromUnixTime, + classOf[LastDay] -> CometLastDay, classOf[Hour] -> CometHour, classOf[Minute] -> CometMinute, classOf[Second] -> CometSecond, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index ef2b0f793c..628e190908 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, LastDay, Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.types.{DateType, IntegerType} import org.apache.spark.unsafe.types.UTF8String @@ -258,6 +258,8 @@ object CometDateAdd extends CometScalarFunction[DateAdd]("date_add") object CometDateSub extends CometScalarFunction[DateSub]("date_sub") +object CometLastDay extends CometScalarFunction[LastDay]("last_day") + object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 9a23c76d82..53ffd483d7 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -122,4 +122,30 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH StructField("fmt", DataTypes.StringType, true))) FuzzDataGenerator.generateDataFrame(r, spark, schema, 1000, DataGenOptions()) } + + test("last_day") { + val r = new Random(42) + val schema = StructType(Seq(StructField("c0", DataTypes.DateType, true))) + val df = FuzzDataGenerator.generateDataFrame(r, spark, schema, 1000, DataGenOptions()) + df.createOrReplaceTempView("tbl") + + // Basic test with random dates + checkSparkAnswerAndOperator("SELECT c0, last_day(c0) FROM tbl ORDER BY c0") + + // Disable constant folding to ensure literal expressions are executed by Comet + withSQLConf( + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> + "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") { + // Test with literal dates - various months + checkSparkAnswerAndOperator( + "SELECT last_day(DATE('2024-01-15')), last_day(DATE('2024-02-15')), last_day(DATE('2024-12-01'))") + + // Test leap year handling (February) + checkSparkAnswerAndOperator( + "SELECT last_day(DATE('2024-02-01')), last_day(DATE('2023-02-01'))") + + // Test null handling + checkSparkAnswerAndOperator("SELECT last_day(NULL)") + } + } } From 77436851749b832e20a5401062d479d7c6ad36d5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 15:48:03 -0700 Subject: [PATCH 02/12] feat: add support for date_from_unix_date expression Adds native Comet support for Spark's date_from_unix_date function, which converts an integer representing days since Unix epoch (1970-01-01) to a Date32 value. Closes #3089 Co-Authored-By: Claude Opus 4.5 --- native/spark-expr/src/comet_scalar_funcs.rs | 5 +- .../src/datetime_funcs/date_from_unix_date.rs | 110 ++++++++++++++++++ native/spark-expr/src/datetime_funcs/mod.rs | 2 + native/spark-expr/src/lib.rs | 4 +- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../comet/CometTemporalExpressionSuite.scala | 38 +++++- 7 files changed, 159 insertions(+), 5 deletions(-) create mode 100644 native/spark-expr/src/datetime_funcs/date_from_unix_date.rs diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index 8384a4646a..1bcf4701c2 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -22,8 +22,8 @@ use crate::math_funcs::modulo_expr::spark_modulo; use crate::{ spark_array_repeat, spark_ceil, spark_decimal_div, spark_decimal_integral_div, spark_floor, spark_isnan, spark_lpad, spark_make_decimal, spark_read_side_padding, spark_round, spark_rpad, - spark_unhex, spark_unscaled_value, EvalMode, SparkBitwiseCount, SparkDateTrunc, SparkSizeFunc, - SparkStringSpace, + spark_unhex, spark_unscaled_value, EvalMode, SparkBitwiseCount, SparkDateFromUnixDate, + SparkDateTrunc, SparkSizeFunc, SparkStringSpace, }; use arrow::datatypes::DataType; use datafusion::common::{DataFusionError, Result as DataFusionResult}; @@ -192,6 +192,7 @@ pub fn create_comet_physical_fun_with_eval_mode( fn all_scalar_functions() -> Vec> { vec![ Arc::new(ScalarUDF::new_from_impl(SparkBitwiseCount::default())), + Arc::new(ScalarUDF::new_from_impl(SparkDateFromUnixDate::default())), Arc::new(ScalarUDF::new_from_impl(SparkDateTrunc::default())), Arc::new(ScalarUDF::new_from_impl(SparkStringSpace::default())), Arc::new(ScalarUDF::new_from_impl(SparkSizeFunc::default())), diff --git a/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs new file mode 100644 index 0000000000..c4b9c36e1e --- /dev/null +++ b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, Date32Array, Int32Array}; +use arrow::datatypes::DataType; +use datafusion::common::{utils::take_function_args, DataFusionError, Result}; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible date_from_unix_date function. +/// Converts an integer representing days since Unix epoch (1970-01-01) to a Date32 value. +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkDateFromUnixDate { + signature: Signature, + aliases: Vec, +} + +impl SparkDateFromUnixDate { + pub fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + aliases: vec![], + } + } +} + +impl Default for SparkDateFromUnixDate { + fn default() -> Self { + Self::new() + } +} + +impl ScalarUDFImpl for SparkDateFromUnixDate { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_from_unix_date" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _: &[DataType]) -> Result { + Ok(DataType::Date32) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let [unix_date] = take_function_args(self.name(), args.args)?; + match unix_date { + ColumnarValue::Array(arr) => { + let int_array = arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Execution( + "date_from_unix_date expects Int32Array input".to_string(), + ) + })?; + + // Date32 and Int32 both represent days since epoch, so we can directly + // reinterpret the values. The only operation needed is creating a Date32Array + // from the same underlying i32 values. + let date_array = Date32Array::new( + int_array.values().clone(), + int_array.nulls().cloned(), + ); + + Ok(ColumnarValue::Array(Arc::new(date_array))) + } + ColumnarValue::Scalar(scalar) => { + // Handle scalar case by converting to single-element array and back + let arr = scalar.to_array()?; + let int_array = arr.as_any().downcast_ref::().ok_or_else(|| { + DataFusionError::Execution( + "date_from_unix_date expects Int32 scalar input".to_string(), + ) + })?; + + let date_array = + Date32Array::new(int_array.values().clone(), int_array.nulls().cloned()); + + Ok(ColumnarValue::Array(Arc::new(date_array))) + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} diff --git a/native/spark-expr/src/datetime_funcs/mod.rs b/native/spark-expr/src/datetime_funcs/mod.rs index ef8041e5fe..6022c5c2c7 100644 --- a/native/spark-expr/src/datetime_funcs/mod.rs +++ b/native/spark-expr/src/datetime_funcs/mod.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +mod date_from_unix_date; mod date_trunc; mod extract_date_part; mod timestamp_trunc; +pub use date_from_unix_date::SparkDateFromUnixDate; pub use date_trunc::SparkDateTrunc; pub use extract_date_part::SparkHour; pub use extract_date_part::SparkMinute; diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index f26fd911d8..1ac60343e4 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -69,7 +69,9 @@ pub use comet_scalar_funcs::{ create_comet_physical_fun, create_comet_physical_fun_with_eval_mode, register_all_comet_functions, }; -pub use datetime_funcs::{SparkDateTrunc, SparkHour, SparkMinute, SparkSecond, TimestampTruncExpr}; +pub use datetime_funcs::{ + SparkDateFromUnixDate, SparkDateTrunc, SparkHour, SparkMinute, SparkSecond, TimestampTruncExpr, +}; pub use error::{SparkError, SparkResult}; pub use hash_funcs::*; pub use json_funcs::{FromJson, ToJson}; diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index f68d79d9cc..80268aedb4 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -185,6 +185,7 @@ object QueryPlanSerde extends Logging with CometExprShim { private val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[DateAdd] -> CometDateAdd, + classOf[DateFromUnixDate] -> CometDateFromUnixDate, classOf[DateSub] -> CometDateSub, classOf[FromUnixTime] -> CometFromUnixTime, classOf[LastDay] -> CometLastDay, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 628e190908..2f4918342b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, LastDay, Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, GetDateField, Hour, LastDay, Literal, Minute, Month, Quarter, Second, TruncDate, TruncTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.types.{DateType, IntegerType} import org.apache.spark.unsafe.types.UTF8String @@ -260,6 +260,8 @@ object CometDateSub extends CometScalarFunction[DateSub]("date_sub") object CometLastDay extends CometScalarFunction[LastDay]("last_day") +object CometDateFromUnixDate extends CometScalarFunction[DateFromUnixDate]("date_from_unix_date") + object CometTruncDate extends CometExpressionSerde[TruncDate] { val supportedFormats: Seq[String] = diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 53ffd483d7..81293c046f 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -21,7 +21,7 @@ package org.apache.comet import scala.util.Random -import org.apache.spark.sql.{CometTestBase, SaveMode} +import org.apache.spark.sql.{CometTestBase, Row, SaveMode} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataTypes, StructField, StructType} @@ -148,4 +148,40 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH checkSparkAnswerAndOperator("SELECT last_day(NULL)") } } + + test("date_from_unix_date") { + // Create test data with unix dates in a reasonable range (1900-2100) + // -25567 = 1900-01-01, 47482 = 2100-01-01 + val r = new Random(42) + val testData = (1 to 1000).map { _ => + val unixDate = r.nextInt(73049) - 25567 // range from 1900 to 2100 + Row(if (r.nextDouble() < 0.1) null else unixDate) + } + val schema = StructType(Seq(StructField("c0", DataTypes.IntegerType, true))) + val df = spark.createDataFrame(spark.sparkContext.parallelize(testData), schema) + df.createOrReplaceTempView("tbl") + + // Basic test with random unix dates in a reasonable range + checkSparkAnswerAndOperator("SELECT c0, date_from_unix_date(c0) FROM tbl ORDER BY c0") + + // Disable constant folding to ensure literal expressions are executed by Comet + withSQLConf( + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> + "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") { + // Test epoch (0 = 1970-01-01) + checkSparkAnswerAndOperator("SELECT date_from_unix_date(0)") + + // Test day after epoch (1 = 1970-01-02) + checkSparkAnswerAndOperator("SELECT date_from_unix_date(1)") + + // Test day before epoch (-1 = 1969-12-31) + checkSparkAnswerAndOperator("SELECT date_from_unix_date(-1)") + + // Test a known date (18993 = 2022-01-01, calculated as days from 1970-01-01) + checkSparkAnswerAndOperator("SELECT date_from_unix_date(18993)") + + // Test null handling + checkSparkAnswerAndOperator("SELECT date_from_unix_date(NULL)") + } + } } From c18720bf9f9359a54908a10294e801e41f95d28d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 17:22:02 -0700 Subject: [PATCH 03/12] update docs --- docs/source/user-guide/latest/configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 5a2450444d..705be019bb 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -234,6 +234,7 @@ These settings can be used to determine which parts of the plan are accelerated | `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for `CreateArray` | true | | `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet acceleration for `CreateNamedStruct` | true | | `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for `DateAdd` | true | +| `spark.comet.expression.DateFromUnixDate.enabled` | Enable Comet acceleration for `DateFromUnixDate` | true | | `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for `DateSub` | true | | `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for `DayOfMonth` | true | | `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for `DayOfWeek` | true | From 5dcd64d88f4e72c4a517d816351a491314ba592c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 14 Jan 2026 17:28:44 -0700 Subject: [PATCH 04/12] cargo fmt --- .../src/datetime_funcs/date_from_unix_date.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs index c4b9c36e1e..0671a9001d 100644 --- a/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs +++ b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs @@ -68,22 +68,17 @@ impl ScalarUDFImpl for SparkDateFromUnixDate { let [unix_date] = take_function_args(self.name(), args.args)?; match unix_date { ColumnarValue::Array(arr) => { - let int_array = arr - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Execution( - "date_from_unix_date expects Int32Array input".to_string(), - ) - })?; + let int_array = arr.as_any().downcast_ref::().ok_or_else(|| { + DataFusionError::Execution( + "date_from_unix_date expects Int32Array input".to_string(), + ) + })?; // Date32 and Int32 both represent days since epoch, so we can directly // reinterpret the values. The only operation needed is creating a Date32Array // from the same underlying i32 values. - let date_array = Date32Array::new( - int_array.values().clone(), - int_array.nulls().cloned(), - ); + let date_array = + Date32Array::new(int_array.values().clone(), int_array.nulls().cloned()); Ok(ColumnarValue::Array(Arc::new(date_array))) } From 4b02d52d97eac67784f1a1a259e06bba4b708422 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 10 Feb 2026 11:37:47 -0700 Subject: [PATCH 05/12] test: migrate date_from_unix_date tests to SQL file-based approach Co-Authored-By: Claude Opus 4.6 --- .../datetime/date_from_unix_date.sql | 31 ++++++++++++++++ .../comet/CometTemporalExpressionSuite.scala | 36 ------------------- 2 files changed, 31 insertions(+), 36 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql new file mode 100644 index 0000000000..caaf18ec02 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +statement +CREATE TABLE test_date_from_unix_date(i int) USING parquet + +statement +INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (NULL) + +query +SELECT date_from_unix_date(i) FROM test_date_from_unix_date + +-- literal arguments +query +SELECT date_from_unix_date(0), date_from_unix_date(1), date_from_unix_date(-1), date_from_unix_date(18993), date_from_unix_date(NULL) diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index 6e6c66d6a6..1ae6926e05 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -208,42 +208,6 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH } } - test("date_from_unix_date") { - // Create test data with unix dates in a reasonable range (1900-2100) - // -25567 = 1900-01-01, 47482 = 2100-01-01 - val r = new Random(42) - val testData = (1 to 1000).map { _ => - val unixDate = r.nextInt(73049) - 25567 // range from 1900 to 2100 - Row(if (r.nextDouble() < 0.1) null else unixDate) - } - val schema = StructType(Seq(StructField("c0", DataTypes.IntegerType, true))) - val df = spark.createDataFrame(spark.sparkContext.parallelize(testData), schema) - df.createOrReplaceTempView("tbl") - - // Basic test with random unix dates in a reasonable range - checkSparkAnswerAndOperator("SELECT c0, date_from_unix_date(c0) FROM tbl ORDER BY c0") - - // Disable constant folding to ensure literal expressions are executed by Comet - withSQLConf( - SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> - "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") { - // Test epoch (0 = 1970-01-01) - checkSparkAnswerAndOperator("SELECT date_from_unix_date(0)") - - // Test day after epoch (1 = 1970-01-02) - checkSparkAnswerAndOperator("SELECT date_from_unix_date(1)") - - // Test day before epoch (-1 = 1969-12-31) - checkSparkAnswerAndOperator("SELECT date_from_unix_date(-1)") - - // Test a known date (18993 = 2022-01-01, calculated as days from 1970-01-01) - checkSparkAnswerAndOperator("SELECT date_from_unix_date(18993)") - - // Test null handling - checkSparkAnswerAndOperator("SELECT date_from_unix_date(NULL)") - } - } - test("datediff") { val r = new Random(42) val schema = StructType( From ddd0a655f18a27becd59b6cb9b771e2ca33584e0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 7 Mar 2026 08:28:59 -0700 Subject: [PATCH 06/12] upmerge --- native/spark-expr/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index 23f88e0959..5838cd1b3e 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -70,8 +70,8 @@ pub use comet_scalar_funcs::{ }; pub use csv_funcs::*; pub use datetime_funcs::{ - SparkDateDiff, SparkDateFromUnixDate, SparkDateTrunc, SparkHour, SparkMakeDate, SparkMinute, SparkSecond, - SparkUnixTimestamp, TimestampTruncExpr, + SparkDateDiff, SparkDateFromUnixDate, SparkDateTrunc, SparkHour, SparkMakeDate, SparkMinute, + SparkSecond, SparkUnixTimestamp, TimestampTruncExpr, }; pub use error::{SparkError, SparkResult}; pub use hash_funcs::*; From 0ec82c9d200e7c3f571777d563e9e64b9c328d41 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 13 Apr 2026 16:14:45 -0600 Subject: [PATCH 07/12] chore: apply cargo fmt --- native/spark-expr/src/comet_scalar_funcs.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index 010fefe119..f6f6ef29c4 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -24,8 +24,7 @@ use crate::{ spark_ceil, spark_decimal_div, spark_decimal_integral_div, spark_floor, spark_isnan, spark_lpad, spark_make_decimal, spark_read_side_padding, spark_round, spark_rpad, spark_unhex, spark_unscaled_value, EvalMode, SparkArrayCompact, SparkContains, SparkDateDiff, - SparkDateFromUnixDate, - SparkDateTrunc, SparkMakeDate, SparkSizeFunc, + SparkDateFromUnixDate, SparkDateTrunc, SparkMakeDate, SparkSizeFunc, }; use arrow::datatypes::DataType; use datafusion::common::{DataFusionError, Result as DataFusionResult}; From 5becf49c59820c55bf228ae15f03a3bd4653a696 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Apr 2026 09:26:05 -0600 Subject: [PATCH 08/12] fix: address review feedback for date_from_unix_date - Return ScalarValue::Date32 directly in the scalar path instead of converting to a 1-element array, preserving scalar-in/scalar-out contract for proper broadcast semantics - Mark date_from_unix_date as supported in spark_expressions_support.md - Add Int32 boundary values (2147483647, -2147483648) to test coverage --- docs/spark_expressions_support.md | 2 +- .../src/datetime_funcs/date_from_unix_date.rs | 27 +++++++++---------- .../datetime/date_from_unix_date.sql | 2 +- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/docs/spark_expressions_support.md b/docs/spark_expressions_support.md index 6892eb358d..889fe3255f 100644 --- a/docs/spark_expressions_support.md +++ b/docs/spark_expressions_support.md @@ -171,7 +171,7 @@ - [ ] date_add - [ ] date_diff - [ ] date_format -- [ ] date_from_unix_date +- [x] date_from_unix_date - [x] date_part - [ ] date_sub - [ ] date_trunc diff --git a/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs index 0671a9001d..1c88fc47ab 100644 --- a/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs +++ b/native/spark-expr/src/datetime_funcs/date_from_unix_date.rs @@ -17,7 +17,7 @@ use arrow::array::{Array, Date32Array, Int32Array}; use arrow::datatypes::DataType; -use datafusion::common::{utils::take_function_args, DataFusionError, Result}; +use datafusion::common::{utils::take_function_args, DataFusionError, Result, ScalarValue}; use datafusion::logical_expr::{ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; @@ -82,20 +82,17 @@ impl ScalarUDFImpl for SparkDateFromUnixDate { Ok(ColumnarValue::Array(Arc::new(date_array))) } - ColumnarValue::Scalar(scalar) => { - // Handle scalar case by converting to single-element array and back - let arr = scalar.to_array()?; - let int_array = arr.as_any().downcast_ref::().ok_or_else(|| { - DataFusionError::Execution( - "date_from_unix_date expects Int32 scalar input".to_string(), - ) - })?; - - let date_array = - Date32Array::new(int_array.values().clone(), int_array.nulls().cloned()); - - Ok(ColumnarValue::Array(Arc::new(date_array))) - } + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Int32(Some(days)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(days)))) + } + ScalarValue::Int32(None) | ScalarValue::Null => { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(None))) + } + _ => Err(DataFusionError::Execution( + "date_from_unix_date expects Int32 scalar input".to_string(), + )), + }, } } diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql index caaf18ec02..673c0d544a 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql @@ -21,7 +21,7 @@ statement CREATE TABLE test_date_from_unix_date(i int) USING parquet statement -INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (NULL) +INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (2147483647), (-2147483648), (NULL) query SELECT date_from_unix_date(i) FROM test_date_from_unix_date From 0e1b28dc91ede53c147c2d7372afe7abac57454a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Apr 2026 13:09:35 -0600 Subject: [PATCH 09/12] cargo fmt --- native/spark-expr/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs index 2a1dbb0320..963ce62e2b 100644 --- a/native/spark-expr/src/lib.rs +++ b/native/spark-expr/src/lib.rs @@ -72,8 +72,7 @@ pub use comet_scalar_funcs::{ pub use csv_funcs::*; pub use datetime_funcs::{ SparkDateDiff, SparkDateFromUnixDate, SparkDateTrunc, SparkHour, SparkHoursTransform, - SparkMakeDate, SparkMinute, - SparkSecond, SparkUnixTimestamp, TimestampTruncExpr, + SparkMakeDate, SparkMinute, SparkSecond, SparkUnixTimestamp, TimestampTruncExpr, }; pub use error::{decimal_overflow_error, SparkError, SparkErrorWithContext, SparkResult}; pub use hash_funcs::*; From fc69b9a56108c025b09b723774dc13555f17b1ce Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Apr 2026 13:10:29 -0600 Subject: [PATCH 10/12] address feedback --- .../sql-tests/expressions/datetime/date_from_unix_date.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql index 673c0d544a..b123a50c8a 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql @@ -15,8 +15,6 @@ -- specific language governing permissions and limitations -- under the License. --- ConfigMatrix: parquet.enable.dictionary=false,true - statement CREATE TABLE test_date_from_unix_date(i int) USING parquet From 461c3ed47e41c8b8e71cc85ae2f5b2b9af25b558 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Apr 2026 14:03:35 -0600 Subject: [PATCH 11/12] fix: use valid Spark date boundaries instead of INT_MAX/MIN in test Replace INT_MAX (2147483647) and INT_MIN (-2147483648) with Spark's actual date boundaries (-719162 for 0001-01-01 and 2932896 for 9999-12-31) to fix EXPRESSION_DECODING_FAILED error when Spark tries to convert out-of-range dates to Java Date objects. --- .../sql-tests/expressions/datetime/date_from_unix_date.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql index b123a50c8a..2e4538c120 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql @@ -19,7 +19,7 @@ statement CREATE TABLE test_date_from_unix_date(i int) USING parquet statement -INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (2147483647), (-2147483648), (NULL) +INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (-719162), (2932896), (NULL) query SELECT date_from_unix_date(i) FROM test_date_from_unix_date From 0d325c7a11434bb30a883c374394f7749a4e6679 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 Apr 2026 14:04:02 -0600 Subject: [PATCH 12/12] docs: add comment explaining date boundary values in test --- .../sql-tests/expressions/datetime/date_from_unix_date.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql index 2e4538c120..69a56eedc3 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_from_unix_date.sql @@ -18,6 +18,7 @@ statement CREATE TABLE test_date_from_unix_date(i int) USING parquet +-- -719162 = 0001-01-01 (Spark min date), 2932896 = 9999-12-31 (Spark max date) statement INSERT INTO test_date_from_unix_date VALUES (0), (1), (-1), (18993), (-25567), (-719162), (2932896), (NULL)