From 83e076e7782210efaf799b10d79298ae84ad7edd Mon Sep 17 00:00:00 2001 From: Cyprien Huet Date: Fri, 16 Jan 2026 09:27:48 +0400 Subject: [PATCH 1/2] feat(spark): implement Spark `date_diff` function --- .../spark/src/function/datetime/date_diff.rs | 118 ++++++++++++++++++ datafusion/spark/src/function/datetime/mod.rs | 8 ++ .../test_files/spark/datetime/date_diff.slt | 112 ++++++++++++++--- .../test_files/spark/datetime/datediff.slt | 32 ----- 4 files changed, 223 insertions(+), 47 deletions(-) create mode 100644 datafusion/spark/src/function/datetime/date_diff.rs delete mode 100644 datafusion/sqllogictest/test_files/spark/datetime/datediff.slt diff --git a/datafusion/spark/src/function/datetime/date_diff.rs b/datafusion/spark/src/function/datetime/date_diff.rs new file mode 100644 index 0000000000000..bcf413a1fc76f --- /dev/null +++ b/datafusion/spark/src/function/datetime/date_diff.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::types::{NativeType, logical_date, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, internal_err}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ExprSchemable, Operator, ReturnFieldArgs, + ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, + binary_expr, +}; + +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkDateDiff { + signature: Signature, + aliases: Vec, +} + +impl Default for SparkDateDiff { + fn default() -> Self { + Self::new() + } +} + +impl SparkDateDiff { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Timestamp, + ], + NativeType::Date, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Timestamp, + ], + NativeType::Date, + ), + ], + Volatility::Immutable, + ), + aliases: vec!["datediff".to_string()], + } + } +} + +impl ScalarUDFImpl for SparkDateDiff { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_diff" + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + internal_err!( + "spark date_diff should have been simplified to standard subtraction" + ) + } + + fn simplify( + &self, + args: Vec, + info: &SimplifyContext, + ) -> Result { + let [end, start] = take_function_args(self.name(), args)?; + Ok(ExprSimplifyResult::Simplified(binary_expr( + end.cast_to(&DataType::Int32, info.schema())?, + Operator::Minus, + start.cast_to(&DataType::Int32, info.schema())?, + ))) + } +} diff --git a/datafusion/spark/src/function/datetime/mod.rs b/datafusion/spark/src/function/datetime/mod.rs index 849aa20895990..8054281cbd89a 100644 --- a/datafusion/spark/src/function/datetime/mod.rs +++ b/datafusion/spark/src/function/datetime/mod.rs @@ -16,6 +16,7 @@ // under the License. pub mod date_add; +pub mod date_diff; pub mod date_sub; pub mod extract; pub mod last_day; @@ -36,6 +37,7 @@ make_udf_function!(last_day::SparkLastDay, last_day); make_udf_function!(make_dt_interval::SparkMakeDtInterval, make_dt_interval); make_udf_function!(make_interval::SparkMakeInterval, make_interval); make_udf_function!(next_day::SparkNextDay, next_day); +make_udf_function!(date_diff::SparkDateDiff, date_diff); pub mod expr_fn { use datafusion_functions::export_functions; @@ -83,6 +85,11 @@ pub mod expr_fn { "Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.", arg1 arg2 )); + export_functions!(( + date_diff, + "Returns the number of days from start `start` to end `end`.", + end start + )); } pub fn functions() -> Vec> { @@ -96,5 +103,6 @@ pub fn functions() -> Vec> { make_dt_interval(), make_interval(), next_day(), + date_diff(), ] } diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt index c5871ab41e183..fd7a2b5c6b93c 100644 --- a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt +++ b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt @@ -15,18 +15,100 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT date_diff('2009-07-30', '2009-07-31'); -## PySpark 3.5.5 Result: {'date_diff(2009-07-30, 2009-07-31)': -1, 'typeof(date_diff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'} -#query -#SELECT date_diff('2009-07-30'::string, '2009-07-31'::string); - -## Original Query: SELECT date_diff('2009-07-31', '2009-07-30'); -## PySpark 3.5.5 Result: {'date_diff(2009-07-31, 2009-07-30)': 1, 'typeof(date_diff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'} -#query -#SELECT date_diff('2009-07-31'::string, '2009-07-30'::string); +# date input +query I +SELECT date_diff('2009-07-30'::date, '2009-07-31'::date); +---- +-1 + +query I +SELECT date_diff('2009-07-31'::date, '2009-07-30'::date); +---- +1 + +# Same date returns 0 +query I +SELECT date_diff('2009-07-30'::date, '2009-07-30'::date); +---- +0 + +# Large difference +query I +SELECT date_diff('2020-01-01'::date, '1970-01-01'::date); +---- +18262 + +# timestamp input +query I +SELECT date_diff('2009-07-30 12:34:56'::timestamp, '2009-07-31 23:45:01'::timestamp); +---- +-1 + +query I +SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timestamp); +---- +1 + +# string input +query I +SELECT date_diff('2009-07-30', '2009-07-31'); +---- +-1 + +query I +SELECT date_diff('2009-07-31', '2009-07-30'); +---- +1 + +# NULL handling +query I +SELECT date_diff(NULL::date, '2009-07-30'::date); +---- +NULL + +query I +SELECT date_diff('2009-07-31'::date, NULL::date); +---- +NULL + +query I +SELECT date_diff(NULL::date, NULL::date); +---- +NULL + +query I +SELECT date_diff(column1, column2) +FROM VALUES +('2009-07-30'::date, '2009-07-31'::date), +('2009-07-31'::date, '2009-07-30'::date), +(NULL::date, '2009-07-30'::date), +('2009-07-31'::date, NULL::date), +(NULL::date, NULL::date); +---- +-1 +1 +NULL +NULL +NULL + + +# Alias datediff +query I +SELECT datediff('2009-07-30'::date, '2009-07-31'::date); +---- +-1 + +query I +SELECT datediff(column1, column2) +FROM VALUES +('2009-07-30'::date, '2009-07-31'::date), +('2009-07-31'::date, '2009-07-30'::date), +(NULL::date, '2009-07-30'::date), +('2009-07-31'::date, NULL::date), +(NULL::date, NULL::date); +---- +-1 +1 +NULL +NULL +NULL diff --git a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt b/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt deleted file mode 100644 index 223e2c313ae86..0000000000000 --- a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT datediff('2009-07-30', '2009-07-31'); -## PySpark 3.5.5 Result: {'datediff(2009-07-30, 2009-07-31)': -1, 'typeof(datediff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'} -#query -#SELECT datediff('2009-07-30'::string, '2009-07-31'::string); - -## Original Query: SELECT datediff('2009-07-31', '2009-07-30'); -## PySpark 3.5.5 Result: {'datediff(2009-07-31, 2009-07-30)': 1, 'typeof(datediff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'} -#query -#SELECT datediff('2009-07-31'::string, '2009-07-30'::string); From 2668fee946cfb25812c69af77565da58442c0b75 Mon Sep 17 00:00:00 2001 From: Cyprien Huet Date: Sat, 17 Jan 2026 12:12:23 +0400 Subject: [PATCH 2/2] feat(spark): enhance `date_diff` function to support various input types including string and timestamp --- .../spark/src/function/datetime/date_diff.rs | 13 ++++--- .../test_files/spark/datetime/date_diff.slt | 38 +++++++++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/datafusion/spark/src/function/datetime/date_diff.rs b/datafusion/spark/src/function/datetime/date_diff.rs index bcf413a1fc76f..094c35eec56b5 100644 --- a/datafusion/spark/src/function/datetime/date_diff.rs +++ b/datafusion/spark/src/function/datetime/date_diff.rs @@ -99,7 +99,7 @@ impl ScalarUDFImpl for SparkDateDiff { fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { internal_err!( - "spark date_diff should have been simplified to standard subtraction" + "Apache Spark `date_diff` should have been simplified to standard subtraction" ) } @@ -109,10 +109,11 @@ impl ScalarUDFImpl for SparkDateDiff { info: &SimplifyContext, ) -> Result { let [end, start] = take_function_args(self.name(), args)?; - Ok(ExprSimplifyResult::Simplified(binary_expr( - end.cast_to(&DataType::Int32, info.schema())?, - Operator::Minus, - start.cast_to(&DataType::Int32, info.schema())?, - ))) + let end = end.cast_to(&DataType::Date32, info.schema())?; + let start = start.cast_to(&DataType::Date32, info.schema())?; + Ok(ExprSimplifyResult::Simplified( + binary_expr(end, Operator::Minus, start) + .cast_to(&DataType::Int32, info.schema())?, + )) } } diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt index fd7a2b5c6b93c..b0952d6a43510 100644 --- a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt +++ b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt @@ -26,6 +26,39 @@ SELECT date_diff('2009-07-31'::date, '2009-07-30'::date); ---- 1 +query I +SELECT date_diff('2009-07-31'::string, '2009-07-30'::date); +---- +1 + +query I +SELECT date_diff('2009-07-31'::timestamp, '2009-07-30'::date); +---- +1 + +# Date64 input +query I +SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), arrow_cast('2009-07-30', 'Date64')); +---- +1 + +query I +SELECT date_diff(arrow_cast('2009-07-30', 'Date64'), arrow_cast('2009-07-31', 'Date64')); +---- +-1 + +# Mixed Date32 and Date64 input +query I +SELECT date_diff('2009-07-31'::date, arrow_cast('2009-07-30', 'Date64')); +---- +1 + +query I +SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), '2009-07-30'::date); +---- +1 + + # Same date returns 0 query I SELECT date_diff('2009-07-30'::date, '2009-07-30'::date); @@ -49,6 +82,11 @@ SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timest ---- 1 +query I +SELECT date_diff('2009-07-31 23:45:01'::string, '2009-07-30 12:34:56'::timestamp); +---- +1 + # string input query I SELECT date_diff('2009-07-30', '2009-07-31');