From f70abf14f80414687087f3753290328d9b667c59 Mon Sep 17 00:00:00 2001 From: Martin Hilton Date: Wed, 29 Oct 2025 15:44:23 +0000 Subject: [PATCH 1/3] fix: correct date_trunc for times before the epoch The array-based implementation of date_trunc can produce incorrect results for negative timestamps (i.e. dates before 1970-01-01). Check for any such incorrect values and compensate accordingly. --- datafusion/functions/src/datetime/date_trunc.rs | 15 +++++++++++++-- datafusion/sqllogictest/test_files/timestamps.slt | 7 +++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 405aabfde9917..b70c8c3cb4cd1 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -482,9 +482,20 @@ fn general_date_trunc_array_fine_granularity( if let Some(unit) = unit { let original_type = array.data_type(); - let array = arrow::compute::cast(array, &DataType::Int64)?; - let array = arrow::compute::kernels::numeric::div(&array, &unit)?; + let input = arrow::compute::cast(array, &DataType::Int64)?; + let array = arrow::compute::kernels::numeric::div(&input, &unit)?; let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; + // For timestamps before 1970-01-01T00:00:00Z (negative values) + // it is possible that the truncated value is actually later + // than the original value. Correct any such cases by + // subtracting `unit`. + let too_late = arrow::compute::kernels::cmp::gt(&array, &input)?; + let array = if too_late.true_count() > 0 { + let earlier = arrow::compute::kernels::numeric::sub(&array, &unit)?; + arrow::compute::kernels::zip::zip(&too_late, &earlier, &array)? + } else { + array + }; let array = arrow::compute::cast(&array, original_type)?; Ok(array) } else { diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 84dd7098a2ee5..3d03bc3263425 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1687,6 +1687,13 @@ SELECT DATE_TRUNC('second', '2022-08-03 14:38:50Z'); ---- 2022-08-03T14:38:50 +# DATE_TRUNC handling of times before the unix epoch (issue 18334) +query PPP +SELECT d, DATE_TRUNC('hour', d), DATE_TRUNC('hour', TIMESTAMP '1900-06-15 07:09:00') +FROM (VALUES (TIMESTAMP '1900-06-15 07:09:00')) AS t(d); +---- +1900-06-15T07:09:00 1900-06-15T07:00:00 1900-06-15T07:00:00 + # Test that interval can add a timestamp query P SELECT timestamp '2013-07-01 12:00:00' + INTERVAL '8' DAY; From 0bd2f92c3882806c9a00d346ce13533702106e70 Mon Sep 17 00:00:00 2001 From: Martin Hilton Date: Thu, 30 Oct 2025 09:04:05 +0000 Subject: [PATCH 2/3] test: add additional date_trunc cases Add some additional test cases to the DATE_TRUNC logic tests. These were suggested by @sadboy. --- .../sqllogictest/test_files/timestamps.slt | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 3d03bc3263425..250d4e9830e58 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1688,11 +1688,28 @@ SELECT DATE_TRUNC('second', '2022-08-03 14:38:50Z'); 2022-08-03T14:38:50 # DATE_TRUNC handling of times before the unix epoch (issue 18334) -query PPP -SELECT d, DATE_TRUNC('hour', d), DATE_TRUNC('hour', TIMESTAMP '1900-06-15 07:09:00') -FROM (VALUES (TIMESTAMP '1900-06-15 07:09:00')) AS t(d); ----- -1900-06-15T07:09:00 1900-06-15T07:00:00 1900-06-15T07:00:00 +query PPPPPPPPPPP +SELECT + d, + DATE_TRUNC('year', d), + DATE_TRUNC('quarter', d), + DATE_TRUNC('month', d), + DATE_TRUNC('week', d), + DATE_TRUNC('day', d), + DATE_TRUNC('hour', d), + DATE_TRUNC('minute', d), + DATE_TRUNC('second', d), + DATE_TRUNC('millisecond', d), + DATE_TRUNC('microsecond', d), +FROM (VALUES + (TIMESTAMP '1900-06-15 07:09:00'), + (TIMESTAMP '1970-01-01 00:00:00'), + (TIMESTAMP '2024-12-31 23:39:01.123456789') +) AS t(d); +---- +1900-06-15T07:09:00 1900-01-01T00:00:00 1900-04-01T00:00:00 1900-06-01T00:00:00 1900-06-11T00:00:00 1900-06-15T00:00:00 1900-06-15T07:00:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00 +1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1969-12-29T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 +2024-12-31T23:39:01.123456789 2024-01-01T00:00:00 2024-10-01T00:00:00 2024-12-01T00:00:00 2024-12-30T00:00:00 2024-12-31T00:00:00 2024-12-31T23:00:00 2024-12-31T23:39:00 2024-12-31T23:39:01 2024-12-31T23:39:01.123 2024-12-31T23:39:01.123456 # Test that interval can add a timestamp query P From 587856d282c6e5fadc1cbffc16577c5ec2c357a4 Mon Sep 17 00:00:00 2001 From: Martin Hilton Date: Thu, 30 Oct 2025 10:28:33 +0000 Subject: [PATCH 3/3] feat: make date_trunc fast case faster Apply the review suggestions for @alamb to avoid unnecessary allocations, and from @findepi to perform the calculation directly using rem_euclid. This is a fairly nieve iterator-based implementation, but it is showing a significant speed improvement, probably due to avoiding memory allocations. --- .../functions/src/datetime/date_trunc.rs | 69 +++++++++---------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index b70c8c3cb4cd1..543ed8038b2f3 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::num::NonZeroI64; use std::ops::{Add, Sub}; use std::str::FromStr; use std::sync::Arc; @@ -28,7 +29,7 @@ use arrow::array::types::{ ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, PrimitiveArray}; use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; use datafusion_common::cast::as_primitive_array; @@ -456,48 +457,40 @@ fn general_date_trunc_array_fine_granularity( granularity: &str, ) -> Result { let unit = match (tu, granularity) { - (Second, "minute") => Some(Int64Array::new_scalar(60)), - (Second, "hour") => Some(Int64Array::new_scalar(3600)), - (Second, "day") => Some(Int64Array::new_scalar(86400)), - - (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), - (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), - (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), - (Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), - - (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), - (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), - (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), - (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), - (Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), - - (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), - (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), - (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), - (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), - (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), - (Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), + (Second, "minute") => NonZeroI64::new(60), + (Second, "hour") => NonZeroI64::new(3600), + (Second, "day") => NonZeroI64::new(86400), + + (Millisecond, "second") => NonZeroI64::new(1_000), + (Millisecond, "minute") => NonZeroI64::new(60_000), + (Millisecond, "hour") => NonZeroI64::new(3_600_000), + (Millisecond, "day") => NonZeroI64::new(86_400_000), + + (Microsecond, "millisecond") => NonZeroI64::new(1_000), + (Microsecond, "second") => NonZeroI64::new(1_000_000), + (Microsecond, "minute") => NonZeroI64::new(60_000_000), + (Microsecond, "hour") => NonZeroI64::new(3_600_000_000), + (Microsecond, "day") => NonZeroI64::new(86_400_000_000), + + (Nanosecond, "microsecond") => NonZeroI64::new(1_000), + (Nanosecond, "millisecond") => NonZeroI64::new(1_000_000), + (Nanosecond, "second") => NonZeroI64::new(1_000_000_000), + (Nanosecond, "minute") => NonZeroI64::new(60_000_000_000), + (Nanosecond, "hour") => NonZeroI64::new(3_600_000_000_000), + (Nanosecond, "day") => NonZeroI64::new(86_400_000_000_000), _ => None, }; if let Some(unit) = unit { - let original_type = array.data_type(); - let input = arrow::compute::cast(array, &DataType::Int64)?; - let array = arrow::compute::kernels::numeric::div(&input, &unit)?; - let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; - // For timestamps before 1970-01-01T00:00:00Z (negative values) - // it is possible that the truncated value is actually later - // than the original value. Correct any such cases by - // subtracting `unit`. - let too_late = arrow::compute::kernels::cmp::gt(&array, &input)?; - let array = if too_late.true_count() > 0 { - let earlier = arrow::compute::kernels::numeric::sub(&array, &unit)?; - arrow::compute::kernels::zip::zip(&too_late, &earlier, &array)? - } else { + let unit = unit.get(); + let array = PrimitiveArray::::from_iter_values_with_nulls( array - }; - let array = arrow::compute::cast(&array, original_type)?; - Ok(array) + .values() + .iter() + .map(|v| *v - i64::rem_euclid(*v, unit)), + array.nulls().cloned(), + ); + Ok(Arc::new(array)) } else { // truncate to the same or smaller unit Ok(Arc::new(array.clone()))