From ace316486515694ddf45f5b9f8e7719f598a68e0 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Sat, 27 Dec 2025 18:17:23 +0530 Subject: [PATCH 1/3] fix:Median() encountered integer overflow and truncates integer results --- datafusion/functions-aggregate/src/median.rs | 22 +++++++++++++-- .../sqllogictest/test_files/aggregate.slt | 28 +++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index 29b8857254dd3..f137ae0801f09 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -604,9 +604,25 @@ fn calculate_median(values: &mut [T::Native]) -> Option(low); - let median = left_max - .add_wrapping(*high) - .div_wrapping(T::Native::usize_as(2)); + // Calculate median as the average of the two middle values. + // Use checked arithmetic to detect overflow and fall back to safe formula. + let two = T::Native::usize_as(2); + let median = match left_max.add_checked(*high) { + Ok(sum) => sum.div_wrapping(two), + Err(_) => { + // Overflow detected - use safe midpoint formula: + // a/2 + b/2 + ((a%2 + b%2) / 2) + // This avoids overflow by dividing before adding. + let half_left = left_max.div_wrapping(two); + let half_right = (*high).div_wrapping(two); + let rem_left = left_max.mod_wrapping(two); + let rem_right = (*high).mod_wrapping(two); + // The sum of remainders (0, 1, or 2 for unsigned; -2 to 2 for signed) + // divided by 2 gives the correction factor (0 or 1 for unsigned; -1, 0, or 1 for signed) + let correction = rem_left.add_wrapping(rem_right).div_wrapping(two); + half_left.add_wrapping(half_right).add_wrapping(correction) + } + }; Some(median) } else { let (_, median, _) = values.select_nth_unstable_by(len / 2, cmp); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index a3fab065dc097..400c2f5c1f475 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -990,6 +990,34 @@ SELECT approx_median(col_f64_nan) FROM median_table ---- NaN + +# median_i8_overflow_negative +query I +SELECT median(v) FROM (VALUES (arrow_cast(-85, 'Int8')), (arrow_cast(-56, 'Int8'))) AS t(v); +---- +-70 + +# median_i8_overflow_positive +# Test overflow with positive values: 100 + 120 = 220 > 127 (max i8) +query I +SELECT median(v) FROM (VALUES (arrow_cast(100, 'Int8')), (arrow_cast(120, 'Int8'))) AS t(v); +---- +110 + +# median_u8_overflow +# Test unsigned overflow: 200 + 250 = 450 > 255 (max u8) +query I +SELECT median(v) FROM (VALUES (arrow_cast(200, 'UInt8')), (arrow_cast(250, 'UInt8'))) AS t(v); +---- +225 + +# median_i8_no_overflow_normal_case +# Normal case that doesn't overflow for comparison +query I +SELECT median(v) FROM (VALUES (arrow_cast(4, 'Int8')), (arrow_cast(5, 'Int8'))) AS t(v); +---- +4 + # median_sliding_window statement ok CREATE TABLE median_window_test ( From 041d930cd6b0885788f7c04aa71513ad96900157 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Sat, 27 Dec 2025 19:11:50 +0530 Subject: [PATCH 2/3] update insta snapshot --- datafusion/core/tests/dataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 7b5e853465a92..c09db371912b0 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1112,7 +1112,7 @@ async fn window_using_aggregates() -> Result<()> { | -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 | | -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 | | -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 | - | -85 | -56 | 2 | -70 | 57 | -56 | -85 | 1 | -25 | + | -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 | | -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 | | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | | -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 | From 2a145f8b3b9a074b71127f765a431427bc55e288 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Sat, 27 Dec 2025 20:22:28 +0530 Subject: [PATCH 3/3] added more tests for overflow --- .../sqllogictest/test_files/aggregate.slt | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 400c2f5c1f475..2a4daeb92979d 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1018,6 +1018,34 @@ SELECT median(v) FROM (VALUES (arrow_cast(4, 'Int8')), (arrow_cast(5, 'Int8'))) ---- 4 +# median_i8_max_values +# Test with both i8::MAX values: 127 + 127 = 254 > 127, overflow +query I +SELECT median(v) FROM (VALUES (arrow_cast(127, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v); +---- +127 + +# median_i8_min_values +# Test with both i8::MIN values: -128 + -128 = -256 < -128, underflow +query I +SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(-128, 'Int8'))) AS t(v); +---- +-128 + +# median_i8_min_max_values +# Test with i8::MIN and i8::MAX: -128 + 127 = -1, no overflow, median = 0 (truncated from -0.5) +query I +SELECT median(v) FROM (VALUES (arrow_cast(-128, 'Int8')), (arrow_cast(127, 'Int8'))) AS t(v); +---- +0 + +# median_u8_max_values +# Test with both u8::MAX values: 255 + 255 = 510 > 255, overflow +query I +SELECT median(v) FROM (VALUES (arrow_cast(255, 'UInt8')), (arrow_cast(255, 'UInt8'))) AS t(v); +---- +255 + # median_sliding_window statement ok CREATE TABLE median_window_test (