From 44893600bc0f6ef7f447c18fccf8047f92eb0314 Mon Sep 17 00:00:00 2001 From: jonahgao Date: Fri, 16 Aug 2024 00:13:54 +0800 Subject: [PATCH] fix: incorrect aggregation result of `bool_and` --- .../aggregate/groups_accumulator/bool_op.rs | 13 ++++-- .../functions-aggregate/src/bool_and_or.rs | 9 ++-- .../sqllogictest/test_files/aggregate.slt | 45 +++++++++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs index be2b5e48a8db9..110f7b4c22713 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator/bool_op.rs @@ -46,17 +46,22 @@ where /// Function that computes the output bool_fn: F, + + /// The identity element for the boolean operation. + /// Any value combined with this returns the original value. + identity: bool, } impl BooleanGroupsAccumulator where F: Fn(bool, bool) -> bool + Send + Sync, { - pub fn new(bitop_fn: F) -> Self { + pub fn new(bool_fn: F, identity: bool) -> Self { Self { values: BooleanBufferBuilder::new(0), null_state: NullState::new(), - bool_fn: bitop_fn, + bool_fn, + identity, } } } @@ -77,7 +82,9 @@ where if self.values.len() < total_num_groups { let new_groups = total_num_groups - self.values.len(); - self.values.append_n(new_groups, Default::default()); + // Fill with the identity element, so that when the first non-null value is encountered, + // it will combine with the identity and the result will be the first non-null value itself. + self.values.append_n(new_groups, self.identity); } // NullState dispatches / handles tracking nulls and groups that saw no values diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs index b993b2a4979c8..7cc7d9ff7fec3 100644 --- a/datafusion/functions-aggregate/src/bool_and_or.rs +++ b/datafusion/functions-aggregate/src/bool_and_or.rs @@ -151,7 +151,7 @@ impl AggregateUDFImpl for BoolAnd { ) -> Result> { match args.return_type { DataType::Boolean => { - Ok(Box::new(BooleanGroupsAccumulator::new(|x, y| x && y))) + Ok(Box::new(BooleanGroupsAccumulator::new(|x, y| x && y, true))) } _ => not_impl_err!( "GroupsAccumulator not supported for {} with {}", @@ -270,9 +270,10 @@ impl AggregateUDFImpl for BoolOr { args: AccumulatorArgs, ) -> Result> { match args.return_type { - DataType::Boolean => { - Ok(Box::new(BooleanGroupsAccumulator::new(|x, y| x || y))) - } + DataType::Boolean => Ok(Box::new(BooleanGroupsAccumulator::new( + |x, y| x || y, + false, + ))), _ => not_impl_err!( "GroupsAccumulator not supported for {} with {}", args.name, diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 322ddcdb047b3..46a9ae3ca2f36 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3724,6 +3724,51 @@ SELECT bool_or(distinct c1), bool_or(distinct c2), bool_or(distinct c3), bool_or ---- true true true false true true false NULL +# Test issue: https://github.com/apache/datafusion/issues/11846 +statement ok +create table t1(v1 int, v2 boolean); + +statement ok +insert into t1 values (1, true), (1, true); + +statement ok +insert into t1 values (3, null), (3, true); + +statement ok +insert into t1 values (2, false), (2, true); + +statement ok +insert into t1 values (6, false), (6, false); + +statement ok +insert into t1 values (4, null), (4, null); + +statement ok +insert into t1 values (5, false), (5, null); + +query IB +select v1, bool_and(v2) from t1 group by v1 order by v1; +---- +1 true +2 false +3 true +4 NULL +5 false +6 false + +query IB +select v1, bool_or(v2) from t1 group by v1 order by v1; +---- +1 true +2 true +3 true +4 NULL +5 false +6 false + +statement ok +drop table t1; + # All supported timestamp types # "nanos" --> TimestampNanosecondArray