-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-9055: [C++] Add sum/mean/minmax kernels for Boolean type #7478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ebfa27a
88dd1c7
4673699
2bfa185
db7d4cd
fd8f217
3717230
8f99d16
48da080
03e05bb
bb4f12c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -255,6 +255,28 @@ struct SumState { | |
| } | ||
| }; | ||
|
|
||
| template <> | ||
| struct SumState<BooleanType> { | ||
| using SumType = typename FindAccumulatorType<BooleanType>::Type; | ||
| using ThisType = SumState<BooleanType, SumType>; | ||
|
|
||
| ThisType& operator+=(const ThisType& rhs) { | ||
| this->count += rhs.count; | ||
| this->sum += rhs.sum; | ||
| return *this; | ||
| } | ||
|
|
||
| public: | ||
| void Consume(const Array& input) { | ||
| const BooleanArray& array = static_cast<const BooleanArray&>(input); | ||
| count += array.length() - array.null_count(); | ||
| sum += array.true_count(); | ||
| } | ||
|
|
||
| size_t count = 0; | ||
| typename SumType::c_type sum = 0; | ||
| }; | ||
|
|
||
| template <typename ArrowType> | ||
| struct SumImpl : public ScalarAggregator { | ||
| using ArrayType = typename TypeTraits<ArrowType>::ArrayType; | ||
|
|
@@ -311,6 +333,11 @@ struct SumLikeInit { | |
| return Status::NotImplemented("No sum implemented"); | ||
| } | ||
|
|
||
| Status Visit(const BooleanType&) { | ||
| state.reset(new KernelClass<BooleanType>()); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| template <typename Type> | ||
| enable_if_number<Type, Status> Visit(const Type&) { | ||
| state.reset(new KernelClass<Type>()); | ||
|
|
@@ -339,13 +366,38 @@ std::unique_ptr<KernelState> MeanInit(KernelContext* ctx, const KernelInitArgs& | |
| template <typename ArrowType, typename Enable = void> | ||
| struct MinMaxState {}; | ||
|
|
||
| template <typename ArrowType> | ||
| struct MinMaxState<ArrowType, enable_if_boolean<ArrowType>> { | ||
| using ThisType = MinMaxState<ArrowType>; | ||
| using T = typename ArrowType::c_type; | ||
|
|
||
| ThisType& operator+=(const ThisType& rhs) { | ||
| this->has_nulls |= rhs.has_nulls; | ||
| this->has_values |= rhs.has_values; | ||
wesm marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| this->min = this->min && rhs.min; | ||
| this->max = this->max || rhs.max; | ||
| return *this; | ||
| } | ||
|
|
||
| void MergeOne(T value) { | ||
| this->min = this->min && value; | ||
| this->max = this->max || value; | ||
| } | ||
|
|
||
| T min = true; | ||
| T max = false; | ||
| bool has_nulls = false; | ||
| bool has_values = false; | ||
| }; | ||
|
|
||
| template <typename ArrowType> | ||
| struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> { | ||
| using ThisType = MinMaxState<ArrowType>; | ||
| using T = typename ArrowType::c_type; | ||
|
|
||
| ThisType& operator+=(const ThisType& rhs) { | ||
| this->has_nulls |= rhs.has_nulls; | ||
| this->has_values |= rhs.has_values; | ||
| this->min = std::min(this->min, rhs.min); | ||
| this->max = std::max(this->max, rhs.max); | ||
| return *this; | ||
|
|
@@ -359,6 +411,7 @@ struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> { | |
| T min = std::numeric_limits<T>::max(); | ||
| T max = std::numeric_limits<T>::min(); | ||
| bool has_nulls = false; | ||
| bool has_values = false; | ||
| }; | ||
|
|
||
| template <typename ArrowType> | ||
|
|
@@ -368,6 +421,7 @@ struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> { | |
|
|
||
| ThisType& operator+=(const ThisType& rhs) { | ||
| this->has_nulls |= rhs.has_nulls; | ||
| this->has_values |= rhs.has_values; | ||
| this->min = std::fmin(this->min, rhs.min); | ||
| this->max = std::fmax(this->max, rhs.max); | ||
| return *this; | ||
|
|
@@ -381,6 +435,7 @@ struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> { | |
| T min = std::numeric_limits<T>::infinity(); | ||
| T max = -std::numeric_limits<T>::infinity(); | ||
| bool has_nulls = false; | ||
| bool has_values = false; | ||
| }; | ||
|
|
||
| template <typename ArrowType> | ||
|
|
@@ -397,24 +452,26 @@ struct MinMaxImpl : public ScalarAggregator { | |
|
|
||
| ArrayType arr(batch[0].array()); | ||
|
|
||
| local.has_nulls = arr.null_count() > 0; | ||
| const auto null_count = arr.null_count(); | ||
| local.has_nulls = null_count > 0; | ||
| local.has_values = (arr.length() - null_count) > 0; | ||
|
|
||
| if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { | ||
| this->state = local; | ||
| return; | ||
| } | ||
|
|
||
| const auto values = arr.raw_values(); | ||
|
||
| if (arr.null_count() > 0) { | ||
| if (local.has_nulls) { | ||
| BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); | ||
| for (int64_t i = 0; i < arr.length(); i++) { | ||
| if (reader.IsSet()) { | ||
| local.MergeOne(values[i]); | ||
| local.MergeOne(arr.Value(i)); | ||
| } | ||
| reader.Next(); | ||
| } | ||
| } else { | ||
| for (int64_t i = 0; i < arr.length(); i++) { | ||
| local.MergeOne(values[i]); | ||
| local.MergeOne(arr.Value(i)); | ||
| } | ||
wesm marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| this->state = local; | ||
|
|
@@ -429,7 +486,8 @@ struct MinMaxImpl : public ScalarAggregator { | |
| using ScalarType = typename TypeTraits<ArrowType>::ScalarType; | ||
|
|
||
| std::vector<std::shared_ptr<Scalar>> values; | ||
| if (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { | ||
| if (!state.has_values || | ||
| (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL)) { | ||
| // (null, null) | ||
| values = {std::make_shared<ScalarType>(), std::make_shared<ScalarType>()}; | ||
| } else { | ||
|
|
@@ -444,6 +502,33 @@ struct MinMaxImpl : public ScalarAggregator { | |
| MinMaxState<ArrowType> state; | ||
| }; | ||
|
|
||
| struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType> { | ||
| using MinMaxImpl::MinMaxImpl; | ||
|
|
||
| void Consume(KernelContext*, const ExecBatch& batch) override { | ||
| StateType local; | ||
| ArrayType arr(batch[0].array()); | ||
|
|
||
| const auto arr_length = arr.length(); | ||
| const auto null_count = arr.null_count(); | ||
| const auto valid_count = arr_length - null_count; | ||
|
|
||
| local.has_nulls = null_count > 0; | ||
| local.has_values = valid_count > 0; | ||
| if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { | ||
| this->state = local; | ||
| return; | ||
| } | ||
|
|
||
| const auto true_count = arr.true_count(); | ||
| const auto false_count = valid_count - true_count; | ||
| local.max = true_count > 0; | ||
| local.min = false_count == 0; | ||
|
|
||
| this->state = local; | ||
| } | ||
| }; | ||
|
|
||
| struct MinMaxInitState { | ||
| std::unique_ptr<KernelState> state; | ||
| KernelContext* ctx; | ||
|
|
@@ -463,6 +548,11 @@ struct MinMaxInitState { | |
| return Status::NotImplemented("No sum implemented"); | ||
| } | ||
|
|
||
| Status Visit(const BooleanType&) { | ||
| state.reset(new BooleanMinMaxImpl(out_type, options)); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| template <typename Type> | ||
| enable_if_number<Type, Status> Visit(const Type&) { | ||
| state.reset(new MinMaxImpl<Type>(out_type, options)); | ||
|
|
@@ -525,18 +615,21 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { | |
| DCHECK_OK(registry->AddFunction(std::move(func))); | ||
|
|
||
| func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary()); | ||
| AddBasicAggKernels(SumInit, {boolean()}, int64(), func.get()); | ||
| AddBasicAggKernels(SumInit, SignedIntTypes(), int64(), func.get()); | ||
| AddBasicAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get()); | ||
| AddBasicAggKernels(SumInit, FloatingPointTypes(), float64(), func.get()); | ||
| DCHECK_OK(registry->AddFunction(std::move(func))); | ||
|
|
||
| func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary()); | ||
| AddBasicAggKernels(MeanInit, {boolean()}, float64(), func.get()); | ||
| AddBasicAggKernels(MeanInit, NumericTypes(), float64(), func.get()); | ||
| DCHECK_OK(registry->AddFunction(std::move(func))); | ||
|
|
||
| static auto default_minmax_options = MinMaxOptions::Defaults(); | ||
| func = std::make_shared<ScalarAggregateFunction>("minmax", Arity::Unary(), | ||
| &default_minmax_options); | ||
| AddMinMaxKernels(MinMaxInit, {boolean()}, func.get()); | ||
| AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); | ||
| DCHECK_OK(registry->AddFunction(std::move(func))); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,6 +127,41 @@ void ValidateSum(const Array& array) { | |
| ValidateSum<ArrowType>(array, NaiveSum<ArrowType>(array)); | ||
| } | ||
|
|
||
| using UnaryOp = Result<Datum>(const Datum&, ExecContext*); | ||
|
|
||
| template <UnaryOp& Op, typename ScalarType> | ||
| void ValidateBooleanAgg(const std::string& json, | ||
| const std::shared_ptr<ScalarType>& expected) { | ||
| auto array = ArrayFromJSON(boolean(), json); | ||
| auto exp = Datum(expected); | ||
| ASSERT_OK_AND_ASSIGN(Datum result, Op(array, nullptr)); | ||
| ASSERT_TRUE(result.Equals(exp)); | ||
| } | ||
|
|
||
| TEST(TestBooleanAggregation, Sum) { | ||
| ValidateBooleanAgg<Sum>("[]", std::make_shared<UInt64Scalar>()); | ||
|
||
| ValidateBooleanAgg<Sum>("[null]", std::make_shared<UInt64Scalar>()); | ||
| ValidateBooleanAgg<Sum>("[null, false]", std::make_shared<UInt64Scalar>(0)); | ||
| ValidateBooleanAgg<Sum>("[true]", std::make_shared<UInt64Scalar>(1)); | ||
| ValidateBooleanAgg<Sum>("[true, false, true]", std::make_shared<UInt64Scalar>(2)); | ||
| ValidateBooleanAgg<Sum>("[true, false, true, true, null]", | ||
| std::make_shared<UInt64Scalar>(3)); | ||
| } | ||
|
|
||
| TEST(TestBooleanAggregation, Mean) { | ||
| ValidateBooleanAgg<Mean>("[]", std::make_shared<DoubleScalar>()); | ||
| ValidateBooleanAgg<Mean>("[null]", std::make_shared<DoubleScalar>()); | ||
| ValidateBooleanAgg<Mean>("[null, false]", std::make_shared<DoubleScalar>(0)); | ||
| ValidateBooleanAgg<Mean>("[true]", std::make_shared<DoubleScalar>(1)); | ||
| ValidateBooleanAgg<Mean>("[true, false, true, false]", | ||
| std::make_shared<DoubleScalar>(0.5)); | ||
| ValidateBooleanAgg<Mean>("[true, null]", std::make_shared<DoubleScalar>(1)); | ||
| ValidateBooleanAgg<Mean>("[true, null, false, true, true]", | ||
| std::make_shared<DoubleScalar>(0.75)); | ||
| ValidateBooleanAgg<Mean>("[true, null, false, false, false]", | ||
| std::make_shared<DoubleScalar>(0.25)); | ||
kszucs marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| template <typename ArrowType> | ||
| class TestNumericSumKernel : public ::testing::Test {}; | ||
|
|
||
|
|
@@ -346,10 +381,10 @@ TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMean) { | |
| /// | ||
|
|
||
| template <typename ArrowType> | ||
| class TestNumericMinMaxKernel : public ::testing::Test { | ||
| class TestPrimitiveMinMaxKernel : public ::testing::Test { | ||
| using Traits = TypeTraits<ArrowType>; | ||
| using ArrayType = typename Traits::ArrayType; | ||
| using c_type = typename ArrayType::value_type; | ||
| using c_type = typename ArrowType::c_type; | ||
| using ScalarType = typename Traits::ScalarType; | ||
|
|
||
| public: | ||
|
|
@@ -401,15 +436,57 @@ class TestNumericMinMaxKernel : public ::testing::Test { | |
| }; | ||
|
|
||
| template <typename ArrowType> | ||
| class TestFloatingMinMaxKernel : public TestNumericMinMaxKernel<ArrowType> {}; | ||
| class TestIntegerMinMaxKernel : public TestPrimitiveMinMaxKernel<ArrowType> {}; | ||
|
|
||
| template <typename ArrowType> | ||
| class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel<ArrowType> {}; | ||
|
|
||
| class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel<BooleanType> {}; | ||
|
|
||
| TEST_F(TestBooleanMinMaxKernel, Basics) { | ||
| MinMaxOptions options; | ||
| std::vector<std::string> chunked_input0 = {"[]", "[]"}; | ||
| std::vector<std::string> chunked_input1 = {"[true, true, null]", "[true, null]"}; | ||
| std::vector<std::string> chunked_input2 = {"[false, false, false]", "[false]"}; | ||
| std::vector<std::string> chunked_input3 = {"[true, null]", "[null, false]"}; | ||
|
|
||
| // SKIP nulls by default | ||
| this->AssertMinMaxIsNull("[]", options); | ||
| this->AssertMinMaxIsNull("[null, null, null]", options); | ||
| this->AssertMinMaxIs("[false, false, false]", false, false, options); | ||
| this->AssertMinMaxIs("[false, false, false, null]", false, false, options); | ||
| this->AssertMinMaxIs("[true, null, true, true]", true, true, options); | ||
| this->AssertMinMaxIs("[true, null, true, true]", true, true, options); | ||
| this->AssertMinMaxIs("[true, null, false, true]", false, true, options); | ||
| this->AssertMinMaxIsNull(chunked_input0, options); | ||
| this->AssertMinMaxIs(chunked_input1, true, true, options); | ||
| this->AssertMinMaxIs(chunked_input2, false, false, options); | ||
| this->AssertMinMaxIs(chunked_input3, false, true, options); | ||
|
|
||
| options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); | ||
| this->AssertMinMaxIsNull("[]", options); | ||
| this->AssertMinMaxIsNull("[null, null, null]", options); | ||
| this->AssertMinMaxIsNull("[false, null, false]", options); | ||
| this->AssertMinMaxIsNull("[true, null]", options); | ||
| this->AssertMinMaxIs("[true, true, true]", true, true, options); | ||
| this->AssertMinMaxIs("[false, false]", false, false, options); | ||
| this->AssertMinMaxIs("[false, true]", false, true, options); | ||
| this->AssertMinMaxIsNull(chunked_input0, options); | ||
| this->AssertMinMaxIsNull(chunked_input1, options); | ||
| this->AssertMinMaxIs(chunked_input2, false, false, options); | ||
| this->AssertMinMaxIsNull(chunked_input3, options); | ||
| } | ||
|
|
||
| TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes); | ||
| TYPED_TEST(TestNumericMinMaxKernel, Basics) { | ||
| TYPED_TEST_SUITE(TestIntegerMinMaxKernel, IntegralArrowTypes); | ||
| TYPED_TEST(TestIntegerMinMaxKernel, Basics) { | ||
| MinMaxOptions options; | ||
| std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"}; | ||
| std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"}; | ||
| std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"}; | ||
|
|
||
| // SKIP nulls by default | ||
| this->AssertMinMaxIsNull("[]", options); | ||
| this->AssertMinMaxIsNull("[null, null, null]", options); | ||
|
||
| this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); | ||
| this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options); | ||
| this->AssertMinMaxIs(chunked_input1, 1, 9, options); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.