From cdfb05462b3926297742a4f34ed1850de755f182 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 27 Sep 2020 22:14:30 -0400 Subject: [PATCH 01/18] ARROW-1846: [C++] Implement "any" reduction kernel for boolean data --- cpp/src/arrow/compute/api_aggregate.cc | 8 +- cpp/src/arrow/compute/api_aggregate.h | 17 +++- .../arrow/compute/kernels/aggregate_basic.cc | 20 +++++ .../kernels/aggregate_basic_internal.h | 57 +++++++++++++ .../arrow/compute/kernels/aggregate_test.cc | 83 +++++++++++++++++++ docs/source/cpp/compute.rst | 3 + python/pyarrow/tests/test_compute.py | 9 ++ 7 files changed, 194 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index dde5118ceae..ea0e2e977b4 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -41,8 +41,12 @@ Result MinMax(const Datum& value, const MinMaxOptions& options, ExecConte return CallFunction("min_max", {value}, &options, ctx); } -Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { - return CallFunction("mode", {value}, &options, ctx); +Result Any(const Datum& value, ExecContext* ctx) { + return CallFunction("any", {value}, ctx); +} + +Result Mode(const Datum& value, ExecContext* ctx) { + return CallFunction("mode", {value}, ctx); } Result Stddev(const Datum& value, const VarianceOptions& options, diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 991af5b9721..19d4acfc678 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -154,7 +154,22 @@ Result MinMax(const Datum& value, const MinMaxOptions& options = MinMaxOptions::Defaults(), ExecContext* ctx = NULLPTR); -/// \brief Calculate the modal (most common) values of a numeric array +/// \brief Test whether any element in a boolean array evaluates to true. +/// +/// This function returns true if any of the elements in the array evaluates +/// to true and false otherwise. Null values are taken to evaluate to false. +/// +/// \param[in] value input datum, expecting a boolean array +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as a BooleanScalar + +/// \since 2.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Any(const Datum& value, ExecContext* ctx = NULLPTR); + + +/// \brief Calculate the modal (most common) value of a numeric array /// /// This function returns top-n most common values and number of times they occur as /// an array of `struct`, where T is the input type. diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 11c1e2b1730..99e709e0c38 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -173,6 +173,12 @@ void AddMinMaxKernels(KernelInit init, } } +void AddAnyKernel(KernelInit init, ScalarAggregateFunction* func) { + auto sig = + KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); + AddAggKernel(std::move(sig), init, func); +} + } // namespace aggregate namespace internal { @@ -268,6 +274,20 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); +<<<<<<< HEAD +======= + + // any + func = std::make_shared("any", Arity::Unary(), + &default_minmax_options); + aggregate::AddAnyKernel(aggregate::AnyInit, func.get()); + + DCHECK_OK(registry->AddFunction(std::move(func))); + + DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddStddevAggKernels())); + DCHECK_OK(registry->AddFunction(aggregate::AddVarianceAggKernels())); +>>>>>>> 34f3bbad9 (ARROW-1846: [C++] Implement "any" reduction kernel for boolean data) } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 733e6d1d0a6..1365c276c12 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -535,6 +535,36 @@ struct BooleanMinMaxImpl : public MinMaxImpl { } }; +template +struct BooleanAnyImpl : public MinMaxImpl { + using StateType = MinMaxState; + using ArrayType = typename TypeTraits::ArrayType; + using MinMaxImpl::MinMaxImpl; + + void Consume(KernelContext*, const ExecBatch& batch) override { + // short-circuit if seen a True already + if (this->state.max == true) { + return; + } + + ArrayType arr(batch[0].array()); + const auto true_count = arr.true_count(); + if (true_count > 0) { + this->state.max = true; + } + } + + void Finalize(KernelContext*, Datum* out) override { + using ScalarType = typename TypeTraits::ScalarType; + + if (this->state.max == true) { + out->value = std::make_shared(true); + } else { + out->value = std::make_shared(false); + } + } +}; + template struct MinMaxInitState { std::unique_ptr state; @@ -572,6 +602,33 @@ struct MinMaxInitState { } }; +template +struct AnyInitState { + std::unique_ptr state; + KernelContext* ctx; + const DataType& in_type; + const std::shared_ptr& out_type; + const MinMaxOptions& options; + + AnyInitState(KernelContext* ctx, const DataType& in_type, + const std::shared_ptr& out_type, const MinMaxOptions& options) + : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} + + Status Visit(const DataType&) { + return Status::NotImplemented("No any kernel implemented"); + } + + Status Visit(const BooleanType&) { + state.reset(new BooleanAnyImpl(out_type, options)); + return Status::OK(); + } + + std::unique_ptr Create() { + ctx->SetStatus(VisitTypeInline(in_type, this)); + return std::move(state); + } +}; + } // namespace aggregate } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index ad2633b4f97..d1477306571 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -727,6 +727,89 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) { } } +// +// Any +// + +class TestPrimitiveAnyKernel : public ::testing::Test { + public: + void AssertAnyIs(const Datum& array, bool expected) { + ASSERT_OK_AND_ASSIGN(Datum out, Any(array)); + const BooleanScalar& out_any = out.scalar_as(); + const auto expected_any = static_cast(expected); + ASSERT_EQ(out_any, expected_any); + } + + void AssertAnyIs(const std::string& json, bool expected) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertAnyIs(array, expected); + } + + void AssertAnyIs(const std::vector& json, bool expected) { + auto array = ChunkedArrayFromJSON(type_singleton(), json); + AssertAnyIs(array, expected); + } + + std::shared_ptr type_singleton() { + return TypeTraits::type_singleton(); + } +}; + +class TestAnyKernel : public TestPrimitiveAnyKernel {}; + +TEST_F(TestAnyKernel, Basics) { + std::vector chunked_input0 = {"[]", "[true]"}; + std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; + std::vector chunked_input2 = {"[false, false, false]", "[false]"}; + std::vector chunked_input3 = {"[false, null]", "[null, false]"}; + + this->AssertAnyIs("[false]", false); + this->AssertAnyIs("[true, false]", true); + this->AssertAnyIs("[null, null, null]", false); + this->AssertAnyIs("[false, false, false]", false); + this->AssertAnyIs("[false, false, false, null]", false); + this->AssertAnyIs("[true, null, true, true]", true); + this->AssertAnyIs("[false, null, false, true]", true); + this->AssertAnyIs("[true, null, false, true]", true); + this->AssertAnyIs(chunked_input0, true); + this->AssertAnyIs(chunked_input1, true); + this->AssertAnyIs(chunked_input2, false); + this->AssertAnyIs(chunked_input3, false); +} + +static bool NaiveAny(const Array& array) { + const auto& array_numeric = checked_cast(array); + const auto true_count = array_numeric.true_count(); + return true_count > 0; +} + +void ValidateAny(const Array& array) { + ASSERT_OK_AND_ASSIGN(Datum out, Any(array)); + const BooleanScalar& out_any = out.scalar_as(); + + bool expected = NaiveAny(array); + BooleanScalar expected_any = static_cast(expected); + + ASSERT_EQ(out_any, expected_any); +} + +class TestRandomBooleanAnyKernel : public ::testing::Test {}; + +TEST_F(TestRandomBooleanAnyKernel, RandomArrayAny) { + auto rand = random::RandomArrayGenerator(0x8afc055); + // Test size up to 1<<11 (2048). + for (size_t i = 3; i < 12; i += 2) { + for (auto null_probability : {0.0, 0.01, 0.1, 0.5, 0.99, 1.0}) { + int64_t base_length = (1UL << i) + 2; + auto array = rand.Boolean(base_length, null_probability, null_probability); + for (auto length_adjust : {-2, -1, 0, 1, 2}) { + int64_t length = (1UL << i) + length_adjust; + ValidateAny(*array->Slice(0, length)); + } + } + } +} + // // Mode // diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index c1d3ac7e61b..1f87ea4c251 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -134,6 +134,9 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | Function name | Arity | Input types | Output type | Options class | +==========================+============+====================+=======================+============================================+ ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| any | Unary | Boolean | Scalar Boolean | | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mean | Unary | Numeric | Scalar Float64 | | diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 981121a3672..3ff73ec226a 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -350,6 +350,15 @@ def test_min_max(): s = pc.min_max() +def test_any(): + # ARROW-1846 + a = pa.array([False, None, True]) + assert pc.any(a).as_py() is True + + a = pa.array([False, None, False]) + assert pc.any(a).as_py() is False + + def test_is_valid(): # An example generated function wrapper without options data = [4, 5, None] From 1926af3c7f4d33f7a123838c0e025771f89ac0b7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 29 Sep 2020 13:54:01 -0400 Subject: [PATCH 02/18] feedback --- .../kernels/aggregate_basic_internal.h | 12 ++----- .../arrow/compute/kernels/aggregate_test.cc | 33 ------------------- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 1365c276c12..34830bd4429 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -537,8 +537,6 @@ struct BooleanMinMaxImpl : public MinMaxImpl { template struct BooleanAnyImpl : public MinMaxImpl { - using StateType = MinMaxState; - using ArrayType = typename TypeTraits::ArrayType; using MinMaxImpl::MinMaxImpl; void Consume(KernelContext*, const ExecBatch& batch) override { @@ -547,7 +545,7 @@ struct BooleanAnyImpl : public MinMaxImpl { return; } - ArrayType arr(batch[0].array()); + BooleanArray arr(batch[0].array()); const auto true_count = arr.true_count(); if (true_count > 0) { this->state.max = true; @@ -555,13 +553,7 @@ struct BooleanAnyImpl : public MinMaxImpl { } void Finalize(KernelContext*, Datum* out) override { - using ScalarType = typename TypeTraits::ScalarType; - - if (this->state.max == true) { - out->value = std::make_shared(true); - } else { - out->value = std::make_shared(false); - } + out->value = std::make_shared(this->state.max); } }; diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index d1477306571..3d2aa8462a3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -777,39 +777,6 @@ TEST_F(TestAnyKernel, Basics) { this->AssertAnyIs(chunked_input3, false); } -static bool NaiveAny(const Array& array) { - const auto& array_numeric = checked_cast(array); - const auto true_count = array_numeric.true_count(); - return true_count > 0; -} - -void ValidateAny(const Array& array) { - ASSERT_OK_AND_ASSIGN(Datum out, Any(array)); - const BooleanScalar& out_any = out.scalar_as(); - - bool expected = NaiveAny(array); - BooleanScalar expected_any = static_cast(expected); - - ASSERT_EQ(out_any, expected_any); -} - -class TestRandomBooleanAnyKernel : public ::testing::Test {}; - -TEST_F(TestRandomBooleanAnyKernel, RandomArrayAny) { - auto rand = random::RandomArrayGenerator(0x8afc055); - // Test size up to 1<<11 (2048). - for (size_t i = 3; i < 12; i += 2) { - for (auto null_probability : {0.0, 0.01, 0.1, 0.5, 0.99, 1.0}) { - int64_t base_length = (1UL << i) + 2; - auto array = rand.Boolean(base_length, null_probability, null_probability); - for (auto length_adjust : {-2, -1, 0, 1, 2}) { - int64_t length = (1UL << i) + length_adjust; - ValidateAny(*array->Slice(0, length)); - } - } - } -} - // // Mode // From f7f394e337b8e8cde799463272daf1a725d3c7d9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 12 Oct 2020 01:14:55 -0400 Subject: [PATCH 03/18] add test for empty case --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 3d2aa8462a3..f9c974165fe 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -763,6 +763,7 @@ TEST_F(TestAnyKernel, Basics) { std::vector chunked_input2 = {"[false, false, false]", "[false]"}; std::vector chunked_input3 = {"[false, null]", "[null, false]"}; + this->AssertAnyIs("[]", false); this->AssertAnyIs("[false]", false); this->AssertAnyIs("[true, false]", true); this->AssertAnyIs("[null, null, null]", false); From f01f079eb15b2184342ec84c2bf3af08ada7c3cb Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 12 Oct 2020 21:40:09 -0400 Subject: [PATCH 04/18] feedback: aggressive short-circuiting --- .../compute/kernels/aggregate_basic_internal.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 34830bd4429..4a825f1bf8e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -545,10 +545,17 @@ struct BooleanAnyImpl : public MinMaxImpl { return; } - BooleanArray arr(batch[0].array()); - const auto true_count = arr.true_count(); - if (true_count > 0) { - this->state.max = true; + const auto& data = *batch[0].array(); + arrow::internal::OptionalBinaryBitBlockCounter counter( + data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); + int64_t position = 0; + while (position < data.length) { + const auto block = counter.NextAndBlock(); + if (block.popcount > 0) { + this->state.max = true; + break; + } + position += block.length; } } From 59524439527f9505bfbd61870c2e313f5bbba2d5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 13 Oct 2020 11:58:00 -0400 Subject: [PATCH 05/18] feedback: simplify AnyImpl --- .../arrow/compute/kernels/aggregate_basic.cc | 57 +++++++++++++++++-- .../kernels/aggregate_basic_internal.h | 56 ------------------ 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 99e709e0c38..5d7af0ba865 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -151,6 +151,56 @@ std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs return visitor.Create(); } +// ---------------------------------------------------------------------- +// Any implementation + + +struct BooleanAnyImpl: public ScalarAggregator { + + void Consume(KernelContext*, const ExecBatch& batch) override { + // short-circuit if seen a True already + if (this->max == true) { + return; + } + + const auto& data = *batch[0].array(); + arrow::internal::OptionalBinaryBitBlockCounter counter( + data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); + int64_t position = 0; + while (position < data.length) { + const auto block = counter.NextAndBlock(); + if (block.popcount > 0) { + this->max = true; + break; + } + position += block.length; + } + } + + void MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->max += other.max; + } + + void Finalize(KernelContext*, Datum* out) override { + out->value = std::make_shared(this->max); + } + bool max = false; +}; + +std::unique_ptr AnyInit(KernelContext*, const KernelInitArgs& args) { + return ::arrow::internal::make_unique(); +} + +void AddAggKernel(std::shared_ptr sig, KernelInit init, + ScalarAggregateFunction* func, SimdLevel::type simd_level) { + ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge, + AggregateFinalize); + // Set the simd level + kernel.simd_level = simd_level; + DCHECK_OK(func->AddKernel(kernel)); +} + void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, @@ -278,10 +328,9 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { ======= // any - func = std::make_shared("any", Arity::Unary(), - &default_minmax_options); - aggregate::AddAnyKernel(aggregate::AnyInit, func.get()); - + func = std::make_shared("any", Arity::Unary()); + auto sig = KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); + aggregate::AddAggKernel(std::move(sig), aggregate::AnyInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 4a825f1bf8e..733e6d1d0a6 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -535,35 +535,6 @@ struct BooleanMinMaxImpl : public MinMaxImpl { } }; -template -struct BooleanAnyImpl : public MinMaxImpl { - using MinMaxImpl::MinMaxImpl; - - void Consume(KernelContext*, const ExecBatch& batch) override { - // short-circuit if seen a True already - if (this->state.max == true) { - return; - } - - const auto& data = *batch[0].array(); - arrow::internal::OptionalBinaryBitBlockCounter counter( - data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); - int64_t position = 0; - while (position < data.length) { - const auto block = counter.NextAndBlock(); - if (block.popcount > 0) { - this->state.max = true; - break; - } - position += block.length; - } - } - - void Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->state.max); - } -}; - template struct MinMaxInitState { std::unique_ptr state; @@ -601,33 +572,6 @@ struct MinMaxInitState { } }; -template -struct AnyInitState { - std::unique_ptr state; - KernelContext* ctx; - const DataType& in_type; - const std::shared_ptr& out_type; - const MinMaxOptions& options; - - AnyInitState(KernelContext* ctx, const DataType& in_type, - const std::shared_ptr& out_type, const MinMaxOptions& options) - : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} - - Status Visit(const DataType&) { - return Status::NotImplemented("No any kernel implemented"); - } - - Status Visit(const BooleanType&) { - state.reset(new BooleanAnyImpl(out_type, options)); - return Status::OK(); - } - - std::unique_ptr Create() { - ctx->SetStatus(VisitTypeInline(in_type, this)); - return std::move(state); - } -}; - } // namespace aggregate } // namespace compute } // namespace arrow From 4cd692afa89fbd312000a904e99392b500e4ffb9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 13 Oct 2020 12:29:27 -0400 Subject: [PATCH 06/18] update docstring --- cpp/src/arrow/compute/api_aggregate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 19d4acfc678..749db8315b6 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -157,7 +157,7 @@ Result MinMax(const Datum& value, /// \brief Test whether any element in a boolean array evaluates to true. /// /// This function returns true if any of the elements in the array evaluates -/// to true and false otherwise. Null values are taken to evaluate to false. +/// to true and false otherwise. Null values are skipped. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] ctx the function execution context, optional From c55d20f7a8fbbaac37ac7b004260e881d55e3b5b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 13 Oct 2020 12:30:34 -0400 Subject: [PATCH 07/18] update Python api reference --- docs/source/python/api/compute.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 2ade10291ef..1e08da1a5d5 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -81,6 +81,7 @@ logic variants are provided (suffixed ``_kleene``). See User Guide for details. and_ and_kleene + any invert or_ or_kleene From 9f099c6b23fea2abdade28289fa8f9629af3fd5b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 13 Oct 2020 14:39:06 -0400 Subject: [PATCH 08/18] some fixes --- cpp/src/arrow/compute/api_aggregate.h | 3 +-- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 13 +++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 749db8315b6..404fbc0b89c 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -157,7 +157,7 @@ Result MinMax(const Datum& value, /// \brief Test whether any element in a boolean array evaluates to true. /// /// This function returns true if any of the elements in the array evaluates -/// to true and false otherwise. Null values are skipped. +/// to true and false otherwise. Null values are skipped. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] ctx the function execution context, optional @@ -168,7 +168,6 @@ Result MinMax(const Datum& value, ARROW_EXPORT Result Any(const Datum& value, ExecContext* ctx = NULLPTR); - /// \brief Calculate the modal (most common) value of a numeric array /// /// This function returns top-n most common values and number of times they occur as diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 5d7af0ba865..668d420ae9b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -154,9 +154,7 @@ std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs // ---------------------------------------------------------------------- // Any implementation - -struct BooleanAnyImpl: public ScalarAggregator { - +struct BooleanAnyImpl : public ScalarAggregator { void Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a True already if (this->max == true) { @@ -223,12 +221,6 @@ void AddMinMaxKernels(KernelInit init, } } -void AddAnyKernel(KernelInit init, ScalarAggregateFunction* func) { - auto sig = - KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); - AddAggKernel(std::move(sig), init, func); -} - } // namespace aggregate namespace internal { @@ -329,7 +321,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // any func = std::make_shared("any", Arity::Unary()); - auto sig = KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); + auto sig = + KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); aggregate::AddAggKernel(std::move(sig), aggregate::AnyInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); From 83c653b29477693adebeeb6742e1b67097e2ffde Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 13 Oct 2020 23:38:09 -0400 Subject: [PATCH 09/18] in BooleanAnyImpl rename local var max->any --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 668d420ae9b..01abf8e7da6 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -157,7 +157,7 @@ std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs struct BooleanAnyImpl : public ScalarAggregator { void Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a True already - if (this->max == true) { + if (this->any == true) { return; } @@ -168,7 +168,7 @@ struct BooleanAnyImpl : public ScalarAggregator { while (position < data.length) { const auto block = counter.NextAndBlock(); if (block.popcount > 0) { - this->max = true; + this->any = true; break; } position += block.length; @@ -177,13 +177,13 @@ struct BooleanAnyImpl : public ScalarAggregator { void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); - this->max += other.max; + this->any += other.any; } void Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->max); + out->value = std::make_shared(this->any); } - bool max = false; + bool any = false; }; std::unique_ptr AnyInit(KernelContext*, const KernelInitArgs& args) { From bed57a655453b973fd1c5d8c3b70659f9123c7e4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 14 Oct 2020 00:04:55 -0400 Subject: [PATCH 10/18] BooleanAnyImpl: use |= instead of += for bool to fix Windows build --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 01abf8e7da6..c41e5a4c0c2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -177,7 +177,7 @@ struct BooleanAnyImpl : public ScalarAggregator { void MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); - this->any += other.any; + this->any |= other.any; } void Finalize(KernelContext*, Datum* out) override { From 90f911e4600ce7378051c87f10c64712846495be Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 14 Oct 2020 01:07:37 -0400 Subject: [PATCH 11/18] add testcase --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index f9c974165fe..990c9348036 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -762,6 +762,7 @@ TEST_F(TestAnyKernel, Basics) { std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; std::vector chunked_input2 = {"[false, false, false]", "[false]"}; std::vector chunked_input3 = {"[false, null]", "[null, false]"}; + std::vector chunked_input4 = {"[true, null]", "[null, false]"}; this->AssertAnyIs("[]", false); this->AssertAnyIs("[false]", false); @@ -776,6 +777,7 @@ TEST_F(TestAnyKernel, Basics) { this->AssertAnyIs(chunked_input1, true); this->AssertAnyIs(chunked_input2, false); this->AssertAnyIs(chunked_input3, false); + this->AssertAnyIs(chunked_input4, true); } // From 2b22096aea32985d9255fe9b53c9db44344dcd61 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 2 Nov 2020 13:25:35 -0500 Subject: [PATCH 12/18] DOC: add python doc in aggregate_basic.cc --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index c41e5a4c0c2..df317fdfa83 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -246,6 +246,11 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer {"array"}, "MinMaxOptions"}; +const FunctionDoc any_doc{ + "Test whether any element in a boolean array evaluates to true.", + ("Null values are ignored."), + {"array"}}; + } // namespace void RegisterScalarAggregateBasic(FunctionRegistry* registry) { @@ -320,7 +325,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { ======= // any - func = std::make_shared("any", Arity::Unary()); + func = std::make_shared("any", Arity::Unary(), &any_doc); auto sig = KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); aggregate::AddAggKernel(std::move(sig), aggregate::AnyInit, func.get()); From 70a9d7feefcfcbbb80feeac3403d54a36a996c6c Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 11 Nov 2020 23:34:19 -0500 Subject: [PATCH 13/18] resolve merge conflict --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index df317fdfa83..a98bf539b16 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -321,20 +321,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); -<<<<<<< HEAD -======= // any func = std::make_shared("any", Arity::Unary(), &any_doc); auto sig = KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); - aggregate::AddAggKernel(std::move(sig), aggregate::AnyInit, func.get()); + aggregate::AddBasicAggKernels(std::move(sig), aggregate::AnyInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - DCHECK_OK(registry->AddFunction(aggregate::AddModeAggKernels())); - DCHECK_OK(registry->AddFunction(aggregate::AddStddevAggKernels())); - DCHECK_OK(registry->AddFunction(aggregate::AddVarianceAggKernels())); ->>>>>>> 34f3bbad9 (ARROW-1846: [C++] Implement "any" reduction kernel for boolean data) } } // namespace internal From 21c1ef624ed62fbd3dbaaf89357384d08ef03053 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 13 Nov 2020 00:28:38 -0500 Subject: [PATCH 14/18] fix merge conflict --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index a98bf539b16..221c2c4df97 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -190,15 +190,6 @@ std::unique_ptr AnyInit(KernelContext*, const KernelInitArgs& args) return ::arrow::internal::make_unique(); } -void AddAggKernel(std::shared_ptr sig, KernelInit init, - ScalarAggregateFunction* func, SimdLevel::type simd_level) { - ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge, - AggregateFinalize); - // Set the simd level - kernel.simd_level = simd_level; - DCHECK_OK(func->AddKernel(kernel)); -} - void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, @@ -324,9 +315,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // any func = std::make_shared("any", Arity::Unary(), &any_doc); - auto sig = - KernelSignature::Make({InputType::Array(boolean())}, ValueDescr::Scalar(boolean())); - aggregate::AddBasicAggKernels(std::move(sig), aggregate::AnyInit, func.get()); + aggregate::AddBasicAggKernels(aggregate::AnyInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); } From 579a96dca2db0f8101e25f5b00a0f20c90973156 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 13 Nov 2020 00:39:31 -0500 Subject: [PATCH 15/18] linting --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 221c2c4df97..42e2baa0c01 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -317,7 +317,6 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { func = std::make_shared("any", Arity::Unary(), &any_doc); aggregate::AddBasicAggKernels(aggregate::AnyInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - } } // namespace internal From a66661d8b3168b266a3ab6cbb02de0d2286b32f6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 13 Nov 2020 11:23:52 -0500 Subject: [PATCH 16/18] Update version in C++ docstring Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/compute/api_aggregate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 404fbc0b89c..5651ecbd777 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -163,7 +163,7 @@ Result MinMax(const Datum& value, /// \param[in] ctx the function execution context, optional /// \return resulting datum as a BooleanScalar -/// \since 2.0.0 +/// \since 3.0.0 /// \note API not yet finalized ARROW_EXPORT Result Any(const Datum& value, ExecContext* ctx = NULLPTR); From 8a58bdfdb1e5c475cd9514abecf0ce1953109daf Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 13 Nov 2020 11:36:46 -0500 Subject: [PATCH 17/18] fix rebase error --- cpp/src/arrow/compute/api_aggregate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index ea0e2e977b4..67165f48727 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -45,7 +45,7 @@ Result Any(const Datum& value, ExecContext* ctx) { return CallFunction("any", {value}, ctx); } -Result Mode(const Datum& value, ExecContext* ctx) { +Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { return CallFunction("mode", {value}, ctx); } From 656bd8268011069eeba5271b55b5b9b8b8975d86 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 13 Nov 2020 12:19:33 -0500 Subject: [PATCH 18/18] fix rebase error --- cpp/src/arrow/compute/api_aggregate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 67165f48727..8711ae381e7 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -46,7 +46,7 @@ Result Any(const Datum& value, ExecContext* ctx) { } Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { - return CallFunction("mode", {value}, ctx); + return CallFunction("mode", {value}, &options, ctx); } Result Stddev(const Datum& value, const VarianceOptions& options,