diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index dde5118ceae..8711ae381e7 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -41,6 +41,10 @@ Result MinMax(const Datum& value, const MinMaxOptions& options, ExecConte return CallFunction("min_max", {value}, &options, ctx); } +Result Any(const Datum& value, ExecContext* ctx) { + return CallFunction("any", {value}, ctx); +} + Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { return CallFunction("mode", {value}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 991af5b9721..5651ecbd777 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -154,7 +154,21 @@ Result MinMax(const Datum& value, const MinMaxOptions& options = MinMaxOptions::Defaults(), ExecContext* ctx = NULLPTR); -/// \brief Calculate the modal (most common) values of a numeric array +/// \brief Test whether any element in a boolean array evaluates to true. +/// +/// This function returns true if any of the elements in the array evaluates +/// to true and false otherwise. Null values are skipped. +/// +/// \param[in] value input datum, expecting a boolean array +/// \param[in] ctx the function execution context, optional +/// \return resulting datum as a BooleanScalar + +/// \since 3.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Any(const Datum& value, ExecContext* ctx = NULLPTR); + +/// \brief Calculate the modal (most common) value of a numeric array /// /// This function returns top-n most common values and number of times they occur as /// an array of `struct`, where T is the input type. diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 11c1e2b1730..42e2baa0c01 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -151,6 +151,45 @@ std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs return visitor.Create(); } +// ---------------------------------------------------------------------- +// Any implementation + +struct BooleanAnyImpl : public ScalarAggregator { + void Consume(KernelContext*, const ExecBatch& batch) override { + // short-circuit if seen a True already + if (this->any == true) { + return; + } + + const auto& data = *batch[0].array(); + arrow::internal::OptionalBinaryBitBlockCounter counter( + data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); + int64_t position = 0; + while (position < data.length) { + const auto block = counter.NextAndBlock(); + if (block.popcount > 0) { + this->any = true; + break; + } + position += block.length; + } + } + + void MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->any |= other.any; + } + + void Finalize(KernelContext*, Datum* out) override { + out->value = std::make_shared(this->any); + } + bool any = false; +}; + +std::unique_ptr AnyInit(KernelContext*, const KernelInitArgs& args) { + return ::arrow::internal::make_unique(); +} + void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, @@ -198,6 +237,11 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer {"array"}, "MinMaxOptions"}; +const FunctionDoc any_doc{ + "Test whether any element in a boolean array evaluates to true.", + ("Null values are ignored."), + {"array"}}; + } // namespace void RegisterScalarAggregateBasic(FunctionRegistry* registry) { @@ -268,6 +312,11 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); + + // any + func = std::make_shared("any", Arity::Unary(), &any_doc); + aggregate::AddBasicAggKernels(aggregate::AnyInit, {boolean()}, boolean(), func.get()); + DCHECK_OK(registry->AddFunction(std::move(func))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index ad2633b4f97..990c9348036 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -727,6 +727,59 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) { } } +// +// Any +// + +class TestPrimitiveAnyKernel : public ::testing::Test { + public: + void AssertAnyIs(const Datum& array, bool expected) { + ASSERT_OK_AND_ASSIGN(Datum out, Any(array)); + const BooleanScalar& out_any = out.scalar_as(); + const auto expected_any = static_cast(expected); + ASSERT_EQ(out_any, expected_any); + } + + void AssertAnyIs(const std::string& json, bool expected) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertAnyIs(array, expected); + } + + void AssertAnyIs(const std::vector& json, bool expected) { + auto array = ChunkedArrayFromJSON(type_singleton(), json); + AssertAnyIs(array, expected); + } + + std::shared_ptr type_singleton() { + return TypeTraits::type_singleton(); + } +}; + +class TestAnyKernel : public TestPrimitiveAnyKernel {}; + +TEST_F(TestAnyKernel, Basics) { + std::vector chunked_input0 = {"[]", "[true]"}; + std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; + std::vector chunked_input2 = {"[false, false, false]", "[false]"}; + std::vector chunked_input3 = {"[false, null]", "[null, false]"}; + std::vector chunked_input4 = {"[true, null]", "[null, false]"}; + + this->AssertAnyIs("[]", false); + this->AssertAnyIs("[false]", false); + this->AssertAnyIs("[true, false]", true); + this->AssertAnyIs("[null, null, null]", false); + this->AssertAnyIs("[false, false, false]", false); + this->AssertAnyIs("[false, false, false, null]", false); + this->AssertAnyIs("[true, null, true, true]", true); + this->AssertAnyIs("[false, null, false, true]", true); + this->AssertAnyIs("[true, null, false, true]", true); + this->AssertAnyIs(chunked_input0, true); + this->AssertAnyIs(chunked_input1, true); + this->AssertAnyIs(chunked_input2, false); + this->AssertAnyIs(chunked_input3, false); + this->AssertAnyIs(chunked_input4, true); +} + // // Mode // diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index c1d3ac7e61b..1f87ea4c251 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -134,6 +134,9 @@ Aggregations +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | Function name | Arity | Input types | Output type | Options class | +==========================+============+====================+=======================+============================================+ ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ +| any | Unary | Boolean | Scalar Boolean | | ++--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+ | mean | Unary | Numeric | Scalar Float64 | | diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 2ade10291ef..1e08da1a5d5 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -81,6 +81,7 @@ logic variants are provided (suffixed ``_kleene``). See User Guide for details. and_ and_kleene + any invert or_ or_kleene diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 981121a3672..3ff73ec226a 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -350,6 +350,15 @@ def test_min_max(): s = pc.min_max() +def test_any(): + # ARROW-1846 + a = pa.array([False, None, True]) + assert pc.any(a).as_py() is True + + a = pa.array([False, None, False]) + assert pc.any(a).as_py() is False + + def test_is_valid(): # An example generated function wrapper without options data = [4, 5, None]