From 162c7cb39195f3465e6d926165eb193f633b530c Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 8 Jun 2021 03:22:06 +0200 Subject: [PATCH 1/8] Adding ScalarAggregateOptions to Any and All kernels. --- cpp/src/arrow/compute/api_aggregate.cc | 10 +- cpp/src/arrow/compute/api_aggregate.h | 16 +- .../arrow/compute/kernels/aggregate_basic.cc | 66 ++++++-- .../arrow/compute/kernels/aggregate_test.cc | 143 ++++++++++++------ docs/source/cpp/compute.rst | 4 +- python/pyarrow/tests/test_compute.py | 10 ++ r/R/compute.R | 23 +-- r/src/compute.cpp | 2 +- r/tests/testthat/test-compute-aggregate.R | 7 +- 9 files changed, 186 insertions(+), 95 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index be05c3c11d0..1b00c366bfd 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -155,12 +155,14 @@ Result MinMax(const Datum& value, const ScalarAggregateOptions& options, return CallFunction("min_max", {value}, &options, ctx); } -Result Any(const Datum& value, ExecContext* ctx) { - return CallFunction("any", {value}, ctx); +Result Any(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("any", {value}, &options, ctx); } -Result All(const Datum& value, ExecContext* ctx) { - return CallFunction("all", {value}, ctx); +Result All(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("all", {value}, &options, ctx); } Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 7b6e2ef96de..05362724625 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -205,30 +205,38 @@ Result MinMax( /// \brief Test whether any element in a boolean array evaluates to true. /// /// This function returns true if any of the elements in the array evaluates -/// to true and false otherwise. Null values are skipped. +/// to true and false otherwise. Null values are ignored by default. /// /// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a BooleanScalar /// /// \since 3.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Any(const Datum& value, ExecContext* ctx = NULLPTR); +Result Any( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Test whether all elements in a boolean array evaluate to true. /// /// This function returns true if all of the elements in the array evaluate -/// to true and false otherwise. Null values are skipped. +/// to true and false otherwise. Null values are ignored by default. /// /// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a BooleanScalar /// \since 3.0.0 /// \note API not yet finalized ARROW_EXPORT -Result All(const Datum& value, ExecContext* ctx = NULLPTR); +Result All( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Calculate the modal (most common) value of a numeric array /// diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 6a844817686..10077138fdb 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -142,13 +142,19 @@ Result> MinMaxInit(KernelContext* ctx, // Any implementation struct BooleanAnyImpl : public ScalarAggregator { + explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {} + Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a True already - if (this->any == true) { + if (options.skip_nulls && this->any == true) { + return Status::OK(); + } + // short-circuit if seen a null already + if (!options.skip_nulls && this->has_nulls) { return Status::OK(); } - const auto& data = *batch[0].array(); + this->has_nulls = data.GetNullCount() > 0; arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); int64_t position = 0; @@ -166,32 +172,48 @@ struct BooleanAnyImpl : public ScalarAggregator { Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->any |= other.any; + this->has_nulls |= other.has_nulls; return Status::OK(); } - Status Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->any); + Status Finalize(KernelContext* ctx, Datum* out) override { + if (!options.skip_nulls && this->has_nulls) { + out->value = std::make_shared(); + } else { + out->value = std::make_shared(this->any); + } return Status::OK(); } bool any = false; + bool has_nulls = false; + ScalarAggregateOptions options; }; Result> AnyInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique(); + const ScalarAggregateOptions options = + static_cast(*args.options); + return ::arrow::internal::make_unique( + static_cast(*args.options)); } // ---------------------------------------------------------------------- // All implementation struct BooleanAllImpl : public ScalarAggregator { + explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {} + Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a false already - if (this->all == false) { + if (options.skip_nulls && this->all == false) { + return Status::OK(); + } + // short-circuit if seen a null already + if (!options.skip_nulls && this->has_nulls) { return Status::OK(); } - const auto& data = *batch[0].array(); + this->has_nulls = data.GetNullCount() > 0; arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[1], data.offset, data.buffers[0], data.offset, data.length); int64_t position = 0; @@ -210,19 +232,27 @@ struct BooleanAllImpl : public ScalarAggregator { Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->all &= other.all; + this->has_nulls |= other.has_nulls; return Status::OK(); } Status Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->all); + if (!options.skip_nulls && this->has_nulls) { + out->value = std::make_shared(); + } else { + out->value = std::make_shared(this->all); + } return Status::OK(); } bool all = true; + bool has_nulls = false; + ScalarAggregateOptions options; }; Result> AllInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique(); + return ::arrow::internal::make_unique( + static_cast(*args.options)); } // ---------------------------------------------------------------------- @@ -407,12 +437,16 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer "ScalarAggregateOptions"}; const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", - ("Null values are ignored."), - {"array"}}; + ("Null values are ignored by default.\n" + "This can be changed through ScalarAggregateOptions."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", - ("Null values are ignored."), - {"array"}}; + ("Null values are ignored by default.\n" + "This can be changed through ScalarAggregateOptions."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc index_doc{"Find the index of the first occurrence of a given value", ("The result is always computed as an int64_t, regardless\n" @@ -496,12 +530,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); // any - func = std::make_shared("any", Arity::Unary(), &any_doc); + func = std::make_shared("any", Arity::Unary(), &any_doc, + &default_scalar_aggregate_options); aggregate::AddBasicAggKernels(aggregate::AnyInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); // all - func = std::make_shared("all", Arity::Unary(), &all_doc); + func = std::make_shared("all", Arity::Unary(), &all_doc, + &default_scalar_aggregate_options); aggregate::AddBasicAggKernels(aggregate::AllInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 4bce02a990b..887f017d08e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -942,21 +942,26 @@ TYPED_TEST(TestRandomNumericMinMaxKernel, RandomArrayMinMax) { class TestPrimitiveAnyKernel : public ::testing::Test { public: - void AssertAnyIs(const Datum& array, bool expected) { - ASSERT_OK_AND_ASSIGN(Datum out, Any(array)); + void AssertAnyIs(const Datum& array, const std::shared_ptr& expected, + const ScalarAggregateOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out, Any(array, options, nullptr)); const BooleanScalar& out_any = out.scalar_as(); - const auto expected_any = static_cast(expected); - ASSERT_EQ(out_any, expected_any); + ASSERT_EQ(out_any, *expected); } - void AssertAnyIs(const std::string& json, bool expected) { + void AssertAnyIs( + const std::string& json, const std::shared_ptr& expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ArrayFromJSON(type_singleton(), json); - AssertAnyIs(array, expected); + AssertAnyIs(array, expected, options); } - void AssertAnyIs(const std::vector& json, bool expected) { + void AssertAnyIs( + const std::vector& json, + const std::shared_ptr& expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ChunkedArrayFromJSON(type_singleton(), json); - AssertAnyIs(array, expected); + AssertAnyIs(array, expected, options); } std::shared_ptr type_singleton() { @@ -967,26 +972,47 @@ class TestPrimitiveAnyKernel : public ::testing::Test { class TestAnyKernel : public TestPrimitiveAnyKernel {}; TEST_F(TestAnyKernel, Basics) { + auto true_value = std::make_shared(true); + auto false_value = std::make_shared(false); + auto null_value = std::make_shared(); + null_value->is_valid = false; + std::vector chunked_input0 = {"[]", "[true]"}; std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; std::vector chunked_input2 = {"[false, false, false]", "[false]"}; std::vector chunked_input3 = {"[false, null]", "[null, false]"}; std::vector chunked_input4 = {"[true, null]", "[null, false]"}; - this->AssertAnyIs("[]", false); - this->AssertAnyIs("[false]", false); - this->AssertAnyIs("[true, false]", true); - this->AssertAnyIs("[null, null, null]", false); - this->AssertAnyIs("[false, false, false]", false); - this->AssertAnyIs("[false, false, false, null]", false); - this->AssertAnyIs("[true, null, true, true]", true); - this->AssertAnyIs("[false, null, false, true]", true); - this->AssertAnyIs("[true, null, false, true]", true); - this->AssertAnyIs(chunked_input0, true); - this->AssertAnyIs(chunked_input1, true); - this->AssertAnyIs(chunked_input2, false); - this->AssertAnyIs(chunked_input3, false); - this->AssertAnyIs(chunked_input4, true); + this->AssertAnyIs("[]", false_value); + this->AssertAnyIs("[false]", false_value); + this->AssertAnyIs("[true, false]", true_value); + this->AssertAnyIs("[null, null, null]", false_value); + this->AssertAnyIs("[false, false, false]", false_value); + this->AssertAnyIs("[false, false, false, null]", false_value); + this->AssertAnyIs("[true, null, true, true]", true_value); + this->AssertAnyIs("[false, null, false, true]", true_value); + this->AssertAnyIs("[true, null, false, true]", true_value); + this->AssertAnyIs(chunked_input0, true_value); + this->AssertAnyIs(chunked_input1, true_value); + this->AssertAnyIs(chunked_input2, false_value); + this->AssertAnyIs(chunked_input3, false_value); + this->AssertAnyIs(chunked_input4, true_value); + + const ScalarAggregateOptions& keep_nulls = ScalarAggregateOptions(/*skip_nulls=*/false); + this->AssertAnyIs("[]", false_value, keep_nulls); + this->AssertAnyIs("[false]", false_value, keep_nulls); + this->AssertAnyIs("[true, false]", true_value, keep_nulls); + this->AssertAnyIs("[null, null, null]", null_value, keep_nulls); + this->AssertAnyIs("[false, false, false]", false_value, keep_nulls); + this->AssertAnyIs("[false, false, false, null]", null_value, keep_nulls); + this->AssertAnyIs("[true, null, true, true]", null_value, keep_nulls); + this->AssertAnyIs("[false, null, false, true]", null_value, keep_nulls); + this->AssertAnyIs("[true, null, false, true]", null_value, keep_nulls); + this->AssertAnyIs(chunked_input0, true_value, keep_nulls); + this->AssertAnyIs(chunked_input1, null_value, keep_nulls); + this->AssertAnyIs(chunked_input2, false_value, keep_nulls); + this->AssertAnyIs(chunked_input3, null_value, keep_nulls); + this->AssertAnyIs(chunked_input4, null_value, keep_nulls); } // @@ -995,21 +1021,26 @@ TEST_F(TestAnyKernel, Basics) { class TestPrimitiveAllKernel : public ::testing::Test { public: - void AssertAllIs(const Datum& array, bool expected) { - ASSERT_OK_AND_ASSIGN(Datum out, All(array)); + void AssertAllIs(const Datum& array, const std::shared_ptr& expected, + const ScalarAggregateOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out, All(array, options, nullptr)); const BooleanScalar& out_all = out.scalar_as(); - const auto expected_all = static_cast(expected); - ASSERT_EQ(out_all, expected_all); + ASSERT_EQ(out_all, *expected); } - void AssertAllIs(const std::string& json, bool expected) { + void AssertAllIs( + const std::string& json, const std::shared_ptr& expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ArrayFromJSON(type_singleton(), json); - AssertAllIs(array, expected); + AssertAllIs(array, expected, options); } - void AssertAllIs(const std::vector& json, bool expected) { + void AssertAllIs( + const std::vector& json, + const std::shared_ptr& expected, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults()) { auto array = ChunkedArrayFromJSON(type_singleton(), json); - AssertAllIs(array, expected); + AssertAllIs(array, expected, options); } std::shared_ptr type_singleton() { @@ -1020,6 +1051,11 @@ class TestPrimitiveAllKernel : public ::testing::Test { class TestAllKernel : public TestPrimitiveAllKernel {}; TEST_F(TestAllKernel, Basics) { + auto true_value = std::make_shared(true); + auto false_value = std::make_shared(false); + auto null_value = std::make_shared(); + null_value->is_valid = false; + std::vector chunked_input0 = {"[]", "[true]"}; std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; std::vector chunked_input2 = {"[false, false, false]", "[false]"}; @@ -1027,21 +1063,38 @@ TEST_F(TestAllKernel, Basics) { std::vector chunked_input4 = {"[true, null]", "[null, false]"}; std::vector chunked_input5 = {"[false, null]", "[null, true]"}; - this->AssertAllIs("[]", true); - this->AssertAllIs("[false]", false); - this->AssertAllIs("[true, false]", false); - this->AssertAllIs("[null, null, null]", true); - this->AssertAllIs("[false, false, false]", false); - this->AssertAllIs("[false, false, false, null]", false); - this->AssertAllIs("[true, null, true, true]", true); - this->AssertAllIs("[false, null, false, true]", false); - this->AssertAllIs("[true, null, false, true]", false); - this->AssertAllIs(chunked_input0, true); - this->AssertAllIs(chunked_input1, true); - this->AssertAllIs(chunked_input2, false); - this->AssertAllIs(chunked_input3, false); - this->AssertAllIs(chunked_input4, false); - this->AssertAllIs(chunked_input5, false); + this->AssertAllIs("[]", true_value); + this->AssertAllIs("[false]", false_value); + this->AssertAllIs("[true, false]", false_value); + this->AssertAllIs("[null, null, null]", true_value); + this->AssertAllIs("[false, false, false]", false_value); + this->AssertAllIs("[false, false, false, null]", false_value); + this->AssertAllIs("[true, null, true, true]", true_value); + this->AssertAllIs("[false, null, false, true]", false_value); + this->AssertAllIs("[true, null, false, true]", false_value); + this->AssertAllIs(chunked_input0, true_value); + this->AssertAllIs(chunked_input1, true_value); + this->AssertAllIs(chunked_input2, false_value); + this->AssertAllIs(chunked_input3, false_value); + this->AssertAllIs(chunked_input4, false_value); + this->AssertAllIs(chunked_input5, false_value); + + const ScalarAggregateOptions keep_nulls = ScalarAggregateOptions(/*skip_nulls=*/false); + this->AssertAllIs("[]", true_value, keep_nulls); + this->AssertAllIs("[false]", false_value, keep_nulls); + this->AssertAllIs("[true, false]", false_value, keep_nulls); + this->AssertAllIs("[null, null, null]", null_value, keep_nulls); + this->AssertAllIs("[false, false, false]", false_value, keep_nulls); + this->AssertAllIs("[false, false, false, null]", null_value, keep_nulls); + this->AssertAllIs("[true, null, true, true]", null_value, keep_nulls); + this->AssertAllIs("[false, null, false, true]", null_value, keep_nulls); + this->AssertAllIs("[true, null, false, true]", null_value, keep_nulls); + this->AssertAllIs(chunked_input0, true_value, keep_nulls); + this->AssertAllIs(chunked_input1, null_value, keep_nulls); + this->AssertAllIs(chunked_input2, false_value, keep_nulls); + this->AssertAllIs(chunked_input3, null_value, keep_nulls); + this->AssertAllIs(chunked_input4, null_value, keep_nulls); + this->AssertAllIs(chunked_input5, null_value, keep_nulls); } // diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index fc6c8b7c7e1..c9c8a709668 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -186,9 +186,9 @@ Aggregations +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +===============+=======+=============+================+==================================+=======+ -| all | Unary | Boolean | Scalar Boolean | | | +| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| any | Unary | Boolean | Scalar Boolean | | | +| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ | count | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 37040ec86b5..19e171f0a65 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -548,30 +548,40 @@ def test_min_max(): def test_any(): # ARROW-1846 + + options = pc.ScalarAggregateOptions(skip_nulls=False) a = pa.array([False, None, True]) assert pc.any(a).as_py() is True + assert pc.any(a, options=options).as_py() is None a = pa.array([False, None, False]) assert pc.any(a).as_py() is False + assert pc.any(a, options=options).as_py() is None def test_all(): # ARROW-10301 + options = pc.ScalarAggregateOptions(skip_nulls=False) a = pa.array([], type='bool') assert pc.all(a).as_py() is True + assert pc.all(a, options=options).as_py() is True a = pa.array([False, True]) assert pc.all(a).as_py() is False + assert pc.all(a, options=options).as_py() is False a = pa.array([True, None]) assert pc.all(a).as_py() is True + assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [True, None]]) assert pc.all(a).as_py() is True + assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [False]]) assert pc.all(a).as_py() is False + assert pc.all(a, options=options).as_py() is False def test_is_valid(): diff --git a/r/R/compute.R b/r/R/compute.R index 5a00e884980..4277ad8d6df 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -203,31 +203,12 @@ unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { #' @export any.ArrowDatum <- function(..., na.rm = FALSE) { - - a <- collect_arrays_from_dots(list(...)) - result <- call_function("any", a) - - if (!na.rm && a$null_count > 0 && !as.vector(result)) { - # Three-valued logic: with na.rm = FALSE, any(c(TRUE, NA)) returns TRUE but any(c(FALSE, NA)) returns NA - # TODO: C++ library should take na.rm for any/all (like ARROW-9054) - Scalar$create(NA) - } else { - result - } + scalar_aggregate("any", ..., na.rm = na.rm) } #' @export all.ArrowDatum <- function(..., na.rm = FALSE) { - - a <- collect_arrays_from_dots(list(...)) - result <- call_function("all", a) - - if (!na.rm && a$null_count > 0 && as.vector(result)) { - # See comment above in any() about three-valued logic - Scalar$create(NA) - } else { - result - } + scalar_aggregate("all", ..., na.rm = na.rm) } #' `match` and `%in%` for Arrow objects diff --git a/r/src/compute.cpp b/r/src/compute.cpp index cfa895ecb1e..6bb55cbe208 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -172,7 +172,7 @@ std::shared_ptr make_compute_options( } if (func_name == "min_max" || func_name == "sum" || func_name == "mean" || - func_name == "count") { + func_name == "count" || func_name == "any" || func_name == "all") { using Options = arrow::compute::ScalarAggregateOptions; auto out = std::make_shared(Options::Defaults()); out->min_count = cpp11::as_cpp(options["na.min_count"]); diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 41418014bea..3f252cfb3f5 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -393,13 +393,13 @@ test_that("any.Array and any.ChunkedArray", { data <- c(1:10, NA, NA) - expect_vector_equal(any(input > 5), data) + expect_vector_equal(any(input > 5, na.rm = TRUE), data) expect_vector_equal(any(input < 1), data) expect_vector_equal(any(input < 1, na.rm = TRUE), data) data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) - expect_vector_equal(any(input), data_logical) + expect_vector_equal(any(input, na.rm = TRUE), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) }) @@ -408,7 +408,8 @@ test_that("all.Array and all.ChunkedArray", { data <- c(1:10, NA, NA) - expect_vector_equal(all(input > 5), data) + expect_vector_equal(all(input > 5, na.rm = TRUE), data) + expect_vector_equal(all(input < 11), data) expect_vector_equal(all(input < 11, na.rm = TRUE), data) From 37624829b86547992d7ed76401530e009f7ebcae Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 8 Jun 2021 19:10:39 +0200 Subject: [PATCH 2/8] Apply suggestions from code review Co-authored-by: Ian Cook --- r/tests/testthat/test-compute-aggregate.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 3f252cfb3f5..7eb9f4b02cd 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -396,10 +396,16 @@ test_that("any.Array and any.ChunkedArray", { expect_vector_equal(any(input > 5, na.rm = TRUE), data) expect_vector_equal(any(input < 1), data) expect_vector_equal(any(input < 1, na.rm = TRUE), data) - - data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) - + + data_logical <- c(NA, FALSE, FALSE, NA, FALSE) + expect_vector_equal(any(input), data_logical) + expect_vector_equal(any(input, na.rm = FALSE), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) + + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) + + expect_vector_equal(any(input), data_logical) + expect_vector_equal(any(input, na.rm = FALSE), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) }) From 55904919a87eff9bbc7fef0f7470ff8f598c11f6 Mon Sep 17 00:00:00 2001 From: Rok Date: Wed, 9 Jun 2021 02:24:33 +0200 Subject: [PATCH 3/8] Fixing behaviour. --- .../arrow/compute/kernels/aggregate_basic.cc | 12 ++++------ .../arrow/compute/kernels/aggregate_test.cc | 22 +++++++++---------- python/pyarrow/tests/test_compute.py | 2 +- r/tests/testthat/test-compute-aggregate.R | 15 ++++++++----- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 10077138fdb..d1c71c47bce 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -146,11 +146,7 @@ struct BooleanAnyImpl : public ScalarAggregator { Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a True already - if (options.skip_nulls && this->any == true) { - return Status::OK(); - } - // short-circuit if seen a null already - if (!options.skip_nulls && this->has_nulls) { + if (this->any == true) { return Status::OK(); } const auto& data = *batch[0].array(); @@ -177,7 +173,7 @@ struct BooleanAnyImpl : public ScalarAggregator { } Status Finalize(KernelContext* ctx, Datum* out) override { - if (!options.skip_nulls && this->has_nulls) { + if (!options.skip_nulls && !this->any && this->has_nulls) { out->value = std::make_shared(); } else { out->value = std::make_shared(this->any); @@ -205,7 +201,7 @@ struct BooleanAllImpl : public ScalarAggregator { Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a false already - if (options.skip_nulls && this->all == false) { + if (this->all == false) { return Status::OK(); } // short-circuit if seen a null already @@ -237,7 +233,7 @@ struct BooleanAllImpl : public ScalarAggregator { } Status Finalize(KernelContext*, Datum* out) override { - if (!options.skip_nulls && this->has_nulls) { + if (!options.skip_nulls && this->all && this->has_nulls) { out->value = std::make_shared(); } else { out->value = std::make_shared(this->all); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 887f017d08e..7318539df7f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1005,14 +1005,14 @@ TEST_F(TestAnyKernel, Basics) { this->AssertAnyIs("[null, null, null]", null_value, keep_nulls); this->AssertAnyIs("[false, false, false]", false_value, keep_nulls); this->AssertAnyIs("[false, false, false, null]", null_value, keep_nulls); - this->AssertAnyIs("[true, null, true, true]", null_value, keep_nulls); - this->AssertAnyIs("[false, null, false, true]", null_value, keep_nulls); - this->AssertAnyIs("[true, null, false, true]", null_value, keep_nulls); + this->AssertAnyIs("[true, null, true, true]", true_value, keep_nulls); + this->AssertAnyIs("[false, null, false, true]", true_value, keep_nulls); + this->AssertAnyIs("[true, null, false, true]", true_value, keep_nulls); this->AssertAnyIs(chunked_input0, true_value, keep_nulls); - this->AssertAnyIs(chunked_input1, null_value, keep_nulls); + this->AssertAnyIs(chunked_input1, true_value, keep_nulls); this->AssertAnyIs(chunked_input2, false_value, keep_nulls); this->AssertAnyIs(chunked_input3, null_value, keep_nulls); - this->AssertAnyIs(chunked_input4, null_value, keep_nulls); + this->AssertAnyIs(chunked_input4, true_value, keep_nulls); } // @@ -1085,16 +1085,16 @@ TEST_F(TestAllKernel, Basics) { this->AssertAllIs("[true, false]", false_value, keep_nulls); this->AssertAllIs("[null, null, null]", null_value, keep_nulls); this->AssertAllIs("[false, false, false]", false_value, keep_nulls); - this->AssertAllIs("[false, false, false, null]", null_value, keep_nulls); + this->AssertAllIs("[false, false, false, null]", false_value, keep_nulls); this->AssertAllIs("[true, null, true, true]", null_value, keep_nulls); - this->AssertAllIs("[false, null, false, true]", null_value, keep_nulls); - this->AssertAllIs("[true, null, false, true]", null_value, keep_nulls); + this->AssertAllIs("[false, null, false, true]", false_value, keep_nulls); + this->AssertAllIs("[true, null, false, true]", false_value, keep_nulls); this->AssertAllIs(chunked_input0, true_value, keep_nulls); this->AssertAllIs(chunked_input1, null_value, keep_nulls); this->AssertAllIs(chunked_input2, false_value, keep_nulls); - this->AssertAllIs(chunked_input3, null_value, keep_nulls); - this->AssertAllIs(chunked_input4, null_value, keep_nulls); - this->AssertAllIs(chunked_input5, null_value, keep_nulls); + this->AssertAllIs(chunked_input3, false_value, keep_nulls); + this->AssertAllIs(chunked_input4, false_value, keep_nulls); + this->AssertAllIs(chunked_input5, false_value, keep_nulls); } // diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 19e171f0a65..b65970745ec 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -552,7 +552,7 @@ def test_any(): options = pc.ScalarAggregateOptions(skip_nulls=False) a = pa.array([False, None, True]) assert pc.any(a).as_py() is True - assert pc.any(a, options=options).as_py() is None + assert pc.any(a, options=options).as_py() is True a = pa.array([False, None, False]) assert pc.any(a).as_py() is False diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 7eb9f4b02cd..23b8a615bee 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -390,20 +390,22 @@ test_that("value_counts", { }) test_that("any.Array and any.ChunkedArray", { - + data <- c(1:10, NA, NA) + expect_vector_equal(any(input > 5), data) expect_vector_equal(any(input > 5, na.rm = TRUE), data) expect_vector_equal(any(input < 1), data) expect_vector_equal(any(input < 1, na.rm = TRUE), data) - data_logical <- c(NA, FALSE, FALSE, NA, FALSE) + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) + expect_vector_equal(any(input), data_logical) expect_vector_equal(any(input, na.rm = FALSE), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) - data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) - + data_logical <- c(NA, FALSE, FALSE, NA, FALSE) + expect_vector_equal(any(input), data_logical) expect_vector_equal(any(input, na.rm = FALSE), data_logical) expect_vector_equal(any(input, na.rm = TRUE), data_logical) @@ -413,7 +415,8 @@ test_that("any.Array and any.ChunkedArray", { test_that("all.Array and all.ChunkedArray", { data <- c(1:10, NA, NA) - + + expect_vector_equal(all(input > 5), data) expect_vector_equal(all(input > 5, na.rm = TRUE), data) expect_vector_equal(all(input < 11), data) @@ -423,7 +426,7 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input), data_logical) expect_vector_equal(all(input, na.rm = TRUE), data_logical) - + }) test_that("variance", { From a67de917f58b43c220ff21846448f8f5d5faa089 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 15 Jun 2021 00:07:35 +0200 Subject: [PATCH 4/8] Documenting any/all as kleen when skip_null=True. --- cpp/src/arrow/compute/api_aggregate.h | 4 ++++ docs/source/cpp/compute.rst | 25 +++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 05362724625..0d11f37bcd0 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -206,6 +206,8 @@ Result MinMax( /// /// This function returns true if any of the elements in the array evaluates /// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then kleen logic is applied. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] options see ScalarAggregateOptions for more information @@ -224,6 +226,8 @@ Result Any( /// /// This function returns true if all of the elements in the array evaluate /// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then kleen logic is applied. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] options see ScalarAggregateOptions for more information diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index c9c8a709668..736602af198 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -186,9 +186,9 @@ Aggregations +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +===============+=======+=============+================+==================================+=======+ -| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | | +| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | | +| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | count | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ @@ -196,15 +196,15 @@ Aggregations +---------------+-------+-------------+----------------+----------------------------------+-------+ | mean | Unary | Numeric | Scalar Float64 | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(1) | +| min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(2) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(2) | +| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(3) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(3) | +| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(4) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(4) | +| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ @@ -213,18 +213,23 @@ Aggregations Notes: -* \(1) Output is a ``{"min": input type, "max": input type}`` Struct. +* \(1) If null values are taken into account by setting ScalarAggregateOptions + parameter skip_nulls = false then `Kleene logic`_ logic is applied. -* \(2) Output is an array of ``{"mode": input type, "count": Int64}`` Struct. +* \(2) Output is a ``{"min": input type, "max": input type}`` Struct. + +* \(3) Output is an array of ``{"mode": input type, "count": Int64}`` Struct. It contains the *N* most common elements in the input, in descending order, where *N* is given in :member:`ModeOptions::n`. If two values have the same count, the smallest one comes first. Note that the output can have less than *N* elements if the input has less than *N* distinct values. -* \(3) Output is Float64 or input type, depending on QuantileOptions. +* \(4) Output is Float64 or input type, depending on QuantileOptions. + +* \(5) Output is Int64, UInt64 or Float64, depending on the input type. -* \(4) Output is Int64, UInt64 or Float64, depending on the input type. +.. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics Element-wise ("scalar") functions --------------------------------- From 263fd7afdf7b1c161cbe577ae4433f6cbf689d02 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 15 Jun 2021 14:38:36 +0200 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/compute/api_aggregate.h | 6 ++++-- docs/source/cpp/compute.rst | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 0d11f37bcd0..79af4978223 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -207,7 +207,8 @@ Result MinMax( /// This function returns true if any of the elements in the array evaluates /// to true and false otherwise. Null values are ignored by default. /// If null values are taken into account by setting ScalarAggregateOptions -/// parameter skip_nulls = false then kleen logic is applied. +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneOr for more details on Kleene logic. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] options see ScalarAggregateOptions for more information @@ -227,7 +228,8 @@ Result Any( /// This function returns true if all of the elements in the array evaluate /// to true and false otherwise. Null values are ignored by default. /// If null values are taken into account by setting ScalarAggregateOptions -/// parameter skip_nulls = false then kleen logic is applied. +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneOr for more details on Kleene logic. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] options see ScalarAggregateOptions for more information diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 736602af198..94ecd5ac45a 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -229,8 +229,6 @@ Notes: * \(5) Output is Int64, UInt64 or Float64, depending on the input type. -.. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics - Element-wise ("scalar") functions --------------------------------- From 2fa7505c393638fbce5bd7f725f5f1446733e86b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 15 Jun 2021 16:02:09 +0200 Subject: [PATCH 6/8] Update cpp/src/arrow/compute/api_aggregate.h Co-authored-by: Joris Van den Bossche --- cpp/src/arrow/compute/api_aggregate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 79af4978223..7a6c44bd923 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -229,7 +229,7 @@ Result Any( /// to true and false otherwise. Null values are ignored by default. /// If null values are taken into account by setting ScalarAggregateOptions /// parameter skip_nulls = false then Kleene logic is used. -/// See KleeneOr for more details on Kleene logic. +/// See KleeneAnd for more details on Kleene logic. /// /// \param[in] value input datum, expecting a boolean array /// \param[in] options see ScalarAggregateOptions for more information From 5150ec9871067315a69508b6ebfa3cf96e2c8cbe Mon Sep 17 00:00:00 2001 From: Rok Date: Thu, 24 Jun 2021 20:49:54 +0200 Subject: [PATCH 7/8] Adding comments to aggregate_basic.cc. --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index d1c71c47bce..5e0454c9c4d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -434,13 +434,19 @@ const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numer const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", ("Null values are ignored by default.\n" - "This can be changed through ScalarAggregateOptions."), + "If null values are taken into account by setting " + "ScalarAggregateOptions parameter skip_nulls = false then " + "Kleene logic is used.\n" + "See KleeneOr for more details on Kleene logic."), {"array"}, "ScalarAggregateOptions"}; const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", ("Null values are ignored by default.\n" - "This can be changed through ScalarAggregateOptions."), + "If null values are taken into account by setting " + "ScalarAggregateOptions parameter skip_nulls = false then " + "Kleene logic is used.\n" + "See KleeneAnd for more details on Kleene logic."), {"array"}, "ScalarAggregateOptions"}; From 2743729209d954b754e8f8c01c7e47454ac331c5 Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 13 Jul 2021 20:04:02 +0200 Subject: [PATCH 8/8] Adding lole32 to configure.win. --- r/tests/testthat/test-compute-aggregate.R | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 23b8a615bee..428f799c97b 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -390,21 +390,15 @@ test_that("value_counts", { }) test_that("any.Array and any.ChunkedArray", { - + data <- c(1:10, NA, NA) expect_vector_equal(any(input > 5), data) expect_vector_equal(any(input > 5, na.rm = TRUE), data) expect_vector_equal(any(input < 1), data) expect_vector_equal(any(input < 1, na.rm = TRUE), data) - + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) - - expect_vector_equal(any(input), data_logical) - expect_vector_equal(any(input, na.rm = FALSE), data_logical) - expect_vector_equal(any(input, na.rm = TRUE), data_logical) - - data_logical <- c(NA, FALSE, FALSE, NA, FALSE) expect_vector_equal(any(input), data_logical) expect_vector_equal(any(input, na.rm = FALSE), data_logical) @@ -415,7 +409,7 @@ test_that("any.Array and any.ChunkedArray", { test_that("all.Array and all.ChunkedArray", { data <- c(1:10, NA, NA) - + expect_vector_equal(all(input > 5), data) expect_vector_equal(all(input > 5, na.rm = TRUE), data) @@ -426,7 +420,7 @@ test_that("all.Array and all.ChunkedArray", { expect_vector_equal(all(input), data_logical) expect_vector_equal(all(input, na.rm = TRUE), data_logical) - + }) test_that("variance", {