From ebfa27a4aae364f6eac5e532290c76b6cb5a67f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 18 Jun 2020 15:36:16 +0200 Subject: [PATCH 01/11] sum mean for boolean --- .../arrow/compute/kernels/aggregate_basic.cc | 31 ++++++++++++++++ .../compute/kernels/aggregate_internal.h | 5 +++ .../arrow/compute/kernels/aggregate_test.cc | 35 +++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index abc2665a2d0..33452bfaecd 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -92,6 +92,8 @@ std::unique_ptr CountInit(KernelContext*, const KernelInitArgs& arg // ---------------------------------------------------------------------- // Sum implementation +// factor out common part to a SumStateBase + template ::Type> struct SumState { @@ -255,6 +257,28 @@ struct SumState { } }; +template <> +struct SumState { + using SumType = typename FindAccumulatorType::Type; + using ThisType = SumState; + + ThisType& operator+=(const ThisType& rhs) { + this->count += rhs.count; + this->sum += rhs.sum; + return *this; + } + + public: + void Consume(const Array& input) { + const BooleanArray& array = static_cast(input); + count += array.length(); + sum += array.true_count(); + } + + size_t count = 0; + typename SumType::c_type sum = 0; +}; + template struct SumImpl : public ScalarAggregator { using ArrayType = typename TypeTraits::ArrayType; @@ -311,6 +335,11 @@ struct SumLikeInit { return Status::NotImplemented("No sum implemented"); } + Status Visit(const BooleanType&) { + state.reset(new KernelClass()); + return Status::OK(); + } + template enable_if_number Visit(const Type&) { state.reset(new KernelClass()); @@ -525,12 +554,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("sum", Arity::Unary()); + AddBasicAggKernels(SumInit, {boolean()}, int64(), func.get()); AddBasicAggKernels(SumInit, SignedIntTypes(), int64(), func.get()); AddBasicAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get()); AddBasicAggKernels(SumInit, FloatingPointTypes(), float64(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("mean", Arity::Unary()); + AddBasicAggKernels(MeanInit, {boolean()}, float64(), func.get()); AddBasicAggKernels(MeanInit, NumericTypes(), float64(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index de3584588ea..5f2f50c0b06 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -27,6 +27,11 @@ namespace compute { template struct FindAccumulatorType {}; +template +struct FindAccumulatorType> { + using Type = UInt64Type; +}; + template struct FindAccumulatorType> { using Type = Int64Type; diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 8cbb987356f..b6efb66937a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -127,6 +127,41 @@ void ValidateSum(const Array& array) { ValidateSum(array, NaiveSum(array)); } +using UnaryOp = Result(const Datum&, ExecContext*); + +template +void ValidateBooleanAgg(const std::string& json, + const std::shared_ptr& expected) { + auto array = ArrayFromJSON(boolean(), json); + auto exp = Datum(expected); + ASSERT_OK_AND_ASSIGN(Datum result, Op(array, nullptr)); + ASSERT_TRUE(result.Equals(exp)); +} + +TEST(TestBooleanAggregation, Sum) { + ValidateBooleanAgg("[]", std::make_shared()); + ValidateBooleanAgg("[null]", std::make_shared(0)); + ValidateBooleanAgg("[null, false]", std::make_shared(0)); + ValidateBooleanAgg("[true]", std::make_shared(1)); + ValidateBooleanAgg("[true, false, true]", std::make_shared(2)); + ValidateBooleanAgg("[true, false, true, true, null]", + std::make_shared(3)); +} + +TEST(TestBooleanAggregation, Mean) { + ValidateBooleanAgg("[]", std::make_shared()); + ValidateBooleanAgg("[null]", std::make_shared(0)); + ValidateBooleanAgg("[null, false]", std::make_shared(0)); + ValidateBooleanAgg("[true]", std::make_shared(1)); + ValidateBooleanAgg("[true, false, true, false]", + std::make_shared(0.5)); + ValidateBooleanAgg("[true, null]", std::make_shared(0.5)); + ValidateBooleanAgg("[true, null, false, true]", + std::make_shared(0.5)); + ValidateBooleanAgg("[true, null, false, false]", + std::make_shared(0.25)); +} + template class TestNumericSumKernel : public ::testing::Test {}; From 88dd1c703bbb6057ef2e74304aed82d4f950e864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Thu, 18 Jun 2020 15:37:16 +0200 Subject: [PATCH 02/11] remove comment --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 33452bfaecd..b370b0c7a47 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -92,8 +92,6 @@ std::unique_ptr CountInit(KernelContext*, const KernelInitArgs& arg // ---------------------------------------------------------------------- // Sum implementation -// factor out common part to a SumStateBase - template ::Type> struct SumState { From 46736990bc19a7d927c7e03ec8862c7668135f00 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 18 Jun 2020 13:45:49 -0700 Subject: [PATCH 03/11] Use sum/mean on boolean from R --- r/R/compute.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 9b18f22ae27..2eca36fc990 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -62,9 +62,8 @@ scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { } } - if (inherits(a$type, "Boolean")) { - # Bool sum/mean not implemented so cast to int - # https://issues.apache.org/jira/browse/ARROW-9055 + if (inherits(a$type, "Boolean") && FUN %in% "minmax") { + # Bool minmax not implemented so cast to int a <- a$cast(int8()) } Scalar$create(call_function(FUN, a, options = list(na.rm = na.rm))) From 2bfa1859749c190ff4fd4eac0bbfafa9f097e302 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 18 Jun 2020 14:03:34 -0700 Subject: [PATCH 04/11] Add tests for na.rm boolean behavior --- r/R/compute.R | 2 +- r/tests/testthat/test-compute.R | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 2eca36fc990..5153d355345 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -58,7 +58,7 @@ scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { if (FUN %in% c("mean", "sum")) { # Arrow sum/mean function always drops NAs so handle that here # https://issues.apache.org/jira/browse/ARROW-9054 - return(Scalar$create(NA_integer_, type = a$type)) + return(Scalar$create(NA_real_)) } } diff --git a/r/tests/testthat/test-compute.R b/r/tests/testthat/test-compute.R index 3d4b5f102b1..5cd94fe9087 100644 --- a/r/tests/testthat/test-compute.R +++ b/r/tests/testthat/test-compute.R @@ -33,9 +33,10 @@ test_that("sum.Array", { expect_is(sum(na, na.rm = TRUE), "Scalar") expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) - bools <- c(TRUE, TRUE, FALSE) + bools <- c(TRUE, NA, TRUE, FALSE) b <- Array$create(bools) expect_identical(as.integer(sum(b)), sum(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) }) test_that("sum.ChunkedArray", { @@ -73,9 +74,10 @@ test_that("mean.Array", { expect_is(mean(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) - bools <- c(TRUE, TRUE, FALSE) + bools <- c(TRUE, NA, TRUE, FALSE) b <- Array$create(bools) expect_identical(as.vector(mean(b)), mean(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) }) test_that("mean.ChunkedArray", { From db7d4cdd2c978af58e5f1221148c4685045bb59e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 19 Jun 2020 14:54:08 +0200 Subject: [PATCH 05/11] don't count nulls --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- cpp/src/arrow/compute/kernels/aggregate_test.cc | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index b370b0c7a47..f17df7fbb92 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -269,7 +269,7 @@ struct SumState { public: void Consume(const Array& input) { const BooleanArray& array = static_cast(input); - count += array.length(); + count += array.length() - array.null_count(); sum += array.true_count(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index b6efb66937a..321293f973b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -140,7 +140,7 @@ void ValidateBooleanAgg(const std::string& json, TEST(TestBooleanAggregation, Sum) { ValidateBooleanAgg("[]", std::make_shared()); - ValidateBooleanAgg("[null]", std::make_shared(0)); + ValidateBooleanAgg("[null]", std::make_shared()); ValidateBooleanAgg("[null, false]", std::make_shared(0)); ValidateBooleanAgg("[true]", std::make_shared(1)); ValidateBooleanAgg("[true, false, true]", std::make_shared(2)); @@ -150,15 +150,15 @@ TEST(TestBooleanAggregation, Sum) { TEST(TestBooleanAggregation, Mean) { ValidateBooleanAgg("[]", std::make_shared()); - ValidateBooleanAgg("[null]", std::make_shared(0)); + ValidateBooleanAgg("[null]", std::make_shared()); ValidateBooleanAgg("[null, false]", std::make_shared(0)); ValidateBooleanAgg("[true]", std::make_shared(1)); ValidateBooleanAgg("[true, false, true, false]", std::make_shared(0.5)); - ValidateBooleanAgg("[true, null]", std::make_shared(0.5)); - ValidateBooleanAgg("[true, null, false, true]", - std::make_shared(0.5)); - ValidateBooleanAgg("[true, null, false, false]", + ValidateBooleanAgg("[true, null]", std::make_shared(1)); + ValidateBooleanAgg("[true, null, false, true, true]", + std::make_shared(0.75)); + ValidateBooleanAgg("[true, null, false, false, false]", std::make_shared(0.25)); } From fd8f2171d7e4855eb65e44c8af0062ef82f657b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 19 Jun 2020 16:08:57 +0200 Subject: [PATCH 06/11] fix minmax kernel for empty array and add support for boolean type --- .../arrow/compute/kernels/aggregate_basic.cc | 50 ++++++++++++++--- .../arrow/compute/kernels/aggregate_test.cc | 54 +++++++++++++++++-- cpp/src/arrow/testing/gtest_util.h | 2 + 3 files changed, 95 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index f17df7fbb92..9fcea1bdad0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -366,6 +366,30 @@ std::unique_ptr MeanInit(KernelContext* ctx, const KernelInitArgs& template struct MinMaxState {}; +template +struct MinMaxState> { + using ThisType = MinMaxState; + using T = typename ArrowType::c_type; + + ThisType& operator+=(const ThisType& rhs) { + this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; + this->min = this->min && rhs.min; + this->max = this->max || rhs.max; + return *this; + } + + void MergeOne(T value) { + this->min = this->min && value; + this->max = this->max || value; + } + + T min = true; + T max = false; + bool has_nulls = false; + bool has_values = false; +}; + template struct MinMaxState> { using ThisType = MinMaxState; @@ -373,6 +397,7 @@ struct MinMaxState> { ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; this->min = std::min(this->min, rhs.min); this->max = std::max(this->max, rhs.max); return *this; @@ -386,6 +411,7 @@ struct MinMaxState> { T min = std::numeric_limits::max(); T max = std::numeric_limits::min(); bool has_nulls = false; + bool has_values = false; }; template @@ -395,6 +421,7 @@ struct MinMaxState> { ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; this->min = std::fmin(this->min, rhs.min); this->max = std::fmax(this->max, rhs.max); return *this; @@ -408,6 +435,7 @@ struct MinMaxState> { T min = std::numeric_limits::infinity(); T max = -std::numeric_limits::infinity(); bool has_nulls = false; + bool has_values = false; }; template @@ -424,24 +452,27 @@ struct MinMaxImpl : public ScalarAggregator { ArrayType arr(batch[0].array()); - local.has_nulls = arr.null_count() > 0; + const auto null_count = arr.null_count(); + local.has_nulls = null_count > 0; + local.has_values = (arr.length() - null_count) > 0; + if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { this->state = local; return; } - const auto values = arr.raw_values(); - if (arr.null_count() > 0) { + // const auto values = arr.raw_values(); + if (local.has_nulls) { BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); for (int64_t i = 0; i < arr.length(); i++) { if (reader.IsSet()) { - local.MergeOne(values[i]); + local.MergeOne(arr.Value(i)); } reader.Next(); } } else { for (int64_t i = 0; i < arr.length(); i++) { - local.MergeOne(values[i]); + local.MergeOne(arr.Value(i)); } } this->state = local; @@ -456,7 +487,8 @@ struct MinMaxImpl : public ScalarAggregator { using ScalarType = typename TypeTraits::ScalarType; std::vector> values; - if (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { + if (!state.has_values || + (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL)) { // (null, null) values = {std::make_shared(), std::make_shared()}; } else { @@ -490,6 +522,11 @@ struct MinMaxInitState { return Status::NotImplemented("No sum implemented"); } + Status Visit(const BooleanType&) { + state.reset(new MinMaxImpl(out_type, options)); + return Status::OK(); + } + template enable_if_number Visit(const Type&) { state.reset(new MinMaxImpl(out_type, options)); @@ -566,6 +603,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { static auto default_minmax_options = MinMaxOptions::Defaults(); func = std::make_shared("minmax", Arity::Unary(), &default_minmax_options); + AddMinMaxKernels(MinMaxInit, {boolean()}, func.get()); AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 321293f973b..97f8041f9a3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -381,10 +381,10 @@ TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMean) { /// template -class TestNumericMinMaxKernel : public ::testing::Test { +class TestPrimitiveMinMaxKernel : public ::testing::Test { using Traits = TypeTraits; using ArrayType = typename Traits::ArrayType; - using c_type = typename ArrayType::value_type; + using c_type = typename ArrowType::c_type; using ScalarType = typename Traits::ScalarType; public: @@ -436,15 +436,59 @@ class TestNumericMinMaxKernel : public ::testing::Test { }; template -class TestFloatingMinMaxKernel : public TestNumericMinMaxKernel {}; +class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; -TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes); -TYPED_TEST(TestNumericMinMaxKernel, Basics) { +template +class TestIntegerMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +template +class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +TYPED_TEST_SUITE(TestBooleanMinMaxKernel, BooleanArrowType); +TYPED_TEST(TestBooleanMinMaxKernel, Basics) { + MinMaxOptions options; + std::vector chunked_input0 = {"[]", "[]"}; + std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; + std::vector chunked_input2 = {"[false, false, false]", "[false]"}; + std::vector chunked_input3 = {"[true, null]", "[null, false]"}; + + // SKIP nulls by default + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); + this->AssertMinMaxIs("[false, false, false]", false, false, options); + this->AssertMinMaxIs("[false, false, false, null]", false, false, options); + this->AssertMinMaxIs("[true, null, true, true]", true, true, options); + this->AssertMinMaxIs("[true, null, true, true]", true, true, options); + this->AssertMinMaxIs("[true, null, false, true]", false, true, options); + this->AssertMinMaxIsNull(chunked_input0, options); + this->AssertMinMaxIs(chunked_input1, true, true, options); + this->AssertMinMaxIs(chunked_input2, false, false, options); + this->AssertMinMaxIs(chunked_input3, false, true, options); + + options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); + this->AssertMinMaxIsNull("[false, null, false]", options); + this->AssertMinMaxIsNull("[true, null]", options); + this->AssertMinMaxIs("[true, true, true]", true, true, options); + this->AssertMinMaxIs("[false, false]", false, false, options); + this->AssertMinMaxIs("[false, true]", false, true, options); + this->AssertMinMaxIsNull(chunked_input0, options); + this->AssertMinMaxIsNull(chunked_input1, options); + this->AssertMinMaxIs(chunked_input2, false, false, options); + this->AssertMinMaxIsNull(chunked_input3, options); +} + +TYPED_TEST_SUITE(TestIntegerMinMaxKernel, IntegralArrowTypes); +TYPED_TEST(TestIntegerMinMaxKernel, Basics) { MinMaxOptions options; std::vector chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"}; std::vector chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"}; std::vector chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"}; + // SKIP nulls by default + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options); this->AssertMinMaxIs(chunked_input1, 1, 9, options); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 89e870da9d2..5e4af37e655 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -137,6 +137,8 @@ namespace arrow { // ---------------------------------------------------------------------- // Useful testing::Types declarations +typedef ::testing::Types BooleanArrowType; + typedef ::testing::Types NumericArrowTypes; From 37172303a79b103d800c24718c5a4812358b864f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 19 Jun 2020 16:10:06 +0200 Subject: [PATCH 07/11] remove comment --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 9fcea1bdad0..7b9a550eb02 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -461,7 +461,6 @@ struct MinMaxImpl : public ScalarAggregator { return; } - // const auto values = arr.raw_values(); if (local.has_nulls) { BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); for (int64_t i = 0; i < arr.length(); i++) { From 8f99d169e6131b52e5517954234e569bee4a6cbb Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 19 Jun 2020 08:28:18 -0700 Subject: [PATCH 08/11] [R] remove special bool case in aggregate function --- r/R/compute.R | 4 ---- r/tests/testthat/test-compute.R | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 5153d355345..f28c53a6b01 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -62,10 +62,6 @@ scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { } } - if (inherits(a$type, "Boolean") && FUN %in% "minmax") { - # Bool minmax not implemented so cast to int - a <- a$cast(int8()) - } Scalar$create(call_function(FUN, a, options = list(na.rm = na.rm))) } diff --git a/r/tests/testthat/test-compute.R b/r/tests/testthat/test-compute.R index 5cd94fe9087..811c27d05e5 100644 --- a/r/tests/testthat/test-compute.R +++ b/r/tests/testthat/test-compute.R @@ -115,5 +115,6 @@ test_that("min/max.Array", { bools <- c(TRUE, TRUE, FALSE) b <- Array$create(bools) - expect_identical(as.vector(min(b)), min(bools)) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) }) From 48da080a1b3b58619865e1534a355064472d32e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 26 Jun 2020 16:43:15 +0200 Subject: [PATCH 09/11] don't use typed test --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 8 +++----- cpp/src/arrow/testing/gtest_util.h | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 97f8041f9a3..72531cf0c5f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -435,17 +435,15 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test { std::shared_ptr type_singleton() { return Traits::type_singleton(); } }; -template -class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; - template class TestIntegerMinMaxKernel : public TestPrimitiveMinMaxKernel {}; template class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel {}; -TYPED_TEST_SUITE(TestBooleanMinMaxKernel, BooleanArrowType); -TYPED_TEST(TestBooleanMinMaxKernel, Basics) { +class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +TEST_F(TestBooleanMinMaxKernel, Basics) { MinMaxOptions options; std::vector chunked_input0 = {"[]", "[]"}; std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 5e4af37e655..89e870da9d2 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -137,8 +137,6 @@ namespace arrow { // ---------------------------------------------------------------------- // Useful testing::Types declarations -typedef ::testing::Types BooleanArrowType; - typedef ::testing::Types NumericArrowTypes; From 03e05bb9f09b3276279e57cd20b9ffadf9e9f436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 26 Jun 2020 17:45:42 +0200 Subject: [PATCH 10/11] specialize minmax for boolean --- .../arrow/compute/kernels/aggregate_basic.cc | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 7b9a550eb02..45015ba4558 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -502,6 +502,34 @@ struct MinMaxImpl : public ScalarAggregator { MinMaxState state; }; +struct BooleanMinMaxImpl : public MinMaxImpl { + + using MinMaxImpl::MinMaxImpl; + + void Consume(KernelContext*, const ExecBatch& batch) override { + StateType local; + ArrayType arr(batch[0].array()); + + const auto arr_length = arr.length(); + const auto null_count = arr.null_count(); + const auto valid_count = arr_length - null_count; + + local.has_nulls = null_count > 0; + local.has_values = valid_count > 0; + if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { + this->state = local; + return; + } + + const auto true_count = arr.true_count(); + const auto false_count = valid_count - true_count; + local.max = true_count > 0; + local.min = false_count == 0; + + this->state = local; + } +}; + struct MinMaxInitState { std::unique_ptr state; KernelContext* ctx; @@ -522,7 +550,7 @@ struct MinMaxInitState { } Status Visit(const BooleanType&) { - state.reset(new MinMaxImpl(out_type, options)); + state.reset(new BooleanMinMaxImpl(out_type, options)); return Status::OK(); } From bb4f12c816e6d313615fbbfbc71044f90012ae1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 26 Jun 2020 18:13:37 +0200 Subject: [PATCH 11/11] lint --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 45015ba4558..a8f839c33ea 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -503,7 +503,6 @@ struct MinMaxImpl : public ScalarAggregator { }; struct BooleanMinMaxImpl : public MinMaxImpl { - using MinMaxImpl::MinMaxImpl; void Consume(KernelContext*, const ExecBatch& batch) override {