From e1726ccefa5e756a56e036a1131ceebc6522c844 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 16 Jun 2020 21:31:00 -0500 Subject: [PATCH 1/3] Implement BooleanArray::true_count and false_count, add Python bindings --- cpp/src/arrow/array/array_primitive.cc | 32 ++++++++++++++++++++++++++ cpp/src/arrow/array/array_primitive.h | 8 +++++++ cpp/src/arrow/array/array_test.cc | 27 ++++++++++++++++++++++ python/pyarrow/array.pxi | 7 ++++++ python/pyarrow/includes/libarrow.pxd | 2 ++ python/pyarrow/tests/test_array.py | 7 ++++++ 6 files changed, 83 insertions(+) diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index 0f70e3c280e..f42c5dcebf3 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -22,6 +22,7 @@ #include "arrow/array/array_base.h" #include "arrow/type.h" +#include "arrow/util/bit_block_counter.h" #include "arrow/util/logging.h" namespace arrow { @@ -49,6 +50,37 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr& data, int64_t offset) : PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {} +int64_t BooleanArray::false_count() const { + return this->length() - this->null_count() - this->true_count(); +} + +int64_t BooleanArray::true_count() const { + int64_t count = 0; + if (data_->buffers[0] != nullptr) { + internal::BinaryBitBlockCounter bit_counter(data_->buffers[0]->data(), data_->offset, + data_->buffers[1]->data(), data_->offset, + data_->length); + while (true) { + internal::BitBlockCount block = bit_counter.NextAndWord(); + if (block.length == 0) { + break; + } + count += block.popcount; + } + } else { + internal::BitBlockCounter bit_counter(data_->buffers[1]->data(), data_->offset, + data_->length); + while (true) { + internal::BitBlockCount block = bit_counter.NextFourWords(); + if (block.length == 0) { + break; + } + count += block.popcount; + } + } + return count; +} + // ---------------------------------------------------------------------- // Day time interval diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index e58f5f4c8b6..c58fee77cef 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -84,6 +84,14 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { bool GetView(int64_t i) const { return Value(i); } + /// \brief Return the number of false (0) values among the valid + /// values. Result is not cached. + int64_t false_count() const; + + /// \brief Return the number of true (1) values among the valid + /// values. Result is not cached. + int64_t true_count() const; + protected: using PrimitiveArray::PrimitiveArray; }; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 21c49093585..24f180f73c5 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -656,6 +656,33 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr ASSERT_EQ(0, builder->null_count()); } +TEST(TestBooleanArray, TrueCountFalseCount) { + random::RandomArrayGenerator rng(/*seed=*/0); + + const int64_t length = 10000; + auto arr = rng.Boolean(length, /*true_probability=*/0.5, /*null_probability=*/0.1); + + auto CheckArray = [&](const BooleanArray& values) { + int64_t expected_false = 0; + int64_t expected_true = 0; + for (int64_t i = 0; i < values.length(); ++i) { + if (values.IsValid(i)) { + if (values.Value(i)) { + ++expected_true; + } else { + ++expected_false; + } + } + } + ASSERT_EQ(values.true_count(), expected_true); + ASSERT_EQ(values.false_count(), expected_false); + }; + + CheckArray(checked_cast(*arr)); + CheckArray(checked_cast(*arr->Slice(5))); + CheckArray(checked_cast(*arr->Slice(0, 0))); +} + TEST(TestPrimitiveAdHoc, TestType) { Int8Builder i8(default_memory_pool()); ASSERT_TRUE(i8.type()->Equals(int8())); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 94e30c8fe6b..8be7ebac7f4 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1158,6 +1158,13 @@ cdef class BooleanArray(Array): """ Concrete class for Arrow arrays of boolean data type. """ + @property + def false_count(self): + return ( self.ap).false_count() + + @property + def true_count(self): + return ( self.ap).true_count() cdef class NumericArray(Array): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d8084f0c401..757d8a70fdb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -445,6 +445,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): c_bool Value(int i) + int64_t false_count() + int64_t true_count() cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): uint8_t Value(int i) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 8bfb072702a..883261e2031 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1905,6 +1905,13 @@ def test_array_from_strided_bool(): assert result.equals(expected) +def test_boolean_true_count_false_count(): + # ARROW-9145 + arr = pa.array([True, True, None, False, None, True] * 1000) + assert arr.true_count == 3000 + assert arr.false_count == 1000 + + def test_buffers_primitive(): a = pa.array([1, 2, None, 4], type=pa.int16()) buffers = a.buffers() From e3ee747928f7436e4dfa98da46a9ef7b310c33dc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 17 Jun 2020 15:48:25 -0500 Subject: [PATCH 2/3] Use CountSetBits in the appropriate place --- cpp/src/arrow/array/array_primitive.cc | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index f42c5dcebf3..6c434fb6429 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -55,11 +55,11 @@ int64_t BooleanArray::false_count() const { } int64_t BooleanArray::true_count() const { - int64_t count = 0; if (data_->buffers[0] != nullptr) { internal::BinaryBitBlockCounter bit_counter(data_->buffers[0]->data(), data_->offset, data_->buffers[1]->data(), data_->offset, data_->length); + int64_t count = 0; while (true) { internal::BitBlockCount block = bit_counter.NextAndWord(); if (block.length == 0) { @@ -67,18 +67,11 @@ int64_t BooleanArray::true_count() const { } count += block.popcount; } + return count; } else { - internal::BitBlockCounter bit_counter(data_->buffers[1]->data(), data_->offset, - data_->length); - while (true) { - internal::BitBlockCount block = bit_counter.NextFourWords(); - if (block.length == 0) { - break; - } - count += block.popcount; - } + return internal::CountSetBits(data_->buffers[1]->data(), data_->offset, + data_->length); } - return count; } // ---------------------------------------------------------------------- From bebb62976c06318c03ca01af8f9751657e53e2ba Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 17 Jun 2020 15:49:51 -0500 Subject: [PATCH 3/3] Use CountSetBits in more cases --- cpp/src/arrow/array/array_primitive.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index 6c434fb6429..519a7f21f43 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -55,7 +55,8 @@ int64_t BooleanArray::false_count() const { } int64_t BooleanArray::true_count() const { - if (data_->buffers[0] != nullptr) { + if (data_->null_count.load() != 0) { + DCHECK(data_->buffers[0]); internal::BinaryBitBlockCounter bit_counter(data_->buffers[0]->data(), data_->offset, data_->buffers[1]->data(), data_->offset, data_->length);