diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index d894df1311e..8777d9ff358 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -33,6 +33,7 @@ #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" namespace arrow { @@ -150,30 +151,92 @@ TEST_F(TestArray, TestIsNullIsValid) { 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1}; // clang-format on + vector valid_bitmap; int64_t null_count = 0; for (uint8_t x : null_bitmap) { if (x == 0) { ++null_count; + valid_bitmap.push_back(1); + } else { + valid_bitmap.push_back(0); } } std::shared_ptr null_buf; + std::shared_ptr null_arr_buf; + std::shared_ptr valid_arr_buf; ASSERT_OK(BitUtil::BytesToBits(null_bitmap, default_memory_pool(), &null_buf)); + ASSERT_OK(null_buf->Copy(0, null_buf->size(), &valid_arr_buf)); + ASSERT_OK(BitUtil::BytesToBits(valid_bitmap, default_memory_pool(), &null_arr_buf)); std::unique_ptr arr; arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count)); + std::unique_ptr null_arr; + std::unique_ptr valid_arr; + null_arr.reset(new BooleanArray(valid_bitmap.size(), null_arr_buf, nullptr, 0)); + valid_arr.reset(new BooleanArray(null_bitmap.size(), valid_arr_buf, nullptr, 0)); + ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); ASSERT_TRUE(arr->null_bitmap()->Equals(*null_buf.get())); + EXPECT_TRUE(arr->IsNull()->Equals(*null_arr.get())); + EXPECT_TRUE(arr->IsValid()->Equals(*valid_arr.get())); for (size_t i = 0; i < null_bitmap.size(); ++i) { EXPECT_EQ(null_bitmap[i] != 0, !arr->IsNull(i)) << i; EXPECT_EQ(null_bitmap[i] != 0, arr->IsValid(i)) << i; } } +TEST_F(TestArray, TestIsNullIsValidLarge) { + // clang-format off + vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; + // clang-format on + const size_t initial_size = null_bitmap.size(); + const size_t generate_size = (3 * BitUtil::kSimdWidth * 8) / null_bitmap.size() + 1; + + for (size_t i = 1; i < generate_size; i++) { + for (size_t j = 0; j < initial_size; j++) { + null_bitmap.push_back(null_bitmap[j]); + } + } + + vector valid_bitmap; + int64_t null_count = 0; + for (uint8_t x : null_bitmap) { + if (x == 0) { + ++null_count; + valid_bitmap.push_back(1); + } else { + valid_bitmap.push_back(0); + } + } + + std::shared_ptr null_buf; + std::shared_ptr null_arr_buf; + std::shared_ptr valid_arr_buf; + ASSERT_OK(BitUtil::BytesToBits(null_bitmap, default_memory_pool(), &null_buf)); + ASSERT_OK(null_buf->Copy(0, null_buf->size(), &valid_arr_buf)); + ASSERT_OK(BitUtil::BytesToBits(valid_bitmap, default_memory_pool(), &null_arr_buf)); + + std::unique_ptr arr; + arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count)); + + std::unique_ptr null_arr; + std::unique_ptr valid_arr; + null_arr.reset(new BooleanArray(valid_bitmap.size(), null_arr_buf, nullptr, 0)); + valid_arr.reset(new BooleanArray(null_bitmap.size(), valid_arr_buf, nullptr, 0)); + + EXPECT_TRUE(arr->IsNull()->Equals(*null_arr.get())); + EXPECT_TRUE(arr->IsValid()->Equals(*valid_arr.get())); +} + TEST_F(TestArray, BuildLargeInMemoryArray) { const int64_t length = static_cast(std::numeric_limits::max()) + 1; diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 0b235cc199c..44d83508a4b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/compare.h" +#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type_traits.h" @@ -52,6 +53,39 @@ std::shared_ptr ArrayData::Make(const std::shared_ptr& type // ---------------------------------------------------------------------- // Base array class +std::shared_ptr Array::IsNull() const { + auto pool = default_memory_pool(); + + const auto bitmap_buffer = null_bitmap(); + std::shared_ptr new_bitmap_buffer; + + if (bitmap_buffer == NULLPTR) { + ARROW_CHECK_OK(GetEmptyBitmap(pool, length(), &new_bitmap_buffer)); + } else { + ARROW_CHECK_OK(CopyFlipedBitmap(pool, bitmap_buffer->mutable_data(), length(), + &new_bitmap_buffer)); + } + + auto boolean_array = + std::make_shared(length(), new_bitmap_buffer, NULLPTR, 0); + return std::dynamic_pointer_cast(boolean_array); +} + +std::shared_ptr Array::IsValid() const { + auto bitmap_buffer = null_bitmap(); + std::shared_ptr new_bitmap_buffer; + + if (bitmap_buffer == NULLPTR) { + auto pool = default_memory_pool(); + ARROW_CHECK_OK(GetFullBitmap(pool, length(), &new_bitmap_buffer)); + } else { + ARROW_CHECK_OK(bitmap_buffer->Copy(0, bitmap_buffer->size(), &new_bitmap_buffer)); + } + + auto boolean_array = + std::make_shared(length(), new_bitmap_buffer, NULLPTR, 0); + return std::dynamic_pointer_cast(boolean_array); +} int64_t Array::null_count() const { if (ARROW_PREDICT_FALSE(data_->null_count < 0)) { diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index ebe54adcb9e..7b87348d08d 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -203,6 +203,8 @@ class ARROW_EXPORT Array { BitUtil::BitNotSet(null_bitmap_data_, i + data_->offset); } + std::shared_ptr IsNull() const; + /// \brief Return true if value at index is valid (not null). Does not /// boundscheck bool IsValid(int64_t i) const { @@ -210,6 +212,8 @@ class ARROW_EXPORT Array { BitUtil::GetBit(null_bitmap_data_, i + data_->offset); } + std::shared_ptr IsValid() const; + /// Size in the number of elements this array contains. int64_t length() const { return data_->length; } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 4dd91e99ad9..0e584484b33 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -21,6 +21,8 @@ #define __builtin_popcount __popcnt #include #define __builtin_popcountll _mm_popcnt_u64 +#else +#include #endif #include @@ -104,6 +106,12 @@ Status GetEmptyBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* return Status::OK(); } +Status GetFullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* result) { + RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), result)); + memset((*result)->mutable_data(), 0xffff, static_cast((*result)->size())); + return Status::OK(); +} + Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t length, std::shared_ptr* out) { std::shared_ptr buffer; @@ -116,6 +124,43 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t return Status::OK(); } +Status CopyFlipedBitmap(MemoryPool* pool, const uint8_t* data, int64_t length, + std::shared_ptr* out) { + std::shared_ptr buffer; + RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer)); + uint8_t* dest = buffer->mutable_data(); + + // flip bits with vectorization + // TODO: use AVX instructions if available + size_t size = BitUtil::BytesForBits(length); + size_t rational = size / BitUtil::kSimdWidth; + size_t quotient = size % BitUtil::kSimdWidth; + + if (quotient != 0) { + size_t align = BitUtil::kSimdWidth * rational; + + for (size_t i = 0; i < quotient; i++) { + size_t position = align + i; + dest[position] = ~data[position]; + } + } + + for (size_t i = 0; i < rational; i++) { + size_t position = i * BitUtil::kSimdWidth; + const __m128i* data_in = reinterpret_cast(&data[position]); + __m128i* data_out = reinterpret_cast<__m128i*>(&dest[position]); + + __m128i mask = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); + __m128i loaded_data = _mm_load_si128(data_in); + __m128i result = _mm_xor_si128(loaded_data, mask); + + _mm_stream_si128(data_out, result); + } + + *out = buffer; + return Status::OK(); +} + bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, int64_t right_offset, int64_t bit_length) { if (left_offset % 8 == 0 && right_offset % 8 == 0) { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index cab3c9ee703..1de0954b4c8 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -106,6 +106,8 @@ class Status; namespace BitUtil { +static constexpr size_t kSimdWidth = 16; + static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; // the ~i byte version of kBitmaks @@ -535,6 +537,9 @@ class BitmapWriter { ARROW_EXPORT Status GetEmptyBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* result); +ARROW_EXPORT +Status GetFullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* result); + /// Copy a bit range of an existing bitmap /// /// \param[in] pool memory pool to allocate memory from @@ -548,6 +553,10 @@ ARROW_EXPORT Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, int64_t length, std::shared_ptr* out); +ARROW_EXPORT +Status CopyFlipedBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t length, + std::shared_ptr* out); + /// Compute the number of 1's in the given data array /// /// \param[in] data a packed LSB-ordered bitmap as a byte array diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cca9425881b..279e5d62d2b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -361,7 +361,14 @@ cdef class Array: return 0 def isnull(self): - raise NotImplemented + null_arr = Array() + null_arr.init(self.sp_array.get().IsNull()) + return null_arr + + def notnull(self): + notnull_arr = Array() + notnull_arr.init(self.sp_array.get().IsValid()) + return notnull_arr def __getitem__(self, key): cdef Py_ssize_t n = len(self) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 91bc96dc63f..662950bd5ab 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -109,6 +109,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool Equals(const CArray& arr) c_bool IsNull(int i) + shared_ptr[CArray] IsNull() + c_bool IsValid(int i) + shared_ptr[CArray] IsValid() shared_ptr[CArrayData] data()