diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index 5ec278acae8..07951d0c963 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -598,5 +598,49 @@ Result> BitmapAllButOne(MemoryPool* pool, int64_t length return std::move(buffer); } +BitBlockCounter::Block BitBlockCounter::NextBlock() { + auto load_word = [](const uint8_t* bytes) -> uint64_t { + return BitUtil::ToLittleEndian(util::SafeLoadAs(bytes)); + }; + auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t { + return (current >> shift) | (next << (64 - shift)); + }; + + // When the offset is > 0, we need there to be a word beyond the last aligned + // word in the bitmap for the bit shifting logic. + const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_); + if (bits_remaining_ < bits_required_to_scan_words) { + // End of the bitmap, leave it to the caller to decide how to best check + // these bits, no need to do redundant computation here. + const int16_t run_length = static_cast(bits_remaining_); + bits_remaining_ -= run_length; + return {run_length, static_cast(CountSetBits(bitmap_, offset_, run_length))}; + } + + int64_t total_popcount = 0; + if (offset_ == 0) { + total_popcount += __builtin_popcountll(load_word(bitmap_)); + total_popcount += __builtin_popcountll(load_word(bitmap_ + 8)); + total_popcount += __builtin_popcountll(load_word(bitmap_ + 16)); + total_popcount += __builtin_popcountll(load_word(bitmap_ + 24)); + } else { + auto current = load_word(bitmap_); + auto next = load_word(bitmap_ + 8); + total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 16); + total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 24); + total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 32); + total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); + } + bitmap_ += BitUtil::BytesForBits(kTargetBlockLength); + bits_remaining_ -= 256; + return {256, static_cast(total_popcount)}; +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 12b1a13c59a..d1d6a52f9bb 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -1170,5 +1170,32 @@ class BitsetStack { std::vector offsets_; }; +/// \brief A class that scans through a true/false bitmap to yield blocks of up +/// to 256 bits at a time along with their popcount. This is used to accelerate +/// processing of mostly-not-null array data. +class ARROW_EXPORT BitBlockCounter { + public: + struct Block { + int16_t length; + int16_t popcount; + }; + + static constexpr int16_t kTargetBlockLength = 256; + + BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap + start_offset / 8), + bits_remaining_(length), + offset_(start_offset % 8) {} + + /// \brief Return the next run of available bits, up to 256. The returned + /// pair contains the size of run and the number of true values + Block NextBlock(); + + private: + const uint8_t* bitmap_; + int64_t bits_remaining_; + int64_t offset_; +}; + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc index 81ccb4726a9..ae4ddc12d5d 100644 --- a/cpp/src/arrow/util/bit_util_benchmark.cc +++ b/cpp/src/arrow/util/bit_util_benchmark.cc @@ -22,10 +22,12 @@ #include #include +#include "arrow/array/array_base.h" #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/util/bit_util.h" @@ -387,6 +389,114 @@ static void BitmapEquals(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * buffer_size); } +template +static void BitBlockCounterSumNotNull(benchmark::State& state) { + using internal::BitBlockCounter; + + random::RandomArrayGenerator rng(/*seed=*/0); + + const int64_t bitmap_length = 1 << 20; + + // State parameter is the average number of total values for each false + // value. So 100 means that 1 out of 100 on average are false. + double true_probability = 1. - 1. / state.range(0); + auto arr = rng.Int8(bitmap_length, 0, 100, true_probability); + + const uint8_t* bitmap = arr->null_bitmap_data(); + + // Compute the expected result + int64_t expected = 0; + const auto& int8_arr = static_cast(*arr); + for (int64_t i = Offset; i < bitmap_length; ++i) { + if (int8_arr.IsValid(i)) { + expected += int8_arr.Value(i); + } + } + for (auto _ : state) { + BitBlockCounter scanner(bitmap, Offset, bitmap_length - Offset); + int64_t result = 0; + int64_t position = Offset; + while (true) { + BitBlockCounter::Block block = scanner.NextBlock(); + if (block.length == 0) { + break; + } + if (block.length == block.popcount) { + // All not-null + for (int64_t i = 0; i < block.length; ++i) { + result += int8_arr.Value(position + i); + } + } else if (block.popcount > 0) { + // Some but not all not-null + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(bitmap, position + i)) { + result += int8_arr.Value(position + i); + } + } + } + position += block.length; + } + // Sanity check + if (result != expected) { + std::abort(); + } + } + state.SetItemsProcessed(state.iterations() * bitmap_length); +} + +template +static void BitmapReaderSumNotNull(benchmark::State& state) { + random::RandomArrayGenerator rng(/*seed=*/0); + + const int64_t bitmap_length = 1 << 20; + + // State parameter is the average number of total values for each false + // value. So 100 means that 1 out of 100 on average are false. + double true_probability = 1. - 1. / state.range(0); + auto arr = rng.Int8(bitmap_length, 0, 100, true_probability); + + const uint8_t* bitmap = arr->null_bitmap_data(); + // Compute the expected result + int64_t expected = 0; + const auto& int8_arr = static_cast(*arr); + for (int64_t i = Offset; i < bitmap_length; ++i) { + if (int8_arr.IsValid(i)) { + expected += int8_arr.Value(i); + } + } + for (auto _ : state) { + internal::BitmapReader bit_reader(bitmap, Offset, bitmap_length - Offset); + int64_t result = 0; + for (int64_t i = Offset; i < bitmap_length; ++i) { + if (bit_reader.IsSet()) { + result += int8_arr.Value(i); + } + bit_reader.Next(); + } + // Sanity check + if (result != expected) { + std::abort(); + } + } + state.SetItemsProcessed(state.iterations() * bitmap_length); +} + +static void BitBlockCounterSumNotNull(benchmark::State& state) { + BitBlockCounterSumNotNull<0>(state); +} + +static void BitBlockCounterSumNotNullWithOffset(benchmark::State& state) { + BitBlockCounterSumNotNull<4>(state); +} + +static void BitmapReaderSumNotNull(benchmark::State& state) { + BitmapReaderSumNotNull<0>(state); +} + +static void BitmapReaderSumNotNullWithOffset(benchmark::State& state) { + BitmapReaderSumNotNull<4>(state); +} + static void BitmapEqualsWithoutOffset(benchmark::State& state) { BitmapEquals<0>(state); } static void BitmapEqualsWithOffset(benchmark::State& state) { BitmapEquals<4>(state); } @@ -424,6 +534,12 @@ BENCHMARK(CopyBitmapWithOffsetBoth)->Arg(kBufferSize); BENCHMARK(BitmapEqualsWithoutOffset)->Arg(kBufferSize); BENCHMARK(BitmapEqualsWithOffset)->Arg(kBufferSize); +// Range value: average number of total values per null +BENCHMARK(BitBlockCounterSumNotNull)->Range(8, 1 << 16); +BENCHMARK(BitBlockCounterSumNotNullWithOffset)->Range(8, 1 << 16); +BENCHMARK(BitmapReaderSumNotNull)->Range(8, 1 << 16); +BENCHMARK(BitmapReaderSumNotNullWithOffset)->Range(8, 1 << 16); + #define AND_BENCHMARK_RANGES \ { \ {kBufferSize * 4, kBufferSize * 16}, { 0, 2 } \ diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 8b87e698bc3..45f6daff4ba 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1367,5 +1367,107 @@ TEST(Bitmap, VisitWordsAnd) { } } } + +class TestBitBlockCounter : public ::testing::Test { + public: + void Create(int64_t nbytes, int64_t offset, int64_t length) { + ASSERT_OK_AND_ASSIGN(buf_, AllocateBuffer(nbytes)); + // Start with data zeroed out + std::memset(buf_->mutable_data(), 0, nbytes); + scanner_.reset(new BitBlockCounter(buf_->data(), offset, length)); + } + + protected: + std::shared_ptr buf_; + std::unique_ptr scanner_; +}; + +static constexpr int64_t kWordSize = 64; + +TEST_F(TestBitBlockCounter, Basics) { + const int64_t nbytes = 1024; + + Create(nbytes, 0, nbytes * 8); + + int64_t bits_scanned = 0; + for (int64_t i = 0; i < nbytes / 32; ++i) { + BitBlockCounter::Block block = scanner_->NextBlock(); + ASSERT_EQ(block.length, 4 * kWordSize); + ASSERT_EQ(block.popcount, 0); + bits_scanned += block.length; + } + ASSERT_EQ(bits_scanned, 1024 * 8); + + auto block = scanner_->NextBlock(); + ASSERT_EQ(block.length, 0); + ASSERT_EQ(block.popcount, 0); +} + +TEST_F(TestBitBlockCounter, Offsets) { + auto CheckWithOffset = [&](int64_t offset) { + const int64_t nwords = 15; + + const int64_t total_bytes = nwords * 8 + 1; + // Trim a bit from the end of the bitmap so we can check the remainder bits + // behavior + Create(total_bytes, offset, nwords * kWordSize - offset - 1); + + // Start with data all set + std::memset(buf_->mutable_data(), 0xFF, total_bytes); + + BitBlockCounter::Block block = scanner_->NextBlock(); + ASSERT_EQ(4 * kWordSize, block.length); + ASSERT_EQ(block.popcount, 256); + + // Add some false values to the next 3 shifted words + BitUtil::SetBitTo(buf_->mutable_data(), 4 * kWordSize + offset, false); + BitUtil::SetBitTo(buf_->mutable_data(), 5 * kWordSize + offset, false); + BitUtil::SetBitTo(buf_->mutable_data(), 6 * kWordSize + offset, false); + block = scanner_->NextBlock(); + + ASSERT_EQ(block.length, 256); + ASSERT_EQ(block.popcount, 253); + + BitUtil::SetBitsTo(buf_->mutable_data(), 8 * kWordSize + offset, 2 * kWordSize, + false); + + block = scanner_->NextBlock(); + ASSERT_EQ(block.length, 256); + ASSERT_EQ(block.popcount, 128); + + // Last block + block = scanner_->NextBlock(); + ASSERT_EQ(block.length, 3 * kWordSize - offset - 1); + ASSERT_EQ(block.length, block.popcount); + + // We can keep calling NextBlock safely + block = scanner_->NextBlock(); + ASSERT_EQ(block.length, 0); + ASSERT_EQ(block.popcount, 0); + }; + + for (int64_t offset_i = 0; offset_i < 7; ++offset_i) { + CheckWithOffset(offset_i); + } +} + +TEST_F(TestBitBlockCounter, RandomData) { + const int64_t nbytes = 1024; + auto buffer = *AllocateBuffer(nbytes); + random_bytes(nbytes, 0, buffer->mutable_data()); + + auto CheckWithOffset = [&](int64_t offset) { + BitBlockCounter scanner(buffer->data(), offset, nbytes * 8 - offset); + for (int64_t i = 0; i < nbytes / 32; ++i) { + BitBlockCounter::Block block = scanner.NextBlock(); + ASSERT_EQ(block.popcount, + CountSetBits(buffer->data(), i * 256 + offset, block.length)); + } + }; + for (int64_t offset_i = 0; offset_i < 7; ++offset_i) { + CheckWithOffset(offset_i); + } +} + } // namespace internal } // namespace arrow