-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-9029: [C++] Implement BitBlockCounter for much faster block popcounts of bitmaps #7346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
feedc84
20e7069
3dbd47d
af93766
f9dda29
ffff8c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -598,5 +598,49 @@ Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length | |
| return std::move(buffer); | ||
| } | ||
|
|
||
| BitBlockCounter::Block BitBlockCounter::NextBlock() { | ||
| auto load_word = [](const uint8_t* bytes) -> uint64_t { | ||
| return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes)); | ||
| }; | ||
| auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t { | ||
| return (current >> shift) | (next << (64 - shift)); | ||
| }; | ||
|
|
||
| // When the offset is > 0, we need there to be a word beyond the last aligned | ||
| // word in the bitmap for the bit shifting logic. | ||
| const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_); | ||
| if (bits_remaining_ < bits_required_to_scan_words) { | ||
| // End of the bitmap, leave it to the caller to decide how to best check | ||
| // these bits, no need to do redundant computation here. | ||
| const int16_t run_length = static_cast<int16_t>(bits_remaining_); | ||
| bits_remaining_ -= run_length; | ||
| return {run_length, static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length))}; | ||
| } | ||
|
|
||
| int64_t total_popcount = 0; | ||
| if (offset_ == 0) { | ||
| total_popcount += __builtin_popcountll(load_word(bitmap_)); | ||
| total_popcount += __builtin_popcountll(load_word(bitmap_ + 8)); | ||
| total_popcount += __builtin_popcountll(load_word(bitmap_ + 16)); | ||
| total_popcount += __builtin_popcountll(load_word(bitmap_ + 24)); | ||
| } else { | ||
| auto current = load_word(bitmap_); | ||
| auto next = load_word(bitmap_ + 8); | ||
| total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); | ||
wesm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| current = next; | ||
| next = load_word(bitmap_ + 16); | ||
| total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); | ||
| current = next; | ||
| next = load_word(bitmap_ + 24); | ||
| total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); | ||
| current = next; | ||
| next = load_word(bitmap_ + 32); | ||
| total_popcount += __builtin_popcountll(shift_word(current, next, offset_)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm okay with the implementation. Just thinking if it would be faster to drop "shift_word" with below steps:
I think we can leave deeper optimization as follow up tasks. And I guess we can handle larger blocks with SIMD more efficiently.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This raised an interesting question of should this be tied in with existing popcount methods and the improvements applied in both places?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good to tackle this as follow up, but this is a good starting point in any case. |
||
| } | ||
| bitmap_ += BitUtil::BytesForBits(kTargetBlockLength); | ||
| bits_remaining_ -= 256; | ||
| return {256, static_cast<int16_t>(total_popcount)}; | ||
| } | ||
|
|
||
| } // namespace internal | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1170,5 +1170,32 @@ class BitsetStack { | |
| std::vector<int> offsets_; | ||
| }; | ||
|
|
||
| /// \brief A class that scans through a true/false bitmap to yield blocks of up | ||
| /// to 256 bits at a time along with their popcount. This is used to accelerate | ||
| /// processing of mostly-not-null array data. | ||
| class ARROW_EXPORT BitBlockCounter { | ||
| public: | ||
| struct Block { | ||
| int16_t length; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did you check if this affects benchmarks, I think popcount returns ints so there might be some miniscule advantage to keep it at a larger size.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It didn't seem to affect them |
||
| int16_t popcount; | ||
| }; | ||
|
|
||
| static constexpr int16_t kTargetBlockLength = 256; | ||
|
|
||
| BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length) | ||
| : bitmap_(bitmap + start_offset / 8), | ||
| bits_remaining_(length), | ||
| offset_(start_offset % 8) {} | ||
|
|
||
| /// \brief Return the next run of available bits, up to 256. The returned | ||
| /// pair contains the size of run and the number of true values | ||
| Block NextBlock(); | ||
|
|
||
| private: | ||
| const uint8_t* bitmap_; | ||
| int64_t bits_remaining_; | ||
| int64_t offset_; | ||
| }; | ||
|
|
||
| } // namespace internal | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,10 +22,12 @@ | |
| #include <bitset> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/array/array_base.h" | ||
| #include "arrow/buffer.h" | ||
| #include "arrow/builder.h" | ||
| #include "arrow/memory_pool.h" | ||
| #include "arrow/testing/gtest_util.h" | ||
| #include "arrow/testing/random.h" | ||
| #include "arrow/testing/util.h" | ||
| #include "arrow/util/bit_util.h" | ||
|
|
||
|
|
@@ -387,6 +389,114 @@ static void BitmapEquals(benchmark::State& state) { | |
| state.SetBytesProcessed(state.iterations() * buffer_size); | ||
| } | ||
|
|
||
| template <int64_t Offset = 0> | ||
| static void BitBlockCounterSumNotNull(benchmark::State& state) { | ||
| using internal::BitBlockCounter; | ||
|
|
||
| random::RandomArrayGenerator rng(/*seed=*/0); | ||
|
|
||
| const int64_t bitmap_length = 1 << 20; | ||
|
|
||
| // State parameter is the average number of total values for each false | ||
| // value. So 100 means that 1 out of 100 on average are false. | ||
| double true_probability = 1. - 1. / state.range(0); | ||
| auto arr = rng.Int8(bitmap_length, 0, 100, true_probability); | ||
|
|
||
| const uint8_t* bitmap = arr->null_bitmap_data(); | ||
|
|
||
| // Compute the expected result | ||
| int64_t expected = 0; | ||
| const auto& int8_arr = static_cast<const Int8Array&>(*arr); | ||
| for (int64_t i = Offset; i < bitmap_length; ++i) { | ||
| if (int8_arr.IsValid(i)) { | ||
| expected += int8_arr.Value(i); | ||
| } | ||
| } | ||
| for (auto _ : state) { | ||
| BitBlockCounter scanner(bitmap, Offset, bitmap_length - Offset); | ||
| int64_t result = 0; | ||
| int64_t position = Offset; | ||
| while (true) { | ||
| BitBlockCounter::Block block = scanner.NextBlock(); | ||
| if (block.length == 0) { | ||
| break; | ||
| } | ||
| if (block.length == block.popcount) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So basically you don't need a popcount at all for this optimization. You just need to check whether all bits in the block are 1, or whether all bits in the block are 0. This should be trivial (and could perhaps be folded in VisitBits and/or VisitArrayDataInline).
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need the popcount in order to calculate the number of "on" elements for filters, but yes for this particular optimization between "all-on" and "not-all-on" you don't need the popcount. |
||
| // All not-null | ||
| for (int64_t i = 0; i < block.length; ++i) { | ||
| result += int8_arr.Value(position + i); | ||
| } | ||
| } else if (block.popcount > 0) { | ||
| // Some but not all not-null | ||
| for (int64_t i = 0; i < block.length; ++i) { | ||
| if (BitUtil::GetBit(bitmap, position + i)) { | ||
| result += int8_arr.Value(position + i); | ||
| } | ||
| } | ||
| } | ||
| position += block.length; | ||
| } | ||
| // Sanity check | ||
| if (result != expected) { | ||
| std::abort(); | ||
| } | ||
| } | ||
| state.SetItemsProcessed(state.iterations() * bitmap_length); | ||
| } | ||
|
|
||
| template <int64_t Offset = 0> | ||
| static void BitmapReaderSumNotNull(benchmark::State& state) { | ||
| random::RandomArrayGenerator rng(/*seed=*/0); | ||
|
|
||
| const int64_t bitmap_length = 1 << 20; | ||
|
|
||
| // State parameter is the average number of total values for each false | ||
| // value. So 100 means that 1 out of 100 on average are false. | ||
| double true_probability = 1. - 1. / state.range(0); | ||
| auto arr = rng.Int8(bitmap_length, 0, 100, true_probability); | ||
|
|
||
| const uint8_t* bitmap = arr->null_bitmap_data(); | ||
| // Compute the expected result | ||
| int64_t expected = 0; | ||
| const auto& int8_arr = static_cast<const Int8Array&>(*arr); | ||
| for (int64_t i = Offset; i < bitmap_length; ++i) { | ||
| if (int8_arr.IsValid(i)) { | ||
| expected += int8_arr.Value(i); | ||
| } | ||
| } | ||
| for (auto _ : state) { | ||
| internal::BitmapReader bit_reader(bitmap, Offset, bitmap_length - Offset); | ||
| int64_t result = 0; | ||
| for (int64_t i = Offset; i < bitmap_length; ++i) { | ||
| if (bit_reader.IsSet()) { | ||
| result += int8_arr.Value(i); | ||
| } | ||
| bit_reader.Next(); | ||
| } | ||
| // Sanity check | ||
| if (result != expected) { | ||
| std::abort(); | ||
| } | ||
| } | ||
| state.SetItemsProcessed(state.iterations() * bitmap_length); | ||
| } | ||
|
|
||
| static void BitBlockCounterSumNotNull(benchmark::State& state) { | ||
| BitBlockCounterSumNotNull<0>(state); | ||
| } | ||
|
|
||
| static void BitBlockCounterSumNotNullWithOffset(benchmark::State& state) { | ||
| BitBlockCounterSumNotNull<4>(state); | ||
| } | ||
|
|
||
| static void BitmapReaderSumNotNull(benchmark::State& state) { | ||
| BitmapReaderSumNotNull<0>(state); | ||
| } | ||
|
|
||
| static void BitmapReaderSumNotNullWithOffset(benchmark::State& state) { | ||
| BitmapReaderSumNotNull<4>(state); | ||
| } | ||
|
|
||
| static void BitmapEqualsWithoutOffset(benchmark::State& state) { BitmapEquals<0>(state); } | ||
|
|
||
| static void BitmapEqualsWithOffset(benchmark::State& state) { BitmapEquals<4>(state); } | ||
|
|
@@ -424,6 +534,12 @@ BENCHMARK(CopyBitmapWithOffsetBoth)->Arg(kBufferSize); | |
| BENCHMARK(BitmapEqualsWithoutOffset)->Arg(kBufferSize); | ||
| BENCHMARK(BitmapEqualsWithOffset)->Arg(kBufferSize); | ||
|
|
||
| // Range value: average number of total values per null | ||
| BENCHMARK(BitBlockCounterSumNotNull)->Range(8, 1 << 16); | ||
| BENCHMARK(BitBlockCounterSumNotNullWithOffset)->Range(8, 1 << 16); | ||
| BENCHMARK(BitmapReaderSumNotNull)->Range(8, 1 << 16); | ||
| BENCHMARK(BitmapReaderSumNotNullWithOffset)->Range(8, 1 << 16); | ||
|
|
||
| #define AND_BENCHMARK_RANGES \ | ||
| { \ | ||
| {kBufferSize * 4, kBufferSize * 16}, { 0, 2 } \ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will this have different semantics on big endian?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
shift_wordhelper is used in other functions -- the unit tests pass on big endian it seemsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added some more tests with random data