Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions cpp/src/arrow/util/bit_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -598,5 +598,49 @@ Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length
return std::move(buffer);
}

BitBlockCounter::Block BitBlockCounter::NextBlock() {
auto load_word = [](const uint8_t* bytes) -> uint64_t {
return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
};
auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t {
return (current >> shift) | (next << (64 - shift));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will this have different semantics on big endian?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shift_word helper is used in other functions -- the unit tests pass on big endian it seems

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some more tests with random data

};

// When the offset is > 0, we need there to be a word beyond the last aligned
// word in the bitmap for the bit shifting logic.
const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_);
if (bits_remaining_ < bits_required_to_scan_words) {
// End of the bitmap, leave it to the caller to decide how to best check
// these bits, no need to do redundant computation here.
const int16_t run_length = static_cast<int16_t>(bits_remaining_);
bits_remaining_ -= run_length;
return {run_length, static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length))};
}

int64_t total_popcount = 0;
if (offset_ == 0) {
total_popcount += __builtin_popcountll(load_word(bitmap_));
total_popcount += __builtin_popcountll(load_word(bitmap_ + 8));
total_popcount += __builtin_popcountll(load_word(bitmap_ + 16));
total_popcount += __builtin_popcountll(load_word(bitmap_ + 24));
} else {
auto current = load_word(bitmap_);
auto next = load_word(bitmap_ + 8);
total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
current = next;
next = load_word(bitmap_ + 16);
total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
current = next;
next = load_word(bitmap_ + 24);
total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
current = next;
next = load_word(bitmap_ + 32);
total_popcount += __builtin_popcountll(shift_word(current, next, offset_));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with the implementation.

Just thinking if it would be faster to drop "shift_word" with below steps:

  1. simply accumulate total pops of 4 continuous uint64
  2. mask first byte with offset bitmask, count pops, minus from total pops
  3. mask next byte after last uint64 with offset bitmask, count pops, add to total pops

I think we can leave deeper optimization as follow up tasks. And I guess we can handle larger blocks with SIMD more efficiently.
Breaking large blocks to smaller trunks to leverage non-null processing is a great idea.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This raised an interesting question of should this be tied in with existing popcount methods and the improvements applied in both places?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to tackle this as follow up, but this is a good starting point in any case.

}
bitmap_ += BitUtil::BytesForBits(kTargetBlockLength);
bits_remaining_ -= 256;
return {256, static_cast<int16_t>(total_popcount)};
}

} // namespace internal
} // namespace arrow
27 changes: 27 additions & 0 deletions cpp/src/arrow/util/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -1170,5 +1170,32 @@ class BitsetStack {
std::vector<int> offsets_;
};

/// \brief A class that scans through a true/false bitmap to yield blocks of up
/// to 256 bits at a time along with their popcount. This is used to accelerate
/// processing of mostly-not-null array data.
class ARROW_EXPORT BitBlockCounter {
public:
struct Block {
int16_t length;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you check if this affects benchmarks, I think popcount returns ints so there might be some miniscule advantage to keep it at a larger size.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It didn't seem to affect them

int16_t popcount;
};

static constexpr int16_t kTargetBlockLength = 256;

BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
: bitmap_(bitmap + start_offset / 8),
bits_remaining_(length),
offset_(start_offset % 8) {}

/// \brief Return the next run of available bits, up to 256. The returned
/// pair contains the size of run and the number of true values
Block NextBlock();

private:
const uint8_t* bitmap_;
int64_t bits_remaining_;
int64_t offset_;
};

} // namespace internal
} // namespace arrow
116 changes: 116 additions & 0 deletions cpp/src/arrow/util/bit_util_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
#include <bitset>
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
#include "arrow/memory_pool.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/testing/util.h"
#include "arrow/util/bit_util.h"

Expand Down Expand Up @@ -387,6 +389,114 @@ static void BitmapEquals(benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * buffer_size);
}

template <int64_t Offset = 0>
static void BitBlockCounterSumNotNull(benchmark::State& state) {
using internal::BitBlockCounter;

random::RandomArrayGenerator rng(/*seed=*/0);

const int64_t bitmap_length = 1 << 20;

// State parameter is the average number of total values for each false
// value. So 100 means that 1 out of 100 on average are false.
double true_probability = 1. - 1. / state.range(0);
auto arr = rng.Int8(bitmap_length, 0, 100, true_probability);

const uint8_t* bitmap = arr->null_bitmap_data();

// Compute the expected result
int64_t expected = 0;
const auto& int8_arr = static_cast<const Int8Array&>(*arr);
for (int64_t i = Offset; i < bitmap_length; ++i) {
if (int8_arr.IsValid(i)) {
expected += int8_arr.Value(i);
}
}
for (auto _ : state) {
BitBlockCounter scanner(bitmap, Offset, bitmap_length - Offset);
int64_t result = 0;
int64_t position = Offset;
while (true) {
BitBlockCounter::Block block = scanner.NextBlock();
if (block.length == 0) {
break;
}
if (block.length == block.popcount) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So basically you don't need a popcount at all for this optimization. You just need to check whether all bits in the block are 1, or whether all bits in the block are 0. This should be trivial (and could perhaps be folded in VisitBits and/or VisitArrayDataInline).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need the popcount in order to calculate the number of "on" elements for filters, but yes for this particular optimization between "all-on" and "not-all-on" you don't need the popcount.

// All not-null
for (int64_t i = 0; i < block.length; ++i) {
result += int8_arr.Value(position + i);
}
} else if (block.popcount > 0) {
// Some but not all not-null
for (int64_t i = 0; i < block.length; ++i) {
if (BitUtil::GetBit(bitmap, position + i)) {
result += int8_arr.Value(position + i);
}
}
}
position += block.length;
}
// Sanity check
if (result != expected) {
std::abort();
}
}
state.SetItemsProcessed(state.iterations() * bitmap_length);
}

template <int64_t Offset = 0>
static void BitmapReaderSumNotNull(benchmark::State& state) {
random::RandomArrayGenerator rng(/*seed=*/0);

const int64_t bitmap_length = 1 << 20;

// State parameter is the average number of total values for each false
// value. So 100 means that 1 out of 100 on average are false.
double true_probability = 1. - 1. / state.range(0);
auto arr = rng.Int8(bitmap_length, 0, 100, true_probability);

const uint8_t* bitmap = arr->null_bitmap_data();
// Compute the expected result
int64_t expected = 0;
const auto& int8_arr = static_cast<const Int8Array&>(*arr);
for (int64_t i = Offset; i < bitmap_length; ++i) {
if (int8_arr.IsValid(i)) {
expected += int8_arr.Value(i);
}
}
for (auto _ : state) {
internal::BitmapReader bit_reader(bitmap, Offset, bitmap_length - Offset);
int64_t result = 0;
for (int64_t i = Offset; i < bitmap_length; ++i) {
if (bit_reader.IsSet()) {
result += int8_arr.Value(i);
}
bit_reader.Next();
}
// Sanity check
if (result != expected) {
std::abort();
}
}
state.SetItemsProcessed(state.iterations() * bitmap_length);
}

static void BitBlockCounterSumNotNull(benchmark::State& state) {
BitBlockCounterSumNotNull<0>(state);
}

static void BitBlockCounterSumNotNullWithOffset(benchmark::State& state) {
BitBlockCounterSumNotNull<4>(state);
}

static void BitmapReaderSumNotNull(benchmark::State& state) {
BitmapReaderSumNotNull<0>(state);
}

static void BitmapReaderSumNotNullWithOffset(benchmark::State& state) {
BitmapReaderSumNotNull<4>(state);
}

static void BitmapEqualsWithoutOffset(benchmark::State& state) { BitmapEquals<0>(state); }

static void BitmapEqualsWithOffset(benchmark::State& state) { BitmapEquals<4>(state); }
Expand Down Expand Up @@ -424,6 +534,12 @@ BENCHMARK(CopyBitmapWithOffsetBoth)->Arg(kBufferSize);
BENCHMARK(BitmapEqualsWithoutOffset)->Arg(kBufferSize);
BENCHMARK(BitmapEqualsWithOffset)->Arg(kBufferSize);

// Range value: average number of total values per null
BENCHMARK(BitBlockCounterSumNotNull)->Range(8, 1 << 16);
BENCHMARK(BitBlockCounterSumNotNullWithOffset)->Range(8, 1 << 16);
BENCHMARK(BitmapReaderSumNotNull)->Range(8, 1 << 16);
BENCHMARK(BitmapReaderSumNotNullWithOffset)->Range(8, 1 << 16);

#define AND_BENCHMARK_RANGES \
{ \
{kBufferSize * 4, kBufferSize * 16}, { 0, 2 } \
Expand Down
102 changes: 102 additions & 0 deletions cpp/src/arrow/util/bit_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1367,5 +1367,107 @@ TEST(Bitmap, VisitWordsAnd) {
}
}
}

class TestBitBlockCounter : public ::testing::Test {
public:
void Create(int64_t nbytes, int64_t offset, int64_t length) {
ASSERT_OK_AND_ASSIGN(buf_, AllocateBuffer(nbytes));
// Start with data zeroed out
std::memset(buf_->mutable_data(), 0, nbytes);
scanner_.reset(new BitBlockCounter(buf_->data(), offset, length));
}

protected:
std::shared_ptr<Buffer> buf_;
std::unique_ptr<BitBlockCounter> scanner_;
};

static constexpr int64_t kWordSize = 64;

TEST_F(TestBitBlockCounter, Basics) {
const int64_t nbytes = 1024;

Create(nbytes, 0, nbytes * 8);

int64_t bits_scanned = 0;
for (int64_t i = 0; i < nbytes / 32; ++i) {
BitBlockCounter::Block block = scanner_->NextBlock();
ASSERT_EQ(block.length, 4 * kWordSize);
ASSERT_EQ(block.popcount, 0);
bits_scanned += block.length;
}
ASSERT_EQ(bits_scanned, 1024 * 8);

auto block = scanner_->NextBlock();
ASSERT_EQ(block.length, 0);
ASSERT_EQ(block.popcount, 0);
}

TEST_F(TestBitBlockCounter, Offsets) {
auto CheckWithOffset = [&](int64_t offset) {
const int64_t nwords = 15;

const int64_t total_bytes = nwords * 8 + 1;
// Trim a bit from the end of the bitmap so we can check the remainder bits
// behavior
Create(total_bytes, offset, nwords * kWordSize - offset - 1);

// Start with data all set
std::memset(buf_->mutable_data(), 0xFF, total_bytes);

BitBlockCounter::Block block = scanner_->NextBlock();
ASSERT_EQ(4 * kWordSize, block.length);
ASSERT_EQ(block.popcount, 256);

// Add some false values to the next 3 shifted words
BitUtil::SetBitTo(buf_->mutable_data(), 4 * kWordSize + offset, false);
BitUtil::SetBitTo(buf_->mutable_data(), 5 * kWordSize + offset, false);
BitUtil::SetBitTo(buf_->mutable_data(), 6 * kWordSize + offset, false);
block = scanner_->NextBlock();

ASSERT_EQ(block.length, 256);
ASSERT_EQ(block.popcount, 253);

BitUtil::SetBitsTo(buf_->mutable_data(), 8 * kWordSize + offset, 2 * kWordSize,
false);

block = scanner_->NextBlock();
ASSERT_EQ(block.length, 256);
ASSERT_EQ(block.popcount, 128);

// Last block
block = scanner_->NextBlock();
ASSERT_EQ(block.length, 3 * kWordSize - offset - 1);
ASSERT_EQ(block.length, block.popcount);

// We can keep calling NextBlock safely
block = scanner_->NextBlock();
ASSERT_EQ(block.length, 0);
ASSERT_EQ(block.popcount, 0);
};

for (int64_t offset_i = 0; offset_i < 7; ++offset_i) {
CheckWithOffset(offset_i);
}
}

TEST_F(TestBitBlockCounter, RandomData) {
const int64_t nbytes = 1024;
auto buffer = *AllocateBuffer(nbytes);
random_bytes(nbytes, 0, buffer->mutable_data());

auto CheckWithOffset = [&](int64_t offset) {
BitBlockCounter scanner(buffer->data(), offset, nbytes * 8 - offset);
for (int64_t i = 0; i < nbytes / 32; ++i) {
BitBlockCounter::Block block = scanner.NextBlock();
ASSERT_EQ(block.popcount,
CountSetBits(buffer->data(), i * 256 + offset, block.length));
}
};
for (int64_t offset_i = 0; offset_i < 7; ++offset_i) {
CheckWithOffset(offset_i);
}
}

} // namespace internal
} // namespace arrow