diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 326578588a7..3362d91cbe8 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -37,6 +37,7 @@ add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index 89107120fa3..7a0e3654edb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -30,60 +30,60 @@ namespace compute { namespace { -enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA }; - template void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left, const ArrayData& right, ArrayData* out) { DCHECK(left.null_count != 0 || right.null_count != 0) << "ComputeKleene is unnecessarily expensive for the non-null case"; - Bitmap bitmaps[4]; - bitmaps[LEFT_VALID] = {left.buffers[0], left.offset, left.length}; - bitmaps[LEFT_DATA] = {left.buffers[1], left.offset, left.length}; + Bitmap left_valid_bm{left.buffers[0], left.offset, left.length}; + Bitmap left_data_bm{left.buffers[1], left.offset, left.length}; - bitmaps[RIGHT_VALID] = {right.buffers[0], right.offset, right.length}; - bitmaps[RIGHT_DATA] = {right.buffers[1], right.offset, right.length}; + Bitmap right_valid_bm{right.buffers[0], right.offset, right.length}; + Bitmap right_data_bm{right.buffers[1], right.offset, right.length}; - auto out_validity = out->GetMutableValues(0); - auto out_data = out->GetMutableValues(1); + std::array out_bms{Bitmap(out->buffers[0], out->offset, out->length), + Bitmap(out->buffers[1], out->offset, out->length)}; - int64_t i = 0; auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid, - uint64_t right_data) { + uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) { auto left_true = left_valid & left_data; auto left_false = left_valid & ~left_data; auto right_true = right_valid & right_data; auto right_false = right_valid & ~right_data; - compute_word(left_true, left_false, right_true, right_false, &out_validity[i], - &out_data[i]); - ++i; + compute_word(left_true, left_false, right_true, right_false, out_validity, out_data); }; if (right.null_count == 0) { - // bitmaps[RIGHT_VALID] might be null; override to make it safe for Visit() - bitmaps[RIGHT_VALID] = bitmaps[RIGHT_DATA]; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[LEFT_VALID], words[LEFT_DATA], ~uint64_t(0), words[RIGHT_DATA]); - }); + std::array in_bms{left_valid_bm, left_data_bm, right_data_bm}; + Bitmap::VisitWordsAndWrite( + in_bms, &out_bms, + [&](const std::array& in, std::array* out) { + apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1))); + }); return; } if (left.null_count == 0) { - // bitmaps[LEFT_VALID] might be null; override to make it safe for Visit() - bitmaps[LEFT_VALID] = bitmaps[LEFT_DATA]; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(~uint64_t(0), words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]); - }); + std::array in_bms{left_data_bm, right_valid_bm, right_data_bm}; + Bitmap::VisitWordsAndWrite( + in_bms, &out_bms, + [&](const std::array& in, std::array* out) { + apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1))); + }); return; } DCHECK(left.null_count != 0 && right.null_count != 0); - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[LEFT_VALID], words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]); - }); + std::array in_bms{left_valid_bm, left_data_bm, right_valid_bm, + right_data_bm}; + Bitmap::VisitWordsAndWrite( + in_bms, &out_bms, + [&](const std::array& in, std::array* out) { + apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1))); + }); } inline BooleanScalar InvertScalar(const Scalar& in) { @@ -204,7 +204,8 @@ struct KleeneAndOp : Commutative { ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { out->null_count = 0; - out->buffers[0] = nullptr; + // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1 + BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length); return AndOp::Call(ctx, left, right, out); } auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true, @@ -307,7 +308,8 @@ struct KleeneOrOp : Commutative { ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { out->null_count = 0; - out->buffers[0] = nullptr; + // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1 + BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length); return OrOp::Call(ctx, left, right, out); } @@ -437,7 +439,8 @@ struct KleeneAndNotOp { ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { out->null_count = 0; - out->buffers[0] = nullptr; + // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1 + BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length); return AndNotOp::Call(ctx, left, right, out); } @@ -453,9 +456,8 @@ struct KleeneAndNotOp { } }; -void MakeFunction(std::string name, int arity, ArrayKernelExec exec, +void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec, const FunctionDoc* doc, FunctionRegistry* registry, - bool can_write_into_slices = true, NullHandling::type null_handling = NullHandling::INTERSECTION) { auto func = std::make_shared(name, Arity(arity), doc); @@ -463,7 +465,6 @@ void MakeFunction(std::string name, int arity, ArrayKernelExec exec, std::vector in_types(arity, InputType(boolean())); ScalarKernel kernel(std::move(in_types), boolean(), exec); kernel.null_handling = null_handling; - kernel.can_write_into_slices = can_write_into_slices; DCHECK_OK(func->AddKernel(kernel)); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -549,16 +550,12 @@ void RegisterScalarBoolean(FunctionRegistry* registry) { MakeFunction("or", 2, applicator::SimpleBinary, &or_doc, registry); MakeFunction("xor", 2, applicator::SimpleBinary, &xor_doc, registry); - // The Kleene logic kernels cannot write into sliced output bitmaps MakeFunction("and_kleene", 2, applicator::SimpleBinary, &and_kleene_doc, - registry, - /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE); + registry, NullHandling::COMPUTED_PREALLOCATE); MakeFunction("and_not_kleene", 2, applicator::SimpleBinary, - &and_not_kleene_doc, registry, - /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE); + &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE); MakeFunction("or_kleene", 2, applicator::SimpleBinary, &or_kleene_doc, - registry, - /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE); + registry, NullHandling::COMPUTED_PREALLOCATE); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 7a0defaccd6..54e0725fce7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -21,11 +21,13 @@ #include #include #include +#include namespace arrow { using internal::BitBlockCount; using internal::BitBlockCounter; using internal::Bitmap; +using internal::BitmapWordReader; namespace compute { @@ -72,116 +74,267 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length}; Bitmap left_valid = GetBitmap(left_d, 0); Bitmap right_valid = GetBitmap(right_d, 0); - // sometimes Bitmaps will be ignored, in which case we replace access to them with - // duplicated (probably elided) access to cond_data - const Bitmap& _ = cond_data; - - // lambda function that will be used inside the visitor - uint64_t* out_validity = nullptr; - int64_t i = 0; - auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid, - uint64_t r_valid) { - out_validity[i] = c_valid & ((c_data & l_valid) | (~c_data & r_valid)); - i++; - }; // cond.valid & (cond.data & left.valid | ~cond.data & right.valid) // In the following cases, we dont need to allocate out_valid bitmap - // if cond & left & right all ones, then output is all valid --> out_valid = nullptr + // if cond & left & right all ones, then output is all valid. output validity buffer + // is already allocated, hence set all bits if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) { + BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset, + output->length); return Status::OK(); } if (left_const == kAllValid && right_const == kAllValid) { - // if both left and right are valid, no need to calculate out_valid bitmap. Pass + // if both left and right are valid, no need to calculate out_valid bitmap. Copy // cond validity buffer - // if there's an offset, copy bitmap (cannot slice a bitmap) - if (cond.offset) { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(), - cond.offset, cond.length)); - } else { // just copy assign cond validity buffer - output->buffers[0] = cond.buffers[0]; - } + arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length, + output->buffers[0]->mutable_data(), output->offset); return Status::OK(); } - // following cases requires a separate out_valid buffer - ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length)); - out_validity = output->GetMutableValues(0); + // lambda function that will be used inside the visitor + auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid, + uint64_t r_valid) { + return c_valid & ((c_data & l_valid) | (~c_data & r_valid)); + }; - enum { C_VALID, C_DATA, L_VALID, R_VALID }; + std::array out_bitmaps{ + Bitmap{output->buffers[0], output->offset, output->length}}; switch (flag) { case COND_CONST | LEFT_CONST | RIGHT_CONST: { - Bitmap bitmaps[] = {_, cond_data, _, _}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(*cond_const, words[C_DATA], *left_const, *right_const); - }); + std::array bitmaps{cond_data}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(*cond_const, words_in[0], + *left_const, *right_const); + }); break; } case LEFT_CONST | RIGHT_CONST: { - Bitmap bitmaps[] = {cond_valid, cond_data, _, _}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[C_VALID], words[C_DATA], *left_const, *right_const); - }); + std::array bitmaps{cond_valid, cond_data}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(words_in[0], words_in[1], + *left_const, *right_const); + }); break; } case COND_CONST | RIGHT_CONST: { // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for // Visit() - Bitmap bitmaps[] = {_, cond_data, left_valid, _}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(*cond_const, words[C_DATA], words[L_VALID], *right_const); - }); + std::array bitmaps{cond_data, left_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(*cond_const, words_in[0], + words_in[1], *right_const); + }); break; } case RIGHT_CONST: { // bitmaps[R_VALID] might be null; override to make it safe for Visit() - Bitmap bitmaps[] = {cond_valid, cond_data, left_valid, _}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[C_VALID], words[C_DATA], words[L_VALID], *right_const); - }); + std::array bitmaps{cond_valid, cond_data, left_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(words_in[0], words_in[1], + words_in[2], *right_const); + }); break; } case COND_CONST | LEFT_CONST: { // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for // Visit() - Bitmap bitmaps[] = {_, cond_data, _, right_valid}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(*cond_const, words[C_DATA], *left_const, words[R_VALID]); - }); + std::array bitmaps{cond_data, right_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(*cond_const, words_in[0], + *left_const, words_in[1]); + }); break; } case LEFT_CONST: { // bitmaps[L_VALID] might be null; override to make it safe for Visit() - Bitmap bitmaps[] = {cond_valid, cond_data, _, right_valid}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[C_VALID], words[C_DATA], *left_const, words[R_VALID]); - }); + std::array bitmaps{cond_valid, cond_data, right_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(words_in[0], words_in[1], + *left_const, words_in[2]); + }); break; } case COND_CONST: { // bitmaps[C_VALID] might be null; override to make it safe for Visit() - Bitmap bitmaps[] = {_, cond_data, left_valid, right_valid}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(*cond_const, words[C_DATA], words[L_VALID], words[R_VALID]); - }); + std::array bitmaps{cond_data, left_valid, right_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(*cond_const, words_in[0], + words_in[1], words_in[2]); + }); break; } case 0: { - Bitmap bitmaps[] = {cond_valid, cond_data, left_valid, right_valid}; - Bitmap::VisitWords(bitmaps, [&](std::array words) { - apply(words[C_VALID], words[C_DATA], words[L_VALID], words[R_VALID]); - }); + std::array bitmaps{cond_valid, cond_data, left_valid, right_valid}; + Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps, + [&](const std::array& words_in, + std::array* word_out) { + word_out->at(0) = apply(words_in[0], words_in[1], + words_in[2], words_in[3]); + }); break; } } return Status::OK(); } +using Word = uint64_t; +static constexpr int64_t word_len = sizeof(Word) * 8; + +/// Runs the main if_else loop. Here, it is expected that the right data has already +/// been copied to the output. +/// If `invert` is meant to invert the cond.data. If is set to `true`, then the +/// buffer will be inverted before calling the handle_bulk or handle_each functions. +/// This is useful, when left is an array and right is scalar. Then rather than +/// copying data from the right to output, we can copy left data to the output and +/// invert the cond data to fill right values. Filling out with a scalar is presumed to +/// be more efficient than filling with an array +template +static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk, + HandleEach handle_each) { + int64_t data_offset = 0; + int64_t bit_offset = cond.offset; + const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray + + BitmapWordReader cond_reader(cond_data, cond.offset, cond.length); + + int64_t cnt = cond_reader.words(); + while (cnt--) { + Word word = cond_reader.NextWord(); + if (invert) { + if (word == 0) { + handle_bulk(data_offset, word_len); + } else if (word != UINT64_MAX) { + for (int64_t i = 0; i < word_len; ++i) { + if (!BitUtil::GetBit(cond_data, bit_offset + i)) { + handle_each(data_offset + i); + } + } + } + } else { + if (word == UINT64_MAX) { + handle_bulk(data_offset, word_len); + } else if (word) { + for (int64_t i = 0; i < word_len; ++i) { + if (BitUtil::GetBit(cond_data, bit_offset + i)) { + handle_each(data_offset + i); + } + } + } + } + data_offset += word_len; + bit_offset += word_len; + } + + cnt = cond_reader.trailing_bytes(); + while (cnt--) { + int valid_bits; + uint8_t byte = cond_reader.NextTrailingByte(valid_bits); + if (invert) { + if (byte == 0 && valid_bits == 8) { + handle_bulk(data_offset, 8); + } else if (byte != UINT8_MAX) { + for (int i = 0; i < valid_bits; ++i) { + if (!BitUtil::GetBit(cond_data, bit_offset + i)) { + handle_each(data_offset + i); + } + } + } + } else { + if (byte == UINT8_MAX && valid_bits == 8) { + handle_bulk(data_offset, 8); + } else if (byte) { + for (int i = 0; i < valid_bits; ++i) { + if (BitUtil::GetBit(cond_data, bit_offset + i)) { + handle_each(data_offset + i); + } + } + } + } + data_offset += 8; + bit_offset += 8; + } +} + +template +static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk, + HandleEach handle_each) { + return RunIfElseLoop(cond, handle_bulk, handle_each); +} + +/// Runs if-else when cond is a scalar. Two special functions are required, +/// 1.CopyArrayData, 2. BroadcastScalar +template +static Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, + const Datum& right, Datum* out, + CopyArrayData copy_array_data, + BroadcastScalar broadcast_scalar) { + if (left.is_scalar() && right.is_scalar()) { // output will be a scalar + if (cond.is_valid) { + *out = cond.value ? left.scalar() : right.scalar(); + } else { + *out = MakeNullScalar(left.type()); + } + return Status::OK(); + } + + // either left or right is an array. Output is always an array` + const std::shared_ptr& out_array = out->array(); + if (!cond.is_valid) { + // cond is null; output is all null --> clear validity buffer + BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset, + out_array->length); + return Status::OK(); + } + + // cond is a non-null scalar + const auto& valid_data = cond.value ? left : right; + if (valid_data.is_array()) { + // valid_data is an array. Hence copy data to the output buffers + const auto& valid_array = valid_data.array(); + if (valid_array->MayHaveNulls()) { + arrow::internal::CopyBitmap( + valid_array->buffers[0]->data(), valid_array->offset, valid_array->length, + out_array->buffers[0]->mutable_data(), out_array->offset); + } else { // validity buffer is nullptr --> set all bits + BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset, + out_array->length); + } + copy_array_data(*valid_array, out_array.get()); + return Status::OK(); + + } else { // valid data is scalar + // valid data is a scalar that needs to be broadcasted + const auto& valid_scalar = *valid_data.scalar(); + if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast + BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset, + out_array->length); + broadcast_scalar(*valid_data.scalar(), out_array.get()); + } else { // scalar is null, clear the output validity buffer + BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset, + out_array->length); + } + return Status::OK(); + } +} + template struct IfElseFunctor {}; @@ -191,178 +344,148 @@ struct IfElseFunctor {}; template struct IfElseFunctor> { using T = typename TypeTraits::CType; - // A - Array - // S - Scalar + // A - Array, S - Scalar, X = Array/Scalar + + // SXX + static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left, + const Datum& right, Datum* out) { + return RunIfElseScalar( + cond, left, right, out, + /*CopyArrayData*/ + [&](const ArrayData& valid_array, ArrayData* out_array) { + std::memcpy(out_array->GetMutableValues(1), valid_array.GetValues(1), + valid_array.length * sizeof(T)); + }, + /*BroadcastScalar*/ + [&](const Scalar& scalar, ArrayData* out_array) { + T scalar_data = internal::UnboxScalar::Unbox(scalar); + std::fill(out_array->GetMutableValues(1), + out_array->GetMutableValues(1) + out_array->length, scalar_data); + }); + } // AAA static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, const ArrayData& right, ArrayData* out) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - ctx->Allocate(cond.length * sizeof(T))); - T* out_values = reinterpret_cast(out_buf->mutable_data()); + T* out_values = out->template GetMutableValues(1); // copy right data to out_buff const T* right_data = right.GetValues(1); std::memcpy(out_values, right_data, right.length * sizeof(T)); - const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray - BitBlockCounter bit_counter(cond_data, cond.offset, cond.length); - // selectively copy values from left data const T* left_data = left.GetValues(1); - int64_t offset = cond.offset; - - // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*) - while (offset < cond.offset + cond.length) { - const BitBlockCount& block = bit_counter.NextWord(); - if (block.AllSet()) { // all from left - std::memcpy(out_values, left_data, block.length * sizeof(T)); - } else if (block.popcount) { // selectively copy from left - for (int64_t i = 0; i < block.length; ++i) { - if (BitUtil::GetBit(cond_data, offset + i)) { - out_values[i] = left_data[i]; - } - } - } - offset += block.length; - out_values += block.length; - left_data += block.length; - } + RunIfElseLoop( + cond, + [&](int64_t data_offset, int64_t num_elems) { + std::memcpy(out_values + data_offset, left_data + data_offset, + num_elems * sizeof(T)); + }, + [&](int64_t data_offset) { out_values[data_offset] = left_data[data_offset]; }); - out->buffers[1] = std::move(out_buf); return Status::OK(); } // ASA static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left, const ArrayData& right, ArrayData* out) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - ctx->Allocate(cond.length * sizeof(T))); - T* out_values = reinterpret_cast(out_buf->mutable_data()); + T* out_values = out->template GetMutableValues(1); // copy right data to out_buff const T* right_data = right.GetValues(1); std::memcpy(out_values, right_data, right.length * sizeof(T)); - const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray - BitBlockCounter bit_counter(cond_data, cond.offset, cond.length); - // selectively copy values from left data T left_data = internal::UnboxScalar::Unbox(left); - int64_t offset = cond.offset; - - // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*) - while (offset < cond.offset + cond.length) { - const BitBlockCount& block = bit_counter.NextWord(); - if (block.AllSet()) { // all from left - std::fill(out_values, out_values + block.length, left_data); - } else if (block.popcount) { // selectively copy from left - for (int64_t i = 0; i < block.length; ++i) { - if (BitUtil::GetBit(cond_data, offset + i)) { - out_values[i] = left_data; - } - } - } - offset += block.length; - out_values += block.length; - } + RunIfElseLoop( + cond, + [&](int64_t data_offset, int64_t num_elems) { + std::fill(out_values + data_offset, out_values + data_offset + num_elems, + left_data); + }, + [&](int64_t data_offset) { out_values[data_offset] = left_data; }); - out->buffers[1] = std::move(out_buf); return Status::OK(); } // AAS static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, const Scalar& right, ArrayData* out) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - ctx->Allocate(cond.length * sizeof(T))); - T* out_values = reinterpret_cast(out_buf->mutable_data()); + T* out_values = out->template GetMutableValues(1); // copy left data to out_buff const T* left_data = left.GetValues(1); std::memcpy(out_values, left_data, left.length * sizeof(T)); - const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray - BitBlockCounter bit_counter(cond_data, cond.offset, cond.length); - - // selectively copy values from left data T right_data = internal::UnboxScalar::Unbox(right); - int64_t offset = cond.offset; - - // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*) - // left data is already in the output buffer. Therefore, mask needs to be inverted - while (offset < cond.offset + cond.length) { - const BitBlockCount& block = bit_counter.NextWord(); - if (block.NoneSet()) { // all from right - std::fill(out_values, out_values + block.length, right_data); - } else if (block.popcount) { // selectively copy from right - for (int64_t i = 0; i < block.length; ++i) { - if (!BitUtil::GetBit(cond_data, offset + i)) { - out_values[i] = right_data; - } - } - } - offset += block.length; - out_values += block.length; - } + RunIfElseLoopInverted( + cond, + [&](int64_t data_offset, int64_t num_elems) { + std::fill(out_values + data_offset, out_values + data_offset + num_elems, + right_data); + }, + [&](int64_t data_offset) { out_values[data_offset] = right_data; }); - out->buffers[1] = std::move(out_buf); return Status::OK(); } // ASS static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left, const Scalar& right, ArrayData* out) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - ctx->Allocate(cond.length * sizeof(T))); - T* out_values = reinterpret_cast(out_buf->mutable_data()); + T* out_values = out->template GetMutableValues(1); // copy right data to out_buff T right_data = internal::UnboxScalar::Unbox(right); std::fill(out_values, out_values + cond.length, right_data); - const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray - BitBlockCounter bit_counter(cond_data, cond.offset, cond.length); - // selectively copy values from left data T left_data = internal::UnboxScalar::Unbox(left); - int64_t offset = cond.offset; - - // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*) - while (offset < cond.offset + cond.length) { - const BitBlockCount& block = bit_counter.NextWord(); - if (block.AllSet()) { // all from left - std::fill(out_values, out_values + block.length, left_data); - } else if (block.popcount) { // selectively copy from left - for (int64_t i = 0; i < block.length; ++i) { - if (BitUtil::GetBit(cond_data, offset + i)) { - out_values[i] = left_data; - } - } - } - - offset += block.length; - out_values += block.length; - } + RunIfElseLoop( + cond, + [&](int64_t data_offset, int64_t num_elems) { + std::fill(out_values + data_offset, out_values + data_offset + num_elems, + left_data); + }, + [&](int64_t data_offset) { out_values[data_offset] = left_data; }); - out->buffers[1] = std::move(out_buf); return Status::OK(); } }; template struct IfElseFunctor> { + // A - Array, S - Scalar, X = Array/Scalar + + // SXX + static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left, + const Datum& right, Datum* out) { + return RunIfElseScalar( + cond, left, right, out, + /*CopyArrayData*/ + [&](const ArrayData& valid_array, ArrayData* out_array) { + arrow::internal::CopyBitmap( + valid_array.buffers[1]->data(), valid_array.offset, valid_array.length, + out_array->buffers[1]->mutable_data(), out_array->offset); + }, + /*BroadcastScalar*/ + [&](const Scalar& scalar, ArrayData* out_array) { + bool scalar_data = internal::UnboxScalar::Unbox(scalar); + BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset, + out_array->length, scalar_data); + }); + } + // AAA static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, const ArrayData& right, ArrayData* out) { // out_buff = right & ~cond - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - arrow::internal::BitmapAndNot( - ctx->memory_pool(), right.buffers[1]->data(), right.offset, - cond.buffers[1]->data(), cond.offset, cond.length, 0)); + const auto& out_buf = out->buffers[1]; + arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset, + cond.buffers[1]->data(), cond.offset, cond.length, + out->offset, out_buf->mutable_data()); // out_buff = left & cond ARROW_ASSIGN_OR_RAISE(std::shared_ptr temp_buf, @@ -370,9 +493,9 @@ struct IfElseFunctor> { ctx->memory_pool(), left.buffers[1]->data(), left.offset, cond.buffers[1]->data(), cond.offset, cond.length, 0)); - arrow::internal::BitmapOr(out_buf->data(), 0, temp_buf->data(), 0, cond.length, 0, - out_buf->mutable_data()); - out->buffers[1] = std::move(out_buf); + arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0, + cond.length, out->offset, out_buf->mutable_data()); + return Status::OK(); } @@ -380,19 +503,19 @@ struct IfElseFunctor> { static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left, const ArrayData& right, ArrayData* out) { // out_buff = right & ~cond - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - arrow::internal::BitmapAndNot( - ctx->memory_pool(), right.buffers[1]->data(), right.offset, - cond.buffers[1]->data(), cond.offset, cond.length, 0)); + const auto& out_buf = out->buffers[1]; + arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset, + cond.buffers[1]->data(), cond.offset, cond.length, + out->offset, out_buf->mutable_data()); // out_buff = left & cond bool left_data = internal::UnboxScalar::Unbox(left); if (left_data) { - arrow::internal::BitmapOr(out_buf->data(), 0, cond.buffers[1]->data(), cond.offset, - cond.length, 0, out_buf->mutable_data()); + arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(), + cond.offset, cond.length, out->offset, + out_buf->mutable_data()); } - out->buffers[1] = std::move(out_buf); return Status::OK(); } @@ -400,20 +523,20 @@ struct IfElseFunctor> { static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, const Scalar& right, ArrayData* out) { // out_buff = left & cond - ARROW_ASSIGN_OR_RAISE(std::shared_ptr out_buf, - arrow::internal::BitmapAnd( - ctx->memory_pool(), left.buffers[1]->data(), left.offset, - cond.buffers[1]->data(), cond.offset, cond.length, 0)); + const auto& out_buf = out->buffers[1]; + arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset, + cond.buffers[1]->data(), cond.offset, cond.length, + out->offset, out_buf->mutable_data()); bool right_data = internal::UnboxScalar::Unbox(right); // out_buff = left & cond | right & ~cond if (right_data) { - arrow::internal::BitmapOrNot(out_buf->data(), 0, cond.buffers[1]->data(), - cond.offset, cond.length, 0, out_buf->mutable_data()); + arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(), + cond.offset, cond.length, out->offset, + out_buf->mutable_data()); } - out->buffers[1] = std::move(out_buf); return Status::OK(); } @@ -423,66 +546,32 @@ struct IfElseFunctor> { bool left_data = internal::UnboxScalar::Unbox(left); bool right_data = internal::UnboxScalar::Unbox(right); + const auto& out_buf = out->buffers[1]; + // out_buf = left & cond | right & ~cond - std::shared_ptr out_buf = nullptr; + // std::shared_ptr out_buf = nullptr; if (left_data) { if (right_data) { // out_buf = ones - ARROW_ASSIGN_OR_RAISE(out_buf, ctx->AllocateBitmap(cond.length)); - // filling with UINT8_MAX upto the buffer's size (in bytes) - std::memset(out_buf->mutable_data(), UINT8_MAX, out_buf->size()); + BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length); } else { // out_buf = cond - out_buf = SliceBuffer(cond.buffers[1], cond.offset, cond.length); + arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length, + out_buf->mutable_data(), out->offset); } } else { if (right_data) { // out_buf = ~cond - ARROW_ASSIGN_OR_RAISE(out_buf, arrow::internal::InvertBitmap( - ctx->memory_pool(), cond.buffers[1]->data(), - cond.offset, cond.length)) + arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length, + out_buf->mutable_data(), out->offset); } else { // out_buf = zeros - ARROW_ASSIGN_OR_RAISE(out_buf, ctx->AllocateBitmap(cond.length)); + BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length); } } - out->buffers[1] = std::move(out_buf); - return Status::OK(); - } -}; -template -struct IfElseFunctor> { - template - static inline Status ReturnCopy(const T& in, T* out) { - // Nothing preallocated, so we assign in into the output - *out = in; return Status::OK(); } - - // AAA - static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, - const ArrayData& right, ArrayData* out) { - return ReturnCopy(left, out); - } - - // ASA - static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left, - const ArrayData& right, ArrayData* out) { - return ReturnCopy(right, out); - } - - // AAS - static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left, - const Scalar& right, ArrayData* out) { - return ReturnCopy(left, out); - } - - // ASS - static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left, - const Scalar& right, ArrayData* out) { - return ReturnCopy(cond, out); - } }; template @@ -491,32 +580,7 @@ struct ResolveIfElseExec { // cond is scalar if (batch[0].is_scalar()) { const auto& cond = batch[0].scalar_as(); - if (batch[1].is_scalar() && batch[2].is_scalar()) { - if (cond.is_valid) { - *out = cond.value ? batch[1].scalar() : batch[2].scalar(); - } else { - *out = MakeNullScalar(batch[1].type()); - } - return Status::OK(); - } - // either left or right is an array. Output is always an array - if (!cond.is_valid) { - // cond is null; just create a null array - ARROW_ASSIGN_OR_RAISE( - *out, MakeArrayOfNull(batch[1].type(), batch.length, ctx->memory_pool())) - return Status::OK(); - } - - const auto& valid_data = cond.value ? batch[1] : batch[2]; - if (valid_data.is_array()) { - *out = valid_data; - } else { - // valid data is a scalar that needs to be broadcasted - ARROW_ASSIGN_OR_RAISE( - *out, - MakeArrayFromScalar(*valid_data.scalar(), batch.length, ctx->memory_pool())); - } - return Status::OK(); + return IfElseFunctor::Call(ctx, cond, batch[1], batch[2], out); } // cond is array. Use functors to sort things out @@ -543,6 +607,20 @@ struct ResolveIfElseExec { } }; +template <> +struct ResolveIfElseExec { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].is_scalar()) { + *out = MakeNullScalar(null()); + } else { + const std::shared_ptr& cond_array = batch[0].array(); + ARROW_ASSIGN_OR_RAISE( + *out, MakeArrayOfNull(null(), cond_array->length, ctx->memory_pool())); + } + return Status::OK(); + } +}; + struct IfElseFunction : ScalarFunction { using ScalarFunction::ScalarFunction; @@ -574,14 +652,25 @@ struct IfElseFunction : ScalarFunction { } }; -void AddPrimitiveIfElseKernels(const std::shared_ptr& scalar_function, +void AddNullIfElseKernel(const std::shared_ptr& scalar_function) { + ScalarKernel kernel({boolean(), null(), null()}, null(), + ResolveIfElseExec::Exec); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + kernel.can_write_into_slices = false; + + DCHECK_OK(scalar_function->AddKernel(std::move(kernel))); +} + +void AddPrimitiveIfElseKernels(const std::shared_ptr& scalar_function, const std::vector>& types) { for (auto&& type : types) { auto exec = internal::GenerateTypeAgnosticPrimitive(*type); // cond array needs to be boolean always ScalarKernel kernel({boolean(), type, type}, type, exec); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; + kernel.mem_allocation = MemAllocation::PREALLOCATE; + kernel.can_write_into_slices = true; DCHECK_OK(scalar_function->AddKernel(std::move(kernel))); } @@ -607,7 +696,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddPrimitiveIfElseKernels(func, NumericTypes()); AddPrimitiveIfElseKernels(func, TemporalTypes()); - AddPrimitiveIfElseKernels(func, {boolean(), null()}); + AddPrimitiveIfElseKernels(func, {boolean()}); + AddNullIfElseKernel(func); // todo add binary kernels DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc new file mode 100644 index 00000000000..98fb675da40 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +namespace arrow { +namespace compute { + +const int64_t elems = 1024 * 1024; + +template +static void IfElseBench(benchmark::State& state) { + using CType = typename Type::c_type; + auto type = TypeTraits::type_singleton(); + using ArrayType = typename TypeTraits::ArrayType; + + int64_t len = state.range(0); + int64_t offset = state.range(1); + + random::RandomArrayGenerator rand(/*seed=*/0); + + auto cond = std::static_pointer_cast( + rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); + auto left = std::static_pointer_cast( + rand.ArrayOf(type, len, /*null_probability=*/0.01)); + auto right = std::static_pointer_cast( + rand.ArrayOf(type, len, /*null_probability=*/0.01)); + + for (auto _ : state) { + ABORT_NOT_OK(IfElse(cond->Slice(offset), left->Slice(offset), right->Slice(offset))); + } + + state.SetBytesProcessed(state.iterations() * + ((len - offset) / 8 + 2 * (len - offset) * sizeof(CType))); +} + +template +static void IfElseBenchContiguous(benchmark::State& state) { + using CType = typename Type::c_type; + auto type = TypeTraits::type_singleton(); + using ArrayType = typename TypeTraits::ArrayType; + + int64_t len = state.range(0); + int64_t offset = state.range(1); + + ASSERT_OK_AND_ASSIGN(auto temp1, MakeArrayFromScalar(BooleanScalar(true), len / 2)); + ASSERT_OK_AND_ASSIGN(auto temp2, + MakeArrayFromScalar(BooleanScalar(false), len - len / 2)); + ASSERT_OK_AND_ASSIGN(auto concat, Concatenate({temp1, temp2})); + auto cond = std::static_pointer_cast(concat); + + random::RandomArrayGenerator rand(/*seed=*/0); + auto left = std::static_pointer_cast( + rand.ArrayOf(type, len, /*null_probability=*/0.01)); + auto right = std::static_pointer_cast( + rand.ArrayOf(type, len, /*null_probability=*/0.01)); + + for (auto _ : state) { + ABORT_NOT_OK(IfElse(cond->Slice(offset), left->Slice(offset), right->Slice(offset))); + } + + state.SetBytesProcessed(state.iterations() * + ((len - offset) / 8 + 2 * (len - offset) * sizeof(CType))); +} + +static void IfElseBench64(benchmark::State& state) { + return IfElseBench(state); +} + +static void IfElseBench32(benchmark::State& state) { + return IfElseBench(state); +} + +static void IfElseBench64Contiguous(benchmark::State& state) { + return IfElseBenchContiguous(state); +} + +static void IfElseBench32Contiguous(benchmark::State& state) { + return IfElseBenchContiguous(state); +} + +BENCHMARK(IfElseBench32)->Args({elems, 0}); +BENCHMARK(IfElseBench64)->Args({elems, 0}); + +BENCHMARK(IfElseBench32)->Args({elems, 99}); +BENCHMARK(IfElseBench64)->Args({elems, 99}); + +BENCHMARK(IfElseBench32Contiguous)->Args({elems, 0}); +BENCHMARK(IfElseBench64Contiguous)->Args({elems, 0}); + +BENCHMARK(IfElseBench32Contiguous)->Args({elems, 99}); +BENCHMARK(IfElseBench64Contiguous)->Args({elems, 99}); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 2b63af2f26f..670a2d42a3a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -56,8 +57,15 @@ TYPED_TEST(TestIfElsePrimitive, IfElseFixedSizeRand) { random::RandomArrayGenerator rand(/*seed=*/0); int64_t len = 1000; - auto cond = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); + + // adding 64 consecutive 1's and 0's in the cond array to test all-true/ all-false + // word code paths + ASSERT_OK_AND_ASSIGN(auto temp1, MakeArrayFromScalar(BooleanScalar(true), 64)); + ASSERT_OK_AND_ASSIGN(auto temp2, MakeArrayFromScalar(BooleanScalar(false), 64)); + auto temp3 = rand.ArrayOf(boolean(), len - 64 * 2, /*null_probability=*/0.01); + ASSERT_OK_AND_ASSIGN(auto concat, Concatenate({temp1, temp2, temp3})); + auto cond = std::static_pointer_cast(concat); + auto left = std::static_pointer_cast( rand.ArrayOf(type, len, /*null_probability=*/0.01)); auto right = std::static_pointer_cast( diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index e26a17120cd..660fb2657b6 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -79,6 +79,7 @@ add_arrow_test(threading-utility-test add_arrow_benchmark(bit_block_counter_benchmark) add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(bitmap_reader_benchmark) add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) add_arrow_benchmark(decimal_benchmark) diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index 6e23678ddf9..ee4bcde7713 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -20,6 +20,8 @@ #include #include +#include "arrow/util/logging.h" + namespace arrow { namespace BitUtil { @@ -67,5 +69,59 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar bits[bytes_end - 1] |= static_cast(fill_byte & ~last_byte_mask); } +template +void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) { + // offset length + // data |<------------->| + // |--------|...|--------|...|--------| + // |<--->| |<--->| + // pro epi + if (ARROW_PREDICT_FALSE(length == 0)) { + return; + } + + constexpr uint8_t set_byte = value ? UINT8_MAX : 0; + + auto prologue = static_cast(BitUtil::RoundUp(offset, 8) - offset); + DCHECK_LT(prologue, 8); + + if (length < prologue) { // special case where a mask is required + // offset length + // data |<->| + // |--------|...|--------|... + // mask --> |111| + // |<---->| + // pro + uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^ + BitUtil::kPrecedingBitmask[8 - prologue + length]; + data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask; + return; + } + + // align to a byte boundary + data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte); + offset += prologue; + length -= prologue; + + // set values per byte + DCHECK_EQ(offset % 8, 0); + std::memset(data + offset / 8, set_byte, length / 8); + offset += BitUtil::RoundDown(length, 8); + length -= BitUtil::RoundDown(length, 8); + + // clean up + DCHECK_LT(length, 8); + data[offset / 8] = + BitUtil::SpliceWord(static_cast(length), set_byte, data[offset / 8]); +} + +void SetBitmap(uint8_t* data, int64_t offset, int64_t length) { + SetBitmapImpl(data, offset, length); +} + +void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) { + SetBitmapImpl(data, offset, length); +} + } // namespace BitUtil } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 01845791faa..1e97e467610 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -316,5 +316,37 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { ARROW_EXPORT void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set); +/// \brief Sets all bits in the bitmap to true +ARROW_EXPORT +void SetBitmap(uint8_t* data, int64_t offset, int64_t length); + +/// \brief Clears all bits in the bitmap (set to false) +ARROW_EXPORT +void ClearBitmap(uint8_t* data, int64_t offset, int64_t length); + +/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be +/// returned +/// ex: +/// ref: https://stackoverflow.com/a/59523400 +template +constexpr Word PrecedingWordBitmask(unsigned int const i) { + return (static_cast(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1; +} +static_assert(PrecedingWordBitmask(0) == 0x00, ""); +static_assert(PrecedingWordBitmask(4) == 0x0f, ""); +static_assert(PrecedingWordBitmask(8) == 0xff, ""); +static_assert(PrecedingWordBitmask(8) == 0x00ff, ""); + +/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits +/// from `high`. +/// Word ret +/// for (i = 0; i < sizeof(Word)*8; i++){ +/// ret[i]= i < n ? low[i]: high[i]; +/// } +template +constexpr Word SpliceWord(int n, Word low, Word high) { + return (high & ~PrecedingWordBitmask(n)) | (low & PrecedingWordBitmask(n)); +} + } // namespace BitUtil } // namespace arrow diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index e5a5e4c39be..ded37398f95 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1532,6 +1532,43 @@ TEST(BitUtilTests, TestSetBitsTo) { } } +TEST(BitUtilTests, TestSetBitmap) { + using BitUtil::SetBitsTo; + for (const auto fill_byte_int : {0xff}) { + const uint8_t fill_byte = static_cast(fill_byte_int); + { + // test set within a byte + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + BitUtil::SetBitmap(bitmap, 2, 2); + BitUtil::ClearBitmap(bitmap, 4, 2); + ASSERT_BYTES_EQ(bitmap, {static_cast((fill_byte & ~0x3C) | 0xC)}); + } + { + // test straddling a single byte boundary + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + BitUtil::SetBitmap(bitmap, 4, 7); + BitUtil::ClearBitmap(bitmap, 11, 7); + ASSERT_BYTES_EQ(bitmap, {static_cast((fill_byte & 0xF) | 0xF0), 0x7, + static_cast(fill_byte & ~0x3)}); + } + { + // test byte aligned end + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + BitUtil::SetBitmap(bitmap, 4, 4); + BitUtil::ClearBitmap(bitmap, 8, 8); + ASSERT_BYTES_EQ(bitmap, + {static_cast((fill_byte & 0xF) | 0xF0), 0x00, fill_byte}); + } + { + // test byte aligned end, multiple bytes + uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte}; + BitUtil::ClearBitmap(bitmap, 0, 24); + uint8_t false_byte = static_cast(0); + ASSERT_BYTES_EQ(bitmap, {false_byte, false_byte, false_byte, fill_byte}); + } + } +} + TEST(BitUtilTests, TestCopyBitmap) { const int kBufferSize = 1000; @@ -1975,6 +2012,34 @@ TEST(BitUtil, BitsetStack) { ASSERT_EQ(stack.TopSize(), 0); } +TEST(SpliceWord, SpliceWord) { + static_assert( + BitUtil::PrecedingWordBitmask(0) == BitUtil::kPrecedingBitmask[0], ""); + static_assert( + BitUtil::PrecedingWordBitmask(5) == BitUtil::kPrecedingBitmask[5], ""); + static_assert(BitUtil::PrecedingWordBitmask(8) == UINT8_MAX, ""); + + static_assert(BitUtil::PrecedingWordBitmask(0) == uint64_t(0), ""); + static_assert(BitUtil::PrecedingWordBitmask(33) == 8589934591, ""); + static_assert(BitUtil::PrecedingWordBitmask(64) == UINT64_MAX, ""); + static_assert(BitUtil::PrecedingWordBitmask(65) == UINT64_MAX, ""); + + ASSERT_EQ(BitUtil::SpliceWord(0, 0x12, 0xef), 0xef); + ASSERT_EQ(BitUtil::SpliceWord(8, 0x12, 0xef), 0x12); + ASSERT_EQ(BitUtil::SpliceWord(3, 0x12, 0xef), 0xea); + + ASSERT_EQ(BitUtil::SpliceWord(0, 0x12345678, 0xfedcba98), 0xfedcba98); + ASSERT_EQ(BitUtil::SpliceWord(32, 0x12345678, 0xfedcba98), 0x12345678); + ASSERT_EQ(BitUtil::SpliceWord(24, 0x12345678, 0xfedcba98), 0xfe345678); + + ASSERT_EQ(BitUtil::SpliceWord(0, 0x0123456789abcdef, 0xfedcba9876543210), + 0xfedcba9876543210); + ASSERT_EQ(BitUtil::SpliceWord(64, 0x0123456789abcdef, 0xfedcba9876543210), + 0x0123456789abcdef); + ASSERT_EQ(BitUtil::SpliceWord(48, 0x0123456789abcdef, 0xfedcba9876543210), + 0xfedc456789abcdef); +} + // test the basic assumption of word level Bitmap::Visit TEST(Bitmap, ShiftingWordsOptimization) { // single word @@ -2156,5 +2221,72 @@ TEST(Bitmap, VisitWordsAnd) { } } +void DoBitmapVisitAndWrite(int64_t part, bool with_offset) { + int64_t bits = part * 4; + + random::RandomArrayGenerator rand(/*seed=*/0); + auto arrow_data = rand.ArrayOf(boolean(), bits, 0); + + std::shared_ptr& arrow_buffer = arrow_data->data()->buffers[1]; + + Bitmap bm0(arrow_buffer, 0, part); + Bitmap bm1(arrow_buffer, part * 1, part); + Bitmap bm2(arrow_buffer, part * 2, part); + + std::array out_bms; + if (with_offset) { + ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4)); + out_bms[0] = Bitmap(out, part, part); + out_bms[1] = Bitmap(out, part * 2, part); + } else { + ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part)); + ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part)); + out_bms[0] = Bitmap(out0, 0, part); + out_bms[1] = Bitmap(out1, 0, part); + } + + // out0 = bm0 & bm1, out1= bm0 | bm2 + std::array in_bms{bm0, bm1, bm2}; + Bitmap::VisitWordsAndWrite( + in_bms, &out_bms, + [](const std::array& in, std::array* out) { + out->at(0) = in[0] & in[1]; + out->at(1) = in[0] | in[2]; + }); + + auto pool = MemoryPool::CreateDefault(); + ASSERT_OK_AND_ASSIGN(auto exp_0, + BitmapAnd(pool.get(), bm0.buffer()->data(), bm0.offset(), + bm1.buffer()->data(), bm1.offset(), part, 0)); + ASSERT_OK_AND_ASSIGN(auto exp_1, + BitmapOr(pool.get(), bm0.buffer()->data(), bm0.offset(), + bm2.buffer()->data(), bm2.offset(), part, 0)); + + ASSERT_TRUE(BitmapEquals(exp_0->data(), 0, out_bms[0].buffer()->data(), + out_bms[0].offset(), part)) + << "exp: " << Bitmap(exp_0->data(), 0, part).ToString() << std::endl + << "got: " << out_bms[0].ToString(); + + ASSERT_TRUE(BitmapEquals(exp_1->data(), 0, out_bms[1].buffer()->data(), + out_bms[1].offset(), part)) + << "exp: " << Bitmap(exp_1->data(), 0, part).ToString() << std::endl + << "got: " << out_bms[1].ToString(); +} + +class TestBitmapVisitAndWrite : public ::testing::TestWithParam {}; + +INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWrite, + testing::Values(199, 256, 1000)); + +INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWrite, + testing::Values(5, 13, 21, 29, 37, 41, 51, 59, 64, 97)); + +INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWrite, + testing::Values(8, 16, 24, 32, 40, 48, 56, 64)); + +TEST_P(TestBitmapVisitAndWrite, NoOffset) { DoBitmapVisitAndWrite(GetParam(), false); } + +TEST_P(TestBitmapVisitAndWrite, WithOffset) { DoBitmapVisitAndWrite(GetParam(), true); } + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h index 8562c55e3d5..141f863c0b8 100644 --- a/cpp/src/arrow/util/bitmap.h +++ b/cpp/src/arrow/util/bitmap.h @@ -29,6 +29,9 @@ #include "arrow/buffer.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/bitmap_writer.h" #include "arrow/util/compare.h" #include "arrow/util/endian.h" #include "arrow/util/functional.h" @@ -109,6 +112,21 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, } } + /// \brief Visit bits from each bitmap as bitset + /// + /// All bitmaps must have identical length. + template + static void VisitBits(const std::array& bitmaps, Visitor&& visitor) { + int64_t bit_length = BitLength(bitmaps); + std::bitset bits; + for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) { + for (size_t i = 0; i < N; ++i) { + bits[i] = bitmaps[i].GetBit(bit_i); + } + visitor(bits); + } + } + /// \brief Visit words of bits from each bitmap as array /// /// All bitmaps must have identical length. The first bit in a visited bitmap @@ -225,6 +243,132 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, return min_offset; } + template >::type::value_type> + static void RunVisitWordsAndWriteLoop(int64_t bit_length, + std::array& readers, + std::array& writers, + Visitor&& visitor) { + constexpr int64_t kBitWidth = sizeof(Word) * 8; + + std::array visited_words; + std::array output_words; + + // every reader will have same number of words, since they are same length'ed + // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond + // Word boundary, every Word would have to be created from 2 adjoining Words + auto n_words = readers[0].words(); + bit_length -= n_words * kBitWidth; + while (n_words--) { + // first collect all words to visited_words array + for (size_t i = 0; i < N; i++) { + visited_words[i] = readers[i].NextWord(); + } + visitor(visited_words, &output_words); + for (size_t i = 0; i < M; i++) { + writers[i].PutNextWord(output_words[i]); + } + } + + // every reader will have same number of trailing bytes, because of the above reason + // tailing portion could be more than one word! (ref: BitmapWordReader constructor) + // remaining full/ partial words to write + + if (bit_length) { + // convert the word visitor lambda to a byte_visitor + auto byte_visitor = [&](const std::array& in, + std::array* out) { + std::array in_words; + std::array out_words; + std::copy(in.begin(), in.end(), in_words.begin()); + visitor(in_words, &out_words); + for (size_t i = 0; i < M; i++) { + out->at(i) = static_cast(out_words[i]); + } + }; + + std::array visited_bytes; + std::array output_bytes; + int n_bytes = readers[0].trailing_bytes(); + while (n_bytes--) { + visited_bytes.fill(0); + output_bytes.fill(0); + int valid_bits; + for (size_t i = 0; i < N; i++) { + visited_bytes[i] = readers[i].NextTrailingByte(valid_bits); + } + byte_visitor(visited_bytes, &output_bytes); + for (size_t i = 0; i < M; i++) { + writers[i].PutNextTrailingByte(output_bytes[i], valid_bits); + } + } + } + } + + /// \brief Visit words of bits from each input bitmap as array and collects + /// outputs to an array, to be written into the output bitmaps accordingly. + /// + /// All bitmaps must have identical length. The first bit in a visited bitmap + /// may be offset within the first visited word, but words will otherwise contain + /// densely packed bits loaded from the bitmap. That offset within the first word is + /// returned. + /// Visitor is expected to have the following signature + /// [](const std::array& in_words, std::array* out_words){...} + /// + // NOTE: this function is efficient on 3+ sufficiently large bitmaps. + // It also has a large prolog / epilog overhead and should be used + // carefully in other cases. + // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid + // and BitmapUInt64Reader. + template >::type::value_type> + static void VisitWordsAndWrite(const std::array& bitmaps_arg, + std::array* out_bitmaps_arg, + Visitor&& visitor) { + int64_t bit_length = BitLength(bitmaps_arg); + assert(bit_length == BitLength(*out_bitmaps_arg)); + + // if both input and output bitmaps have no byte offset, then use special template + if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(), + [](const Bitmap& b) { return b.offset_ % 8 == 0; }) && + std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(), + [](const Bitmap& b) { return b.offset_ % 8 == 0; })) { + std::array, N> readers; + for (size_t i = 0; i < N; ++i) { + const Bitmap& in_bitmap = bitmaps_arg[i]; + readers[i] = BitmapWordReader( + in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_); + } + + std::array, M> writers; + for (size_t i = 0; i < M; ++i) { + const Bitmap& out_bitmap = out_bitmaps_arg->at(i); + writers[i] = BitmapWordWriter( + out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_); + } + + RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor); + } else { + std::array, N> readers; + for (size_t i = 0; i < N; ++i) { + const Bitmap& in_bitmap = bitmaps_arg[i]; + readers[i] = BitmapWordReader(in_bitmap.buffer_->data(), in_bitmap.offset_, + in_bitmap.length_); + } + + std::array, M> writers; + for (size_t i = 0; i < M; ++i) { + const Bitmap& out_bitmap = out_bitmaps_arg->at(i); + writers[i] = BitmapWordWriter(out_bitmap.buffer_->mutable_data(), + out_bitmap.offset_, out_bitmap.length_); + } + + RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor); + } + } + const std::shared_ptr& buffer() const { return buffer_; } /// offset of first bit relative to buffer().data() @@ -301,6 +445,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable, /// assert bitmaps have identical length and return that length static int64_t BitLength(const Bitmap* bitmaps, size_t N); + template + static int64_t BitLength(const std::array& bitmaps) { + for (size_t i = 1; i < N; ++i) { + assert(bitmaps[i].length() == bitmaps[0].length()); + } + return bitmaps[0].length(); + } + std::shared_ptr buffer_; int64_t offset_ = 0, length_ = 0; }; diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index a27a61cadf3..63c8b008f4a 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -28,9 +28,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/bitmap_writer.h" -#include "arrow/util/endian.h" #include "arrow/util/logging.h" -#include "arrow/util/ubsan.h" namespace arrow { namespace internal { @@ -85,222 +83,6 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { return count; } -namespace { - -// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h) -// on sufficiently large inputs. However, it has a larger prolog / epilog overhead -// and should probably not be used for small bitmaps. - -template -class BitmapWordReader { - public: - BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) { - bitmap_ = bitmap + offset / 8; - offset_ = offset % 8; - bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length); - - // decrement word count by one as we may touch two adjacent words in one iteration - nwords_ = length / (sizeof(Word) * 8) - 1; - if (nwords_ < 0) { - nwords_ = 0; - } - trailing_bits_ = static_cast(length - nwords_ * sizeof(Word) * 8); - trailing_bytes_ = static_cast(BitUtil::BytesForBits(trailing_bits_)); - - if (nwords_ > 0) { - current_word_ = load(bitmap_); - } else if (length > 0) { - current_byte_ = load(bitmap_); - } - } - - Word NextWord() { - bitmap_ += sizeof(Word); - const Word next_word = load(bitmap_); - Word word = current_word_; - if (offset_) { - // combine two adjacent words into one word - // |<------ next ----->|<---- current ---->| - // +-------------+-----+-------------+-----+ - // | --- | A | B | --- | - // +-------------+-----+-------------+-----+ - // | | offset - // v v - // +-----+-------------+ - // | A | B | - // +-----+-------------+ - // |<------ word ----->| - word >>= offset_; - word |= next_word << (sizeof(Word) * 8 - offset_); - } - current_word_ = next_word; - return word; - } - - uint8_t NextTrailingByte(int& valid_bits) { - uint8_t byte; - DCHECK_GT(trailing_bits_, 0); - - if (trailing_bits_ <= 8) { - // last byte - valid_bits = trailing_bits_; - trailing_bits_ = 0; - byte = 0; - internal::BitmapReader reader(bitmap_, offset_, valid_bits); - for (int i = 0; i < valid_bits; ++i) { - byte >>= 1; - if (reader.IsSet()) { - byte |= 0x80; - } - reader.Next(); - } - byte >>= (8 - valid_bits); - } else { - ++bitmap_; - const uint8_t next_byte = load(bitmap_); - byte = current_byte_; - if (offset_) { - byte >>= offset_; - byte |= next_byte << (8 - offset_); - } - current_byte_ = next_byte; - trailing_bits_ -= 8; - valid_bits = 8; - } - return byte; - } - - int64_t words() const { return nwords_; } - int trailing_bytes() const { return trailing_bytes_; } - - private: - int64_t offset_; - const uint8_t* bitmap_; - - const uint8_t* bitmap_end_; - int64_t nwords_; - int trailing_bits_; - int trailing_bytes_; - union { - Word current_word_; - struct { -#if ARROW_LITTLE_ENDIAN == 0 - uint8_t padding_bytes_[sizeof(Word) - 1]; -#endif - uint8_t current_byte_; - }; - }; - - template - DType load(const uint8_t* bitmap) { - DCHECK_LE(bitmap + sizeof(DType), bitmap_end_); - return BitUtil::ToLittleEndian(util::SafeLoadAs(bitmap)); - } -}; - -template -class BitmapWordWriter { - public: - BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) { - bitmap_ = bitmap + offset / 8; - offset_ = offset % 8; - bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length); - mask_ = (1U << offset_) - 1; - - if (offset_) { - if (length >= static_cast(sizeof(Word) * 8)) { - current_word_ = load(bitmap_); - } else if (length > 0) { - current_byte_ = load(bitmap_); - } - } - } - - void PutNextWord(Word word) { - if (offset_) { - // split one word into two adjacent words, don't touch unused bits - // |<------ word ----->| - // +-----+-------------+ - // | A | B | - // +-----+-------------+ - // | | - // v v offset - // +-------------+-----+-------------+-----+ - // | --- | A | B | --- | - // +-------------+-----+-------------+-----+ - // |<------ next ----->|<---- current ---->| - word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_)); - Word next_word = load(bitmap_ + sizeof(Word)); - current_word_ = (current_word_ & mask_) | (word & ~mask_); - next_word = (next_word & ~mask_) | (word & mask_); - store(bitmap_, current_word_); - store(bitmap_ + sizeof(Word), next_word); - current_word_ = next_word; - } else { - store(bitmap_, word); - } - bitmap_ += sizeof(Word); - } - - void PutNextTrailingByte(uint8_t byte, int valid_bits) { - if (valid_bits == 8) { - if (offset_) { - byte = (byte << offset_) | (byte >> (8 - offset_)); - uint8_t next_byte = load(bitmap_ + 1); - current_byte_ = (current_byte_ & mask_) | (byte & ~mask_); - next_byte = (next_byte & ~mask_) | (byte & mask_); - store(bitmap_, current_byte_); - store(bitmap_ + 1, next_byte); - current_byte_ = next_byte; - } else { - store(bitmap_, byte); - } - ++bitmap_; - } else { - DCHECK_GT(valid_bits, 0); - DCHECK_LT(valid_bits, 8); - DCHECK_LE(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits), bitmap_end_); - internal::BitmapWriter writer(bitmap_, offset_, valid_bits); - for (int i = 0; i < valid_bits; ++i) { - (byte & 0x01) ? writer.Set() : writer.Clear(); - writer.Next(); - byte >>= 1; - } - writer.Finish(); - } - } - - private: - int64_t offset_; - uint8_t* bitmap_; - - const uint8_t* bitmap_end_; - uint64_t mask_; - union { - Word current_word_; - struct { -#if ARROW_LITTLE_ENDIAN == 0 - uint8_t padding_bytes_[sizeof(Word) - 1]; -#endif - uint8_t current_byte_; - }; - }; - - template - DType load(const uint8_t* bitmap) { - DCHECK_LE(bitmap + sizeof(DType), bitmap_end_); - return BitUtil::ToLittleEndian(util::SafeLoadAs(bitmap)); - } - - template - void store(uint8_t* bitmap, DType data) { - DCHECK_LE(bitmap + sizeof(DType), bitmap_end_); - util::SafeStore(bitmap, BitUtil::FromLittleEndian(data)); - } -}; - -} // namespace - enum class TransferMode : bool { Copy, Invert }; template diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index cf4f5e7db8b..7c43747fafb 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -142,6 +142,118 @@ class BitmapUInt64Reader { uint64_t carry_bits_; }; +// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h) +// on sufficiently large inputs. However, it has a larger prolog / epilog overhead +// and should probably not be used for small bitmaps. + +template +class BitmapWordReader { + public: + BitmapWordReader() = default; + BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) + : offset_(static_cast(may_have_byte_offset) * (offset % 8)), + bitmap_(bitmap + offset / 8), + bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) { + // decrement word count by one as we may touch two adjacent words in one iteration + nwords_ = length / (sizeof(Word) * 8) - 1; + if (nwords_ < 0) { + nwords_ = 0; + } + trailing_bits_ = static_cast(length - nwords_ * sizeof(Word) * 8); + trailing_bytes_ = static_cast(BitUtil::BytesForBits(trailing_bits_)); + + if (nwords_ > 0) { + current_word_ = load(bitmap_); + } else if (length > 0) { + current_byte_ = load(bitmap_); + } + } + + Word NextWord() { + bitmap_ += sizeof(Word); + const Word next_word = load(bitmap_); + Word word = current_word_; + if (may_have_byte_offset && offset_) { + // combine two adjacent words into one word + // |<------ next ----->|<---- current ---->| + // +-------------+-----+-------------+-----+ + // | --- | A | B | --- | + // +-------------+-----+-------------+-----+ + // | | offset + // v v + // +-----+-------------+ + // | A | B | + // +-----+-------------+ + // |<------ word ----->| + word >>= offset_; + word |= next_word << (sizeof(Word) * 8 - offset_); + } + current_word_ = next_word; + return word; + } + + uint8_t NextTrailingByte(int& valid_bits) { + uint8_t byte; + assert(trailing_bits_ > 0); + + if (trailing_bits_ <= 8) { + // last byte + valid_bits = trailing_bits_; + trailing_bits_ = 0; + byte = 0; + internal::BitmapReader reader(bitmap_, offset_, valid_bits); + for (int i = 0; i < valid_bits; ++i) { + byte >>= 1; + if (reader.IsSet()) { + byte |= 0x80; + } + reader.Next(); + } + byte >>= (8 - valid_bits); + } else { + ++bitmap_; + const uint8_t next_byte = load(bitmap_); + byte = current_byte_; + if (may_have_byte_offset && offset_) { + byte >>= offset_; + byte |= next_byte << (8 - offset_); + } + current_byte_ = next_byte; + trailing_bits_ -= 8; + trailing_bytes_--; + valid_bits = 8; + } + return byte; + } + + int64_t words() const { return nwords_; } + int trailing_bytes() const { return trailing_bytes_; } + + private: + int64_t offset_; + const uint8_t* bitmap_; + + const uint8_t* bitmap_end_; + int64_t nwords_; + int trailing_bits_; + int trailing_bytes_; + union { + Word current_word_; + struct { +#if ARROW_LITTLE_ENDIAN == 0 + uint8_t padding_bytes_[sizeof(Word) - 1]; +#endif + uint8_t current_byte_; + }; + }; + + template + DType load(const uint8_t* bitmap) { + assert(bitmap + sizeof(DType) <= bitmap_end_); + return BitUtil::ToLittleEndian(util::SafeLoadAs(bitmap)); + } +}; + /// \brief Index into a possibly non-existent bitmap struct OptionalBitIndexer { const uint8_t* bitmap; @@ -151,7 +263,7 @@ struct OptionalBitIndexer { : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {} bool operator[](int64_t i) const { - return bitmap == NULLPTR ? true : BitUtil::GetBit(bitmap, offset + i); + return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i); } }; diff --git a/cpp/src/arrow/util/bitmap_reader_benchmark.cc b/cpp/src/arrow/util/bitmap_reader_benchmark.cc new file mode 100644 index 00000000000..359653c9644 --- /dev/null +++ b/cpp/src/arrow/util/bitmap_reader_benchmark.cc @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_reader.h" +#include "benchmark/benchmark.h" + +namespace arrow { +namespace BitUtil { + +using internal::BitBlockCount; +using internal::BitBlockCounter; +using internal::BitmapWordReader; + +const int64_t kBufferSize = 1024 * (std::rand() % 25 + 1000); + +// const int seed = std::rand(); + +static std::shared_ptr CreateRandomBuffer(int64_t nbytes) { + auto buffer = *AllocateBuffer(nbytes); + memset(buffer->mutable_data(), 0, nbytes); + random_bytes(nbytes, /*seed=*/0, buffer->mutable_data()); + return std::move(buffer); +} + +static void BitBlockCounterBench(benchmark::State& state) { + int64_t nbytes = state.range(0); + std::shared_ptr cond_buf = CreateRandomBuffer(nbytes); + for (auto _ : state) { + BitBlockCounter counter(cond_buf->data(), 0, nbytes * 8); + + int64_t offset = 0; + uint64_t set_bits = 0; + + while (offset < nbytes * 8) { + const BitBlockCount& word = counter.NextWord(); + // if (word.AllSet()) { + // set_bits += word.length; + // } else if (word.popcount) { + // set_bits += word.popcount; + // } + set_bits += word.popcount; + benchmark::DoNotOptimize(set_bits); + offset += word.length; + } + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(state.iterations() * nbytes); +} + +static void BitmapWordReaderBench(benchmark::State& state) { + int64_t nbytes = state.range(0); + std::shared_ptr cond_buf = CreateRandomBuffer(nbytes); + for (auto _ : state) { + BitmapWordReader counter(cond_buf->data(), 0, nbytes * 8); + + int64_t set_bits = 0; + + int64_t cnt = counter.words(); + while (cnt--) { + const auto& word = counter.NextWord(); + // if (word == UINT64_MAX) { + // set_bits += sizeof(uint64_t) * 8; + // } else if (word) { + // set_bits += PopCount(word); + // } + set_bits += PopCount(word); + benchmark::DoNotOptimize(set_bits); + } + + cnt = counter.trailing_bytes(); + while (cnt--) { + int valid_bits; + const auto& byte = static_cast(counter.NextTrailingByte(valid_bits)); + set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte); + benchmark::DoNotOptimize(set_bits); + } + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * nbytes); +} + +BENCHMARK(BitBlockCounterBench)->Arg(kBufferSize); +BENCHMARK(BitmapWordReaderBench)->Arg(kBufferSize); + +} // namespace BitUtil +} // namespace arrow diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h index d4f02f37a41..d5c6d909df0 100644 --- a/cpp/src/arrow/util/bitmap_writer.h +++ b/cpp/src/arrow/util/bitmap_writer.h @@ -180,5 +180,106 @@ class FirstTimeBitmapWriter { int64_t byte_offset_; }; +template +class BitmapWordWriter { + public: + BitmapWordWriter() = default; + BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) + : offset_(static_cast(may_have_byte_offset) * (offset % 8)), + bitmap_(bitmap + offset / 8), + bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)), + mask_((1U << offset_) - 1) { + if (offset_) { + if (length >= static_cast(sizeof(Word) * 8)) { + current_word_ = load(bitmap_); + } else if (length > 0) { + current_byte_ = load(bitmap_); + } + } + } + + void PutNextWord(Word word) { + if (may_have_byte_offset && offset_) { + // split one word into two adjacent words, don't touch unused bits + // |<------ word ----->| + // +-----+-------------+ + // | A | B | + // +-----+-------------+ + // | | + // v v offset + // +-------------+-----+-------------+-----+ + // | --- | A | B | --- | + // +-------------+-----+-------------+-----+ + // |<------ next ----->|<---- current ---->| + word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_)); + Word next_word = load(bitmap_ + sizeof(Word)); + current_word_ = (current_word_ & mask_) | (word & ~mask_); + next_word = (next_word & ~mask_) | (word & mask_); + store(bitmap_, current_word_); + store(bitmap_ + sizeof(Word), next_word); + current_word_ = next_word; + } else { + store(bitmap_, word); + } + bitmap_ += sizeof(Word); + } + + void PutNextTrailingByte(uint8_t byte, int valid_bits) { + if (valid_bits == 8) { + if (may_have_byte_offset && offset_) { + byte = (byte << offset_) | (byte >> (8 - offset_)); + uint8_t next_byte = load(bitmap_ + 1); + current_byte_ = (current_byte_ & mask_) | (byte & ~mask_); + next_byte = (next_byte & ~mask_) | (byte & mask_); + store(bitmap_, current_byte_); + store(bitmap_ + 1, next_byte); + current_byte_ = next_byte; + } else { + store(bitmap_, byte); + } + ++bitmap_; + } else { + assert(valid_bits > 0); + assert(valid_bits < 8); + assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_); + internal::BitmapWriter writer(bitmap_, offset_, valid_bits); + for (int i = 0; i < valid_bits; ++i) { + (byte & 0x01) ? writer.Set() : writer.Clear(); + writer.Next(); + byte >>= 1; + } + writer.Finish(); + } + } + + private: + int64_t offset_; + uint8_t* bitmap_; + + const uint8_t* bitmap_end_; + uint64_t mask_; + union { + Word current_word_; + struct { +#if ARROW_LITTLE_ENDIAN == 0 + uint8_t padding_bytes_[sizeof(Word) - 1]; +#endif + uint8_t current_byte_; + }; + }; + + template + DType load(const uint8_t* bitmap) { + assert(bitmap + sizeof(DType) <= bitmap_end_); + return BitUtil::ToLittleEndian(util::SafeLoadAs(bitmap)); + } + + template + void store(uint8_t* bitmap, DType data) { + assert(bitmap + sizeof(DType) <= bitmap_end_); + util::SafeStore(bitmap, BitUtil::FromLittleEndian(data)); + } +}; + } // namespace internal } // namespace arrow