diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 026bb5c77e0..a0096f8822b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS util/decimal.cc util/delimiting.cc util/dict_util.cc + util/fixed_width_internal.cc util/float16.cc util/formatting.cc util/future.cc diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8825d697fdf..d5e5e5ad289 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -40,6 +40,7 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/fixed_width_internal.h" namespace arrow { @@ -158,9 +159,11 @@ class PrimitiveFilterImpl { PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) - : byte_width_(values.type->byte_width()), + : byte_width_(util::FixedWidthInBytes(*values.type)), values_is_valid_(values.buffers[0].data), - values_data_(values.buffers[1].data), + // No offset applied for boolean because it's a bitmap + values_data_(kIsBoolean ? values.buffers[1].data + : util::OffsetPointerOfFixedWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -169,17 +172,13 @@ class PrimitiveFilterImpl { if constexpr (kByteWidth >= 0 && !kIsBoolean) { DCHECK_EQ(kByteWidth, byte_width_); } - if constexpr (!kIsBoolean) { - // No offset applied for boolean because it's a bitmap - values_data_ += values.offset * byte_width(); - } + DCHECK_EQ(out_arr->offset, 0); if (out_arr->buffers[0] != nullptr) { // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } - out_data_ = out_arr->buffers[1]->mutable_data(); - DCHECK_EQ(out_arr->offset, 0); + out_data_ = util::MutableFixedWidthValuesPointer(out_arr); out_length_ = out_arr->length; out_position_ = 0; } @@ -416,7 +415,7 @@ class PrimitiveFilterImpl { out_position_ += length; } - constexpr int32_t byte_width() const { + constexpr int64_t byte_width() const { if constexpr (kByteWidth >= 0) { return kByteWidth; } else { @@ -425,7 +424,7 @@ class PrimitiveFilterImpl { } private: - int32_t byte_width_; + int64_t byte_width_; const uint8_t* values_is_valid_; const uint8_t* values_data_; int64_t values_null_count_; @@ -439,6 +438,8 @@ class PrimitiveFilterImpl { int64_t out_position_; }; +} // namespace + Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; const ArraySpan& filter = batch[1].array; @@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - const int bit_width = values.type->bit_width(); - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width, - allocate_validity, out_arr)); + DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, output_length, /*source=*/values, allocate_validity, out_arr)); switch (bit_width) { case 1: @@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return Status::OK(); } +namespace { + // ---------------------------------------------------------------------- // Optimized filter for base binary types (32-bit and 64-bit) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index a0fe2808e3e..9f50dcafdbe 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, DCHECK_OK(registry->AddFunction(std::move(func))); } -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], - ctx->Allocate(bit_util::BytesForBits(length * bit_width))); - } - return Status::OK(); -} - namespace { /// \brief Iterate over a REE filter, emitting ranges of a plain values array that @@ -564,39 +547,6 @@ struct VarBinarySelectionImpl : public Selection, T } }; -struct FSBSelectionImpl : public Selection { - using Base = Selection; - LIFT_BASE_MEMBERS(); - - TypedBufferBuilder data_builder; - - FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length, - ExecResult* out) - : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {} - - template - Status GenerateOutput() { - FixedSizeBinaryArray typed_values(this->values.ToArrayData()); - int32_t value_size = typed_values.byte_width(); - - RETURN_NOT_OK(data_builder.Reserve(value_size * output_length)); - Adapter adapter(this); - return adapter.Generate( - [&](int64_t index) { - auto val = typed_values.GetView(index); - data_builder.UnsafeAppend(reinterpret_cast(val.data()), - value_size); - return Status::OK(); - }, - [&]() { - data_builder.UnsafeAppend(value_size, static_cast(0x00)); - return Status::OK(); - }); - } - - Status Finish() override { return data_builder.Finish(&out->buffers[1]); } -}; - template struct ListSelectionImpl : public Selection, Type> { using offset_type = typename Type::offset_type; @@ -909,6 +859,24 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult } Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveFilterExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*extra_predicate=*/[](auto& fixed_width_type) { + // DICTIONARY is fixed-width but not supported by + // PrimitiveFilterExec. + return fixed_width_type.id() != Type::DICTIONARY; + })) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec + // might not handle it correctly. + if (byte_width > 0) { + return PrimitiveFilterExec(ctx, batch, out); + } + } return FilterExec(ctx, batch, out); } @@ -942,23 +910,6 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, return TakeExec>(ctx, batch, out); } -Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const ArraySpan& values = batch[0].array; - const auto byte_width = values.type->byte_width(); - // Use primitive Take implementation (presumably faster) for some byte widths - switch (byte_width) { - case 1: - case 2: - case 4: - case 8: - case 16: - case 32: - return PrimitiveTakeExec(ctx, batch, out); - default: - return TakeExec(ctx, batch, out); - } -} - Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return TakeExec>(ctx, batch, out); } @@ -968,6 +919,19 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* } Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // FixedWidthTakeExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*extra_predicate=*/[](auto& fixed_width_type) { + // DICTIONARY is fixed-width but not supported by + // FixedWidthTakeExec. + return fixed_width_type.id() != Type::DICTIONARY; + })) { + return FixedWidthTakeExec(ctx, batch, out); + } return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 95f3e51cd67..c5075d6dfe8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, const FunctionOptions* default_options, FunctionRegistry* registry); -/// \brief Allocate an ArrayData for a primitive array with a given length and bit width -/// -/// \param[in] bit_width 1 or a multiple of 8 -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out); - /// \brief Callback type for VisitPlainxREEFilterOutputSegments. /// /// position is the logical position in the values array relative to its offset. @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments( FilterOptions::NullSelectionBehavior null_selection, const EmitREEFilterSegment& emit_segment); +Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); @@ -78,8 +73,7 @@ Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status PrimitiveTakeExec(KernelContext*, const ExecSpan&, ExecResult*); -Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*); +Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 5cd37108284..b23b2c624c6 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "arrow/array/builder_primitive.h" @@ -37,6 +38,8 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" +#include "arrow/util/gather_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/ree_util.h" @@ -323,235 +326,88 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte -// C-type based types. Use common implementation for every byte width and only -// generate code for unsigned integer indices, since after boundschecking to -// check for negative numbers in the indices we can safely reinterpret_cast -// signed integers as unsigned. - -/// \brief The Take implementation for primitive (fixed-width) types does not -/// use the logical Arrow type but rather the physical C type. This way we -/// only generate one take function for each byte width. -/// -/// This function assumes that the indices have been boundschecked. -template -struct PrimitiveTakeImpl { - static constexpr int kValueWidth = ValueWidthConstant::value; - - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - DCHECK_EQ(values.type->byte_width(), kValueWidth); - const auto* values_data = - values.GetValues(1, 0) + kValueWidth * values.offset; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->GetMutableValues(1, 0) + kValueWidth * out_arr->offset; - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - DCHECK_EQ(out_offset, 0); - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - - auto WriteValue = [&](int64_t position) { - memcpy(out + position * kValueWidth, - values_data + indices_data[position] * kValueWidth, kValueWidth); - }; - - auto WriteZero = [&](int64_t position) { - memset(out + position * kValueWidth, 0, kValueWidth); - }; - - auto WriteZeroSegment = [&](int64_t position, int64_t length) { - memset(out + position * kValueWidth, 0, kValueWidth * length); - }; +// Implement optimized take for primitive types from boolean to +// 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more +// bytes). +// +// Use one specialization for each of these primitive byte-widths so the +// compiler can specialize the memcpy to dedicated CPU instructions and for +// fixed-width binary use the 1-byte specialization but pass WithFactor=true +// that makes the kernel consider the factor parameter provided at runtime. +// +// Only unsigned index types need to be instantiated since after +// boundschecking to check for negative numbers in the indices we can safely +// reinterpret_cast signed integers as unsigned. - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; - int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - WriteValue(position); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some indices but not all are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - WriteValue(position); - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position) && - bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // index is not null && value is not null - WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } else { - WriteZero(position); - } - ++position; - } - } else { - WriteZeroSegment(position, block.length); - position += block.length; - } - } +/// \brief The Take implementation for primitive types and fixed-width binary. +/// +/// Also note that this function can also handle fixed-size-list arrays if +/// they fit the criteria described in fixed_width_internal.h, so use the +/// function defined in that file to access values and destination pointers +/// and DO NOT ASSUME `values.type()` is a primitive type. +/// +/// \pre the indices have been boundschecked +template +struct FixedWidthTakeImpl { + static constexpr int kValueWidthInBits = ValueBitWidthConstant::value; + + // offset returned is defined as number of kValueWidthInBits blocks + static std::pair SourceOffsetAndValuesPointer( + const ArraySpan& values) { + if constexpr (kValueWidthInBits == 1) { + DCHECK_EQ(values.type->id(), Type::BOOL); + return {values.offset, values.GetValues(1, 0)}; + } else { + static_assert(kValueWidthInBits % 8 == 0, + "kValueWidthInBits must be 1 or a multiple of 8"); + return {0, util::OffsetPointerOfFixedWidthValues(values)}; } - out_arr->null_count = out_arr->length - valid_count; } -}; -template -struct BooleanTakeImpl { - static void Exec(const ArraySpan& values, const ArraySpan& indices, - ArrayData* out_arr) { - const uint8_t* values_data = values.buffers[1].data; - const uint8_t* values_is_valid = values.buffers[0].data; - auto values_offset = values.offset; - - const auto* indices_data = indices.GetValues(1); - const uint8_t* indices_is_valid = indices.buffers[0].data; - auto indices_offset = indices.offset; - - auto out = out_arr->buffers[1]->mutable_data(); - auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - - // If either the values or indices have nulls, we preemptively zero out the - // out validity bitmap so that we don't have to use ClearBit in each - // iteration for nulls. - if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); - } - // Avoid uninitialized data in values array - bit_util::SetBitsTo(out, out_offset, indices.length, false); - - auto PlaceDataBit = [&](int64_t loc, IndexCType index) { - bit_util::SetBitTo(out, out_offset + loc, - bit_util::GetBit(values_data, values_offset + index)); - }; - - OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset, - indices.length); - int64_t position = 0; + static void Exec(KernelContext* ctx, const ArraySpan& values, const ArraySpan& indices, + ArrayData* out_arr, size_t factor) { +#ifndef NDEBUG + int64_t bit_width = util::FixedWidthInBits(*values.type); + DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1)); + DCHECK(!WithFactor::value || + (factor > 0 && kValueWidthInBits == 8 && // factors are used with bytes + static_cast(factor * kValueWidthInBits) == bit_width)); +#endif + const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls(); + + const uint8_t* src; + int64_t src_offset; + std::tie(src_offset, src) = SourceOffsetAndValuesPointer(values); + uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr); int64_t valid_count = 0; - while (position < indices.length) { - BitBlockCount block = indices_bit_counter.NextBlock(); - if (values.null_count == 0) { - // Values are never null, so things are easier - valid_count += block.popcount; - if (block.popcount == block.length) { - // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); - for (int64_t i = 0; i < block.length; ++i) { - PlaceDataBit(position, indices_data[position]); - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - } - ++position; - } - } else { - position += block.length; - } - } else { - // Values have nulls, so we must do random access into the values bitmap - if (block.popcount == block.length) { - // Faster path: indices are not null but values may be - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - bit_util::SetBit(out_is_valid, out_offset + position); - PlaceDataBit(position, indices_data[position]); - ++valid_count; - } - ++position; - } - } else if (block.popcount > 0) { - // Slow path: some but not all indices are null. Since we are doing - // random access in general we have to check the value nullness one by - // one. - for (int64_t i = 0; i < block.length; ++i) { - if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { - // index is not null - if (bit_util::GetBit(values_is_valid, - values_offset + indices_data[position])) { - // value is not null - PlaceDataBit(position, indices_data[position]); - bit_util::SetBit(out_is_valid, out_offset + position); - ++valid_count; - } - } - ++position; - } - } else { - position += block.length; - } - } + arrow::internal::Gather gather{ + /*src_length=*/values.length, + src, + src_offset, + /*idx_length=*/indices.length, + /*idx=*/indices.GetValues(1), + out, + factor}; + if (out_has_validity) { + DCHECK_EQ(out_arr->offset, 0); + // out_is_valid must be zero-initiliazed, because Gather::Execute + // saves time by not having to ClearBit on every null element. + auto out_is_valid = out_arr->GetMutableValues(0); + memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length)); + valid_count = gather.template Execute( + /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid); + } else { + valid_count = gather.Execute(); } out_arr->null_count = out_arr->length - valid_count; } }; template