From cbd492d76b1a3872f18e3e65bf9332cf2e15fffc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 7 Jun 2020 19:57:31 -0500 Subject: [PATCH 1/8] New Take implementation and support infrastructure --- cpp/src/arrow/compute/api_vector.h | 7 +- cpp/src/arrow/compute/benchmark_util.h | 15 +- cpp/src/arrow/compute/kernel.cc | 82 +- cpp/src/arrow/compute/kernel.h | 13 + .../kernels/vector_selection_benchmark.cc | 12 +- cpp/src/arrow/compute/kernels/vector_take.cc | 791 +++++++++++++++++- .../arrow/compute/kernels/vector_take_test.cc | 156 +++- cpp/src/arrow/type_traits.h | 1 - cpp/src/arrow/util/bit_block_counter.cc | 12 + cpp/src/arrow/util/bit_block_counter.h | 42 + cpp/src/arrow/util/bit_block_counter_test.cc | 23 + cpp/src/arrow/util/bitmap_reader.h | 14 + cpp/src/arrow/util/int_util.cc | 90 ++ cpp/src/arrow/util/int_util.h | 10 + cpp/src/arrow/util/int_util_test.cc | 101 +++ cpp/src/arrow/util/macros.h | 2 + python/pyarrow/_compute.pyx | 11 + python/pyarrow/compute.py | 11 +- python/pyarrow/includes/libarrow.pxd | 7 +- python/pyarrow/tests/test_pandas.py | 1 + 20 files changed, 1314 insertions(+), 87 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 84086802f18..8aefbcf1ccf 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -65,7 +65,12 @@ Result Filter(const Datum& values, const Datum& filter, ExecContext* ctx = NULLPTR); struct ARROW_EXPORT TakeOptions : public FunctionOptions { - static TakeOptions Defaults() { return TakeOptions{}; } + explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {} + + bool boundscheck = true; + static TakeOptions Boundscheck() { return TakeOptions(true); } + static TakeOptions NoBoundscheck() { return TakeOptions(false); } + static TakeOptions Defaults() { return Boundscheck(); } }; /// \brief Take from an array of values at indices in another array diff --git a/cpp/src/arrow/compute/benchmark_util.h b/cpp/src/arrow/compute/benchmark_util.h index 77149f91aab..702f1c7056a 100644 --- a/cpp/src/arrow/compute/benchmark_util.h +++ b/cpp/src/arrow/compute/benchmark_util.h @@ -56,9 +56,10 @@ void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench, const std::vector& sizes = kMemorySizes) { bench->Unit(benchmark::kMicrosecond); + // 0 is treated as "no nulls" for (const auto size : sizes) { for (const auto inverse_null_proportion : - std::vector({10000, 100, 10, 2, 1})) { + std::vector({10000, 100, 50, 10, 2, 1, 0})) { bench->Args({static_cast(size), inverse_null_proportion}); } } @@ -80,12 +81,16 @@ struct RegressionArgs { const int64_t size; // proportion of nulls in generated arrays - const double null_proportion; + double null_proportion; explicit RegressionArgs(benchmark::State& state) - : size(state.range(0)), - null_proportion(std::min(1., 1. / static_cast(state.range(1)))), - state_(state) {} + : size(state.range(0)), null_proportion(), state_(state) { + if (state.range(1) == 0) { + this->null_proportion = 0.0; + } else { + this->null_proportion = std::min(1., 1. / static_cast(state.range(1))); + } + } ~RegressionArgs() { state_.counters["size"] = static_cast(size); diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 1a6f1bd1031..4747e63c159 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -26,6 +26,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/util_internal.h" #include "arrow/result.h" +#include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hash_util.h" @@ -96,7 +97,6 @@ class SameTypeIdMatcher : public TypeMatcher { if (this == &other) { return true; } - auto casted = dynamic_cast(&other); if (casted == nullptr) { return false; @@ -150,6 +150,86 @@ std::shared_ptr TimestampUnit(TimeUnit::type unit) { return std::make_shared(unit); } +class IntegerMatcher : public TypeMatcher { + public: + IntegerMatcher() {} + + bool Matches(const DataType& type) const override { return is_integer(type.id()); } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } + + std::string ToString() const override { return "integer"; } +}; + +std::shared_ptr Integer() { return std::make_shared(); } + +class PrimitiveMatcher : public TypeMatcher { + public: + PrimitiveMatcher() {} + + bool Matches(const DataType& type) const override { return is_primitive(type.id()); } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } + + std::string ToString() const override { return "primitive"; } +}; + +std::shared_ptr Primitive() { return std::make_shared(); } + +class BinaryLikeMatcher : public TypeMatcher { + public: + BinaryLikeMatcher() {} + + bool Matches(const DataType& type) const override { return is_binary_like(type.id()); } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } + std::string ToString() const override { return "binary-like"; } +}; + +std::shared_ptr BinaryLike() { + return std::make_shared(); +} + +class LargeBinaryLikeMatcher : public TypeMatcher { + public: + LargeBinaryLikeMatcher() {} + + bool Matches(const DataType& type) const override { + return is_large_binary_like(type.id()); + } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } + std::string ToString() const override { return "large-binary-like"; } +}; + +std::shared_ptr LargeBinaryLike() { + return std::make_shared(); +} + } // namespace match // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index bdd2e9d1079..059a7c2001a 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -150,6 +150,19 @@ ARROW_EXPORT std::shared_ptr SameTypeId(Type::type type_id); /// zones can be different. ARROW_EXPORT std::shared_ptr TimestampUnit(TimeUnit::type unit); +// \brief Match any integer type +ARROW_EXPORT std::shared_ptr Integer(); + +// Match types using 32-bit varbinary representation +ARROW_EXPORT std::shared_ptr BinaryLike(); + +// Match types using 64-bit varbinary representation +ARROW_EXPORT std::shared_ptr LargeBinaryLike(); + +// \brief Match any primitive type (boolean or any type representable as a C +// Type) +ARROW_EXPORT std::shared_ptr Primitive(); + } // namespace match /// \brief An object used for type- and shape-checking arguments to be passed diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc index 6031a907dee..822363ac554 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc @@ -90,7 +90,7 @@ struct TakeBenchmark { args(state), rand(kSeed), indices_have_nulls(indices_have_nulls), - monotonic_indices(false) {} + monotonic_indices(monotonic_indices) {} void Int64() { const int64_t array_size = args.size / sizeof(int64_t); @@ -123,10 +123,10 @@ struct TakeBenchmark { } void Bench(const std::shared_ptr& values) { - bool indices_null_proportion = indices_have_nulls ? args.null_proportion : 0; + double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0; auto indices = - rand.Int32(static_cast(values->length()), 0, - static_cast(values->length() - 1), indices_null_proportion); + rand.Int32(values->length(), 0, static_cast(values->length() - 1), + indices_null_proportion); if (monotonic_indices) { auto arg_sorter = *SortToIndices(*indices); @@ -277,12 +277,12 @@ void TakeSetArgs(benchmark::internal::Benchmark* bench) { BENCHMARK(TakeInt64RandomIndicesNoNulls)->Apply(TakeSetArgs); BENCHMARK(TakeInt64RandomIndicesWithNulls)->Apply(TakeSetArgs); +BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs); BENCHMARK(TakeFSLInt64RandomIndicesNoNulls)->Apply(TakeSetArgs); BENCHMARK(TakeFSLInt64RandomIndicesWithNulls)->Apply(TakeSetArgs); +BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs); BENCHMARK(TakeStringRandomIndicesNoNulls)->Apply(TakeSetArgs); BENCHMARK(TakeStringRandomIndicesWithNulls)->Apply(TakeSetArgs); -BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs); -BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs); BENCHMARK(TakeStringMonotonicIndices)->Apply(TakeSetArgs); } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc index 1658f965212..8aae6811a2a 100644 --- a/cpp/src/arrow/compute/kernels/vector_take.cc +++ b/cpp/src/arrow/compute/kernels/vector_take.cc @@ -15,15 +15,32 @@ // specific language governing permissions and limitations // under the License. +#include +#include +#include + #include "arrow/array/array_base.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/concatenate.h" +#include "arrow/array/data.h" +#include "arrow/buffer_builder.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common.h" -#include "arrow/compute/kernels/vector_selection_internal.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/int_util.h" namespace arrow { + +using internal::BitBlockCount; +using internal::BitmapReader; +using internal::GetArrayView; +using internal::IndexBoundscheck; +using internal::OptionalBitBlockCounter; +using internal::OptionalBitIndexer; + namespace compute { namespace internal { @@ -38,44 +55,715 @@ std::unique_ptr InitTake(KernelContext*, const KernelInitArgs& args return std::unique_ptr(new TakeState{*take_options}); } -template -struct TakeFunctor { - using ValueArrayType = typename TypeTraits::ArrayType; - using IndexArrayType = typename TypeTraits::ArrayType; - using IS = ArrayIndexSequence; +namespace {} // namespace + +// ---------------------------------------------------------------------- +// Implement optimized take for primitive types from boolean to 1/2/4/8-byte +// C-type based types. Use common implementation for every byte width and only +// generate code for unsigned integer indices, since after boundschecking to +// check for negative numbers the indices we can safely reinterpret_cast signed +// integers as unsigned. + +struct PrimitiveTakeArgs { + const uint8_t* values; + const uint8_t* values_bitmap = nullptr; + int values_bit_width; + int64_t values_length; + int64_t values_offset; + int64_t values_null_count; + const uint8_t* indices; + const uint8_t* indices_bitmap = nullptr; + int indices_bit_width; + int64_t indices_length; + int64_t indices_offset; + int64_t indices_null_count; +}; + +// Reduce code size by dealing with the unboxing of the kernel inputs once +// rather than duplicating compiled code to do all these in each kernel. +PrimitiveTakeArgs GetPrimitiveTakeArgs(const ExecBatch& batch) { + PrimitiveTakeArgs args; + + const ArrayData& arg0 = *batch[0].array(); + const ArrayData& arg1 = *batch[1].array(); + + // Values + args.values_bit_width = static_cast(*arg0.type).bit_width(); + args.values = arg0.buffers[1]->data(); + if (args.values_bit_width > 1) { + args.values += arg0.offset * args.values_bit_width / 8; + } + args.values_length = arg0.length; + args.values_offset = arg0.offset; + args.values_null_count = arg0.GetNullCount(); + if (arg0.buffers[0]) { + args.values_bitmap = arg0.buffers[0]->data(); + } + + // Indices + args.indices_bit_width = static_cast(*arg1.type).bit_width(); + args.indices = arg1.buffers[1]->data() + arg1.offset * args.indices_bit_width / 8; + args.indices_length = arg1.length; + args.indices_offset = arg1.offset; + args.indices_null_count = arg1.GetNullCount(); + if (arg1.buffers[0]) { + args.indices_bitmap = arg1.buffers[0]->data(); + } + + return args; +} + +/// \brief The Take implementation for primitive (fixed-width) types does not +/// use the logical Arrow type but rather then physical C type. This way we +/// only generate one take function for each byte width. +/// +/// This function assumes that the indices have been boundschecked. +template +struct PrimitiveTakeImpl { + static void Exec(const PrimitiveTakeArgs& args, Datum* out_datum) { + auto values = reinterpret_cast(args.values); + auto values_bitmap = args.values_bitmap; + auto values_offset = args.values_offset; + + auto indices = reinterpret_cast(args.indices); + auto indices_bitmap = args.indices_bitmap; + auto indices_offset = args.indices_offset; + + ArrayData* out_arr = out_datum->mutable_array(); + auto out = out_arr->GetMutableValues(1); + auto out_bitmap = out_arr->buffers[0]->mutable_data(); + auto out_offset = out_arr->offset; + + // If either the values or indices have nulls, we preemptively zero out the + // out validity bitmap so that we don't have to use ClearBit in each + // iteration for nulls. + if (args.values_null_count > 0 || args.indices_null_count > 0) { + BitUtil::SetBitsTo(out_bitmap, out_offset, args.indices_length, false); + } + + OptionalBitBlockCounter indices_bit_counter(indices_bitmap, indices_offset, + args.indices_length); + int64_t position = 0; + int64_t valid_count = 0; + while (true) { + BitBlockCount block = indices_bit_counter.NextBlock(); + if (block.length == 0) { + // All indices processed. + break; + } + if (args.values_null_count == 0) { + // Values are never null, so things are easier + valid_count += block.popcount; + if (block.popcount == block.length) { + // Fastest path: neither values nor index nulls + BitUtil::SetBitsTo(out_bitmap, out_offset + position, block.length, true); + for (int64_t i = 0; i < block.length; ++i) { + out[position] = values[indices[position]]; + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some indices but not all are null + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(indices_bitmap, indices_offset + position)) { + // index is not null + BitUtil::SetBit(out_bitmap, out_offset + position); + out[position] = values[indices[position]]; + } + ++position; + } + } + } else { + // Values have nulls, so we must do random access into the values bitmap + if (block.popcount == block.length) { + // Faster path: indices are not null but values may be + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(values_bitmap, values_offset + indices[position])) { + // value is not null + out[position] = values[indices[position]]; + BitUtil::SetBit(out_bitmap, out_offset + position); + ++valid_count; + } + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some but not all indices are null. Since we are doing + // random access in general we have to check the value nullness one by + // one. + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(indices_bitmap, indices_offset + position)) { + // index is not null + if (BitUtil::GetBit(values_bitmap, values_offset + indices[position])) { + // value is not null + out[position] = values[indices[position]]; + BitUtil::SetBit(out_bitmap, out_offset + position); + ++valid_count; + } + } + ++position; + } + } + } + } + out_arr->null_count = out_arr->length - valid_count; + } +}; + +template +struct BooleanTakeImpl { + static void Exec(const PrimitiveTakeArgs& args, Datum* out_datum) { + auto values = args.values; + auto values_bitmap = args.values_bitmap; + auto values_offset = args.values_offset; + + auto indices = reinterpret_cast(args.indices); + auto indices_bitmap = args.indices_bitmap; + auto indices_offset = args.indices_offset; + + ArrayData* out_arr = out_datum->mutable_array(); + auto out = out_arr->buffers[1]->mutable_data(); + auto out_bitmap = out_arr->buffers[0]->mutable_data(); + auto out_offset = out_arr->offset; - static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ValueArrayType values(batch[0].array()); - IndexArrayType indices(batch[1].array()); - std::shared_ptr result; - KERNEL_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result)); - out->value = result->data(); + // If either the values or indices have nulls, we preemptively zero out the + // out validity bitmap so that we don't have to use ClearBit in each + // iteration for nulls. + if (args.values_null_count > 0 || args.indices_null_count > 0) { + BitUtil::SetBitsTo(out_bitmap, out_offset, args.indices_length, false); + } + + auto PlaceDataBit = [&](int64_t loc, IndexCType index) { + BitUtil::SetBitTo(out, out_offset + loc, + BitUtil::GetBit(values, values_offset + index)); + }; + + OptionalBitBlockCounter indices_bit_counter(indices_bitmap, indices_offset, + args.indices_length); + int64_t position = 0; + int64_t valid_count = 0; + while (true) { + BitBlockCount block = indices_bit_counter.NextBlock(); + if (block.length == 0) { + // All indices processed. + break; + } + if (args.values_null_count == 0) { + // Values are never null, so things are easier + valid_count += block.popcount; + if (block.popcount == block.length) { + // Fastest path: neither values nor index nulls + BitUtil::SetBitsTo(out_bitmap, out_offset + position, block.length, true); + for (int64_t i = 0; i < block.length; ++i) { + PlaceDataBit(position, indices[position]); + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some but not all indices are null + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(indices_bitmap, indices_offset + position)) { + // index is not null + BitUtil::SetBit(out_bitmap, out_offset + position); + PlaceDataBit(position, indices[position]); + } + ++position; + } + } + } else { + // Values have nulls, so we must do random access into the values bitmap + if (block.popcount == block.length) { + // Faster path: indices are not null but values may be + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(values_bitmap, values_offset + indices[position])) { + // value is not null + BitUtil::SetBit(out_bitmap, out_offset + position); + PlaceDataBit(position, indices[position]); + ++valid_count; + } + ++position; + } + } else if (block.popcount > 0) { + // Slow path: some but not all indices are null. Since we are doing + // random access in general we have to check the value nullness one by + // one. + for (int64_t i = 0; i < block.length; ++i) { + if (BitUtil::GetBit(indices_bitmap, indices_offset + position)) { + // index is not null + if (BitUtil::GetBit(values_bitmap, values_offset + indices[position])) { + // value is not null + PlaceDataBit(position, indices[position]); + BitUtil::SetBit(out_bitmap, out_offset + position); + ++valid_count; + } + } + ++position; + } + } + } + } + out_arr->null_count = out_arr->length - valid_count; } }; -struct TakeKernelVisitor { - TakeKernelVisitor(const DataType& value_type, const DataType& index_type) - : value_type(value_type), index_type(index_type) {} +template