From dcfb181f68e2518ee7d6a65ee29a17733ca4fda4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Jun 2022 21:51:52 -0500 Subject: [PATCH 01/15] Introduce ArraySpan, ExecSpan data structures and new lighter-weight method of scalar kernel evaluation Checkpoint, getting closer to compilation libarrow builds again Get everything compiling again, compute internals tests passing Get Bitmap test cases passing again Don't try filling validity bitmap if none was allocated Fix bit block visitors and a few arithmetic kernels Refactor int_util.h to use ArraySpan Some more tests passing Fix some more unit tests, compilation Another fix Fix some more tests Fix some more bugs Fixed some more tests All scalar_if_else.cc tests passing again Work partway through scalar_nested.cc Down to only a few failing scalar tests scalar kernels tests passing again fix scalar executor, tests all passing now --- cpp/src/arrow/array/array_dict.cc | 6 +- cpp/src/arrow/array/array_nested.cc | 7 +- cpp/src/arrow/array/array_nested.h | 2 +- cpp/src/arrow/array/array_test.cc | 29 +- cpp/src/arrow/array/builder_base.h | 2 +- cpp/src/arrow/array/builder_binary.h | 4 +- cpp/src/arrow/array/builder_dict.h | 9 +- cpp/src/arrow/array/builder_nested.h | 26 +- cpp/src/arrow/array/builder_primitive.h | 6 +- cpp/src/arrow/array/builder_union.cc | 8 +- cpp/src/arrow/array/builder_union.h | 4 +- cpp/src/arrow/array/data.cc | 140 ++ cpp/src/arrow/array/data.h | 124 ++ cpp/src/arrow/array/util.cc | 4 +- cpp/src/arrow/builder.cc | 2 +- cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/compute/cast.cc | 2 +- cpp/src/arrow/compute/cast.h | 2 +- cpp/src/arrow/compute/exec.cc | 591 +++++-- cpp/src/arrow/compute/exec.h | 188 ++ cpp/src/arrow/compute/exec_internal.h | 65 +- cpp/src/arrow/compute/exec_test.cc | 501 +++++- cpp/src/arrow/compute/function.cc | 4 +- cpp/src/arrow/compute/function.h | 4 +- cpp/src/arrow/compute/function_benchmark.cc | 22 +- cpp/src/arrow/compute/function_test.cc | 34 +- cpp/src/arrow/compute/kernel.h | 97 +- .../arrow/compute/kernels/aggregate_basic.cc | 4 +- .../arrow/compute/kernels/aggregate_mode.cc | 5 +- .../compute/kernels/aggregate_quantile.cc | 2 +- .../arrow/compute/kernels/codegen_internal.cc | 16 +- .../arrow/compute/kernels/codegen_internal.h | 660 +++++-- .../arrow/compute/kernels/hash_aggregate.cc | 4 +- .../compute/kernels/scalar_arithmetic.cc | 83 +- .../arrow/compute/kernels/scalar_boolean.cc | 304 ++-- .../compute/kernels/scalar_cast_boolean.cc | 7 +- .../compute/kernels/scalar_cast_dictionary.cc | 45 +- .../compute/kernels/scalar_cast_internal.cc | 118 +- .../compute/kernels/scalar_cast_internal.h | 18 +- .../compute/kernels/scalar_cast_nested.cc | 51 +- .../compute/kernels/scalar_cast_numeric.cc | 65 +- .../compute/kernels/scalar_cast_string.cc | 131 +- .../compute/kernels/scalar_cast_temporal.cc | 75 +- .../arrow/compute/kernels/scalar_compare.cc | 138 +- .../arrow/compute/kernels/scalar_if_else.cc | 1512 +++++++++-------- .../arrow/compute/kernels/scalar_nested.cc | 364 ++-- .../compute/kernels/scalar_nested_test.cc | 13 +- .../arrow/compute/kernels/scalar_random.cc | 20 +- .../compute/kernels/scalar_set_lookup.cc | 98 +- .../compute/kernels/scalar_set_lookup_test.cc | 12 +- .../compute/kernels/scalar_string_ascii.cc | 440 ++--- .../compute/kernels/scalar_string_internal.h | 66 +- .../compute/kernels/scalar_string_utf8.cc | 32 +- .../compute/kernels/scalar_temporal_binary.cc | 20 +- .../compute/kernels/scalar_temporal_unary.cc | 129 +- .../arrow/compute/kernels/scalar_validity.cc | 127 +- .../compute/kernels/scalar_validity_test.cc | 6 - .../arrow/compute/kernels/temporal_internal.h | 30 +- .../arrow/compute/kernels/util_internal.cc | 65 +- cpp/src/arrow/compute/kernels/util_internal.h | 18 +- .../compute/kernels/vector_array_sort.cc | 10 +- .../compute/kernels/vector_cumulative_ops.cc | 6 +- .../arrow/compute/kernels/vector_replace.cc | 6 +- .../arrow/compute/kernels/vector_selection.cc | 13 +- cpp/src/arrow/datum.h | 2 + cpp/src/arrow/pretty_print.cc | 24 +- cpp/src/arrow/python/arrow_to_pandas.cc | 9 +- cpp/src/arrow/python/udf.cc | 50 +- cpp/src/arrow/scalar.cc | 2 +- cpp/src/arrow/scalar.h | 24 +- cpp/src/arrow/type.h | 26 +- cpp/src/arrow/util/bit_block_counter.h | 105 +- cpp/src/arrow/util/bit_util_benchmark.cc | 15 +- cpp/src/arrow/util/bit_util_test.cc | 35 +- cpp/src/arrow/util/bitmap.cc | 24 +- cpp/src/arrow/util/bitmap.h | 61 +- cpp/src/arrow/util/formatting.h | 18 +- cpp/src/arrow/util/formatting_util_test.cc | 33 +- cpp/src/arrow/util/int_util.cc | 149 +- cpp/src/arrow/util/int_util.h | 13 +- cpp/src/arrow/util/int_util_benchmark.cc | 4 +- cpp/src/arrow/util/int_util_test.cc | 32 +- 82 files changed, 4517 insertions(+), 2707 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index dbfc7bd7586..0a4d33e03da 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -125,7 +125,7 @@ Result> DictionaryArray::FromArrays( "Dictionary type's index type does not match " "indices array's type"); } - RETURN_NOT_OK(internal::CheckIndexBounds(*indices->data(), + RETURN_NOT_OK(internal::CheckIndexBounds(ArraySpan(*indices->data()), static_cast(dictionary->length()))); return std::make_shared(type, indices, dictionary); } @@ -290,8 +290,8 @@ class DictionaryUnifierImpl : public DictionaryUnifier { Status GetResultWithIndexType(const std::shared_ptr& index_type, std::shared_ptr* out_dict) override { - int64_t dict_length = memo_table_.size(); - if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) { + Int64Scalar dict_length(memo_table_.size()); + if (!internal::IntegersCanFit(dict_length, *index_type).ok()) { return Status::Invalid( "These dictionaries cannot be combined. The unified dictionary requires a " "larger index type."); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 757ab8e9c3f..64edec0c7aa 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -569,7 +569,7 @@ const ArrayVector& StructArray::fields() const { return boxed_fields_; } -std::shared_ptr StructArray::field(int i) const { +const std::shared_ptr& StructArray::field(int i) const { std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); if (!result) { std::shared_ptr field_data; @@ -578,10 +578,11 @@ std::shared_ptr StructArray::field(int i) const { } else { field_data = data_->child_data[i]; } - result = MakeArray(field_data); + std::shared_ptr result = MakeArray(field_data); internal::atomic_store(&boxed_fields_[i], result); + return boxed_fields_[i]; } - return result; + return boxed_fields_[i]; } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index be6371941df..5d04bef4f9e 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -378,7 +378,7 @@ class ARROW_EXPORT StructArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. The returned array has its offset, length and null // count adjusted. - std::shared_ptr field(int pos) const; + const std::shared_ptr& field(int pos) const; const ArrayVector& fields() const; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 929c4d3bf4d..6057796875c 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -696,6 +696,7 @@ TEST_F(TestArray, TestMakeEmptyArray) { TEST_F(TestArray, TestAppendArraySlice) { auto scalars = GetScalars(); + ArraySpan span; for (const auto& scalar : scalars) { ARROW_SCOPED_TRACE(*scalar->type); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16)); @@ -704,31 +705,33 @@ TEST_F(TestArray, TestAppendArraySlice) { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 4)); + span.SetMembers(*array->data()); + ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 0, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 4)); ASSERT_EQ(8, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 4)); + span.SetMembers(*nulls->data()); + ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 0, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 4)); ASSERT_EQ(16, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(8, builder->null_count()); @@ -746,13 +749,15 @@ TEST_F(TestArray, TestAppendArraySlice) { { ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16)); NullBuilder builder(pool_); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 4)); + + span.SetMembers(*array->data()); + ASSERT_OK(builder.AppendArraySlice(span, 0, 4)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 0)); + ASSERT_OK(builder.AppendArraySlice(span, 0, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 0)); + ASSERT_OK(builder.AppendArraySlice(span, 1, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 4)); + ASSERT_OK(builder.AppendArraySlice(span, 1, 4)); ASSERT_EQ(8, builder.length()); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 4d0b477dcb8..bc4932a4b83 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -147,7 +147,7 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySlice(const ArrayData& array, int64_t offset, + virtual Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 703355bf278..25cec5c1e25 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -278,7 +278,7 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { auto bitmap = array.GetValues(0, 0); auto offsets = array.GetValues(1); @@ -516,7 +516,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues( array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 90a7c48d9f0..b720f73d7d2 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -366,10 +366,11 @@ class DictionaryBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { // Visit the indices and insert the unpacked values. const auto& dict_ty = internal::checked_cast(*array.type); - const typename TypeTraits::ArrayType dict(array.dictionary); + // See if possible to avoid using ToArrayData here + const typename TypeTraits::ArrayType dict(array.dictionary().ToArrayData()); ARROW_RETURN_NOT_OK(Reserve(length)); switch (dict_ty.index_type()->id()) { case Type::UINT8: @@ -490,10 +491,10 @@ class DictionaryBuilderBase : public ArrayBuilder { protected: template Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, - const ArrayData& array, int64_t offset, int64_t length) { + const ArraySpan& array, int64_t offset, int64_t length) { const c_type* values = array.GetValues(1) + offset; return VisitBitBlocks( - array.buffers[0], array.offset + offset, length, + array.buffers[0].data, array.offset + offset, length, [&](const int64_t position) { const int64_t index = static_cast(values[position]); if (dict.IsValid(index)) { diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 4f39ce86294..3d36cb5f65e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -126,15 +126,15 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); @@ -296,18 +296,18 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const int32_t* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( - *array.child_data[0]->child_data[0], offsets[row], slot_length)); + array.child_data[0].child_data[0], offsets[row], slot_length)); ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( - *array.child_data[0]->child_data[1], offsets[row], slot_length)); + array.child_data[0].child_data[1], offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); } @@ -425,12 +425,12 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( - *array.child_data[0], list_size_ * (array.offset + row), list_size_)); + array.child_data[0], list_size_ * (array.offset + row), list_size_)); ARROW_RETURN_NOT_OK(Append()); } else { ARROW_RETURN_NOT_OK(AppendNull()); @@ -532,13 +532,13 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { for (int i = 0; static_cast(i) < children_.size(); i++) { - ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i], + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i], array.offset + offset, length)); } - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(validity, array.offset + offset, length); return Status::OK(); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index c72b48f0b32..8f2dcc8b09b 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -53,7 +53,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status Append(std::nullptr_t) { return AppendNull(); } - Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override { + Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override { return AppendNulls(length); } @@ -279,7 +279,7 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues(array.GetValues(1) + offset, length, array.GetValues(0, 0), array.offset + offset); @@ -513,7 +513,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(int64_t length, bool value); - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues(array.GetValues(1, 0), length, array.GetValues(0, 0), array.offset + offset); diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 6096b76ff21..883cda3d8b7 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -45,7 +45,7 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } -Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, +Status DenseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset, const int64_t length) { const int8_t* type_codes = array.GetValues(1); const int32_t* offsets = array.GetValues(2); @@ -55,7 +55,7 @@ Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t const int32_t union_offset = offsets[row]; RETURN_NOT_OK(Append(type_code)); RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( - *array.child_data[child_id], union_offset, /*length=*/1)); + array.child_data[child_id], union_offset, /*length=*/1)); } return Status::OK(); } @@ -137,11 +137,11 @@ int8_t BasicUnionBuilder::NextTypeId() { return dense_type_id_++; } -Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, +Status SparseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset, const int64_t length) { for (size_t i = 0; i < type_codes_.size(); i++) { RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendArraySlice( - *array.child_data[i], array.offset + offset, length)); + array.child_data[i], array.offset + offset, length)); } const int8_t* type_codes = array.GetValues(1); RETURN_NOT_OK(types_builder_.Append(type_codes + offset, length)); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index f1629939ce3..eb8c5d3af0e 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -160,7 +160,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override; Status FinishInternal(std::shared_ptr* out) override; @@ -239,7 +239,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override; }; diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 9dfc76c01ce..e1587695d6b 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -25,9 +25,12 @@ #include #include +#include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -128,6 +131,143 @@ int64_t ArrayData::GetNullCount() const { return precomputed; } +// ---------------------------------------------------------------------- +// Methods for ArraySpan + +void ArraySpan::SetMembers(const ArrayData& data) { + this->type = data.type.get(); + this->length = data.length; + this->null_count = data.null_count.load(); + this->offset = data.offset; + + for (size_t i = 0; i < data.buffers.size(); ++i) { + const std::shared_ptr& buffer = data.buffers[i]; + // It is the invoker-of-kernels's responsibility to ensure that + // const buffers are not written to accidentally. + if (buffer) { + SetBuffer(i, buffer); + } else { + ClearBuffer(i); + } + } + + // Makes sure any other buffers are seen as null / non-existent + for (size_t i = data.buffers.size(); i < 3; ++i) { + ClearBuffer(i); + } + + // TODO(wesm): what about extension arrays? + + if (this->type->id() == Type::DICTIONARY) { + this->child_data.resize(1); + this->child_data[0].SetMembers(*data.dictionary); + } else { + this->child_data.resize(data.child_data.size()); + for (size_t child_index = 0; child_index < data.child_data.size(); ++child_index) { + this->child_data[child_index].SetMembers(*data.child_data[child_index]); + } + } +} + +void ArraySpan::FillFromScalar(const Scalar& value) { + static const uint8_t kValidByte = 0x01; + static const uint8_t kNullByte = 0x00; + + this->type = value.type.get(); + this->length = 1; + + // Populate null count and validity bitmap + this->null_count = value.is_valid ? 0 : 1; + this->buffers[0].data = const_cast(value.is_valid ? &kValidByte : &kNullByte); + this->buffers[0].size = 1; + + if (is_primitive(value.type->id())) { + const auto& scalar = + internal::checked_cast(value); + const uint8_t* scalar_data = reinterpret_cast(scalar.data()); + this->buffers[1].data = const_cast(scalar_data); + this->buffers[1].size = scalar.type->byte_width(); + } else { + // TODO(wesm): implement for other types + DCHECK(false) << "need to implement for other types"; + } +} + +int64_t ArraySpan::GetNullCount() const { + int64_t precomputed = this->null_count; + if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) { + if (this->buffers[0].data != nullptr) { + precomputed = + this->length - CountSetBits(this->buffers[0].data, this->offset, this->length); + } else { + precomputed = 0; + } + this->null_count = precomputed; + } + return precomputed; +} + +int GetNumBuffers(const DataType& type) { + switch (type.id()) { + case Type::NA: + return 0; + case Type::STRUCT: + case Type::FIXED_SIZE_LIST: + return 1; + case Type::BINARY: + case Type::LARGE_BINARY: + case Type::STRING: + case Type::LARGE_STRING: + case Type::DENSE_UNION: + return 3; + case Type::EXTENSION: + // The number of buffers depends on the storage type + return GetNumBuffers( + *internal::checked_cast(type).storage_type()); + default: + // Everything else has 2 buffers + return 2; + } +} + +int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); } + +std::shared_ptr ArraySpan::ToArrayData() const { + auto result = std::make_shared(this->type->GetSharedPtr(), this->length, + kUnknownNullCount, this->offset); + + for (int i = 0; i < this->num_buffers(); ++i) { + if (this->buffers[i].owner) { + result->buffers.emplace_back(this->GetBuffer(i)); + } else { + result->buffers.push_back(nullptr); + } + } + + if (this->type->id() == Type::NA) { + result->null_count = this->length; + } else if (this->buffers[0].data == nullptr) { + // No validity bitmap, so the null count is 0 + result->null_count = 0; + } + + // TODO(wesm): what about extension arrays? + + if (this->type->id() == Type::DICTIONARY) { + result->dictionary = this->dictionary().ToArrayData(); + } else { + // Emit children, too + for (size_t i = 0; i < this->child_data.size(); ++i) { + result->child_data.push_back(this->child_data[i].ToArrayData()); + } + } + return result; +} + +std::shared_ptr ArraySpan::ToArray() const { + return MakeArray(this->ToArrayData()); +} + // ---------------------------------------------------------------------- // Implement ArrayData::View diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 418d09def6b..a6cbdcbe8ee 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -25,11 +25,14 @@ #include "arrow/buffer.h" #include "arrow/result.h" +#include "arrow/util/bit_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { +class Array; + // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count // to -1 (any negative number will do). When Array::null_count is called the @@ -242,6 +245,127 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr dictionary; }; +/// \brief A non-owning Buffer reference +struct ARROW_EXPORT BufferRef { + // It is the user of this class's responsibility to ensure that + // buffers that were const originally are not written to + // accidentally. + uint8_t* data = NULLPTR; + int64_t size = 0; + // Pointer back to buffer that owns this memory + const std::shared_ptr* owner = NULLPTR; +}; + +/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply +/// copyable and does not contain any shared_ptr objects. Do not use in public +/// APIs aside from compute kernels for now +struct ARROW_EXPORT ArraySpan { + const DataType* type; + int64_t length = 0; + mutable int64_t null_count = kUnknownNullCount; + int64_t offset = 0; + BufferRef buffers[3]; + + ArraySpan() = default; + + explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {} + explicit ArraySpan(const ArrayData& data) { SetMembers(data); } + explicit ArraySpan(const Scalar& data) { FillFromScalar(data); } + + /// If dictionary-encoded, put dictionary in the first entry + // TODO(wesm): would a std::unique_ptr> be better? + std::vector child_data; + + /// \brief Populate ArraySpan to look like an array of length 1 pointing at + /// the data members of a Scalar value + void FillFromScalar(const Scalar& value); + + void SetMembers(const ArrayData& data); + + void SetBuffer(int index, const std::shared_ptr& buffer) { + this->buffers[index].data = const_cast(buffer->data()); + this->buffers[index].size = buffer->size(); + this->buffers[index].owner = &buffer; + } + + void ClearBuffer(int index) { + this->buffers[index].data = NULLPTR; + this->buffers[index].size = 0; + this->buffers[index].owner = NULLPTR; + } + + const ArraySpan& dictionary() const { return child_data[0]; } + + /// \brief Return the number of buffers (out of 3) that are used to + /// constitute this array + int num_buffers() const; + + // Access a buffer's data as a typed C pointer + template + inline T* GetValues(int i, int64_t absolute_offset) { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline T* GetValues(int i) { + return GetValues(i, this->offset); + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, this->offset); + } + + bool IsNull(int64_t i) const { + return ((this->buffers[0].data != NULLPTR) + ? !bit_util::GetBit(this->buffers[0].data, i + this->offset) + : this->null_count == this->length); + } + + bool IsValid(int64_t i) const { + return ((this->buffers[0].data != NULLPTR) + ? bit_util::GetBit(this->buffers[0].data, i + this->offset) + : this->null_count != this->length); + } + + std::shared_ptr ToArrayData() const; + + std::shared_ptr ToArray() const; + + std::shared_ptr GetBuffer(int index) const { + if (this->buffers[index].owner == NULLPTR) { + return NULLPTR; + } else { + return *this->buffers[index].owner; + } + } + + void AddOffset(int64_t offset) { + this->offset += offset; + this->null_count = kUnknownNullCount; + } + + void SetOffset(int64_t offset) { + this->offset = offset; + this->null_count = kUnknownNullCount; + } + + /// \brief Return null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count != 0 && buffers[0].data != NULLPTR; + } +}; + namespace internal { /// Construct a zero-copy view of this ArrayData with the given type. diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 413182de0df..e5b4ab39493 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -432,7 +432,9 @@ class NullArrayFactory { RETURN_NOT_OK(CreateBuffer()); } std::vector> child_data(type_->num_fields()); - out_ = ArrayData::Make(type_, length_, {buffer_}, child_data, length_, 0); + out_ = ArrayData::Make(type_, length_, + {SliceBuffer(buffer_, 0, bit_util::BytesForBits(length_))}, + child_data, length_, 0); RETURN_NOT_OK(VisitTypeInline(*type_, this)); return out_; } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f3d12dbece9..7c027c9b1e8 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -119,7 +119,7 @@ class ARROW_EXPORT TypeErasedIntBuilder : public ArrayBuilder { Status AppendScalars(const ScalarVector& scalars) override { return builder_->AppendScalars(scalars); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return builder_->AppendArraySlice(array, offset, length); } diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 956595b117b..6ec7d11ac83 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -111,7 +111,7 @@ class ARROW_EXPORT ChunkedArray { int num_chunks() const { return static_cast(chunks_.size()); } /// \return chunk a particular chunk from the chunked array - std::shared_ptr chunk(int i) const { return chunks_[i]; } + const std::shared_ptr& chunk(int i) const { return chunks_[i]; } /// \return an ArrayVector of chunks const ArrayVector& chunks() const { return chunks_; } diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index bd49041b4f3..0acab32e447 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -165,7 +165,7 @@ Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) { } Status CastFunction::AddKernel(Type::type in_type_id, std::vector in_types, - OutputType out_type, ArrayKernelExec exec, + OutputType out_type, ScalarKernel::ExecFunc exec, NullHandling::type null_handling, MemAllocation::type mem_allocation) { ScalarKernel kernel; diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index e9c3cf55da9..a27dafe97c3 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -84,7 +84,7 @@ class CastFunction : public ScalarFunction { const std::vector& in_type_ids() const { return in_type_ids_; } Status AddKernel(Type::type in_type_id, std::vector in_types, - OutputType out_type, ArrayKernelExec exec, + OutputType out_type, ScalarKernel::ExecFunc exec, NullHandling::type = NullHandling::INTERSECTION, MemAllocation::type = MemAllocation::PREALLOCATE); diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 186a3cdf3c1..04bb29e9bfa 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -229,18 +229,45 @@ Status CheckAllValues(const std::vector& values) { return Status::OK(); } -ExecBatchIterator::ExecBatchIterator(std::vector args, int64_t length, +Status GetBatchLength(const std::vector& values, int64_t* out_length) { + for (const auto& arg : values) { + if (!(arg.is_arraylike() || arg.is_scalar())) { + return Status::Invalid( + "Batch iteration only works with Scalar, Array, and " + "ChunkedArray arguments"); + } + } + + // If the arguments are all scalars, then the length is 1 + int64_t length = 1; + + bool length_set = false; + for (auto& arg : values) { + if (arg.is_scalar()) { + continue; + } + if (!length_set) { + length = arg.length(); + length_set = true; + } else { + if (arg.length() != length) { + return Status::Invalid("Array arguments must all be the same length"); + } + } + } + *out_length = length; + return Status::OK(); +} + +ExecBatchIterator::ExecBatchIterator(const std::vector& args, int64_t length, int64_t max_chunksize) - : args_(std::move(args)), - position_(0), - length_(length), - max_chunksize_(max_chunksize) { + : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) { chunk_indexes_.resize(args_.size(), 0); chunk_positions_.resize(args_.size(), 0); } Result> ExecBatchIterator::Make( - std::vector args, int64_t max_chunksize) { + const std::vector& args, int64_t max_chunksize) { for (const auto& arg : args) { if (!(arg.is_arraylike() || arg.is_scalar())) { return Status::Invalid( @@ -270,7 +297,7 @@ Result> ExecBatchIterator::Make( max_chunksize = std::min(length, max_chunksize); return std::unique_ptr( - new ExecBatchIterator(std::move(args), length, max_chunksize)); + new ExecBatchIterator(args, length, max_chunksize)); } bool ExecBatchIterator::Next(ExecBatch* batch) { @@ -325,35 +352,158 @@ bool ExecBatchIterator::Next(ExecBatch* batch) { return true; } +// ---------------------------------------------------------------------- +// ExecSpanIterator; to eventually replace ExecBatchIterator + +ExecSpanIterator::ExecSpanIterator(const std::vector& args, int64_t length, + int64_t max_chunksize) + : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) { + chunk_indexes_.resize(args_.size(), 0); + value_positions_.resize(args_.size(), 0); + value_offsets_.resize(args_.size(), 0); +} + +Result> ExecSpanIterator::Make( + const std::vector& args, int64_t max_chunksize) { + int64_t length = 1; + RETURN_NOT_OK(GetBatchLength(args, &length)); + max_chunksize = std::min(length, max_chunksize); + return std::unique_ptr( + new ExecSpanIterator(args, length, max_chunksize)); +} + +int64_t ExecSpanIterator::GetNextChunkSpan(int64_t iteration_size, ExecSpan* span) { + for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) { + // If the argument is not a chunked array, it's either a Scalar or Array, + // in which case it doesn't influence the size of this span. Note that if + // the args are all scalars the span length is 1 + if (!args_[i].is_chunked_array()) { + continue; + } + const ChunkedArray* arg = args_[i].chunked_array().get(); + const Array* current_chunk; + while (true) { + current_chunk = arg->chunk(chunk_indexes_[i]).get(); + if (value_positions_[i] == current_chunk->length()) { + // Chunk is zero-length, or was exhausted in the previous + // iteration. Move to the next chunk + ++chunk_indexes_[i]; + current_chunk = arg->chunk(chunk_indexes_[i]).get(); + span->values[i].SetArray(*current_chunk->data()); + value_positions_[i] = 0; + value_offsets_[i] = current_chunk->offset(); + continue; + } + break; + } + iteration_size = + std::min(current_chunk->length() - value_positions_[i], iteration_size); + } + return iteration_size; +} + +bool ExecSpanIterator::Next(ExecSpan* span) { + if (position_ == length_) { + // This also protects from degenerate cases like ChunkedArrays + // without any chunks + return false; + } + + if (!initialized_) { + span->length = 0; + + // The first this this is called, we populate the output span with + // any Scalar or Array arguments in the ExecValue struct, and then + // just increment array offsets below. If any arguments are + // ChunkedArray, then the internal ArraySpans will see their + // members updated during hte iteration + span->values.resize(args_.size()); + for (size_t i = 0; i < args_.size(); ++i) { + if (args_[i].is_scalar()) { + span->values[i].SetScalar(args_[i].scalar().get()); + } else if (args_[i].is_array()) { + const ArrayData& arr = *args_[i].array(); + span->values[i].SetArray(arr); + value_offsets_[i] = arr.offset; + } else { + // Populate members from the first chunk + const Array* first_chunk = args_[i].chunked_array()->chunk(0).get(); + const ArrayData& arr = *first_chunk->data(); + span->values[i].SetArray(arr); + value_offsets_[i] = arr.offset; + have_chunked_arrays_ = true; + } + } + initialized_ = true; + } + + if (position_ == length_) { + return false; + } + + // Determine how large the common contiguous "slice" of all the arguments is + int64_t iteration_size = std::min(length_ - position_, max_chunksize_); + if (have_chunked_arrays_) { + iteration_size = GetNextChunkSpan(iteration_size, span); + } + + // Now, adjust the span + span->length = iteration_size; + for (size_t i = 0; i < args_.size(); ++i) { + const Datum& arg = args_[i]; + if (!arg.is_scalar()) { + ArraySpan* arr = &span->values[i].array; + arr->length = iteration_size; + arr->SetOffset(value_positions_[i] + value_offsets_[i]); + value_positions_[i] += iteration_size; + } + } + position_ += iteration_size; + DCHECK_LE(position_, length_); + return true; +} + namespace { struct NullGeneralization { enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL }; - static type Get(const Datum& datum) { - const auto dtype_id = datum.type()->id(); + static type Get(const ExecValue& value) { + const auto dtype_id = value.type()->id(); if (dtype_id == Type::NA) { return ALL_NULL; } if (!arrow::internal::HasValidityBitmap(dtype_id)) { return ALL_VALID; } - if (datum.is_scalar()) { - return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL; - } - if (datum.is_array()) { - const auto& arr = *datum.array(); + if (value.is_scalar()) { + return value.scalar->is_valid ? ALL_VALID : ALL_NULL; + } else { + const ArraySpan& arr = value.array; // Do not count the bits if they haven't been counted already - const int64_t known_null_count = arr.null_count.load(); - if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) { + if ((arr.null_count == 0) || (arr.buffers[0].data == nullptr)) { return ALL_VALID; } - if (known_null_count == arr.length) { + if (arr.null_count == arr.length) { return ALL_NULL; } } return PERHAPS_NULL; } + + static type Get(const Datum& datum) { + // Temporary workaround to help with ARROW-16756 + ExecValue value; + if (datum.is_array()) { + value.SetArray(*datum.array()); + } else if (datum.is_scalar()) { + value.SetScalar(datum.scalar().get()); + } else { + // TODO(wesm): ChunkedArray, I think + return PERHAPS_NULL; + } + return Get(value); + } }; // Null propagation implementation that deals both with preallocated bitmaps @@ -369,35 +519,29 @@ struct NullGeneralization { // * Otherwise, we allocate the bitmap and populate it class NullPropagator { public: - NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) + NullPropagator(KernelContext* ctx, const ExecSpan& batch, ArrayData* output) : ctx_(ctx), batch_(batch), output_(output) { - for (const Datum& datum : batch_.values) { - auto null_generalization = NullGeneralization::Get(datum); - + for (const ExecValue& value : batch_.values) { + auto null_generalization = NullGeneralization::Get(value); if (null_generalization == NullGeneralization::ALL_NULL) { is_all_null_ = true; } - - if (null_generalization != NullGeneralization::ALL_VALID && - datum.kind() == Datum::ARRAY) { - arrays_with_nulls_.push_back(datum.array().get()); + if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) { + arrays_with_nulls_.push_back(&value.array); } } - if (output->buffers[0] != nullptr) { bitmap_preallocated_ = true; - SetBitmap(output_->buffers[0].get()); + bitmap_ = output_->buffers[0]->mutable_data(); } } - void SetBitmap(Buffer* bitmap) { bitmap_ = bitmap->mutable_data(); } - Status EnsureAllocated() { if (bitmap_preallocated_) { return Status::OK(); } ARROW_ASSIGN_OR_RAISE(output_->buffers[0], ctx_->AllocateBitmap(output_->length)); - SetBitmap(output_->buffers[0].get()); + bitmap_ = output_->buffers[0]->mutable_data(); return Status::OK(); } @@ -412,10 +556,10 @@ class NullPropagator { // Walk all the values with nulls instead of breaking on the first in case // we find a bitmap that can be reused in the non-preallocated case - for (const ArrayData* arr : arrays_with_nulls_) { - if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) { + for (const ArraySpan* arr : arrays_with_nulls_) { + if (arr->null_count == arr->length && arr->buffers[0].owner != nullptr) { // Reuse this all null bitmap - output_->buffers[0] = arr->buffers[0]; + output_->buffers[0] = arr->GetBuffer(0); return Status::OK(); } } @@ -427,14 +571,14 @@ class NullPropagator { Status PropagateSingle() { // One array - const ArrayData& arr = *arrays_with_nulls_[0]; - const std::shared_ptr& arr_bitmap = arr.buffers[0]; + const ArraySpan& arr = *arrays_with_nulls_[0]; + const uint8_t* arr_bitmap = arr.buffers[0].data; // Reuse the null count if it's known - output_->null_count = arr.null_count.load(); + output_->null_count = arr.null_count; if (bitmap_preallocated_) { - CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset); + CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, output_->offset); return Status::OK(); } @@ -448,14 +592,13 @@ class NullPropagator { // the bitmap is not preallocated, and that precondition is asserted // higher in the call stack. if (arr.offset == 0) { - output_->buffers[0] = arr_bitmap; + output_->buffers[0] = arr.GetBuffer(0); } else if (arr.offset % 8 == 0) { - output_->buffers[0] = - SliceBuffer(arr_bitmap, arr.offset / 8, bit_util::BytesForBits(arr.length)); + output_->buffers[0] = SliceBuffer(arr.GetBuffer(0), arr.offset / 8, + bit_util::BytesForBits(arr.length)); } else { RETURN_NOT_OK(EnsureAllocated()); - CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, - /*dst_offset=*/0); + CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, /*dst_offset=*/0); } return Status::OK(); } @@ -466,22 +609,22 @@ class NullPropagator { // Do not compute the intersection null count until it's needed RETURN_NOT_OK(EnsureAllocated()); - auto Accumulate = [&](const ArrayData& left, const ArrayData& right) { - DCHECK(left.buffers[0]); - DCHECK(right.buffers[0]); - BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(), - right.offset, output_->length, output_->offset, - output_->buffers[0]->mutable_data()); + auto Accumulate = [&](const uint8_t* left_data, int64_t left_offset, + const uint8_t* right_data, int64_t right_offset) { + BitmapAnd(left_data, left_offset, right_data, right_offset, output_->length, + output_->offset, bitmap_); }; DCHECK_GT(arrays_with_nulls_.size(), 1); // Seed the output bitmap with the & of the first two bitmaps - Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]); + Accumulate(arrays_with_nulls_[0]->buffers[0].data, arrays_with_nulls_[0]->offset, + arrays_with_nulls_[1]->buffers[0].data, arrays_with_nulls_[1]->offset); // Accumulate the rest for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) { - Accumulate(*output_, *arrays_with_nulls_[i]); + Accumulate(bitmap_, output_->offset, arrays_with_nulls_[i]->buffers[0].data, + arrays_with_nulls_[i]->offset); } return Status::OK(); } @@ -527,8 +670,8 @@ class NullPropagator { private: KernelContext* ctx_; - const ExecBatch& batch_; - std::vector arrays_with_nulls_; + const ExecSpan& batch_; + std::vector arrays_with_nulls_; bool is_all_null_ = false; ArrayData* output_; uint8_t* bitmap_; @@ -573,13 +716,9 @@ class KernelExecutorImpl : public KernelExecutor { } protected: - // This is overridden by the VectorExecutor - virtual Status SetupArgIteration(const std::vector& args) { - ARROW_ASSIGN_OR_RAISE( - batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize())); - return Status::OK(); - } - + // Prepare an output ArrayData to be written to. If + // Kernel::mem_allocation is not MemAllocation::PREALLOCATE, then no + // data buffers will be set Result> PrepareOutput(int64_t length) { auto out = std::make_shared(output_descr_.type, length); out->buffers.resize(output_num_buffers_); @@ -619,7 +758,6 @@ class KernelExecutorImpl : public KernelExecutor { KernelContext* kernel_ctx_; const KernelType* kernel_; - std::unique_ptr batch_iterator_; ValueDescr output_descr_; int output_num_buffers_; @@ -636,22 +774,35 @@ class KernelExecutorImpl : public KernelExecutor { class ScalarExecutor : public KernelExecutorImpl { public: Status Execute(const std::vector& args, ExecListener* listener) override { - RETURN_NOT_OK(PrepareExecute(args)); - ExecBatch batch; - while (batch_iterator_->Next(&batch)) { - RETURN_NOT_OK(ExecuteBatch(batch, listener)); + ARROW_ASSIGN_OR_RAISE(span_iterator_, + ExecSpanIterator::Make(args, exec_context()->exec_chunksize())); + + // TODO(wesm): remove if with ARROW-16757 + if (output_descr_.shape != ValueDescr::SCALAR) { + // If the executor is configured to produce a single large Array output for + // kernels supporting preallocation, then we do so up front and then + // iterate over slices of that large array. Otherwise, we preallocate prior + // to processing each span emitted from the ExecSpanIterator + RETURN_NOT_OK(SetupPreallocation(span_iterator_->length(), args)); } - if (preallocate_contiguous_) { - // If we preallocated one big chunk, since the kernel execution is - // completed, we can now emit it - RETURN_NOT_OK(listener->OnResult(std::move(preallocated_))); + + // ARROW-16756: Here we have to accommodate the distinct cases + // + // * Fully-preallocated contiguous output + // * Fully-preallocated, non-contiguous kernel output + // * Not-fully-preallocated kernel output: we pass an empty or + // partially-filled ArrayData to the kernel + if (preallocating_all_buffers_) { + return ExecuteSpans(listener); + } else { + return ExecuteNonSpans(listener); } - return Status::OK(); } Datum WrapResults(const std::vector& inputs, const std::vector& outputs) override { if (output_descr_.shape == ValueDescr::SCALAR) { + // TODO(wesm): to remove, see ARROW-16757 DCHECK_EQ(outputs.size(), 1); // Return as SCALAR return outputs[0]; @@ -674,101 +825,112 @@ class ScalarExecutor : public KernelExecutorImpl { } protected: - Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) { - Datum out; - RETURN_NOT_OK(PrepareNextOutput(batch, &out)); - - if (output_descr_.shape == ValueDescr::ARRAY) { - ArrayData* out_arr = out.mutable_array(); - if (output_descr_.type->id() == Type::NA) { - out_arr->null_count = out_arr->length; - } else if (kernel_->null_handling == NullHandling::INTERSECTION) { - RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr)); - } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { - out_arr->null_count = 0; - } - } else { - if (kernel_->null_handling == NullHandling::INTERSECTION) { - // set scalar validity - out.scalar()->is_valid = - std::all_of(batch.values.begin(), batch.values.end(), - [](const Datum& input) { return input.scalar()->is_valid; }); - } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { - out.scalar()->is_valid = true; - } - } - - RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); + Status ExecuteSpans(ExecListener* listener) { + // We put the preallocation in an ArraySpan to be passed to the + // kernel which is expecting to receive that. More + // performance-critical code (e.g. expression evaluation) should + // eventually skip the creation of ArrayData altogether + std::shared_ptr preallocation; + ExecSpan input; + ExecResult output; + ArraySpan* output_span = output.array_span(); if (preallocate_contiguous_) { - // Some kernels may like to simply nullify the validity bitmap when - // they know the output will have 0 nulls. However, this is not compatible - // with writing into slices. - if (output_descr_.shape == ValueDescr::ARRAY) { - DCHECK(out.array()->buffers[0]) - << "Null bitmap deleted by kernel but can_write_into_slices = true"; + // Make one big output allocation + ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(span_iterator_->length())); + + // Populate and then reuse the ArraySpan inside + output_span->SetMembers(*preallocation); + output_span->offset = 0; + while (span_iterator_->Next(&input)) { + // Set absolute output span position and length + output_span->length = input.length; + RETURN_NOT_OK(ExecuteSingleSpan(input, &output)); + output_span->SetOffset(span_iterator_->position()); } + + // Kernel execution is complete; emit result + RETURN_NOT_OK(listener->OnResult(std::move(preallocation))); } else { - // If we are producing chunked output rather than one big array, then - // emit each chunk as soon as it's available - RETURN_NOT_OK(listener->OnResult(std::move(out))); + // Fully preallocating, but not contiguously + // We preallocate (maybe) only for the output of processing the current + // chunk + while (span_iterator_->Next(&input)) { + ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(input.length)); + output_span->SetMembers(*preallocation); + RETURN_NOT_OK(ExecuteSingleSpan(input, &output)); + // Emit the result for this chunk + RETURN_NOT_OK(listener->OnResult(std::move(preallocation))); + } } return Status::OK(); } - Status PrepareExecute(const std::vector& args) { - RETURN_NOT_OK(this->SetupArgIteration(args)); - - if (output_descr_.shape == ValueDescr::ARRAY) { - // If the executor is configured to produce a single large Array output for - // kernels supporting preallocation, then we do so up front and then - // iterate over slices of that large array. Otherwise, we preallocate prior - // to processing each batch emitted from the ExecBatchIterator - RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length(), args)); + Status ExecuteSingleSpan(const ExecSpan& input, ExecResult* out) { + ArraySpan* result_span = out->array_span(); + if (output_descr_.type->id() == Type::NA) { + result_span->null_count = result_span->length; + } else if (kernel_->null_handling == NullHandling::INTERSECTION) { + if (!elide_validity_bitmap_) { + PropagateNullsSpans(input, result_span); + } + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + result_span->null_count = 0; } - return Status::OK(); + return kernel_->exec(kernel_ctx_, input, out); } - // We must accommodate two different modes of execution for preallocated - // execution - // - // * A single large ("contiguous") allocation that we populate with results - // on a chunkwise basis according to the ExecBatchIterator. This permits - // parallelization even if the objective is to obtain a single Array or - // ChunkedArray at the end - // * A standalone buffer preallocation for each chunk emitted from the - // ExecBatchIterator - // - // When data buffer preallocation is not possible (e.g. with BINARY / STRING - // outputs), then contiguous results are only possible if the input is - // contiguous. - - Status PrepareNextOutput(const ExecBatch& batch, Datum* out) { - if (output_descr_.shape == ValueDescr::ARRAY) { - if (preallocate_contiguous_) { - // The output is already fully preallocated - const int64_t batch_start_position = batch_iterator_->position() - batch.length; - - if (batch.length < batch_iterator_->length()) { - // If this is a partial execution, then we write into a slice of - // preallocated_ - out->value = preallocated_->Slice(batch_start_position, batch.length); - } else { - // Otherwise write directly into preallocated_. The main difference - // computationally (versus the Slice approach) is that the null_count - // may not need to be recomputed in the result - out->value = preallocated_; + Status ExecuteNonSpans(ExecListener* listener) { + // ARROW-16756: Kernel is going to allocate some memory and so + // for the time being we pass in an empty or partially-filled + // shared_ptr or shared_ptr to be populated + // by the kernel. + // + // We will eventually delete the Scalar output path per + // ARROW-16757. + ExecSpan input; + ExecResult output; + while (span_iterator_->Next(&input)) { + if (output_descr_.shape == ValueDescr::ARRAY) { + ARROW_ASSIGN_OR_RAISE(output.value, PrepareOutput(input.length)); + DCHECK(output.is_array_data()); + } else { + // For scalar outputs, we set a null scalar of the correct type to + // communicate the output type to the kernel if needed + // + // XXX: Is there some way to avoid this step? + // TODO: Remove this path in ARROW-16757 + output.value = MakeNullScalar(output_descr_.type); + } + + if (output_descr_.shape == ValueDescr::ARRAY) { + ArrayData* out_arr = output.array_data().get(); + if (output_descr_.type->id() == Type::NA) { + out_arr->null_count = out_arr->length; + } else if (kernel_->null_handling == NullHandling::INTERSECTION) { + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, input, out_arr)); + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + out_arr->null_count = 0; } } else { - // We preallocate (maybe) only for the output of processing the current - // batch - ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length)); + // TODO(wesm): to remove, see ARROW-16757 + if (kernel_->null_handling == NullHandling::INTERSECTION) { + // set scalar validity + output.scalar()->is_valid = + std::all_of(input.values.begin(), input.values.end(), + [](const ExecValue& input) { return input.scalar->is_valid; }); + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + output.scalar()->is_valid = true; + } + } + + RETURN_NOT_OK(kernel_->exec(kernel_ctx_, input, &output)); + + // Emit a result for each chunk + if (output_descr_.shape == ValueDescr::ARRAY) { + RETURN_NOT_OK(listener->OnResult(output.array_data())); + } else { + RETURN_NOT_OK(listener->OnResult(output.scalar())); } - } else { - // For scalar outputs, we set a null scalar of the correct type to - // communicate the output type to the kernel if needed - // - // XXX: Is there some way to avoid this step? - out->value = MakeNullScalar(output_descr_.type); } return Status::OK(); } @@ -780,23 +942,42 @@ class ScalarExecutor : public KernelExecutorImpl { // - Output Array is NullArray // - kernel_->null_handling is COMPUTED_NO_PREALLOCATE or OUTPUT_NOT_NULL validity_preallocated_ = false; + if (out_type_id != Type::NA) { if (kernel_->null_handling == NullHandling::COMPUTED_PREALLOCATE) { // Override the flag if kernel asks for pre-allocation validity_preallocated_ = true; } else if (kernel_->null_handling == NullHandling::INTERSECTION) { - bool are_all_inputs_valid = true; + elide_validity_bitmap_ = true; for (const auto& arg : args) { auto null_gen = NullGeneralization::Get(arg) == NullGeneralization::ALL_VALID; - are_all_inputs_valid = are_all_inputs_valid && null_gen; + + // If not all valid, this becomes false + elide_validity_bitmap_ = elide_validity_bitmap_ && null_gen; } - validity_preallocated_ = !are_all_inputs_valid; + validity_preallocated_ = !elide_validity_bitmap_; + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + elide_validity_bitmap_ = true; } } if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { ComputeDataPreallocate(*output_descr_.type, &data_preallocated_); } + // Validity bitmap either preallocated or elided, and all data + // buffers allocated. This is basically only true for primitive + // types that are not dictionary-encoded + preallocating_all_buffers_ = + ((validity_preallocated_ || elide_validity_bitmap_) && + data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && + !is_nested(out_type_id) && !is_dictionary(out_type_id)); + + // TODO(wesm): why was this check ever here? Fixed width binary + // can be 0-width but anything else? + DCHECK(std::all_of( + data_preallocated_.begin(), data_preallocated_.end(), + [](const BufferPreallocation& prealloc) { return prealloc.bit_width >= 0; })); + // Contiguous preallocation only possible on non-nested types if all // buffers are preallocated. Otherwise, we must go chunk-by-chunk. // @@ -804,26 +985,25 @@ class ScalarExecutor : public KernelExecutorImpl { // kernel's attributes. preallocate_contiguous_ = (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && - validity_preallocated_ && !is_nested(out_type_id) && - !is_dictionary(out_type_id) && - data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && - std::all_of(data_preallocated_.begin(), data_preallocated_.end(), - [](const BufferPreallocation& prealloc) { - return prealloc.bit_width >= 0; - })); - if (preallocate_contiguous_) { - ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length)); - } + preallocating_all_buffers_); return Status::OK(); } + // Used to account for the case where we do not preallocate a + // validity bitmap because the inputs are all non-null and we're + // using NullHandling::INTERSECTION to compute the validity bitmap + bool elide_validity_bitmap_ = false; + + // All memory is preallocated for output, contiguous and + // non-contiguous + bool preallocating_all_buffers_ = false; + // If true, and the kernel and output type supports preallocation (for both // the validity and data buffers), then we allocate one big array and then // iterate through it while executing the kernel in chunks bool preallocate_contiguous_ = false; - // For storing a contiguous preallocation per above. Unused otherwise - std::shared_ptr preallocated_; + std::unique_ptr span_iterator_; }; Status PackBatchNoChunks(const std::vector& args, ExecBatch* out) { @@ -888,7 +1068,7 @@ class VectorExecutor : public KernelExecutorImpl { if (kernel_->null_handling == NullHandling::INTERSECTION && output_descr_.shape == ValueDescr::ARRAY) { - RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array())); + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array())); } RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); if (!kernel_->finalize) { @@ -913,16 +1093,11 @@ class VectorExecutor : public KernelExecutorImpl { return Status::OK(); } - Status SetupArgIteration(const std::vector& args) override { + Status PrepareExecute(const std::vector& args) { if (kernel_->can_execute_chunkwise) { ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make( args, exec_context()->exec_chunksize())); } - return Status::OK(); - } - - Status PrepareExecute(const std::vector& args) { - RETURN_NOT_OK(this->SetupArgIteration(args)); output_num_buffers_ = static_cast(output_descr_.type->layout().buffers.size()); // Decide if we need to preallocate memory for this kernel @@ -935,6 +1110,7 @@ class VectorExecutor : public KernelExecutorImpl { return Status::OK(); } + std::unique_ptr batch_iterator_; std::vector results_; }; @@ -947,7 +1123,8 @@ class ScalarAggExecutor : public KernelExecutorImpl { } Status Execute(const std::vector& args, ExecListener* listener) override { - RETURN_NOT_OK(this->SetupArgIteration(args)); + ARROW_ASSIGN_OR_RAISE( + batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize())); ExecBatch batch; while (batch_iterator_->Next(&batch)) { @@ -971,7 +1148,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { private: Status Consume(const ExecBatch& batch) { - // FIXME(ARROW-11840) don't merge *any* aggegates for every batch + // FIXME(ARROW-11840) don't merge *any* aggregates for every batch ARROW_ASSIGN_OR_RAISE( auto batch_state, kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_})); @@ -988,6 +1165,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { return Status::OK(); } + std::unique_ptr batch_iterator_; const std::vector* input_descrs_; const FunctionOptions* options_; }; @@ -1004,7 +1182,7 @@ Result> MakeExecutor(ExecContext* ctx, } // namespace -Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) { +Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* output) { DCHECK_NE(nullptr, output); DCHECK_GT(output->buffers.size(), 0); @@ -1026,6 +1204,67 @@ Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out return propagator.Execute(); } +void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out) { + if (out->type->id() == Type::NA) { + // Null output type is a no-op (rare when this would happen but we at least + // will test for it) + return; + } + + std::vector arrays_with_nulls; + bool is_all_null = false; + for (const ExecValue& value : batch.values) { + auto null_generalization = NullGeneralization::Get(value); + if (null_generalization == NullGeneralization::ALL_NULL) { + is_all_null = true; + } + if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) { + arrays_with_nulls.push_back(&value.array); + } + } + uint8_t* out_bitmap = out->buffers[0].data; + if (is_all_null) { + // An all-null value (scalar null or all-null array) gives us a short + // circuit opportunity + // OK, the output should be all null + out->null_count = out->length; + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, false); + return; + } + + out->null_count = kUnknownNullCount; + if (arrays_with_nulls.empty()) { + // No arrays with nulls case + out->null_count = 0; + if (out_bitmap != nullptr) { + // An output buffer was allocated, so we fill it with all valid + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, true); + } + } else if (arrays_with_nulls.size() == 1) { + // One array + const ArraySpan& arr = *arrays_with_nulls[0]; + + // Reuse the null count if it's known + out->null_count = arr.null_count; + CopyBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, out->offset); + } else { + // More than one array. We use BitmapAnd to intersect their bitmaps + auto Accumulate = [&](const ArraySpan& left, const ArraySpan& right) { + DCHECK(left.buffers[0].data != nullptr); + DCHECK(right.buffers[0].data != nullptr); + BitmapAnd(left.buffers[0].data, left.offset, right.buffers[0].data, right.offset, + out->length, out->offset, out_bitmap); + }; + // Seed the output bitmap with the & of the first two bitmaps + Accumulate(*arrays_with_nulls[0], *arrays_with_nulls[1]); + + // Accumulate the rest + for (size_t i = 2; i < arrays_with_nulls.size(); ++i) { + Accumulate(*out, *arrays_with_nulls[i]); + } + } +} + std::unique_ptr KernelExecutor::MakeScalar() { return ::arrow::internal::make_unique(); } diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index 742c3794416..994254ffb70 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -20,6 +20,8 @@ #pragma once +#include +#include #include #include #include @@ -252,6 +254,192 @@ struct ARROW_EXPORT ExecBatch { inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); } inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); } +struct ExecValue { + enum Kind { ARRAY, SCALAR }; + Kind kind = ARRAY; + ArraySpan array; + const Scalar* scalar; + + ExecValue(Scalar* scalar) // NOLINT implicit conversion + : kind(SCALAR), scalar(scalar) {} + + ExecValue(ArraySpan array) // NOLINT implicit conversion + : kind(ARRAY), array(std::move(array)) {} + + ExecValue(const ArrayData& array) // NOLINT implicit conversion + : kind(ARRAY) { + this->array.SetMembers(array); + } + + ExecValue() = default; + ExecValue(const ExecValue& other) = default; + ExecValue& operator=(const ExecValue& other) = default; + ExecValue(ExecValue&& other) = default; + ExecValue& operator=(ExecValue&& other) = default; + + int64_t length() const { return this->is_array() ? this->array.length : 1; } + + bool is_array() const { return this->kind == ARRAY; } + bool is_scalar() const { return this->kind == SCALAR; } + + void SetArray(const ArrayData& array) { + this->kind = ARRAY; + this->array.SetMembers(array); + } + + void SetScalar(const Scalar* scalar) { + this->kind = SCALAR; + this->scalar = scalar; + } + + template + const ExactType& scalar_as() const { + return ::arrow::internal::checked_cast(*this->scalar); + } + + /// XXX: here only temporarily until type resolution can be cleaned + /// up to not use ValueDescr + ValueDescr descr() const { + ValueDescr::Shape shape = this->is_array() ? ValueDescr::ARRAY : ValueDescr::SCALAR; + return ValueDescr(const_cast(this->type())->shared_from_this(), shape); + } + + /// XXX: here temporarily for compatibility with datum, see + /// e.g. MakeStructExec in scalar_nested.cc + int64_t null_count() const { + if (this->is_array()) { + return this->array.GetNullCount(); + } else { + return this->scalar->is_valid ? 0 : 1; + } + } + + const DataType* type() const { + if (this->kind == ARRAY) { + return array.type; + } else { + return scalar->type.get(); + } + } +}; + +struct ARROW_EXPORT ExecResult { + // The default value of the variant is ArraySpan + // TODO(wesm): remove Scalar output modality in ARROW-16577 + util::Variant, std::shared_ptr> value; + + int64_t length() const { + if (this->is_array_span()) { + return this->array_span()->length; + } else if (this->is_array_data()) { + return this->array_data()->length; + } else { + // Should not reach here + return 1; + } + } + + const DataType* type() const { + switch (this->value.index()) { + case 0: + return this->array_span()->type; + case 1: + return this->array_data()->type.get(); + default: + // scalar + return this->scalar()->type.get(); + }; + } + + ArraySpan* array_span() const { + return const_cast(&util::get(this->value)); + } + bool is_array_span() const { return this->value.index() == 0; } + + const std::shared_ptr& array_data() const { + return util::get>(this->value); + } + + bool is_array_data() const { return this->value.index() == 1; } + + const std::shared_ptr& scalar() const { + return util::get>(this->value); + } + + bool is_scalar() const { return this->value.index() == 2; } +}; + +/// \brief A "lightweight" column batch object which contains no +/// std::shared_ptr objects and does not have any memory ownership +/// semantics. Can represent a view onto an "owning" ExecBatch. +struct ARROW_EXPORT ExecSpan { + ExecSpan() = default; + ExecSpan(const ExecSpan& other) = default; + ExecSpan& operator=(const ExecSpan& other) = default; + ExecSpan(ExecSpan&& other) = default; + ExecSpan& operator=(ExecSpan&& other) = default; + + explicit ExecSpan(std::vector values, int64_t length) + : length(length), values(std::move(values)) {} + + explicit ExecSpan(const ExecBatch& batch) { + this->length = batch.length; + this->values.resize(batch.values.size()); + for (size_t i = 0; i < batch.values.size(); ++i) { + const Datum& in_value = batch[i]; + ExecValue* out_value = &this->values[i]; + if (in_value.is_array()) { + out_value->SetArray(*in_value.array()); + } else { + out_value->SetScalar(in_value.scalar().get()); + } + } + } + + bool is_all_scalar() const { + return std::all_of(this->values.begin(), this->values.end(), + [](const ExecValue& v) { return v.is_scalar(); }); + } + + /// \brief Return the value at the i-th index + template + inline const ExecValue& operator[](index_type i) const { + return values[i]; + } + + void AddOffset(int64_t offset) { + for (ExecValue& value : values) { + if (value.kind == ExecValue::ARRAY) { + value.array.AddOffset(offset); + } + } + } + + void SetOffset(int64_t offset) { + for (ExecValue& value : values) { + if (value.kind == ExecValue::ARRAY) { + value.array.SetOffset(offset); + } + } + } + + /// \brief A convenience for the number of values / arguments. + int num_values() const { return static_cast(values.size()); } + + // XXX: eliminate the need for ValueDescr; copied temporarily from + // ExecBatch + std::vector GetDescriptors() const { + std::vector result; + for (const auto& value : this->values) { + result.emplace_back(value.descr()); + } + return result; + } + + int64_t length; + std::vector values; +}; + /// \defgroup compute-call-function One-shot calls to compute functions /// /// @{ diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index 74124f02267..1219c39a2df 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -39,8 +39,9 @@ static constexpr int64_t kDefaultMaxChunksize = std::numeric_limits::ma namespace detail { -/// \brief Break std::vector into a sequence of ExecBatch for kernel -/// execution +/// \brief Break std::vector into a sequence of ExecBatch for +/// kernel execution. The lifetime of the Datum vector must be longer +/// than the lifetime of this object class ARROW_EXPORT ExecBatchIterator { public: /// \brief Construct iterator and do basic argument validation @@ -49,7 +50,7 @@ class ARROW_EXPORT ExecBatchIterator { /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending /// on the chunk layout of ChunkedArray. static Result> Make( - std::vector args, int64_t max_chunksize = kDefaultMaxChunksize); + const std::vector& args, int64_t max_chunksize = kDefaultMaxChunksize); /// \brief Compute the next batch. Always returns at least one batch. Return /// false if the iterator is exhausted @@ -62,9 +63,10 @@ class ARROW_EXPORT ExecBatchIterator { int64_t max_chunksize() const { return max_chunksize_; } private: - ExecBatchIterator(std::vector args, int64_t length, int64_t max_chunksize); + ExecBatchIterator(const std::vector& args, int64_t length, + int64_t max_chunksize); - std::vector args_; + const std::vector& args_; std::vector chunk_indexes_; std::vector chunk_positions_; int64_t position_; @@ -72,6 +74,54 @@ class ARROW_EXPORT ExecBatchIterator { int64_t max_chunksize_; }; +/// \brief Break std::vector into a sequence of non-owning +/// ExecSpan for kernel execution. The lifetime of the Datum vector +/// must be longer than the lifetime of this object +class ARROW_EXPORT ExecSpanIterator { + public: + /// \brief Construct iterator and do basic argument validation + /// + /// \param[in] args the Datum argument, must be all array-like or scalar + /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending + /// on the chunk layout of ChunkedArray. + static Result> Make( + const std::vector& args, int64_t max_chunksize = kDefaultMaxChunksize); + + /// \brief Compute the next span by updating the state of the + /// previous span object. You must keep passing in the previous + /// value for the results to be consistent. If you need to process + /// in parallel, make a copy of the in-use ExecSpan while it's being + /// used by another thread and pass it into Next. This function + /// always populates at least one span. If you call this function + /// with a blank ExecSpan after the first iteration, it will not + /// work correctly (maybe we will change this later). Return false + /// if the iteration is exhausted + bool Next(ExecSpan* span); + + int64_t length() const { return length_; } + int64_t position() const { return position_; } + + private: + ExecSpanIterator(const std::vector& args, int64_t length, int64_t max_chunksize); + + int64_t GetNextChunkSpan(int64_t iteration_size, ExecSpan* span); + + bool initialized_ = false; + bool have_chunked_arrays_ = false; + const std::vector& args_; + std::vector chunk_indexes_; + std::vector value_positions_; + + // Keep track of the array offset in the "active" array (e.g. the + // array or the particular chunk of an array) in each slot, separate + // from the relative position within each chunk (which is in + // value_positions_) + std::vector value_offsets_; + int64_t position_; + int64_t length_; + int64_t max_chunksize_; +}; + // "Push" / listener API like IPC reader so that consumers can receive // processed chunks as soon as they're available. @@ -138,7 +188,10 @@ class ARROW_EXPORT KernelExecutor { /// \param[in] batch the data batch /// \param[in] out the output ArrayData, must not be null ARROW_EXPORT -Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out); +Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* out); + +ARROW_EXPORT +void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out); } // namespace detail } // namespace compute diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index b00bcb319c3..c74d24d6ecf 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -90,13 +90,20 @@ TEST(SelectionVector, Basics) { ASSERT_EQ(3, sel_vector->indices()[1]); } +void AssertValidityZeroExtraBits(const uint8_t* data, int64_t length, int64_t offset) { + const int64_t bit_extent = ((offset + length + 7) / 8) * 8; + for (int64_t i = offset + length; i < bit_extent; ++i) { + EXPECT_FALSE(bit_util::GetBit(data, i)) << i; + } +} + +void AssertValidityZeroExtraBits(const ArraySpan& arr) { + return AssertValidityZeroExtraBits(arr.buffers[0].data, arr.length, arr.offset); +} + void AssertValidityZeroExtraBits(const ArrayData& arr) { const Buffer& buf = *arr.buffers[0]; - - const int64_t bit_extent = ((arr.offset + arr.length + 7) / 8) * 8; - for (int64_t i = arr.offset + arr.length; i < bit_extent; ++i) { - EXPECT_FALSE(bit_util::GetBit(buf.data(), i)) << i; - } + return AssertValidityZeroExtraBits(buf.data(), arr.length, arr.offset); } class TestComputeInternals : public ::testing::Test { @@ -137,6 +144,9 @@ class TestComputeInternals : public ::testing::Test { std::unique_ptr rng_; }; +// ---------------------------------------------------------------------- +// Test PropagateNulls + class TestPropagateNulls : public TestComputeInternals {}; TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) { @@ -149,7 +159,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) { ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount); ExecBatch batch({input}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(kUnknownNullCount, output.null_count); ASSERT_EQ(9, output.GetNullCount()); @@ -164,7 +174,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithoutNulls) { ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount); ExecBatch batch({input}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); EXPECT_EQ(-1, output.null_count); EXPECT_EQ(nulls.get(), output.buffers[0].get()); } @@ -185,7 +195,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) { ArrayData output(boolean(), length, buffers); ExecBatch batch(values, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); if (preallocate) { // Ensure that buffer object the same when we pass in preallocated memory @@ -228,7 +238,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) { // null-scalar earlier in the batch ArrayData output(boolean(), length, {nullptr, nullptr}); ExecBatch batch({MakeNullScalar(boolean()), arr_all_nulls}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(arr_all_nulls->data()->buffers[0].get(), output.buffers[0].get()); } } @@ -260,7 +270,7 @@ TEST_F(TestPropagateNulls, SingleValueWithNulls) { ASSERT_EQ(0, output.offset); } - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); if (!preallocate) { const Buffer* parent_buf = arr->data()->buffers[0].get(); @@ -308,14 +318,14 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) { ArrayData output(boolean(), length, {nullptr, nullptr}); ExecBatch batch({some_nulls, no_nulls}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); // Flip order of args output = ArrayData(boolean(), length, {nullptr, nullptr}); batch.values = {no_nulls, no_nulls, some_nulls}; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); @@ -324,7 +334,7 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) { auto preallocated_mem = std::make_shared(bitmap_data, 2); output.null_count = kUnknownNullCount; output.buffers[0] = preallocated_mem; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(preallocated_mem.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); @@ -366,7 +376,7 @@ TEST_F(TestPropagateNulls, IntersectsNulls) { ArrayData output(boolean(), length, {nulls, nullptr}); output.offset = output_offset; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); // Preallocated memory used if (preallocate) { @@ -405,19 +415,257 @@ TEST_F(TestPropagateNulls, NullOutputTypeNoop) { ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length); ArrayData output(null(), length, {nullptr}); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nullptr, output.buffers[0]); } +// ---------------------------------------------------------------------- +// Test PropagateNullsSpans (new span-based implementation). Some of +// the tests above had to be rewritten because the span-based +// implementation does not deal with zero-copy optimizations right now + +class TestPropagateNullsSpans : public TestComputeInternals {}; + +TEST_F(TestPropagateNullsSpans, UnknownNullCountWithNullsZeroCopies) { + const int64_t length = 16; + + const uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + auto ty = boolean(); + ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount); + + uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + auto nulls2 = std::make_shared(validity_bitmap2, 8); + ArraySpan output(ty.get(), length); + output.buffers[0].data = validity_bitmap2; + output.buffers[0].size = 0; + + ExecSpan span(ExecBatch({input}, length)); + PropagateNullsSpans(span, &output); + ASSERT_EQ(kUnknownNullCount, output.null_count); + ASSERT_EQ(9, output.GetNullCount()); +} + +TEST_F(TestPropagateNullsSpans, UnknownNullCountWithoutNulls) { + const int64_t length = 16; + constexpr uint8_t validity_bitmap[8] = {255, 255, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + + auto ty = boolean(); + ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount); + + uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + auto nulls2 = std::make_shared(validity_bitmap2, 8); + ArraySpan output(ty.get(), length); + output.buffers[0].data = validity_bitmap2; + output.buffers[0].size = 0; + + ExecSpan span(ExecBatch({input}, length)); + PropagateNullsSpans(span, &output); + ASSERT_EQ(kUnknownNullCount, output.null_count); + ASSERT_EQ(0, output.GetNullCount()); +} + +TEST_F(TestPropagateNullsSpans, SetAllNulls) { + const int64_t length = 16; + + auto CheckSetAllNull = [&](std::vector values) { + // Make fresh bitmap with all 1's + uint8_t bitmap_data[2] = {255, 255}; + auto buf = std::make_shared(bitmap_data, 2); + + auto ty = boolean(); + ArraySpan output(ty.get(), length); + output.SetBuffer(0, buf); + + ExecSpan span(ExecBatch(values, length)); + PropagateNullsSpans(span, &output); + + uint8_t expected[2] = {0, 0}; + ASSERT_EQ(0, std::memcmp(output.buffers[0].data, expected, output.buffers[0].size)); + }; + + // There is a null scalar + std::shared_ptr i32_val = std::make_shared(3); + std::vector vals = {i32_val, MakeNullScalar(boolean())}; + CheckSetAllNull(vals); + + const double true_prob = 0.5; + vals[0] = rng_->Boolean(length, true_prob); + CheckSetAllNull(vals); + + auto arr_all_nulls = rng_->Boolean(length, true_prob, /*null_probability=*/1); + + // One value is all null + vals = {rng_->Boolean(length, true_prob, /*null_probability=*/0.5), arr_all_nulls}; + CheckSetAllNull(vals); + + // A value is NullType + std::shared_ptr null_arr = std::make_shared(length); + vals = {rng_->Boolean(length, true_prob), null_arr}; + CheckSetAllNull(vals); +} + +TEST_F(TestPropagateNullsSpans, SingleValueWithNulls) { + // Input offset is non-zero (0 mod 8 and nonzero mod 8 cases) + const int64_t length = 100; + auto arr = rng_->Boolean(length, 0.5, /*null_probability=*/0.5); + + auto CheckSliced = [&](int64_t offset, int64_t out_offset = 0) { + // Unaligned bitmap, zero copy not possible + auto sliced = arr->Slice(offset); + std::vector vals = {sliced}; + + auto ty = boolean(); + ArraySpan output(ty.get(), vals[0].length()); + output.offset = out_offset; + + std::shared_ptr preallocated_bitmap; + ASSERT_OK_AND_ASSIGN( + preallocated_bitmap, + AllocateBuffer(bit_util::BytesForBits(sliced->length() + out_offset))); + std::memset(preallocated_bitmap->mutable_data(), 0, preallocated_bitmap->size()); + output.SetBuffer(0, preallocated_bitmap); + + ExecBatch batch(vals, vals[0].length()); + PropagateNullsSpans(ExecSpan(batch), &output); + ASSERT_EQ(arr->Slice(offset)->null_count(), output.GetNullCount()); + ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output.offset, + sliced->null_bitmap_data(), sliced->offset(), + output.length)); + AssertValidityZeroExtraBits(output); + }; + + CheckSliced(8); + CheckSliced(7); + CheckSliced(8, /*offset=*/4); + CheckSliced(7, /*offset=*/4); +} + +TEST_F(TestPropagateNullsSpans, CasesThatUsedToBeZeroCopy) { + // ARROW-16576: testing behaviors that used to be zero copy but are + // not anymore + const int64_t length = 16; + + auto ty = boolean(); + constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + + ArraySpan some_nulls(ty.get(), length); + some_nulls.SetBuffer(0, nulls); + some_nulls.null_count = 9; + + ArraySpan no_nulls(ty.get(), length); + no_nulls.null_count = 0; + + { + uint8_t bitmap_data[2] = {0, 0}; + auto preallocated_mem = std::make_shared(bitmap_data, 2); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, preallocated_mem); + PropagateNullsSpans(ExecSpan({some_nulls, no_nulls}, length), &output); + ASSERT_EQ( + 0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size)); + ASSERT_EQ(output.buffers[0].owner, &preallocated_mem); + ASSERT_EQ(9, output.GetNullCount()); + } + + // Flip order of args + { + uint8_t bitmap_data[2] = {0, 0}; + auto preallocated_mem = std::make_shared(bitmap_data, 2); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, preallocated_mem); + PropagateNullsSpans(ExecSpan({no_nulls, no_nulls, some_nulls}, length), &output); + ASSERT_EQ( + 0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size)); + ASSERT_EQ(output.buffers[0].owner, &preallocated_mem); + ASSERT_EQ(9, output.GetNullCount()); + } +} + +TEST_F(TestPropagateNullsSpans, IntersectsNulls) { + const int64_t length = 16; + + // 0b01111111 0b11001111 + constexpr uint8_t bitmap1[8] = {127, 207, 0, 0, 0, 0, 0, 0}; + auto buffer1 = std::make_shared(bitmap1, 8); + + // 0b11111110 0b01111111 + constexpr uint8_t bitmap2[8] = {254, 127, 0, 0, 0, 0, 0, 0}; + auto buffer2 = std::make_shared(bitmap2, 8); + + // 0b11101111 0b11111110 + constexpr uint8_t bitmap3[8] = {239, 254, 0, 0, 0, 0, 0, 0}; + auto buffer3 = std::make_shared(bitmap3, 8); + + auto ty = boolean(); + + ArraySpan arr1(ty.get(), length); + arr1.SetBuffer(0, buffer1); + + ArraySpan arr2(ty.get(), length); + arr2.SetBuffer(0, buffer2); + + ArraySpan arr3(ty.get(), length); + arr3.SetBuffer(0, buffer3); + + auto CheckCase = [&](std::vector values, int64_t ex_null_count, + const uint8_t* ex_bitmap, int64_t output_offset = 0) { + ExecSpan batch(values, length); + + std::shared_ptr nulls; + // Make the buffer one byte bigger so we can have non-zero offsets + ASSERT_OK_AND_ASSIGN(nulls, AllocateBuffer(3)); + std::memset(nulls->mutable_data(), 0, nulls->size()); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, nulls); + output.offset = output_offset; + + PropagateNullsSpans(batch, &output); + ASSERT_EQ(&nulls, output.buffers[0].owner); + EXPECT_EQ(kUnknownNullCount, output.null_count); + EXPECT_EQ(ex_null_count, output.GetNullCount()); + ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output_offset, ex_bitmap, + /*ex_offset=*/0, length)); + + // Now check that the rest of the bits in out_buffer are still 0 + AssertValidityZeroExtraBits(output); + }; + + // 0b01101110 0b01001110 + uint8_t expected1[2] = {110, 78}; + CheckCase({arr1, arr2, arr3}, 7, expected1); + CheckCase({arr1, arr2, arr3}, 7, expected1, /*output_offset=*/4); + + // 0b01111110 0b01001111 + uint8_t expected2[2] = {126, 79}; + CheckCase({arr1, arr2}, 5, expected2, /*output_offset=*/4); +} + +TEST_F(TestPropagateNullsSpans, NullOutputTypeNoop) { + // Ensure we leave the buffers alone when the output type is null() + // TODO(wesm): is this test useful? Can probably delete + const int64_t length = 100; + ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length); + + auto ty = null(); + ArraySpan result(ty.get(), length); + PropagateNullsSpans(ExecSpan(batch), &result); + ASSERT_EQ(nullptr, result.buffers[0].data); +} + // ---------------------------------------------------------------------- // ExecBatchIterator class TestExecBatchIterator : public TestComputeInternals { public: - void SetupIterator(std::vector args, + void SetupIterator(const std::vector& args, int64_t max_chunksize = kDefaultMaxChunksize) { - ASSERT_OK_AND_ASSIGN(iterator_, - ExecBatchIterator::Make(std::move(args), max_chunksize)); + ASSERT_OK_AND_ASSIGN(iterator_, ExecBatchIterator::Make(args, max_chunksize)); } void CheckIteration(const std::vector& args, int chunksize, const std::vector& ex_batch_sizes) { @@ -540,59 +788,192 @@ TEST_F(TestExecBatchIterator, ZeroLengthInputs) { CheckArgs(args); } +// ---------------------------------------------------------------------- +// ExecSpanIterator tests + +class TestExecSpanIterator : public TestComputeInternals { + public: + void SetupIterator(const std::vector& args, + int64_t max_chunksize = kDefaultMaxChunksize) { + ASSERT_OK_AND_ASSIGN(iterator_, ExecSpanIterator::Make(args, max_chunksize)); + } + void CheckIteration(const std::vector& args, int chunksize, + const std::vector& ex_batch_sizes) { + SetupIterator(args, chunksize); + ExecSpan batch; + int64_t position = 0; + for (size_t i = 0; i < ex_batch_sizes.size(); ++i) { + ASSERT_EQ(position, iterator_->position()); + ASSERT_TRUE(iterator_->Next(&batch)); + ASSERT_EQ(ex_batch_sizes[i], batch.length); + + for (size_t j = 0; j < args.size(); ++j) { + switch (args[j].kind()) { + case Datum::SCALAR: + ASSERT_TRUE(args[j].scalar()->Equals(*batch[j].scalar)); + break; + case Datum::ARRAY: + AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length), + *batch[j].array.ToArray()); + break; + case Datum::CHUNKED_ARRAY: { + const ChunkedArray& carr = *args[j].chunked_array(); + if (batch.length == 0) { + ASSERT_EQ(0, carr.length()); + } else { + auto arg_slice = carr.Slice(position, batch.length); + // The sliced ChunkedArrays should only ever be 1 chunk + ASSERT_EQ(1, arg_slice->num_chunks()); + AssertArraysEqual(*arg_slice->chunk(0), *batch[j].array.ToArray()); + } + } break; + default: + break; + } + } + position += ex_batch_sizes[i]; + } + // Ensure that the iterator is exhausted + ASSERT_FALSE(iterator_->Next(&batch)); + + ASSERT_EQ(iterator_->length(), iterator_->position()); + } + + protected: + std::unique_ptr iterator_; +}; + +TEST_F(TestExecSpanIterator, Basics) { + const int64_t length = 100; + + // Simple case with a single chunk + std::vector args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)), + Datum(std::make_shared(3))}; + SetupIterator(args); + + ExecSpan batch; + ASSERT_TRUE(iterator_->Next(&batch)); + ASSERT_EQ(3, batch.values.size()); + ASSERT_EQ(3, batch.num_values()); + ASSERT_EQ(length, batch.length); + + AssertArraysEqual(*args[0].make_array(), *batch[0].array.ToArray()); + AssertArraysEqual(*args[1].make_array(), *batch[1].array.ToArray()); + ASSERT_TRUE(args[2].scalar()->Equals(*batch[2].scalar)); + + ASSERT_EQ(length, iterator_->position()); + ASSERT_FALSE(iterator_->Next(&batch)); + + // Split into chunks of size 16 + CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4}); +} + +TEST_F(TestExecSpanIterator, InputValidation) { + std::vector args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))}; + ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args)); + + args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))}; + ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args)); + + args = {Datum(GetInt32Array(10))}; + ASSERT_OK_AND_ASSIGN(auto iterator, ExecSpanIterator::Make(args)); +} + +TEST_F(TestExecSpanIterator, ChunkedArrays) { + std::vector args = {Datum(GetInt32Chunked({0, 20, 10})), + Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)), + Datum(std::make_shared(5)), + Datum(MakeNullScalar(boolean()))}; + + CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10}); + CheckIteration(args, /*chunksize=*/20, {15, 5, 10}); + CheckIteration(args, /*chunksize=*/30, {15, 5, 10}); +} + +TEST_F(TestExecSpanIterator, ZeroLengthInputs) { + auto carr = std::shared_ptr(new ChunkedArray({}, int32())); + + auto CheckArgs = [&](const std::vector& args) { + auto iterator = ExecSpanIterator::Make(args).ValueOrDie(); + ExecSpan batch; + ASSERT_FALSE(iterator->Next(&batch)); + }; + + // Zero-length ChunkedArray with zero chunks + std::vector args = {Datum(carr)}; + CheckArgs(args); + + // Zero-length array + args = {Datum(GetInt32Array(0))}; + CheckArgs(args); + + // ChunkedArray with single empty chunk + args = {Datum(GetInt32Chunked({0}))}; + CheckArgs(args); +} + // ---------------------------------------------------------------------- // Scalar function execution -Status ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) { +Status ExecCopyArrayData(KernelContext*, const ExecSpan& batch, ExecResult* out) { DCHECK_EQ(1, batch.num_values()); - const auto& type = checked_cast(*batch[0].type()); - int value_size = type.bit_width() / 8; + int value_size = batch[0].type()->byte_width(); - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); + const ArraySpan& arg0 = batch[0].array; + ArrayData* out_arr = out->array_data().get(); uint8_t* dst = out_arr->buffers[1]->mutable_data() + out_arr->offset * value_size; - const uint8_t* src = arg0.buffers[1]->data() + arg0.offset * value_size; + const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size; std::memcpy(dst, src, batch.length * value_size); return Status::OK(); } -Status ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecCopyArraySpan(KernelContext*, const ExecSpan& batch, ExecResult* out) { + DCHECK_EQ(1, batch.num_values()); + int value_size = batch[0].type()->byte_width(); + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); + uint8_t* dst = out_arr->buffers[1].data + out_arr->offset * value_size; + const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size; + std::memcpy(dst, src, batch.length * value_size); + return Status::OK(); +} + +Status ExecComputedBitmap(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Propagate nulls not used. Check that the out bitmap isn't the same already // as the input bitmap - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); - - if (CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) { + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); + if (CountSetBits(arg0.buffers[0].data, arg0.offset, batch.length) > 0) { // Check that the bitmap has not been already copied over - DCHECK(!BitmapEquals(arg0.buffers[0]->data(), arg0.offset, - out_arr->buffers[0]->data(), out_arr->offset, batch.length)); + DCHECK(!BitmapEquals(arg0.buffers[0].data, arg0.offset, out_arr->buffers[0].data, + out_arr->offset, batch.length)); } - CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, - out_arr->buffers[0]->mutable_data(), out_arr->offset); - return ExecCopy(ctx, batch, out); + CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length, out_arr->buffers[0].data, + out_arr->offset); + return ExecCopyArraySpan(ctx, batch, out); } -Status ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecNoPreallocatedData(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { // Validity preallocated, but not the data - ArrayData* out_arr = out->mutable_array(); + ArrayData* out_arr = out->array_data().get(); DCHECK_EQ(0, out_arr->offset); - const auto& type = checked_cast(*batch[0].type()); - int value_size = type.bit_width() / 8; + int value_size = batch[0].type()->byte_width(); Status s = (ctx->Allocate(out_arr->length * value_size).Value(&out_arr->buffers[1])); DCHECK_OK(s); - return ExecCopy(ctx, batch, out); + return ExecCopyArrayData(ctx, batch, out); } -Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch, - Datum* out) { +Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { // Neither validity nor data preallocated - ArrayData* out_arr = out->mutable_array(); + ArrayData* out_arr = out->array_data().get(); DCHECK_EQ(0, out_arr->offset); Status s = (ctx->AllocateBitmap(out_arr->length).Value(&out_arr->buffers[0])); DCHECK_OK(s); - const ArrayData& arg0 = *batch[0].array(); - CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, + const ArraySpan& arg0 = batch[0].array; + CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length, out_arr->buffers[0]->mutable_data(), /*offset=*/0); // Reuse the kernel that allocates the data @@ -638,22 +1019,23 @@ Result> InitStateful(KernelContext*, return std::unique_ptr(new ExampleState{func_options->value}); } -Status ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecStateful(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // We take the value from the state and multiply the data in batch[0] with it ExampleState* state = static_cast(ctx->state()); int32_t multiplier = checked_cast(*state->value).value; - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); const int32_t* arg0_data = arg0.GetValues(1); - int32_t* dst = out_arr->GetMutableValues(1); + int32_t* dst = out_arr->GetValues(1); for (int64_t i = 0; i < arg0.length; ++i) { dst[i] = arg0_data[i] * multiplier; } return Status::OK(); } -Status ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +// TODO: remove this / refactor it in ARROW-16577 +Status ExecAddInt32(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const Int32Scalar& arg0 = batch[0].scalar_as(); const Int32Scalar& arg1 = batch[1].scalar_as(); out->value = std::make_shared(arg0.value + arg1.value); @@ -685,9 +1067,10 @@ class TestCallScalarFunction : public TestComputeInternals { /*doc=*/FunctionDoc::Empty()); // Add a few kernels. Our implementation only accepts arrays - ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy)); - ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopy)); - ASSERT_OK(func->AddKernel({InputType::Array(float64())}, float64(), ExecCopy)); + ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopyArraySpan)); + ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopyArraySpan)); + ASSERT_OK( + func->AddKernel({InputType::Array(float64())}, float64(), ExecCopyArraySpan)); ASSERT_OK(registry->AddFunction(func)); // A version which doesn't want the executor to call PropagateNulls @@ -767,7 +1150,7 @@ TEST_F(TestCallScalarFunction, ArgumentValidation) { TEST_F(TestCallScalarFunction, PreallocationCases) { double null_prob = 0.2; - auto arr = GetUInt8Array(1000, null_prob); + auto arr = GetUInt8Array(100, null_prob); auto CheckFunction = [&](std::string func_name) { ResetContexts(); @@ -792,7 +1175,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { { // Chunksize not multiple of 8 std::vector args = {Datum(arr)}; - exec_ctx_->set_exec_chunksize(111); + exec_ctx_->set_exec_chunksize(11); ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); AssertArraysEqual(*arr, *result.make_array()); } @@ -800,7 +1183,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { // Input is chunked, output has one big chunk { auto carr = std::shared_ptr( - new ChunkedArray({arr->Slice(0, 100), arr->Slice(100)})); + new ChunkedArray({arr->Slice(0, 10), arr->Slice(10)})); std::vector args = {Datum(carr)}; ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); std::shared_ptr actual = result.chunked_array(); @@ -812,14 +1195,14 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { { std::vector args = {Datum(arr)}; exec_ctx_->set_preallocate_contiguous(false); - exec_ctx_->set_exec_chunksize(400); + exec_ctx_->set_exec_chunksize(40); ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); const ChunkedArray& carr = *result.chunked_array(); ASSERT_EQ(3, carr.num_chunks()); - AssertArraysEqual(*arr->Slice(0, 400), *carr.chunk(0)); - AssertArraysEqual(*arr->Slice(400, 400), *carr.chunk(1)); - AssertArraysEqual(*arr->Slice(800), *carr.chunk(2)); + AssertArraysEqual(*arr->Slice(0, 40), *carr.chunk(0)); + AssertArraysEqual(*arr->Slice(40, 40), *carr.chunk(1)); + AssertArraysEqual(*arr->Slice(80), *carr.chunk(2)); } }; diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 2b3d4e6feb9..d2b36f0080d 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -314,7 +314,7 @@ Status Function::Validate() const { } Status ScalarFunction::AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init) { + ScalarKernel::ExecFunc exec, KernelInit init) { RETURN_NOT_OK(CheckArity(in_types)); if (arity_.is_varargs && in_types.size() != 1) { @@ -336,7 +336,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) { } Status VectorFunction::AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init) { + KernelBatchExec exec, KernelInit init) { RETURN_NOT_OK(CheckArity(in_types)); if (arity_.is_varargs && in_types.size() != 1) { diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index face491690f..91696b84fa2 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -314,7 +314,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { /// initialization, preallocation for fixed-width types, and default null /// handling (intersect validity bitmaps of inputs). Status AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init = NULLPTR); + ScalarKernel::ExecFunc exec, KernelInit init = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. @@ -338,7 +338,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { /// state initialization, no data preallocation, and no preallocation of the /// validity bitmap. Status AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init = NULLPTR); + KernelBatchExec exec, KernelInit init = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index a29a766be79..b508ad047fb 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -85,13 +85,15 @@ void BM_CastDispatchBaseline(benchmark::State& state) { .ValueOrDie(); kernel_context.SetState(cast_state.get()); + ExecSpan input; + input.length = 1; for (auto _ : state) { - Datum timestamp_scalar = MakeNullScalar(double_type); - for (Datum int_scalar : int_scalars) { - ABORT_NOT_OK( - exec(&kernel_context, {{std::move(int_scalar)}, 1}, ×tamp_scalar)); + ExecResult result; + result.value = MakeNullScalar(double_type); + for (const std::shared_ptr& int_scalar : int_scalars) { + input.values = {ExecValue(int_scalar.get())}; + ABORT_NOT_OK(exec(&kernel_context, input, &result)); } - benchmark::DoNotOptimize(timestamp_scalar); } state.SetItemsProcessed(state.iterations() * kScalarCount); @@ -162,11 +164,15 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { ExecContext exec_context; KernelContext kernel_context(&exec_context); + ExecSpan input; + input.length = 1; for (auto _ : state) { int64_t total = 0; - for (const auto& scalar : scalars) { - Datum result{MakeNullScalar(int64())}; - ABORT_NOT_OK(exec(&kernel_context, ExecBatch{{scalar}, /*length=*/1}, &result)); + for (const std::shared_ptr& scalar : scalars) { + ExecResult result; + result.value = MakeNullScalar(int64()); + input.values = {scalar.get()}; + ABORT_NOT_OK(exec(&kernel_context, input, &result)); total += result.scalar()->is_valid; } benchmark::DoNotOptimize(total); diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 94e86c7bd57..ec5f3bc170c 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -210,12 +210,16 @@ TEST(VectorFunction, Basics) { ASSERT_EQ(Function::VECTOR, varargs_func.kind()); } -auto ExecNYI = [](KernelContext* ctx, const ExecBatch& args, Datum* out) { +auto ExecNYI = [](KernelContext* ctx, const ExecSpan& args, ExecResult* out) { return Status::NotImplemented("NYI"); }; -template -void CheckAddDispatch(FunctionType* func) { +auto ExecNYIOld = [](KernelContext* ctx, const ExecBatch& args, Datum* out) { + return Status::NotImplemented("NYI"); +}; + +template +void CheckAddDispatch(FunctionType* func, ExecType exec) { using KernelType = typename FunctionType::KernelType; ASSERT_EQ(0, func->num_kernels()); @@ -224,29 +228,29 @@ void CheckAddDispatch(FunctionType* func) { std::vector in_types1 = {int32(), int32()}; OutputType out_type1 = int32(); - ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI)); - ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), ExecNYI)); + ASSERT_OK(func->AddKernel(in_types1, out_type1, exec)); + ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), exec)); // Duplicate sig is okay - ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI)); + ASSERT_OK(func->AddKernel(in_types1, out_type1, exec)); // Add given a descr - KernelType descr({float64(), float64()}, float64(), ExecNYI); + KernelType descr({float64(), float64()}, float64(), exec); ASSERT_OK(func->AddKernel(descr)); ASSERT_EQ(4, func->num_kernels()); ASSERT_EQ(4, func->kernels().size()); // Try adding some invalid kernels - ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), ExecNYI)); - ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), ExecNYI)); - ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), ExecNYI)); + ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), exec)); + ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), exec)); + ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), exec)); // Add valid and invalid kernel using kernel struct directly - KernelType valid_kernel({boolean(), boolean()}, boolean(), ExecNYI); + KernelType valid_kernel({boolean(), boolean()}, boolean(), exec); ASSERT_OK(func->AddKernel(valid_kernel)); - KernelType invalid_kernel({boolean()}, boolean(), ExecNYI); + KernelType invalid_kernel({boolean()}, boolean(), exec); ASSERT_RAISES(Invalid, func->AddKernel(invalid_kernel)); ASSERT_OK_AND_ASSIGN(const Kernel* kernel, func->DispatchExact({int32(), int32()})); @@ -265,8 +269,10 @@ TEST(ScalarVectorFunction, DispatchExact) { ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); - CheckAddDispatch(&func1); - CheckAddDispatch(&func2); + CheckAddDispatch(&func1, ExecNYI); + + // ARROW-16576: will migrate later to new span-based kernel exec API + CheckAddDispatch(&func2, ExecNYIOld); } TEST(ArrayFunction, VarArgs) { diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 6b1f23e78df..3a0fc2ccd64 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -83,16 +83,6 @@ class ARROW_EXPORT KernelContext { KernelState* state_ = NULLPTR; }; -/// \brief The standard kernel execution API that must be implemented for -/// SCALAR and VECTOR kernel types. This includes both stateless and stateful -/// kernels. Kernels depending on some execution state access that state via -/// subclasses of KernelState set on the KernelContext object. May be used for -/// SCALAR and VECTOR kernel kinds. Implementations should endeavor to write -/// into pre-allocated memory if they are able, though for some kernels -/// (e.g. in cases when a builder like StringBuilder) must be employed this may -/// not be possible. -using ArrayKernelExec = std::function; - /// \brief An type-checking interface to permit customizable validation rules /// for use with InputType and KernelSignature. This is for scenarios where the /// acceptance is not an exact type instance, such as a TIMESTAMP type for a @@ -486,7 +476,7 @@ struct MemAllocation { struct Kernel; -/// \brief Arguments to pass to a KernelInit function. A struct is used to help +/// \brief Arguments to pass to an KernelInit function. A struct is used to help /// avoid API breakage should the arguments passed need to be expanded. struct KernelInitArgs { /// \brief A pointer to the kernel being initialized. The init function may @@ -548,19 +538,26 @@ struct Kernel { SimdLevel::type simd_level = SimdLevel::NONE; }; -/// \brief Common kernel base data structure for ScalarKernel and -/// VectorKernel. It is called "ArrayKernel" in that the functions generally -/// output array values (as opposed to scalar values in the case of aggregate -/// functions). -struct ArrayKernel : public Kernel { - ArrayKernel() = default; - - ArrayKernel(std::shared_ptr sig, ArrayKernelExec exec, - KernelInit init = NULLPTR) +/// \brief Kernel data structure for implementations of ScalarFunction. In +/// addition to the members found in Kernel, contains the null handling +/// and memory pre-allocation preferences. +struct ScalarKernel : public Kernel { + /// \brief The scalar kernel execution API that must be implemented for SCALAR + /// kernel types. This includes both stateless and stateful kernels. Kernels + /// depending on some execution state access that state via subclasses of + /// KernelState set on the KernelContext object. Implementations should + /// endeavor to write into pre-allocated memory if they are able, though for + /// some kernels (e.g. in cases when a builder like StringBuilder) must be + /// employed this may not be possible. + using ExecFunc = std::function; + ScalarKernel() = default; + + ScalarKernel(std::shared_ptr sig, ExecFunc exec, + KernelInit init = NULLPTR) : Kernel(std::move(sig), init), exec(std::move(exec)) {} - ArrayKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, - KernelInit init = NULLPTR) + ScalarKernel(std::vector in_types, OutputType out_type, ExecFunc exec, + KernelInit init = NULLPTR) : Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(std::move(exec)) {} @@ -568,7 +565,7 @@ struct ArrayKernel : public Kernel { /// implementation, it may only write into preallocated memory, while in some /// cases it will allocate its own memory. Any required state is managed /// through the KernelContext. - ArrayKernelExec exec; + ExecFunc exec; /// \brief Writing execution results into larger contiguous allocations /// requires that the kernel be able to write into sliced output ArrayData*, @@ -576,13 +573,6 @@ struct ArrayKernel : public Kernel { /// not be able to do this, so setting this to false disables this /// functionality. bool can_write_into_slices = true; -}; - -/// \brief Kernel data structure for implementations of ScalarFunction. In -/// addition to the members found in ArrayKernel, contains the null handling -/// and memory pre-allocation preferences. -struct ScalarKernel : public ArrayKernel { - using ArrayKernel::ArrayKernel; // For scalar functions preallocated data and intersecting arg validity // bitmaps is a reasonable default @@ -593,34 +583,44 @@ struct ScalarKernel : public ArrayKernel { // ---------------------------------------------------------------------- // VectorKernel (for VectorFunction) -/// \brief See VectorKernel::finalize member for usage -using VectorFinalize = std::function*)>; +/// \brief scalar kernel execution API that must be implemented for VECTOR +/// kernel types. This includes both stateless and stateful kernels. Kernels +/// depending on some execution state access that state via subclasses of +/// KernelState set on the KernelContext object. +using KernelBatchExec = std::function; /// \brief Kernel data structure for implementations of VectorFunction. In -/// addition to the members found in ArrayKernel, contains an optional -/// finalizer function, the null handling and memory pre-allocation preferences -/// (which have different defaults from ScalarKernel), and some other -/// execution-related options. -struct VectorKernel : public ArrayKernel { +/// contains an optional finalizer function, the null handling and memory +/// pre-allocation preferences (which have different defaults from +/// ScalarKernel), and some other execution-related options. +struct VectorKernel : public Kernel { + /// \brief See VectorKernel::finalize member for usage + using FinalizeFunc = std::function*)>; + VectorKernel() = default; - VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, - KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(in_types), std::move(out_type), std::move(exec), - std::move(init)), + VectorKernel(std::vector in_types, OutputType out_type, KernelBatchExec exec, + KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(std::move(exec)), finalize(std::move(finalize)) {} - VectorKernel(std::shared_ptr sig, ArrayKernelExec exec, - KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(sig), std::move(exec), std::move(init)), + VectorKernel(std::shared_ptr sig, KernelBatchExec exec, + KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(sig), std::move(init)), + exec(std::move(exec)), finalize(std::move(finalize)) {} + /// \brief Perform a single invocation of this kernel. Any required state is + /// managed through the KernelContext. + KernelBatchExec exec; + /// \brief For VectorKernel, convert intermediate results into finalized /// results. Mutates input argument. Some kernels may accumulate state /// (example: hashing-related functions) through processing chunked inputs, and /// then need to attach some accumulated state to each of the outputs of /// processing each chunk of data. - VectorFinalize finalize; + FinalizeFunc finalize; /// Since vector kernels generally are implemented rather differently from /// scalar/elementwise kernels (and they may not even yield arrays of the same @@ -629,6 +629,13 @@ struct VectorKernel : public ArrayKernel { NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE; + /// \brief Writing execution results into larger contiguous allocations + /// requires that the kernel be able to write into sliced output ArrayData*, + /// including sliced output validity bitmaps. Some kernel implementations may + /// not be able to do this, so setting this to false disables this + /// functionality. + bool can_write_into_slices = true; + /// Some vector kernels can do chunkwise execution using ExecBatchIterator, /// in some cases accumulating some state. Other kernels (like Take) need to /// be passed whole arrays and don't work on ChunkedArray inputs diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 16495bc8030..8acdce323ed 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -297,7 +297,7 @@ struct ProductImpl : public ScalarAggregator { } internal::VisitArrayValuesInline( - *data, + ArraySpan(*data), [&](typename TypeTraits::CType value) { this->product = MultiplyTraits::Multiply(*out_type, this->product, value); @@ -630,7 +630,7 @@ struct IndexImpl : public ScalarAggregator { int64_t i = 0; ARROW_UNUSED(internal::VisitArrayValuesInline( - *input, + ArraySpan(*input), [&](ArgValue v) -> Status { if (v == desired) { index = i; diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 7d3440cbef3..d54ed12a1f7 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -389,7 +389,7 @@ Result ModeType(KernelContext*, const std::vector& descr } VectorKernel NewModeKernel(const std::shared_ptr& in_type, - ArrayKernelExec exec) { + KernelBatchExec exec) { VectorKernel kernel; kernel.init = ModeState::Init; kernel.can_execute_chunkwise = false; @@ -433,8 +433,9 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) { DCHECK_OK(func->AddKernel( NewModeKernel(boolean(), ModeExecutor::Exec))); for (const auto& type : NumericTypes()) { + // TODO(wesm): DCHECK_OK(func->AddKernel( - NewModeKernel(type, GenerateNumeric(*type)))); + NewModeKernel(type, GenerateNumericOld(*type)))); } // Type parameters are ignored DCHECK_OK(func->AddKernel( diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 810fb539913..d18d8425946 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -500,7 +500,7 @@ void AddQuantileKernels(VectorFunction* func) { for (const auto& ty : NumericTypes()) { base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput)); // output type is determined at runtime, set template argument to nulltype - base.exec = GenerateNumeric(*ty); + base.exec = GenerateNumericOld(*ty); DCHECK_OK(func->AddKernel(base)); } { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index b31ef408b10..cf2c2b9c195 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -29,15 +29,19 @@ namespace arrow { namespace compute { namespace internal { -Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return Status::NotImplemented("This kernel is malformed"); } -ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) { - return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ExecBatch flipped_batch = batch; - std::swap(flipped_batch.values[0], flipped_batch.values[1]); - return exec(ctx, flipped_batch, out); +Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return Status::NotImplemented("This kernel is malformed"); +} + +ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec) { + return [exec](KernelContext* ctx, const ExecSpan& span, ExecResult* out) { + ExecSpan flipped_span = span; + std::swap(flipped_span.values[0], flipped_span.values[1]); + return exec(ctx, flipped_span, out); }; } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 6d31c1fe246..b3d989ec781 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -58,6 +58,7 @@ using internal::BitmapReader; using internal::checked_cast; using internal::FirstTimeBitmapWriter; using internal::GenerateBitsUnrolled; +using internal::VisitBitBlocks; using internal::VisitBitBlocksVoid; using internal::VisitTwoBitBlocksVoid; @@ -237,7 +238,7 @@ struct ArrayIterator> { using T = typename TypeTraits::ScalarType::ValueType; const T* values; - explicit ArrayIterator(const ArrayData& data) : values(data.GetValues(1)) {} + explicit ArrayIterator(const ArraySpan& arr) : values(arr.GetValues(1)) {} T operator()() { return *values++; } }; @@ -245,8 +246,8 @@ template struct ArrayIterator> { BitmapReader reader; - explicit ArrayIterator(const ArrayData& data) - : reader(data.buffers[1]->data(), data.offset, data.length) {} + explicit ArrayIterator(const ArraySpan& arr) + : reader(arr.buffers[1].data, arr.offset, arr.length) {} bool operator()() { bool out = reader.IsSet(); reader.Next(); @@ -257,18 +258,17 @@ struct ArrayIterator> { template struct ArrayIterator> { using offset_type = typename Type::offset_type; - const ArrayData& arr; + const ArraySpan& arr; const offset_type* offsets; offset_type cur_offset; const char* data; int64_t position; - explicit ArrayIterator(const ArrayData& arr) + explicit ArrayIterator(const ArraySpan& arr) : arr(arr), - offsets(reinterpret_cast(arr.buffers[1]->data()) + - arr.offset), + offsets(reinterpret_cast(arr.buffers[1].data) + arr.offset), cur_offset(offsets[0]), - data(reinterpret_cast(arr.buffers[2]->data())), + data(reinterpret_cast(arr.buffers[2].data)), position(0) {} util::string_view operator()() { @@ -281,15 +281,15 @@ struct ArrayIterator> { template <> struct ArrayIterator { - const ArrayData& arr; + const ArraySpan& arr; const char* data; const int32_t width; int64_t position; - explicit ArrayIterator(const ArrayData& arr) + explicit ArrayIterator(const ArraySpan& arr) : arr(arr), - data(reinterpret_cast(arr.buffers[1]->data())), - width(checked_cast(*arr.type).byte_width()), + data(reinterpret_cast(arr.buffers[1].data)), + width(arr.type->byte_width()), position(arr.offset) {} util::string_view operator()() { @@ -309,7 +309,7 @@ struct OutputArrayWriter> { using T = typename TypeTraits::ScalarType::ValueType; T* values; - explicit OutputArrayWriter(ArrayData* data) : values(data->GetMutableValues(1)) {} + explicit OutputArrayWriter(ArraySpan* data) : values(data->GetValues(1)) {} void Write(T value) { *values++ = value; } @@ -340,7 +340,8 @@ struct UnboxScalar> { template struct UnboxScalar> { - static util::string_view Unbox(const Scalar& val) { + using T = util::string_view; + static T Unbox(const Scalar& val) { if (!val.is_valid) return util::string_view(); return util::string_view(*checked_cast(val).value); } @@ -348,14 +349,16 @@ struct UnboxScalar> { template <> struct UnboxScalar { - static const Decimal128& Unbox(const Scalar& val) { + using T = Decimal128; + static const T& Unbox(const Scalar& val) { return checked_cast(val).value; } }; template <> struct UnboxScalar { - static const Decimal256& Unbox(const Scalar& val) { + using T = Decimal256; + static const T& Unbox(const Scalar& val) { return checked_cast(val).value; } }; @@ -397,14 +400,198 @@ struct BoxScalar { static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } }; -// A VisitArrayDataInline variant that calls its visitor function with logical +// ---------------------------------------------------------------------- +// Like VisitArrayDataInline, but for ArraySpans + +template +struct ArraySpanInlineVisitor {}; + +// Numeric and primitive C-compatible types +template +struct ArraySpanInlineVisitor> { + using c_type = typename T::c_type; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + const c_type* data = arr.GetValues(1); + auto visit_valid = [&](int64_t i) { return valid_func(data[i]); }; + return VisitBitBlocks(arr.buffers[0].data, arr.offset, arr.length, + std::move(visit_valid), std::forward(null_func)); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + using c_type = typename T::c_type; + const c_type* data = arr.GetValues(1); + auto visit_valid = [&](int64_t i) { valid_func(data[i]); }; + VisitBitBlocksVoid(arr.buffers[0].data, arr.offset, arr.length, + std::move(visit_valid), std::forward(null_func)); + } +}; + +// Boolean +template <> +struct ArraySpanInlineVisitor { + using c_type = bool; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + int64_t offset = arr.offset; + const uint8_t* data = arr.buffers[1].data; + return VisitBitBlocks( + arr.buffers[0].data, offset, arr.length, + [&](int64_t i) { return valid_func(bit_util::GetBit(data, offset + i)); }, + std::forward(null_func)); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + int64_t offset = arr.offset; + const uint8_t* data = arr.buffers[1].data; + VisitBitBlocksVoid( + arr.buffers[0].data, offset, arr.length, + [&](int64_t i) { valid_func(bit_util::GetBit(data, offset + i)); }, + std::forward(null_func)); + } +}; + +// Binary, String... +template +struct ArraySpanInlineVisitor> { + using c_type = util::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + using offset_type = typename T::offset_type; + constexpr char empty_value = 0; + + if (arr.length == 0) { + return Status::OK(); + } + const offset_type* offsets = arr.GetValues(1); + const char* data; + if (arr.buffers[2].data == NULLPTR) { + data = &empty_value; + } else { + // Do not apply the array offset to the values array; the value_offsets + // index the non-sliced values array. + data = arr.GetValues(2, /*absolute_offset=*/0); + } + offset_type cur_offset = *offsets++; + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t i) { + ARROW_UNUSED(i); + auto value = util::string_view(data + cur_offset, *offsets - cur_offset); + cur_offset = *offsets++; + return valid_func(value); + }, + [&]() { + cur_offset = *offsets++; + return null_func(); + }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + using offset_type = typename T::offset_type; + constexpr uint8_t empty_value = 0; + + if (arr.length == 0) { + return; + } + const offset_type* offsets = arr.GetValues(1); + const uint8_t* data; + if (arr.buffers[2].data == NULLPTR) { + data = &empty_value; + } else { + // Do not apply the array offset to the values array; the value_offsets + // index the non-sliced values array. + data = arr.GetValues(2, /*absolute_offset=*/0); + } + + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t i) { + auto value = util::string_view(reinterpret_cast(data + offsets[i]), + offsets[i + 1] - offsets[i]); + valid_func(value); + }, + std::forward(null_func)); + } +}; + +// FixedSizeBinary, Decimal128 +template +struct ArraySpanInlineVisitor> { + using c_type = util::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + const int32_t byte_width = arr.type->byte_width(); + const char* data = arr.GetValues(1, + /*absolute_offset=*/arr.offset * byte_width); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t i) { + auto value = util::string_view(data, byte_width); + data += byte_width; + return valid_func(value); + }, + [&]() { + data += byte_width; + return null_func(); + }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + const int32_t byte_width = arr.type->byte_width(); + const char* data = arr.GetValues(1, + /*absolute_offset=*/arr.offset * byte_width); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t i) { + valid_func(util::string_view(data, byte_width)); + data += byte_width; + }, + [&]() { + data += byte_width; + null_func(); + }); + } +}; + +template +typename ::arrow::internal::call_traits::enable_if_return::type +VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) { + return internal::ArraySpanInlineVisitor::VisitStatus( + arr, std::forward(valid_func), std::forward(null_func)); +} + +template +typename ::arrow::internal::call_traits::enable_if_return::type +VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) { + return internal::ArraySpanInlineVisitor::VisitVoid( + arr, std::forward(valid_func), std::forward(null_func)); +} + +// A VisitArraySpanInline variant that calls its visitor function with logical // values, such as Decimal128 rather than util::string_view. template static typename arrow::internal::call_traits::enable_if_return::type -VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, +VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func, NullFunc&& null_func) { - VisitArrayDataInline( + VisitArraySpanInline( arr, [&](typename GetViewType::PhysicalType v) { valid_func(GetViewType::LogicalValue(std::move(v))); @@ -414,9 +601,9 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, template static typename arrow::internal::call_traits::enable_if_return::type -VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, +VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func, NullFunc&& null_func) { - return VisitArrayDataInline( + return VisitArraySpanInline( arr, [&](typename GetViewType::PhysicalType v) { return valid_func(GetViewType::LogicalValue(std::move(v))); @@ -427,7 +614,7 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, // Like VisitArrayValuesInline, but for binary functions. template -static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& arr1, +static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& arr1, VisitFunc&& valid_func, NullFunc&& null_func) { ArrayIterator arr0_it(arr0); ArrayIterator arr1_it(arr1); @@ -441,9 +628,24 @@ static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& ar arr1_it(); null_func(); }; - VisitTwoBitBlocksVoid(arr0.buffers[0], arr0.offset, arr1.buffers[0], arr1.offset, - arr0.length, std::move(visit_valid), std::move(visit_null)); + VisitTwoBitBlocksVoid(arr0.buffers[0].data, arr0.offset, arr1.buffers[0].data, + arr1.offset, arr0.length, std::move(visit_valid), + std::move(visit_null)); } +// Like ArrayDataVisitor (see visit_data_inline.h), but for ArraySpans + +template +struct ArraySpanVisitor { + using InlineVisitorType = ArraySpanInlineVisitor; + using c_type = typename InlineVisitorType::c_type; + + template + static Status Visit(const ArraySpan& arr, Visitor* visitor) { + return InlineVisitorType::VisitStatus( + arr, [visitor](c_type v) { return visitor->VisitValue(v); }, + [visitor]() { return visitor->VisitNull(); }); + } +}; // ---------------------------------------------------------------------- // Reusable type resolvers @@ -455,9 +657,10 @@ Result ListValuesType(KernelContext*, const std::vector& // ---------------------------------------------------------------------- // Generate an array kernel given template classes -Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out); +Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out); +Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out); -ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec); +ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to @@ -483,60 +686,56 @@ const std::vector>& ExampleParametricTypes(); // ---------------------------------------------------------------------- // "Applicators" take an operator definition (which may be scalar-valued or -// array-valued) and creates an ArrayKernelExec which can be used to add an +// array-valued) and creates an ScalarKernel::ExecFunc which can be used to add an // ArrayKernel to a Function. namespace applicator { -// Generate an ArrayKernelExec given a functor that handles all of its own +// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own // iteration, etc. // // Operator must implement // -// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out) -// static Status Call(KernelContext*, const Scalar& in, Scalar* out) +// static Status Call(KernelContext*, const ArraySpan& in, ExecResult* out) +// static Status Call(KernelContext*, const Scalar& in, ExecResult* out) template -static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::SCALAR) { - return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get()); +static Status SimpleUnary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_scalar()) { + return Operator::Call(ctx, *batch[0].scalar, out); } else if (batch.length > 0) { - return Operator::Call(ctx, *batch[0].array(), out->mutable_array()); + return Operator::Call(ctx, batch[0].array, out); } return Status::OK(); } -// Generate an ArrayKernelExec given a functor that handles all of its own +// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own // iteration, etc. // // Operator must implement // -// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1, -// ArrayData* out) -// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1, -// ArrayData* out) -// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1, -// ArrayData* out) +// static Status Call(KernelContext*, const ArraySpan& arg0, const ArraySpan& arg1, +// * out) +// static Status Call(KernelContext*, const ArraySpan& arg0, const Scalar& arg1, +// ExecResult* out) +// static Status Call(KernelContext*, const Scalar& arg0, const ArraySpan& arg1, +// ExecResult* out) // static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1, -// Scalar* out) +// ExecResult* out) template -static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +static Status SimpleBinary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { if (batch.length == 0) return Status::OK(); - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return Operator::Call(ctx, *batch[0].array(), *batch[1].array(), - out->mutable_array()); + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return Operator::Call(ctx, batch[0].array, batch[1].array, out); } else { - return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(), - out->mutable_array()); + return Operator::Call(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(), - out->mutable_array()); + if (batch[1].is_array()) { + return Operator::Call(ctx, *batch[0].scalar, batch[1].array, out); } else { - return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(), - out->scalar().get()); + return Operator::Call(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -551,10 +750,8 @@ struct OutputAdapter; template struct OutputAdapter> { template - static Status Write(KernelContext*, Datum* out, Generator&& generator) { - ArrayData* out_arr = out->mutable_array(); - auto out_bitmap = out_arr->buffers[1]->mutable_data(); - GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length, + static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) { + GenerateBitsUnrolled(out->buffers[1].data, out->offset, out->length, std::forward(generator)); return Status::OK(); } @@ -565,11 +762,10 @@ struct OutputAdapter> { using T = typename TypeTraits::ScalarType::ValueType; template - static Status Write(KernelContext*, Datum* out, Generator&& generator) { - ArrayData* out_arr = out->mutable_array(); - auto out_data = out_arr->GetMutableValues(1); + static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) { + T* out_data = out->GetValues(1); // TODO: Is this as fast as a more explicitly inlined function? - for (int64_t i = 0; i < out_arr->length; ++i) { + for (int64_t i = 0; i < out->length; ++i) { *out_data++ = generator(); } return Status::OK(); @@ -579,7 +775,7 @@ struct OutputAdapter> { template struct OutputAdapter> { template - static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) { + static Status Write(KernelContext* ctx, ArraySpan* out, Generator&& generator) { return Status::NotImplemented("NYI"); } }; @@ -607,16 +803,17 @@ struct ScalarUnary { using OutValue = typename GetOutputType::T; using Arg0Value = typename GetViewType::T; - static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) { + static Status ExecArray(KernelContext* ctx, const ArraySpan& arg0, ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), &st); + })); return st; } - static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) { + static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) { Status st = Status::OK(); Scalar* out_scalar = out->scalar().get(); if (arg0.is_valid) { @@ -630,11 +827,11 @@ struct ScalarUnary { return st; } - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - return ExecArray(ctx, *batch[0].array(), out); + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + return ExecArray(ctx, batch[0].array, out); } else { - return ExecScalar(ctx, *batch[0].scalar(), out); + return ExecScalar(ctx, *batch[0].scalar, out); } } }; @@ -654,8 +851,8 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec { - static Status Exec(const ThisType& functor, KernelContext* ctx, - const ExecBatch& batch, Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type " << out->type(); return Status::NotImplemented("NYI"); @@ -664,11 +861,10 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { Status st = Status::OK(); - ArrayData* out_arr = out->mutable_array(); - auto out_data = out_arr->GetMutableValues(1); + auto out_data = out->array_span()->GetValues(1); VisitArrayValuesInline( arg0, [&](Arg0Value v) { @@ -684,8 +880,8 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { // NOTE: This code is not currently used by any kernels and has // suboptimal performance because it's recomputing the validity bitmap // that is already computed by the kernel execution layer. Consider @@ -706,12 +902,12 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec::value>> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { Status st = Status::OK(); - ArrayData* out_arr = out->mutable_array(); - FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(), - out_arr->offset, out_arr->length); + ArraySpan* out_arr = out->array_span(); + FirstTimeBitmapWriter out_writer(out_arr->buffers[1].data, out_arr->offset, + out_arr->length); VisitArrayValuesInline( arg0, [&](Arg0Value v) { @@ -730,7 +926,7 @@ struct ScalarUnaryNotNullStateful { } }; - Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) { + Status Scalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) { Status st = Status::OK(); if (arg0.is_valid) { Arg0Value arg0_val = UnboxScalar::Unbox(arg0); @@ -741,11 +937,11 @@ struct ScalarUnaryNotNullStateful { return st; } - Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - return ArrayExec::Exec(*this, ctx, *batch[0].array(), out); + Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + return ArrayExec::Exec(*this, ctx, batch[0].array, out); } else { - return Scalar(ctx, *batch[0].scalar(), out); + return Scalar(ctx, *batch[0].scalar, out); } } }; @@ -758,7 +954,7 @@ struct ScalarUnaryNotNull { using OutValue = typename GetOutputType::T; using Arg0Value = typename GetViewType::T; - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Seed kernel with dummy state ScalarUnaryNotNullStateful kernel({}); return kernel.Exec(ctx, batch, out); @@ -790,44 +986,47 @@ struct ScalarBinary { using Arg0Value = typename GetViewType::T; using Arg1Value = typename GetViewType::T; - static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, - const ArrayData& arg1, Datum* out) { + static Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0, + const ArraySpan& arg1, ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); ArrayIterator arg1_it(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), arg1_it(), - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), + arg1_it(), &st); + })); return st; } - static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1, - Datum* out) { + static Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1, + ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); auto arg1_val = UnboxScalar::Unbox(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), arg1_val, - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), + arg1_val, &st); + })); return st; } - static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1, - Datum* out) { + static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); auto arg0_val = UnboxScalar::Unbox(arg0); ArrayIterator arg1_it(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_val, arg1_it(), - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_val, + arg1_it(), &st); + })); return st; } static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1, - Datum* out) { + ExecResult* out) { Status st = Status::OK(); if (out->scalar()->is_valid) { auto arg0_val = UnboxScalar::Unbox(arg0); @@ -839,18 +1038,18 @@ struct ScalarBinary { return st; } - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out); + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return ArrayArray(ctx, batch[0].array, batch[1].array, out); } else { - return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out); + return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out); + if (batch[1].is_array()) { + return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out); } else { - return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out); + return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -870,10 +1069,10 @@ struct ScalarBinaryNotNullStateful { // NOTE: In ArrayExec, Type is really OutputType - Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1, - Datum* out) { + Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + OutputArrayWriter writer(out->array_span()); VisitTwoArrayValuesInline( arg0, arg1, [&](Arg0Value u, Arg1Value v) { @@ -883,10 +1082,11 @@ struct ScalarBinaryNotNullStateful { return st; } - Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1, - Datum* out) { + Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + ArraySpan* out_span = out->array_span(); + OutputArrayWriter writer(out_span); if (arg1.is_valid) { const auto arg1_val = UnboxScalar::Unbox(arg1); VisitArrayValuesInline( @@ -897,15 +1097,16 @@ struct ScalarBinaryNotNullStateful { }, [&]() { writer.WriteNull(); }); } else { - writer.WriteAllNull(out->mutable_array()->length); + writer.WriteAllNull(out_span->length); } return st; } - Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1, - Datum* out) { + Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + ArraySpan* out_span = out->array_span(); + OutputArrayWriter writer(out_span); if (arg0.is_valid) { const auto arg0_val = UnboxScalar::Unbox(arg0); VisitArrayValuesInline( @@ -916,13 +1117,13 @@ struct ScalarBinaryNotNullStateful { }, [&]() { writer.WriteNull(); }); } else { - writer.WriteAllNull(out->mutable_array()->length); + writer.WriteAllNull(out_span->length); } return st; } Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1, - Datum* out) { + ExecResult* out) { Status st = Status::OK(); if (arg0.is_valid && arg1.is_valid) { const auto arg0_val = UnboxScalar::Unbox(arg0); @@ -934,18 +1135,18 @@ struct ScalarBinaryNotNullStateful { return st; } - Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out); + Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return ArrayArray(ctx, batch[0].array, batch[1].array, out); } else { - return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out); + return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out); + if (batch[1].is_array()) { + return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out); } else { - return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out); + return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -961,7 +1162,7 @@ struct ScalarBinaryNotNull { using Arg0Value = typename GetViewType::T; using Arg1Value = typename GetViewType::T; - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Seed kernel with dummy state ScalarBinaryNotNullStateful kernel({}); return kernel.Exec(ctx, batch, out); @@ -997,7 +1198,7 @@ using ScalarBinaryNotNullStatefulEqualTypes = // // template // struct FUNCTOR { -// static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +// static void Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // // IMPLEMENTATION // } // }; @@ -1031,7 +1232,7 @@ struct GetTypeId { // GD for numeric types (integer and floating point) template