diff --git a/cpp/examples/arrow/compute_register_example.cc b/cpp/examples/arrow/compute_register_example.cc index f089b910ec4..13d80b29631 100644 --- a/cpp/examples/arrow/compute_register_example.cc +++ b/cpp/examples/arrow/compute_register_example.cc @@ -69,9 +69,9 @@ std::unique_ptr ExampleFunctionOptionsType::Copy( return std::unique_ptr(new ExampleFunctionOptions()); } -arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - *out->mutable_array() = *batch[0].array(); +arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecSpan& batch, + cp::ExecResult* out) { + out->value = batch[0].array.ToArrayData(); return arrow::Status::OK(); } diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index f45e2c644d0..47c45411477 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -59,11 +59,17 @@ const cp::FunctionDoc func_doc{ {"x", "y", "z"}, "UDFOptions"}; -arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - // temp = x + y; return temp + z - ARROW_ASSIGN_OR_RAISE(auto temp, cp::CallFunction("add", {batch[0], batch[1]})); - return cp::CallFunction("add", {temp, batch[2]}).Value(out); +arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecSpan& batch, + cp::ExecResult* out) { + // return x + y + z + const int64_t* x = batch[0].array.GetValues(1); + const int64_t* y = batch[1].array.GetValues(1); + const int64_t* z = batch[2].array.GetValues(1); + int64_t* out_values = out->array_span()->GetValues(1); + for (int64_t i = 0; i < batch.length; ++i) { + *out_values++ = *x++ + *y++ + *z++; + } + return arrow::Status::OK(); } arrow::Status Execute() { @@ -74,8 +80,8 @@ arrow::Status Execute() { cp::InputType::Array(arrow::int64())}, arrow::int64(), SampleFunction); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - kernel.null_handling = cp::NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = cp::MemAllocation::PREALLOCATE; + kernel.null_handling = cp::NullHandling::INTERSECTION; ARROW_RETURN_NOT_OK(func->AddKernel(std::move(kernel))); diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index 5a8ae93532b..dbdb110fb46 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -586,8 +586,8 @@ Status WriteGenericBatch(const Array& array, int64_t orc_offset, batch->hasNulls = true; } Appender appender{array_, batch, orc_offset, 0}; - ArrayDataVisitor visitor; - RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + ArraySpanVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender)); return Status::OK(); } @@ -608,8 +608,8 @@ Status WriteTimestampBatch(const Array& array, int64_t orc_offset, 0, conversion_factor_from_second, conversion_factor_to_nano}; - ArrayDataVisitor visitor; - RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + ArraySpanVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender)); return Status::OK(); } @@ -621,8 +621,8 @@ Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset, batch->hasNulls = true; } FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()}; - ArrayDataVisitor visitor; - RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + ArraySpanVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender)); return Status::OK(); } diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index da1469ebea0..b7225eb8b7d 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -859,7 +859,7 @@ TEST(TestChunkedStringBuilder, BasicOperation) { } // ---------------------------------------------------------------------- -// ArrayDataVisitor tests +// ArraySpanVisitor tests struct BinaryAppender { Status VisitNull() { @@ -885,7 +885,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void TestBasics() { auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); BinaryAppender appender; - ArrayDataVisitor visitor; + ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning @@ -894,7 +894,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void TestSliced() { auto array = ArrayFromJSON(type_, R"(["ab", null, "cd", "ef"])")->Slice(1, 2); BinaryAppender appender; - ArrayDataVisitor visitor; + ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"(null)", "cd"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index dbfc7bd7586..8fbe9f69d78 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -290,8 +290,8 @@ class DictionaryUnifierImpl : public DictionaryUnifier { Status GetResultWithIndexType(const std::shared_ptr& index_type, std::shared_ptr* out_dict) override { - int64_t dict_length = memo_table_.size(); - if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) { + Int64Scalar dict_length(memo_table_.size()); + if (!internal::IntegersCanFit(dict_length, *index_type).ok()) { return Status::Invalid( "These dictionaries cannot be combined. The unified dictionary requires a " "larger index type."); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 757ab8e9c3f..64edec0c7aa 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -569,7 +569,7 @@ const ArrayVector& StructArray::fields() const { return boxed_fields_; } -std::shared_ptr StructArray::field(int i) const { +const std::shared_ptr& StructArray::field(int i) const { std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); if (!result) { std::shared_ptr field_data; @@ -578,10 +578,11 @@ std::shared_ptr StructArray::field(int i) const { } else { field_data = data_->child_data[i]; } - result = MakeArray(field_data); + std::shared_ptr result = MakeArray(field_data); internal::atomic_store(&boxed_fields_[i], result); + return boxed_fields_[i]; } - return result; + return boxed_fields_[i]; } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index be6371941df..5d04bef4f9e 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -378,7 +378,7 @@ class ARROW_EXPORT StructArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. The returned array has its offset, length and null // count adjusted. - std::shared_ptr field(int pos) const; + const std::shared_ptr& field(int pos) const; const ArrayVector& fields() const; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 929c4d3bf4d..0d9afba6ece 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -696,6 +696,7 @@ TEST_F(TestArray, TestMakeEmptyArray) { TEST_F(TestArray, TestAppendArraySlice) { auto scalars = GetScalars(); + ArraySpan span; for (const auto& scalar : scalars) { ARROW_SCOPED_TRACE(*scalar->type); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16)); @@ -704,31 +705,33 @@ TEST_F(TestArray, TestAppendArraySlice) { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 4)); + span.SetMembers(*array->data()); + ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 0, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 4)); ASSERT_EQ(8, builder->length()); - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 4)); + span.SetMembers(*nulls->data()); + ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 0, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(span, 1, 4)); ASSERT_EQ(16, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(8, builder->null_count()); @@ -746,13 +749,15 @@ TEST_F(TestArray, TestAppendArraySlice) { { ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16)); NullBuilder builder(pool_); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 4)); + + span.SetMembers(*array->data()); + ASSERT_OK(builder.AppendArraySlice(span, 0, 4)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 0)); + ASSERT_OK(builder.AppendArraySlice(span, 0, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 0)); + ASSERT_OK(builder.AppendArraySlice(span, 1, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 4)); + ASSERT_OK(builder.AppendArraySlice(span, 1, 4)); ASSERT_EQ(8, builder.length()); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); @@ -2257,23 +2262,23 @@ struct FWBinaryAppender { std::vector data; }; -TEST_F(TestFWBinaryArray, ArrayDataVisitor) { +TEST_F(TestFWBinaryArray, ArraySpanVisitor) { auto type = fixed_size_binary(3); auto array = ArrayFromJSON(type, R"(["abc", null, "def"])"); FWBinaryAppender appender; - ArrayDataVisitor visitor; + ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"abc", "(null)", "def"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } -TEST_F(TestFWBinaryArray, ArrayDataVisitorSliced) { +TEST_F(TestFWBinaryArray, ArraySpanVisitorSliced) { auto type = fixed_size_binary(3); auto array = ArrayFromJSON(type, R"(["abc", null, "def", "ghi"])")->Slice(1, 2); FWBinaryAppender appender; - ArrayDataVisitor visitor; + ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"(null)", "def"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 4d0b477dcb8..bc4932a4b83 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -147,7 +147,7 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySlice(const ArrayData& array, int64_t offset, + virtual Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 703355bf278..25cec5c1e25 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -278,7 +278,7 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { auto bitmap = array.GetValues(0, 0); auto offsets = array.GetValues(1); @@ -516,7 +516,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues( array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 90a7c48d9f0..b720f73d7d2 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -366,10 +366,11 @@ class DictionaryBuilderBase : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { // Visit the indices and insert the unpacked values. const auto& dict_ty = internal::checked_cast(*array.type); - const typename TypeTraits::ArrayType dict(array.dictionary); + // See if possible to avoid using ToArrayData here + const typename TypeTraits::ArrayType dict(array.dictionary().ToArrayData()); ARROW_RETURN_NOT_OK(Reserve(length)); switch (dict_ty.index_type()->id()) { case Type::UINT8: @@ -490,10 +491,10 @@ class DictionaryBuilderBase : public ArrayBuilder { protected: template Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, - const ArrayData& array, int64_t offset, int64_t length) { + const ArraySpan& array, int64_t offset, int64_t length) { const c_type* values = array.GetValues(1) + offset; return VisitBitBlocks( - array.buffers[0], array.offset + offset, length, + array.buffers[0].data, array.offset + offset, length, [&](const int64_t position) { const int64_t index = static_cast(values[position]); if (dict.IsValid(index)) { diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 4f39ce86294..3d36cb5f65e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -126,15 +126,15 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); @@ -296,18 +296,18 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const int32_t* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( - *array.child_data[0]->child_data[0], offsets[row], slot_length)); + array.child_data[0].child_data[0], offsets[row], slot_length)); ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( - *array.child_data[0]->child_data[1], offsets[row], slot_length)); + array.child_data[0].child_data[1], offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); } @@ -425,12 +425,12 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( - *array.child_data[0], list_size_ * (array.offset + row), list_size_)); + array.child_data[0], list_size_ * (array.offset + row), list_size_)); ARROW_RETURN_NOT_OK(Append()); } else { ARROW_RETURN_NOT_OK(AppendNull()); @@ -532,13 +532,13 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { for (int i = 0; static_cast(i) < children_.size(); i++) { - ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i], + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i], array.offset + offset, length)); } - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(validity, array.offset + offset, length); return Status::OK(); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index c72b48f0b32..8f2dcc8b09b 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -53,7 +53,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status Append(std::nullptr_t) { return AppendNull(); } - Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override { + Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override { return AppendNulls(length); } @@ -279,7 +279,7 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues(array.GetValues(1) + offset, length, array.GetValues(0, 0), array.offset + offset); @@ -513,7 +513,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(int64_t length, bool value); - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return AppendValues(array.GetValues(1, 0), length, array.GetValues(0, 0), array.offset + offset); diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 6096b76ff21..883cda3d8b7 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -45,7 +45,7 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } -Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, +Status DenseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset, const int64_t length) { const int8_t* type_codes = array.GetValues(1); const int32_t* offsets = array.GetValues(2); @@ -55,7 +55,7 @@ Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t const int32_t union_offset = offsets[row]; RETURN_NOT_OK(Append(type_code)); RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( - *array.child_data[child_id], union_offset, /*length=*/1)); + array.child_data[child_id], union_offset, /*length=*/1)); } return Status::OK(); } @@ -137,11 +137,11 @@ int8_t BasicUnionBuilder::NextTypeId() { return dense_type_id_++; } -Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, +Status SparseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset, const int64_t length) { for (size_t i = 0; i < type_codes_.size(); i++) { RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendArraySlice( - *array.child_data[i], array.offset + offset, length)); + array.child_data[i], array.offset + offset, length)); } const int8_t* type_codes = array.GetValues(1); RETURN_NOT_OK(types_builder_.Append(type_codes + offset, length)); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index f1629939ce3..eb8c5d3af0e 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -160,7 +160,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override; Status FinishInternal(std::shared_ptr* out) override; @@ -239,7 +239,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override; }; diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 9dfc76c01ce..653d206f015 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -25,9 +25,12 @@ #include #include +#include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -128,6 +131,141 @@ int64_t ArrayData::GetNullCount() const { return precomputed; } +// ---------------------------------------------------------------------- +// Methods for ArraySpan + +void ArraySpan::SetMembers(const ArrayData& data) { + this->type = data.type.get(); + this->length = data.length; + this->null_count = data.null_count.load(); + this->offset = data.offset; + + for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + const std::shared_ptr& buffer = data.buffers[i]; + // It is the invoker-of-kernels's responsibility to ensure that + // const buffers are not written to accidentally. + if (buffer) { + SetBuffer(i, buffer); + } else { + ClearBuffer(i); + } + } + + // Makes sure any other buffers are seen as null / non-existent + for (int i = static_cast(data.buffers.size()); i < 3; ++i) { + ClearBuffer(i); + } + + if (this->type->id() == Type::DICTIONARY) { + this->child_data.resize(1); + this->child_data[0].SetMembers(*data.dictionary); + } else { + this->child_data.resize(data.child_data.size()); + for (size_t child_index = 0; child_index < data.child_data.size(); ++child_index) { + this->child_data[child_index].SetMembers(*data.child_data[child_index]); + } + } +} + +void ArraySpan::FillFromScalar(const Scalar& value) { + static const uint8_t kValidByte = 0x01; + static const uint8_t kNullByte = 0x00; + + this->type = value.type.get(); + this->length = 1; + + // Populate null count and validity bitmap + this->null_count = value.is_valid ? 0 : 1; + this->buffers[0].data = const_cast(value.is_valid ? &kValidByte : &kNullByte); + this->buffers[0].size = 1; + + if (is_primitive(value.type->id())) { + const auto& scalar = + internal::checked_cast(value); + const uint8_t* scalar_data = reinterpret_cast(scalar.view().data()); + this->buffers[1].data = const_cast(scalar_data); + this->buffers[1].size = scalar.type->byte_width(); + } else { + // TODO(wesm): implement for other types + DCHECK(false) << "need to implement for other types"; + } +} + +int64_t ArraySpan::GetNullCount() const { + int64_t precomputed = this->null_count; + if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) { + if (this->buffers[0].data != nullptr) { + precomputed = + this->length - CountSetBits(this->buffers[0].data, this->offset, this->length); + } else { + precomputed = 0; + } + this->null_count = precomputed; + } + return precomputed; +} + +int GetNumBuffers(const DataType& type) { + switch (type.id()) { + case Type::NA: + return 0; + case Type::STRUCT: + case Type::FIXED_SIZE_LIST: + return 1; + case Type::BINARY: + case Type::LARGE_BINARY: + case Type::STRING: + case Type::LARGE_STRING: + case Type::DENSE_UNION: + return 3; + case Type::EXTENSION: + // The number of buffers depends on the storage type + return GetNumBuffers( + *internal::checked_cast(type).storage_type()); + default: + // Everything else has 2 buffers + return 2; + } +} + +int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); } + +std::shared_ptr ArraySpan::ToArrayData() const { + auto result = std::make_shared(this->type->Copy(), this->length, + kUnknownNullCount, this->offset); + + for (int i = 0; i < this->num_buffers(); ++i) { + if (this->buffers[i].owner) { + result->buffers.emplace_back(this->GetBuffer(i)); + } else { + result->buffers.push_back(nullptr); + } + } + + if (this->type->id() == Type::NA) { + result->null_count = this->length; + } else if (this->buffers[0].data == nullptr) { + // No validity bitmap, so the null count is 0 + result->null_count = 0; + } + + // TODO(wesm): what about extension arrays? + + if (this->type->id() == Type::DICTIONARY) { + result->dictionary = this->dictionary().ToArrayData(); + } else { + // Emit children, too + for (size_t i = 0; i < this->child_data.size(); ++i) { + result->child_data.push_back(this->child_data[i].ToArrayData()); + } + } + return result; +} + +std::shared_ptr ArraySpan::ToArray() const { + return MakeArray(this->ToArrayData()); +} + // ---------------------------------------------------------------------- // Implement ArrayData::View diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 418d09def6b..df547aedfaf 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -25,11 +25,14 @@ #include "arrow/buffer.h" #include "arrow/result.h" +#include "arrow/util/bit_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { +class Array; + // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count // to -1 (any negative number will do). When Array::null_count is called the @@ -242,6 +245,131 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr dictionary; }; +/// \brief A non-owning Buffer reference +struct ARROW_EXPORT BufferSpan { + // It is the user of this class's responsibility to ensure that + // buffers that were const originally are not written to + // accidentally. + uint8_t* data = NULLPTR; + int64_t size = 0; + // Pointer back to buffer that owns this memory + const std::shared_ptr* owner = NULLPTR; +}; + +/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply +/// copyable and does not contain any shared_ptr objects. Do not use in public +/// APIs aside from compute kernels for now +struct ARROW_EXPORT ArraySpan { + const DataType* type = NULLPTR; + int64_t length = 0; + mutable int64_t null_count = kUnknownNullCount; + int64_t offset = 0; + BufferSpan buffers[3]; + + ArraySpan() = default; + + explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {} + + ArraySpan(const ArrayData& data) { // NOLINT implicit conversion + SetMembers(data); + } + ArraySpan(const Scalar& data) { // NOLINT implicit converstion + FillFromScalar(data); + } + + /// If dictionary-encoded, put dictionary in the first entry + std::vector child_data; + + /// \brief Populate ArraySpan to look like an array of length 1 pointing at + /// the data members of a Scalar value + void FillFromScalar(const Scalar& value); + + void SetMembers(const ArrayData& data); + + void SetBuffer(int index, const std::shared_ptr& buffer) { + this->buffers[index].data = const_cast(buffer->data()); + this->buffers[index].size = buffer->size(); + this->buffers[index].owner = &buffer; + } + + void ClearBuffer(int index) { + this->buffers[index].data = NULLPTR; + this->buffers[index].size = 0; + this->buffers[index].owner = NULLPTR; + } + + const ArraySpan& dictionary() const { return child_data[0]; } + + /// \brief Return the number of buffers (out of 3) that are used to + /// constitute this array + int num_buffers() const; + + // Access a buffer's data as a typed C pointer + template + inline T* GetValues(int i, int64_t absolute_offset) { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline T* GetValues(int i) { + return GetValues(i, this->offset); + } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + return reinterpret_cast(buffers[i].data) + absolute_offset; + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, this->offset); + } + + bool IsNull(int64_t i) const { + return ((this->buffers[0].data != NULLPTR) + ? !bit_util::GetBit(this->buffers[0].data, i + this->offset) + : this->null_count == this->length); + } + + bool IsValid(int64_t i) const { + return ((this->buffers[0].data != NULLPTR) + ? bit_util::GetBit(this->buffers[0].data, i + this->offset) + : this->null_count != this->length); + } + + std::shared_ptr ToArrayData() const; + + std::shared_ptr ToArray() const; + + std::shared_ptr GetBuffer(int index) const { + if (this->buffers[index].owner == NULLPTR) { + return NULLPTR; + } else { + return *this->buffers[index].owner; + } + } + + void AddOffset(int64_t offset) { + this->offset += offset; + this->null_count = kUnknownNullCount; + } + + void SetOffset(int64_t offset) { + this->offset = offset; + this->null_count = kUnknownNullCount; + } + + /// \brief Return null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + bool MayHaveNulls() const { + // If an ArrayData is slightly malformed it may have kUnknownNullCount set + // but no buffer + return null_count != 0 && buffers[0].data != NULLPTR; + } +}; + namespace internal { /// Construct a zero-copy view of this ArrayData with the given type. diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 413182de0df..e5b4ab39493 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -432,7 +432,9 @@ class NullArrayFactory { RETURN_NOT_OK(CreateBuffer()); } std::vector> child_data(type_->num_fields()); - out_ = ArrayData::Make(type_, length_, {buffer_}, child_data, length_, 0); + out_ = ArrayData::Make(type_, length_, + {SliceBuffer(buffer_, 0, bit_util::BytesForBits(length_))}, + child_data, length_, 0); RETURN_NOT_OK(VisitTypeInline(*type_, this)); return out_; } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 96f8cdef07a..05155d64b6a 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -52,7 +52,7 @@ struct UTF8DataValidator { util::InitializeUTF8(); int64_t i = 0; - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](util::string_view v) { if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { @@ -83,7 +83,7 @@ struct BoundsChecker { using c_type = typename IntegerType::c_type; int64_t i = 0; - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](c_type value) { const auto v = static_cast(value); @@ -173,7 +173,7 @@ struct ValidateArrayImpl { if (full_validation) { using c_type = typename Date64Type::c_type; - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](c_type date) { constexpr c_type kFullDayMillis = 1000 * 60 * 60 * 24; @@ -193,7 +193,7 @@ struct ValidateArrayImpl { if (full_validation) { using c_type = typename Time32Type::c_type; - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](c_type time) { constexpr c_type kFullDaySeconds = 60 * 60 * 24; @@ -221,7 +221,7 @@ struct ValidateArrayImpl { if (full_validation) { using c_type = typename Time64Type::c_type; - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](c_type time) { constexpr c_type kFullDayMicro = 1000000LL * 60 * 60 * 24; @@ -673,7 +673,7 @@ struct ValidateArrayImpl { using CType = typename TypeTraits::CType; if (full_validation) { const int32_t precision = type.precision(); - return VisitArrayDataInline( + return VisitArraySpanInline( data, [&](util::string_view bytes) { DCHECK_EQ(bytes.size(), DecimalType::kByteWidth); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index f3d12dbece9..7c027c9b1e8 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -119,7 +119,7 @@ class ARROW_EXPORT TypeErasedIntBuilder : public ArrayBuilder { Status AppendScalars(const ScalarVector& scalars) override { return builder_->AppendScalars(scalars); } - Status AppendArraySlice(const ArrayData& array, int64_t offset, + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { return builder_->AppendArraySlice(array, offset, length); } diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 956595b117b..6ec7d11ac83 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -111,7 +111,7 @@ class ARROW_EXPORT ChunkedArray { int num_chunks() const { return static_cast(chunks_.size()); } /// \return chunk a particular chunk from the chunked array - std::shared_ptr chunk(int i) const { return chunks_[i]; } + const std::shared_ptr& chunk(int i) const { return chunks_[i]; } /// \return an ArrayVector of chunks const ArrayVector& chunks() const { return chunks_; } diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index e6b0ade315f..3bdff691778 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -369,7 +369,6 @@ static auto kWeekOptionsType = GetFunctionOptionsType( DataMember("count_from_zero", &WeekOptions::count_from_zero), DataMember("first_week_is_fully_in_year", &WeekOptions::first_week_is_fully_in_year)); static auto kRandomOptionsType = GetFunctionOptionsType( - DataMember("length", &RandomOptions::length), DataMember("initializer", &RandomOptions::initializer), DataMember("seed", &RandomOptions::seed)); @@ -583,12 +582,11 @@ WeekOptions::WeekOptions(bool week_starts_monday, bool count_from_zero, first_week_is_fully_in_year(first_week_is_fully_in_year) {} constexpr char WeekOptions::kTypeName[]; -RandomOptions::RandomOptions(int64_t length, Initializer initializer, uint64_t seed) +RandomOptions::RandomOptions(Initializer initializer, uint64_t seed) : FunctionOptions(internal::kRandomOptionsType), - length(length), initializer(initializer), seed(seed) {} -RandomOptions::RandomOptions() : RandomOptions(0, SystemRandom, 0) {} +RandomOptions::RandomOptions() : RandomOptions(SystemRandom, 0) {} constexpr char RandomOptions::kTypeName[]; namespace internal { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 7f7b23c9371..7d86a555ec8 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -478,20 +478,14 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions { public: enum Initializer { SystemRandom, Seed }; - static RandomOptions FromSystemRandom(int64_t length) { - return RandomOptions{length, SystemRandom, 0}; - } - static RandomOptions FromSeed(int64_t length, uint64_t seed) { - return RandomOptions{length, Seed, seed}; - } + static RandomOptions FromSystemRandom() { return RandomOptions{SystemRandom, 0}; } + static RandomOptions FromSeed(uint64_t seed) { return RandomOptions{Seed, seed}; } - RandomOptions(int64_t length, Initializer initializer, uint64_t seed); + RandomOptions(Initializer initializer, uint64_t seed); RandomOptions(); static constexpr char const kTypeName[] = "RandomOptions"; static RandomOptions Defaults() { return RandomOptions(); } - /// The length of the array returned. Negative is invalid. - int64_t length; /// The type of initialization for random number generation - system or provided seed. Initializer initializer; /// The seed value used to initialize the random number generation. diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 186a3cdf3c1..da226a062d0 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -325,35 +325,164 @@ bool ExecBatchIterator::Next(ExecBatch* batch) { return true; } +// ---------------------------------------------------------------------- +// ExecSpanIterator; to eventually replace ExecBatchIterator + +Status ExecSpanIterator::Init(const ExecBatch& batch, ValueDescr::Shape output_shape, + int64_t max_chunksize) { + if (batch.num_values() > 0) { + // Validate arguments + ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values)); + if (inferred_length != batch.length) { + return Status::Invalid("Value lengths differed from ExecBatch length"); + } + } + args_ = &batch.values; + initialized_ = have_chunked_arrays_ = false; + position_ = 0; + length_ = output_shape == ValueDescr::SCALAR ? 1 : batch.length; + chunk_indexes_.clear(); + chunk_indexes_.resize(args_->size(), 0); + value_positions_.clear(); + value_positions_.resize(args_->size(), 0); + value_offsets_.clear(); + value_offsets_.resize(args_->size(), 0); + max_chunksize_ = std::min(length_, max_chunksize); + return Status::OK(); +} + +int64_t ExecSpanIterator::GetNextChunkSpan(int64_t iteration_size, ExecSpan* span) { + for (size_t i = 0; i < args_->size() && iteration_size > 0; ++i) { + // If the argument is not a chunked array, it's either a Scalar or Array, + // in which case it doesn't influence the size of this span. Note that if + // the args are all scalars the span length is 1 + if (!args_->at(i).is_chunked_array()) { + continue; + } + const ChunkedArray* arg = args_->at(i).chunked_array().get(); + const Array* current_chunk; + while (true) { + current_chunk = arg->chunk(chunk_indexes_[i]).get(); + if (value_positions_[i] == current_chunk->length()) { + // Chunk is zero-length, or was exhausted in the previous + // iteration. Move to the next chunk + ++chunk_indexes_[i]; + current_chunk = arg->chunk(chunk_indexes_[i]).get(); + span->values[i].SetArray(*current_chunk->data()); + value_positions_[i] = 0; + value_offsets_[i] = current_chunk->offset(); + continue; + } + break; + } + iteration_size = + std::min(current_chunk->length() - value_positions_[i], iteration_size); + } + return iteration_size; +} + +bool ExecSpanIterator::Next(ExecSpan* span) { + if (position_ == length_) { + // This also protects from degenerate cases like ChunkedArrays + // without any chunks + return false; + } + + if (!initialized_) { + span->length = 0; + + // The first time this is called, we populate the output span with any + // Scalar or Array arguments in the ExecValue struct, and then just + // increment array offsets below. If any arguments are ChunkedArray, then + // the internal ArraySpans will see their members updated during hte + // iteration + span->values.resize(args_->size()); + for (size_t i = 0; i < args_->size(); ++i) { + if (args_->at(i).is_scalar()) { + span->values[i].SetScalar(args_->at(i).scalar().get()); + } else if (args_->at(i).is_array()) { + const ArrayData& arr = *args_->at(i).array(); + span->values[i].SetArray(arr); + value_offsets_[i] = arr.offset; + } else { + // Populate members from the first chunk + const Array* first_chunk = args_->at(i).chunked_array()->chunk(0).get(); + const ArrayData& arr = *first_chunk->data(); + span->values[i].SetArray(arr); + value_offsets_[i] = arr.offset; + have_chunked_arrays_ = true; + } + } + initialized_ = true; + } + + if (position_ == length_) { + return false; + } + + // Determine how large the common contiguous "slice" of all the arguments is + int64_t iteration_size = std::min(length_ - position_, max_chunksize_); + if (have_chunked_arrays_) { + iteration_size = GetNextChunkSpan(iteration_size, span); + } + + // Now, adjust the span + span->length = iteration_size; + for (size_t i = 0; i < args_->size(); ++i) { + const Datum& arg = args_->at(i); + if (!arg.is_scalar()) { + ArraySpan* arr = &span->values[i].array; + arr->length = iteration_size; + arr->SetOffset(value_positions_[i] + value_offsets_[i]); + value_positions_[i] += iteration_size; + } + } + position_ += iteration_size; + DCHECK_LE(position_, length_); + return true; +} + namespace { struct NullGeneralization { enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL }; - static type Get(const Datum& datum) { - const auto dtype_id = datum.type()->id(); + static type Get(const ExecValue& value) { + const auto dtype_id = value.type()->id(); if (dtype_id == Type::NA) { return ALL_NULL; } if (!arrow::internal::HasValidityBitmap(dtype_id)) { return ALL_VALID; } - if (datum.is_scalar()) { - return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL; - } - if (datum.is_array()) { - const auto& arr = *datum.array(); + if (value.is_scalar()) { + return value.scalar->is_valid ? ALL_VALID : ALL_NULL; + } else { + const ArraySpan& arr = value.array; // Do not count the bits if they haven't been counted already - const int64_t known_null_count = arr.null_count.load(); - if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) { + if ((arr.null_count == 0) || (arr.buffers[0].data == nullptr)) { return ALL_VALID; } - if (known_null_count == arr.length) { + if (arr.null_count == arr.length) { return ALL_NULL; } } return PERHAPS_NULL; } + + static type Get(const Datum& datum) { + // Temporary workaround to help with ARROW-16756 + ExecValue value; + if (datum.is_array()) { + value.SetArray(*datum.array()); + } else if (datum.is_scalar()) { + value.SetScalar(datum.scalar().get()); + } else { + // TODO(wesm): ChunkedArray, I think + return PERHAPS_NULL; + } + return Get(value); + } }; // Null propagation implementation that deals both with preallocated bitmaps @@ -369,35 +498,29 @@ struct NullGeneralization { // * Otherwise, we allocate the bitmap and populate it class NullPropagator { public: - NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) + NullPropagator(KernelContext* ctx, const ExecSpan& batch, ArrayData* output) : ctx_(ctx), batch_(batch), output_(output) { - for (const Datum& datum : batch_.values) { - auto null_generalization = NullGeneralization::Get(datum); - + for (const ExecValue& value : batch_.values) { + auto null_generalization = NullGeneralization::Get(value); if (null_generalization == NullGeneralization::ALL_NULL) { is_all_null_ = true; } - - if (null_generalization != NullGeneralization::ALL_VALID && - datum.kind() == Datum::ARRAY) { - arrays_with_nulls_.push_back(datum.array().get()); + if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) { + arrays_with_nulls_.push_back(&value.array); } } - if (output->buffers[0] != nullptr) { bitmap_preallocated_ = true; - SetBitmap(output_->buffers[0].get()); + bitmap_ = output_->buffers[0]->mutable_data(); } } - void SetBitmap(Buffer* bitmap) { bitmap_ = bitmap->mutable_data(); } - Status EnsureAllocated() { if (bitmap_preallocated_) { return Status::OK(); } ARROW_ASSIGN_OR_RAISE(output_->buffers[0], ctx_->AllocateBitmap(output_->length)); - SetBitmap(output_->buffers[0].get()); + bitmap_ = output_->buffers[0]->mutable_data(); return Status::OK(); } @@ -412,10 +535,10 @@ class NullPropagator { // Walk all the values with nulls instead of breaking on the first in case // we find a bitmap that can be reused in the non-preallocated case - for (const ArrayData* arr : arrays_with_nulls_) { - if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) { + for (const ArraySpan* arr : arrays_with_nulls_) { + if (arr->null_count == arr->length && arr->buffers[0].owner != nullptr) { // Reuse this all null bitmap - output_->buffers[0] = arr->buffers[0]; + output_->buffers[0] = arr->GetBuffer(0); return Status::OK(); } } @@ -427,14 +550,14 @@ class NullPropagator { Status PropagateSingle() { // One array - const ArrayData& arr = *arrays_with_nulls_[0]; - const std::shared_ptr& arr_bitmap = arr.buffers[0]; + const ArraySpan& arr = *arrays_with_nulls_[0]; + const uint8_t* arr_bitmap = arr.buffers[0].data; // Reuse the null count if it's known - output_->null_count = arr.null_count.load(); + output_->null_count = arr.null_count; if (bitmap_preallocated_) { - CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset); + CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, output_->offset); return Status::OK(); } @@ -448,14 +571,13 @@ class NullPropagator { // the bitmap is not preallocated, and that precondition is asserted // higher in the call stack. if (arr.offset == 0) { - output_->buffers[0] = arr_bitmap; + output_->buffers[0] = arr.GetBuffer(0); } else if (arr.offset % 8 == 0) { - output_->buffers[0] = - SliceBuffer(arr_bitmap, arr.offset / 8, bit_util::BytesForBits(arr.length)); + output_->buffers[0] = SliceBuffer(arr.GetBuffer(0), arr.offset / 8, + bit_util::BytesForBits(arr.length)); } else { RETURN_NOT_OK(EnsureAllocated()); - CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, - /*dst_offset=*/0); + CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, /*dst_offset=*/0); } return Status::OK(); } @@ -466,22 +588,22 @@ class NullPropagator { // Do not compute the intersection null count until it's needed RETURN_NOT_OK(EnsureAllocated()); - auto Accumulate = [&](const ArrayData& left, const ArrayData& right) { - DCHECK(left.buffers[0]); - DCHECK(right.buffers[0]); - BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(), - right.offset, output_->length, output_->offset, - output_->buffers[0]->mutable_data()); + auto Accumulate = [&](const uint8_t* left_data, int64_t left_offset, + const uint8_t* right_data, int64_t right_offset) { + BitmapAnd(left_data, left_offset, right_data, right_offset, output_->length, + output_->offset, bitmap_); }; DCHECK_GT(arrays_with_nulls_.size(), 1); // Seed the output bitmap with the & of the first two bitmaps - Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]); + Accumulate(arrays_with_nulls_[0]->buffers[0].data, arrays_with_nulls_[0]->offset, + arrays_with_nulls_[1]->buffers[0].data, arrays_with_nulls_[1]->offset); // Accumulate the rest for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) { - Accumulate(*output_, *arrays_with_nulls_[i]); + Accumulate(bitmap_, output_->offset, arrays_with_nulls_[i]->buffers[0].data, + arrays_with_nulls_[i]->offset); } return Status::OK(); } @@ -527,8 +649,8 @@ class NullPropagator { private: KernelContext* ctx_; - const ExecBatch& batch_; - std::vector arrays_with_nulls_; + const ExecSpan& batch_; + std::vector arrays_with_nulls_; bool is_all_null_ = false; ArrayData* output_; uint8_t* bitmap_; @@ -573,13 +695,9 @@ class KernelExecutorImpl : public KernelExecutor { } protected: - // This is overridden by the VectorExecutor - virtual Status SetupArgIteration(const std::vector& args) { - ARROW_ASSIGN_OR_RAISE( - batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize())); - return Status::OK(); - } - + // Prepare an output ArrayData to be written to. If + // Kernel::mem_allocation is not MemAllocation::PREALLOCATE, then no + // data buffers will be set Result> PrepareOutput(int64_t length) { auto out = std::make_shared(output_descr_.type, length); out->buffers.resize(output_num_buffers_); @@ -619,7 +737,6 @@ class KernelExecutorImpl : public KernelExecutor { KernelContext* kernel_ctx_; const KernelType* kernel_; - std::unique_ptr batch_iterator_; ValueDescr output_descr_; int output_num_buffers_; @@ -635,23 +752,36 @@ class KernelExecutorImpl : public KernelExecutor { class ScalarExecutor : public KernelExecutorImpl { public: - Status Execute(const std::vector& args, ExecListener* listener) override { - RETURN_NOT_OK(PrepareExecute(args)); - ExecBatch batch; - while (batch_iterator_->Next(&batch)) { - RETURN_NOT_OK(ExecuteBatch(batch, listener)); + Status Execute(const ExecBatch& batch, ExecListener* listener) override { + RETURN_NOT_OK(span_iterator_.Init(batch, output_descr_.shape, + exec_context()->exec_chunksize())); + + // TODO(wesm): remove if with ARROW-16757 + if (output_descr_.shape != ValueDescr::SCALAR) { + // If the executor is configured to produce a single large Array output for + // kernels supporting preallocation, then we do so up front and then + // iterate over slices of that large array. Otherwise, we preallocate prior + // to processing each span emitted from the ExecSpanIterator + RETURN_NOT_OK(SetupPreallocation(span_iterator_.length(), batch.values)); } - if (preallocate_contiguous_) { - // If we preallocated one big chunk, since the kernel execution is - // completed, we can now emit it - RETURN_NOT_OK(listener->OnResult(std::move(preallocated_))); + + // ARROW-16756: Here we have to accommodate the distinct cases + // + // * Fully-preallocated contiguous output + // * Fully-preallocated, non-contiguous kernel output + // * Not-fully-preallocated kernel output: we pass an empty or + // partially-filled ArrayData to the kernel + if (preallocating_all_buffers_) { + return ExecuteSpans(listener); + } else { + return ExecuteNonSpans(listener); } - return Status::OK(); } Datum WrapResults(const std::vector& inputs, const std::vector& outputs) override { if (output_descr_.shape == ValueDescr::SCALAR) { + // TODO(wesm): to remove, see ARROW-16757 DCHECK_EQ(outputs.size(), 1); // Return as SCALAR return outputs[0]; @@ -674,101 +804,118 @@ class ScalarExecutor : public KernelExecutorImpl { } protected: - Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) { - Datum out; - RETURN_NOT_OK(PrepareNextOutput(batch, &out)); - - if (output_descr_.shape == ValueDescr::ARRAY) { - ArrayData* out_arr = out.mutable_array(); - if (output_descr_.type->id() == Type::NA) { - out_arr->null_count = out_arr->length; - } else if (kernel_->null_handling == NullHandling::INTERSECTION) { - RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr)); - } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { - out_arr->null_count = 0; - } - } else { - if (kernel_->null_handling == NullHandling::INTERSECTION) { - // set scalar validity - out.scalar()->is_valid = - std::all_of(batch.values.begin(), batch.values.end(), - [](const Datum& input) { return input.scalar()->is_valid; }); - } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { - out.scalar()->is_valid = true; - } - } - - RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); + Status ExecuteSpans(ExecListener* listener) { + // We put the preallocation in an ArraySpan to be passed to the + // kernel which is expecting to receive that. More + // performance-critical code (e.g. expression evaluation) should + // eventually skip the creation of ArrayData altogether + std::shared_ptr preallocation; + ExecSpan input; + ExecResult output; + ArraySpan* output_span = output.array_span(); if (preallocate_contiguous_) { - // Some kernels may like to simply nullify the validity bitmap when - // they know the output will have 0 nulls. However, this is not compatible - // with writing into slices. - if (output_descr_.shape == ValueDescr::ARRAY) { - DCHECK(out.array()->buffers[0]) - << "Null bitmap deleted by kernel but can_write_into_slices = true"; + // Make one big output allocation + ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(span_iterator_.length())); + + // Populate and then reuse the ArraySpan inside + output_span->SetMembers(*preallocation); + output_span->offset = 0; + while (span_iterator_.Next(&input)) { + // Set absolute output span position and length + output_span->length = input.length; + RETURN_NOT_OK(ExecuteSingleSpan(input, &output)); + output_span->SetOffset(span_iterator_.position()); } + + // Kernel execution is complete; emit result + RETURN_NOT_OK(listener->OnResult(std::move(preallocation))); } else { - // If we are producing chunked output rather than one big array, then - // emit each chunk as soon as it's available - RETURN_NOT_OK(listener->OnResult(std::move(out))); + // Fully preallocating, but not contiguously + // We preallocate (maybe) only for the output of processing the current + // chunk + while (span_iterator_.Next(&input)) { + ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(input.length)); + output_span->SetMembers(*preallocation); + RETURN_NOT_OK(ExecuteSingleSpan(input, &output)); + // Emit the result for this chunk + RETURN_NOT_OK(listener->OnResult(std::move(preallocation))); + } } return Status::OK(); } - Status PrepareExecute(const std::vector& args) { - RETURN_NOT_OK(this->SetupArgIteration(args)); - - if (output_descr_.shape == ValueDescr::ARRAY) { - // If the executor is configured to produce a single large Array output for - // kernels supporting preallocation, then we do so up front and then - // iterate over slices of that large array. Otherwise, we preallocate prior - // to processing each batch emitted from the ExecBatchIterator - RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length(), args)); + Status ExecuteSingleSpan(const ExecSpan& input, ExecResult* out) { + ArraySpan* result_span = out->array_span(); + if (output_descr_.type->id() == Type::NA) { + result_span->null_count = result_span->length; + } else if (kernel_->null_handling == NullHandling::INTERSECTION) { + if (!elide_validity_bitmap_) { + PropagateNullsSpans(input, result_span); + } + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + result_span->null_count = 0; } - return Status::OK(); + return kernel_->exec(kernel_ctx_, input, out); } - // We must accommodate two different modes of execution for preallocated - // execution - // - // * A single large ("contiguous") allocation that we populate with results - // on a chunkwise basis according to the ExecBatchIterator. This permits - // parallelization even if the objective is to obtain a single Array or - // ChunkedArray at the end - // * A standalone buffer preallocation for each chunk emitted from the - // ExecBatchIterator - // - // When data buffer preallocation is not possible (e.g. with BINARY / STRING - // outputs), then contiguous results are only possible if the input is - // contiguous. - - Status PrepareNextOutput(const ExecBatch& batch, Datum* out) { - if (output_descr_.shape == ValueDescr::ARRAY) { - if (preallocate_contiguous_) { - // The output is already fully preallocated - const int64_t batch_start_position = batch_iterator_->position() - batch.length; - - if (batch.length < batch_iterator_->length()) { - // If this is a partial execution, then we write into a slice of - // preallocated_ - out->value = preallocated_->Slice(batch_start_position, batch.length); - } else { - // Otherwise write directly into preallocated_. The main difference - // computationally (versus the Slice approach) is that the null_count - // may not need to be recomputed in the result - out->value = preallocated_; + Status ExecuteNonSpans(ExecListener* listener) { + // ARROW-16756: Kernel is going to allocate some memory and so + // for the time being we pass in an empty or partially-filled + // shared_ptr or shared_ptr to be populated + // by the kernel. + // + // We will eventually delete the Scalar output path per + // ARROW-16757. + ExecSpan input; + ExecResult output; + while (span_iterator_.Next(&input)) { + if (output_descr_.shape == ValueDescr::ARRAY) { + ARROW_ASSIGN_OR_RAISE(output.value, PrepareOutput(input.length)); + DCHECK(output.is_array_data()); + } else { + // For scalar outputs, we set a null scalar of the correct type to + // communicate the output type to the kernel if needed + // + // XXX: Is there some way to avoid this step? + // TODO: Remove this path in ARROW-16757 + output.value = MakeNullScalar(output_descr_.type); + } + + if (output_descr_.shape == ValueDescr::ARRAY) { + ArrayData* out_arr = output.array_data().get(); + if (output_descr_.type->id() == Type::NA) { + out_arr->null_count = out_arr->length; + } else if (kernel_->null_handling == NullHandling::INTERSECTION) { + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, input, out_arr)); + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + out_arr->null_count = 0; } } else { - // We preallocate (maybe) only for the output of processing the current - // batch - ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length)); + // TODO(wesm): to remove, see ARROW-16757 + if (kernel_->null_handling == NullHandling::INTERSECTION) { + // set scalar validity + output.scalar()->is_valid = + std::all_of(input.values.begin(), input.values.end(), + [](const ExecValue& input) { return input.scalar->is_valid; }); + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + output.scalar()->is_valid = true; + } + } + + RETURN_NOT_OK(kernel_->exec(kernel_ctx_, input, &output)); + + // Assert that the kernel did not alter the shape of the output + // type. After ARROW-16577 delete this since ValueDescr::SCALAR will not + // exist anymore + DCHECK(((output_descr_.shape == ValueDescr::ARRAY) && output.is_array_data()) || + ((output_descr_.shape == ValueDescr::SCALAR) && output.is_scalar())); + + // Emit a result for each chunk + if (output_descr_.shape == ValueDescr::ARRAY) { + RETURN_NOT_OK(listener->OnResult(output.array_data())); + } else { + RETURN_NOT_OK(listener->OnResult(output.scalar())); } - } else { - // For scalar outputs, we set a null scalar of the correct type to - // communicate the output type to the kernel if needed - // - // XXX: Is there some way to avoid this step? - out->value = MakeNullScalar(output_descr_.type); } return Status::OK(); } @@ -780,23 +927,42 @@ class ScalarExecutor : public KernelExecutorImpl { // - Output Array is NullArray // - kernel_->null_handling is COMPUTED_NO_PREALLOCATE or OUTPUT_NOT_NULL validity_preallocated_ = false; + if (out_type_id != Type::NA) { if (kernel_->null_handling == NullHandling::COMPUTED_PREALLOCATE) { // Override the flag if kernel asks for pre-allocation validity_preallocated_ = true; } else if (kernel_->null_handling == NullHandling::INTERSECTION) { - bool are_all_inputs_valid = true; + elide_validity_bitmap_ = true; for (const auto& arg : args) { auto null_gen = NullGeneralization::Get(arg) == NullGeneralization::ALL_VALID; - are_all_inputs_valid = are_all_inputs_valid && null_gen; + + // If not all valid, this becomes false + elide_validity_bitmap_ = elide_validity_bitmap_ && null_gen; } - validity_preallocated_ = !are_all_inputs_valid; + validity_preallocated_ = !elide_validity_bitmap_; + } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) { + elide_validity_bitmap_ = true; } } if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) { ComputeDataPreallocate(*output_descr_.type, &data_preallocated_); } + // Validity bitmap either preallocated or elided, and all data + // buffers allocated. This is basically only true for primitive + // types that are not dictionary-encoded + preallocating_all_buffers_ = + ((validity_preallocated_ || elide_validity_bitmap_) && + data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && + !is_nested(out_type_id) && !is_dictionary(out_type_id)); + + // TODO(wesm): why was this check ever here? Fixed width binary + // can be 0-width but anything else? + DCHECK(std::all_of( + data_preallocated_.begin(), data_preallocated_.end(), + [](const BufferPreallocation& prealloc) { return prealloc.bit_width >= 0; })); + // Contiguous preallocation only possible on non-nested types if all // buffers are preallocated. Otherwise, we must go chunk-by-chunk. // @@ -804,26 +970,25 @@ class ScalarExecutor : public KernelExecutorImpl { // kernel's attributes. preallocate_contiguous_ = (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && - validity_preallocated_ && !is_nested(out_type_id) && - !is_dictionary(out_type_id) && - data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && - std::all_of(data_preallocated_.begin(), data_preallocated_.end(), - [](const BufferPreallocation& prealloc) { - return prealloc.bit_width >= 0; - })); - if (preallocate_contiguous_) { - ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length)); - } + preallocating_all_buffers_); return Status::OK(); } + // Used to account for the case where we do not preallocate a + // validity bitmap because the inputs are all non-null and we're + // using NullHandling::INTERSECTION to compute the validity bitmap + bool elide_validity_bitmap_ = false; + + // All memory is preallocated for output, contiguous and + // non-contiguous + bool preallocating_all_buffers_ = false; + // If true, and the kernel and output type supports preallocation (for both // the validity and data buffers), then we allocate one big array and then // iterate through it while executing the kernel in chunks bool preallocate_contiguous_ = false; - // For storing a contiguous preallocation per above. Unused otherwise - std::shared_ptr preallocated_; + ExecSpanIterator span_iterator_; }; Status PackBatchNoChunks(const std::vector& args, ExecBatch* out) { @@ -847,7 +1012,7 @@ Status PackBatchNoChunks(const std::vector& args, ExecBatch* out) { class VectorExecutor : public KernelExecutorImpl { public: - Status Execute(const std::vector& args, ExecListener* listener) override { + Status ExecuteImpl(const std::vector& args, ExecListener* listener) { RETURN_NOT_OK(PrepareExecute(args)); ExecBatch batch; if (kernel_->can_execute_chunkwise) { @@ -861,6 +1026,10 @@ class VectorExecutor : public KernelExecutorImpl { return Finalize(listener); } + Status Execute(const ExecBatch& batch, ExecListener* listener) override { + return ExecuteImpl(batch.values, listener); + } + Datum WrapResults(const std::vector& inputs, const std::vector& outputs) override { // If execution yielded multiple chunks (because large arrays were split @@ -888,7 +1057,7 @@ class VectorExecutor : public KernelExecutorImpl { if (kernel_->null_handling == NullHandling::INTERSECTION && output_descr_.shape == ValueDescr::ARRAY) { - RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array())); + RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array())); } RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); if (!kernel_->finalize) { @@ -913,16 +1082,11 @@ class VectorExecutor : public KernelExecutorImpl { return Status::OK(); } - Status SetupArgIteration(const std::vector& args) override { + Status PrepareExecute(const std::vector& args) { if (kernel_->can_execute_chunkwise) { ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make( args, exec_context()->exec_chunksize())); } - return Status::OK(); - } - - Status PrepareExecute(const std::vector& args) { - RETURN_NOT_OK(this->SetupArgIteration(args)); output_num_buffers_ = static_cast(output_descr_.type->layout().buffers.size()); // Decide if we need to preallocate memory for this kernel @@ -935,6 +1099,7 @@ class VectorExecutor : public KernelExecutorImpl { return Status::OK(); } + std::unique_ptr batch_iterator_; std::vector results_; }; @@ -946,8 +1111,13 @@ class ScalarAggExecutor : public KernelExecutorImpl { return KernelExecutorImpl::Init(ctx, args); } - Status Execute(const std::vector& args, ExecListener* listener) override { - RETURN_NOT_OK(this->SetupArgIteration(args)); + Status Execute(const ExecBatch& args, ExecListener* listener) override { + return ExecuteImpl(args.values, listener); + } + + Status ExecuteImpl(const std::vector& args, ExecListener* listener) { + ARROW_ASSIGN_OR_RAISE( + batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize())); ExecBatch batch; while (batch_iterator_->Next(&batch)) { @@ -988,6 +1158,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { return Status::OK(); } + std::unique_ptr batch_iterator_; const std::vector* input_descrs_; const FunctionOptions* options_; }; @@ -1004,7 +1175,7 @@ Result> MakeExecutor(ExecContext* ctx, } // namespace -Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) { +Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* output) { DCHECK_NE(nullptr, output); DCHECK_GT(output->buffers.size(), 0); @@ -1026,6 +1197,67 @@ Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out return propagator.Execute(); } +void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out) { + if (out->type->id() == Type::NA) { + // Null output type is a no-op (rare when this would happen but we at least + // will test for it) + return; + } + + std::vector arrays_with_nulls; + bool is_all_null = false; + for (const ExecValue& value : batch.values) { + auto null_generalization = NullGeneralization::Get(value); + if (null_generalization == NullGeneralization::ALL_NULL) { + is_all_null = true; + } + if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) { + arrays_with_nulls.push_back(&value.array); + } + } + uint8_t* out_bitmap = out->buffers[0].data; + if (is_all_null) { + // An all-null value (scalar null or all-null array) gives us a short + // circuit opportunity + // OK, the output should be all null + out->null_count = out->length; + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, false); + return; + } + + out->null_count = kUnknownNullCount; + if (arrays_with_nulls.empty()) { + // No arrays with nulls case + out->null_count = 0; + if (out_bitmap != nullptr) { + // An output buffer was allocated, so we fill it with all valid + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, true); + } + } else if (arrays_with_nulls.size() == 1) { + // One array + const ArraySpan& arr = *arrays_with_nulls[0]; + + // Reuse the null count if it's known + out->null_count = arr.null_count; + CopyBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, out->offset); + } else { + // More than one array. We use BitmapAnd to intersect their bitmaps + auto Accumulate = [&](const ArraySpan& left, const ArraySpan& right) { + DCHECK(left.buffers[0].data != nullptr); + DCHECK(right.buffers[0].data != nullptr); + BitmapAnd(left.buffers[0].data, left.offset, right.buffers[0].data, right.offset, + out->length, out->offset, out_bitmap); + }; + // Seed the output bitmap with the & of the first two bitmaps + Accumulate(*arrays_with_nulls[0], *arrays_with_nulls[1]); + + // Accumulate the rest + for (size_t i = 2; i < arrays_with_nulls.size(); ++i) { + Accumulate(*out, *arrays_with_nulls[i]); + } + } +} + std::unique_ptr KernelExecutor::MakeScalar() { return ::arrow::internal::make_unique(); } @@ -1038,6 +1270,41 @@ std::unique_ptr KernelExecutor::MakeScalarAggregate() { return ::arrow::internal::make_unique(); } +Result InferBatchLength(const std::vector& values) { + int64_t length = -1; + bool are_all_scalar = true; + for (const Datum& arg : values) { + if (arg.is_array()) { + int64_t arg_length = arg.array()->length; + if (length < 0) { + length = arg_length; + } else { + if (length != arg_length) { + return Status::Invalid("Array arguments must all be the same length"); + } + } + are_all_scalar = false; + } else if (arg.is_chunked_array()) { + int64_t arg_length = arg.chunked_array()->length(); + if (length < 0) { + length = arg_length; + } else { + if (length != arg_length) { + return Status::Invalid("Array arguments must all be the same length"); + } + } + are_all_scalar = false; + } + } + + if (are_all_scalar && values.size() > 0) { + length = 1; + } else if (length < 0) { + length = 0; + } + return length; +} + } // namespace detail ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor, @@ -1083,5 +1350,21 @@ Result CallFunction(const std::string& func_name, const std::vector CallFunction(const std::string& func_name, const ExecBatch& batch, + const FunctionOptions* options, ExecContext* ctx) { + if (ctx == nullptr) { + ExecContext default_ctx; + return CallFunction(func_name, batch, options, &default_ctx); + } + ARROW_ASSIGN_OR_RAISE(std::shared_ptr func, + ctx->func_registry()->GetFunction(func_name)); + return func->Execute(batch, options, ctx); +} + +Result CallFunction(const std::string& func_name, const ExecBatch& batch, + ExecContext* ctx) { + return CallFunction(func_name, batch, /*options=*/nullptr, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index 742c3794416..ba41bfb5b6e 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -20,6 +20,8 @@ #pragma once +#include +#include #include #include #include @@ -210,7 +212,7 @@ struct ARROW_EXPORT ExecBatch { /// If the array values are of length 0 then the length is 0 regardless of /// whether any values are Scalar. In general ExecBatch objects are produced /// by ExecBatchIterator which by design does not yield length-0 batches. - int64_t length; + int64_t length = 0; /// \brief The sum of bytes in each buffer referenced by the batch /// @@ -252,6 +254,192 @@ struct ARROW_EXPORT ExecBatch { inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); } inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); } +struct ExecValue { + enum Kind { ARRAY, SCALAR }; + Kind kind = ARRAY; + ArraySpan array; + const Scalar* scalar; + + ExecValue(Scalar* scalar) // NOLINT implicit conversion + : kind(SCALAR), scalar(scalar) {} + + ExecValue(ArraySpan array) // NOLINT implicit conversion + : kind(ARRAY), array(std::move(array)) {} + + ExecValue(const ArrayData& array) // NOLINT implicit conversion + : kind(ARRAY) { + this->array.SetMembers(array); + } + + ExecValue() = default; + ExecValue(const ExecValue& other) = default; + ExecValue& operator=(const ExecValue& other) = default; + ExecValue(ExecValue&& other) = default; + ExecValue& operator=(ExecValue&& other) = default; + + int64_t length() const { return this->is_array() ? this->array.length : 1; } + + bool is_array() const { return this->kind == ARRAY; } + bool is_scalar() const { return this->kind == SCALAR; } + + void SetArray(const ArrayData& array) { + this->kind = ARRAY; + this->array.SetMembers(array); + } + + void SetScalar(const Scalar* scalar) { + this->kind = SCALAR; + this->scalar = scalar; + } + + template + const ExactType& scalar_as() const { + return ::arrow::internal::checked_cast(*this->scalar); + } + + /// XXX: here only temporarily until type resolution can be cleaned + /// up to not use ValueDescr + ValueDescr descr() const { + ValueDescr::Shape shape = this->is_array() ? ValueDescr::ARRAY : ValueDescr::SCALAR; + return ValueDescr(const_cast(this->type())->shared_from_this(), shape); + } + + /// XXX: here temporarily for compatibility with datum, see + /// e.g. MakeStructExec in scalar_nested.cc + int64_t null_count() const { + if (this->is_array()) { + return this->array.GetNullCount(); + } else { + return this->scalar->is_valid ? 0 : 1; + } + } + + const DataType* type() const { + if (this->kind == ARRAY) { + return array.type; + } else { + return scalar->type.get(); + } + } +}; + +struct ARROW_EXPORT ExecResult { + // The default value of the variant is ArraySpan + // TODO(wesm): remove Scalar output modality in ARROW-16577 + util::Variant, std::shared_ptr> value; + + int64_t length() const { + if (this->is_array_span()) { + return this->array_span()->length; + } else if (this->is_array_data()) { + return this->array_data()->length; + } else { + // Should not reach here + return 1; + } + } + + const DataType* type() const { + switch (this->value.index()) { + case 0: + return this->array_span()->type; + case 1: + return this->array_data()->type.get(); + default: + // scalar + return this->scalar()->type.get(); + } + } + + ArraySpan* array_span() const { + return const_cast(&util::get(this->value)); + } + bool is_array_span() const { return this->value.index() == 0; } + + const std::shared_ptr& array_data() const { + return util::get>(this->value); + } + + bool is_array_data() const { return this->value.index() == 1; } + + const std::shared_ptr& scalar() const { + return util::get>(this->value); + } + + bool is_scalar() const { return this->value.index() == 2; } +}; + +/// \brief A "lightweight" column batch object which contains no +/// std::shared_ptr objects and does not have any memory ownership +/// semantics. Can represent a view onto an "owning" ExecBatch. +struct ARROW_EXPORT ExecSpan { + ExecSpan() = default; + ExecSpan(const ExecSpan& other) = default; + ExecSpan& operator=(const ExecSpan& other) = default; + ExecSpan(ExecSpan&& other) = default; + ExecSpan& operator=(ExecSpan&& other) = default; + + explicit ExecSpan(std::vector values, int64_t length) + : length(length), values(std::move(values)) {} + + explicit ExecSpan(const ExecBatch& batch) { + this->length = batch.length; + this->values.resize(batch.values.size()); + for (size_t i = 0; i < batch.values.size(); ++i) { + const Datum& in_value = batch[i]; + ExecValue* out_value = &this->values[i]; + if (in_value.is_array()) { + out_value->SetArray(*in_value.array()); + } else { + out_value->SetScalar(in_value.scalar().get()); + } + } + } + + bool is_all_scalar() const { + return std::all_of(this->values.begin(), this->values.end(), + [](const ExecValue& v) { return v.is_scalar(); }); + } + + /// \brief Return the value at the i-th index + template + inline const ExecValue& operator[](index_type i) const { + return values[i]; + } + + void AddOffset(int64_t offset) { + for (ExecValue& value : values) { + if (value.kind == ExecValue::ARRAY) { + value.array.AddOffset(offset); + } + } + } + + void SetOffset(int64_t offset) { + for (ExecValue& value : values) { + if (value.kind == ExecValue::ARRAY) { + value.array.SetOffset(offset); + } + } + } + + /// \brief A convenience for the number of values / arguments. + int num_values() const { return static_cast(values.size()); } + + // XXX: eliminate the need for ValueDescr; copied temporarily from + // ExecBatch + std::vector GetDescriptors() const { + std::vector result; + for (const auto& value : this->values) { + result.emplace_back(value.descr()); + } + return result; + } + + int64_t length = 0; + std::vector values; +}; + /// \defgroup compute-call-function One-shot calls to compute functions /// /// @{ @@ -271,6 +459,21 @@ ARROW_EXPORT Result CallFunction(const std::string& func_name, const std::vector& args, ExecContext* ctx = NULLPTR); +/// \brief One-shot invoker for all types of functions. +/// +/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs, +/// and wrapping of outputs. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const ExecBatch& batch, + const FunctionOptions* options, ExecContext* ctx = NULLPTR); + +/// \brief Variant of CallFunction which uses a function's default options. +/// +/// NB: Some functions require FunctionOptions be provided. +ARROW_EXPORT +Result CallFunction(const std::string& func_name, const ExecBatch& batch, + ExecContext* ctx = NULLPTR); + /// @} } // namespace compute diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc index a65379460dd..2e0fe6ff34b 100644 --- a/cpp/src/arrow/compute/exec/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -581,9 +581,14 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i auto call = CallNotNull(expr); std::vector arguments(call->arguments.size()); + + bool all_scalar = true; for (size_t i = 0; i < arguments.size(); ++i) { ARROW_ASSIGN_OR_RAISE( arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context)); + if (arguments[i].is_array()) { + all_scalar = false; + } } auto executor = compute::detail::KernelExecutor::MakeScalar(); @@ -597,7 +602,8 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options})); compute::detail::DatumAccumulator listener; - RETURN_NOT_OK(executor->Execute(arguments, &listener)); + RETURN_NOT_OK(executor->Execute( + ExecBatch(std::move(arguments), all_scalar ? 1 : input.length), &listener)); const auto out = executor->WrapResults(arguments, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); @@ -664,7 +670,7 @@ Result FoldConstants(Expression expr) { if (std::all_of(call->arguments.begin(), call->arguments.end(), [](const Expression& argument) { return argument.literal(); })) { // all arguments are literal; we can evaluate this subexpression *now* - static const ExecBatch ignored_input = ExecBatch{}; + static const ExecBatch ignored_input = ExecBatch({}, 1); ARROW_ASSIGN_OR_RAISE(Datum constant, ExecuteScalarExpression(expr, ignored_input)); diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index 74124f02267..fac78da6db1 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -72,6 +72,56 @@ class ARROW_EXPORT ExecBatchIterator { int64_t max_chunksize_; }; +/// \brief Break std::vector into a sequence of non-owning +/// ExecSpan for kernel execution. The lifetime of the Datum vector +/// must be longer than the lifetime of this object +class ARROW_EXPORT ExecSpanIterator { + public: + ExecSpanIterator() = default; + + /// \brief Initialize itertor iterator and do basic argument validation + /// + /// \param[in] batch the input ExecBatch + /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending + /// on the chunk layout of ChunkedArray. + Status Init(const ExecBatch& batch, ValueDescr::Shape output_shape = ValueDescr::ARRAY, + int64_t max_chunksize = kDefaultMaxChunksize); + + /// \brief Compute the next span by updating the state of the + /// previous span object. You must keep passing in the previous + /// value for the results to be consistent. If you need to process + /// in parallel, make a copy of the in-use ExecSpan while it's being + /// used by another thread and pass it into Next. This function + /// always populates at least one span. If you call this function + /// with a blank ExecSpan after the first iteration, it will not + /// work correctly (maybe we will change this later). Return false + /// if the iteration is exhausted + bool Next(ExecSpan* span); + + int64_t length() const { return length_; } + int64_t position() const { return position_; } + + private: + ExecSpanIterator(const std::vector& args, int64_t length, int64_t max_chunksize); + + int64_t GetNextChunkSpan(int64_t iteration_size, ExecSpan* span); + + bool initialized_ = false; + bool have_chunked_arrays_ = false; + const std::vector* args_; + std::vector chunk_indexes_; + std::vector value_positions_; + + // Keep track of the array offset in the "active" array (e.g. the + // array or the particular chunk of an array) in each slot, separate + // from the relative position within each chunk (which is in + // value_positions_) + std::vector value_offsets_; + int64_t position_; + int64_t length_; + int64_t max_chunksize_; +}; + // "Push" / listener API like IPC reader so that consumers can receive // processed chunks as soon as they're available. @@ -113,9 +163,10 @@ class ARROW_EXPORT KernelExecutor { /// for all scanned batches in a dataset filter. virtual Status Init(KernelContext*, KernelInitArgs) = 0; - /// XXX: Better configurability for listener - /// Not thread-safe - virtual Status Execute(const std::vector& args, ExecListener* listener) = 0; + // TODO(wesm): per ARROW-16819, adding ExecBatch variant so that a batch + // length can be passed in for scalar functions; will have to return and + // clean a bunch of things up + virtual Status Execute(const ExecBatch& batch, ExecListener* listener) = 0; virtual Datum WrapResults(const std::vector& args, const std::vector& outputs) = 0; @@ -128,6 +179,8 @@ class ARROW_EXPORT KernelExecutor { static std::unique_ptr MakeScalarAggregate(); }; +Result InferBatchLength(const std::vector& values); + /// \brief Populate validity bitmap with the intersection of the nullity of the /// arguments. If a preallocated bitmap is not provided, then one will be /// allocated if needed (in some cases a bitmap can be zero-copied from the @@ -138,7 +191,10 @@ class ARROW_EXPORT KernelExecutor { /// \param[in] batch the data batch /// \param[in] out the output ArrayData, must not be null ARROW_EXPORT -Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out); +Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* out); + +ARROW_EXPORT +void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out); } // namespace detail } // namespace compute diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index b00bcb319c3..bd344fb2297 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -90,13 +90,20 @@ TEST(SelectionVector, Basics) { ASSERT_EQ(3, sel_vector->indices()[1]); } +void AssertValidityZeroExtraBits(const uint8_t* data, int64_t length, int64_t offset) { + const int64_t bit_extent = ((offset + length + 7) / 8) * 8; + for (int64_t i = offset + length; i < bit_extent; ++i) { + EXPECT_FALSE(bit_util::GetBit(data, i)) << i; + } +} + +void AssertValidityZeroExtraBits(const ArraySpan& arr) { + return AssertValidityZeroExtraBits(arr.buffers[0].data, arr.length, arr.offset); +} + void AssertValidityZeroExtraBits(const ArrayData& arr) { const Buffer& buf = *arr.buffers[0]; - - const int64_t bit_extent = ((arr.offset + arr.length + 7) / 8) * 8; - for (int64_t i = arr.offset + arr.length; i < bit_extent; ++i) { - EXPECT_FALSE(bit_util::GetBit(buf.data(), i)) << i; - } + return AssertValidityZeroExtraBits(buf.data(), arr.length, arr.offset); } class TestComputeInternals : public ::testing::Test { @@ -137,6 +144,9 @@ class TestComputeInternals : public ::testing::Test { std::unique_ptr rng_; }; +// ---------------------------------------------------------------------- +// Test PropagateNulls + class TestPropagateNulls : public TestComputeInternals {}; TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) { @@ -149,7 +159,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) { ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount); ExecBatch batch({input}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(kUnknownNullCount, output.null_count); ASSERT_EQ(9, output.GetNullCount()); @@ -164,7 +174,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithoutNulls) { ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount); ExecBatch batch({input}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); EXPECT_EQ(-1, output.null_count); EXPECT_EQ(nulls.get(), output.buffers[0].get()); } @@ -185,7 +195,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) { ArrayData output(boolean(), length, buffers); ExecBatch batch(values, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); if (preallocate) { // Ensure that buffer object the same when we pass in preallocated memory @@ -228,7 +238,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) { // null-scalar earlier in the batch ArrayData output(boolean(), length, {nullptr, nullptr}); ExecBatch batch({MakeNullScalar(boolean()), arr_all_nulls}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(arr_all_nulls->data()->buffers[0].get(), output.buffers[0].get()); } } @@ -260,7 +270,7 @@ TEST_F(TestPropagateNulls, SingleValueWithNulls) { ASSERT_EQ(0, output.offset); } - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); if (!preallocate) { const Buffer* parent_buf = arr->data()->buffers[0].get(); @@ -308,14 +318,14 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) { ArrayData output(boolean(), length, {nullptr, nullptr}); ExecBatch batch({some_nulls, no_nulls}, length); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); // Flip order of args output = ArrayData(boolean(), length, {nullptr, nullptr}); batch.values = {no_nulls, no_nulls, some_nulls}; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nulls.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); @@ -324,7 +334,7 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) { auto preallocated_mem = std::make_shared(bitmap_data, 2); output.null_count = kUnknownNullCount; output.buffers[0] = preallocated_mem; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(preallocated_mem.get(), output.buffers[0].get()); ASSERT_EQ(9, output.null_count); @@ -366,7 +376,7 @@ TEST_F(TestPropagateNulls, IntersectsNulls) { ArrayData output(boolean(), length, {nulls, nullptr}); output.offset = output_offset; - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); // Preallocated memory used if (preallocate) { @@ -405,10 +415,249 @@ TEST_F(TestPropagateNulls, NullOutputTypeNoop) { ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length); ArrayData output(null(), length, {nullptr}); - ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output)); + ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output)); ASSERT_EQ(nullptr, output.buffers[0]); } +// ---------------------------------------------------------------------- +// Test PropagateNullsSpans (new span-based implementation). Some of +// the tests above had to be rewritten because the span-based +// implementation does not deal with zero-copy optimizations right now + +class TestPropagateNullsSpans : public TestComputeInternals {}; + +TEST_F(TestPropagateNullsSpans, UnknownNullCountWithNullsZeroCopies) { + const int64_t length = 16; + + const uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + auto ty = boolean(); + ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount); + + uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + auto nulls2 = std::make_shared(validity_bitmap2, 8); + ArraySpan output(ty.get(), length); + output.buffers[0].data = validity_bitmap2; + output.buffers[0].size = 0; + + ExecSpan span(ExecBatch({input}, length)); + PropagateNullsSpans(span, &output); + ASSERT_EQ(kUnknownNullCount, output.null_count); + ASSERT_EQ(9, output.GetNullCount()); +} + +TEST_F(TestPropagateNullsSpans, UnknownNullCountWithoutNulls) { + const int64_t length = 16; + constexpr uint8_t validity_bitmap[8] = {255, 255, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + + auto ty = boolean(); + ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount); + + uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + auto nulls2 = std::make_shared(validity_bitmap2, 8); + ArraySpan output(ty.get(), length); + output.buffers[0].data = validity_bitmap2; + output.buffers[0].size = 0; + + ExecSpan span(ExecBatch({input}, length)); + PropagateNullsSpans(span, &output); + ASSERT_EQ(kUnknownNullCount, output.null_count); + ASSERT_EQ(0, output.GetNullCount()); +} + +TEST_F(TestPropagateNullsSpans, SetAllNulls) { + const int64_t length = 16; + + auto CheckSetAllNull = [&](std::vector values) { + // Make fresh bitmap with all 1's + uint8_t bitmap_data[2] = {255, 255}; + auto buf = std::make_shared(bitmap_data, 2); + + auto ty = boolean(); + ArraySpan output(ty.get(), length); + output.SetBuffer(0, buf); + + ExecSpan span(ExecBatch(values, length)); + PropagateNullsSpans(span, &output); + + uint8_t expected[2] = {0, 0}; + ASSERT_EQ(0, std::memcmp(output.buffers[0].data, expected, output.buffers[0].size)); + }; + + // There is a null scalar + std::shared_ptr i32_val = std::make_shared(3); + std::vector vals = {i32_val, MakeNullScalar(boolean())}; + CheckSetAllNull(vals); + + const double true_prob = 0.5; + vals[0] = rng_->Boolean(length, true_prob); + CheckSetAllNull(vals); + + auto arr_all_nulls = rng_->Boolean(length, true_prob, /*null_probability=*/1); + + // One value is all null + vals = {rng_->Boolean(length, true_prob, /*null_probability=*/0.5), arr_all_nulls}; + CheckSetAllNull(vals); + + // A value is NullType + std::shared_ptr null_arr = std::make_shared(length); + vals = {rng_->Boolean(length, true_prob), null_arr}; + CheckSetAllNull(vals); +} + +TEST_F(TestPropagateNullsSpans, SingleValueWithNulls) { + // Input offset is non-zero (0 mod 8 and nonzero mod 8 cases) + const int64_t length = 100; + auto arr = rng_->Boolean(length, 0.5, /*null_probability=*/0.5); + + auto CheckSliced = [&](int64_t offset, int64_t out_offset = 0) { + // Unaligned bitmap, zero copy not possible + auto sliced = arr->Slice(offset); + std::vector vals = {sliced}; + + auto ty = boolean(); + ArraySpan output(ty.get(), vals[0].length()); + output.offset = out_offset; + + std::shared_ptr preallocated_bitmap; + ASSERT_OK_AND_ASSIGN( + preallocated_bitmap, + AllocateBuffer(bit_util::BytesForBits(sliced->length() + out_offset))); + std::memset(preallocated_bitmap->mutable_data(), 0, preallocated_bitmap->size()); + output.SetBuffer(0, preallocated_bitmap); + + ExecBatch batch(vals, vals[0].length()); + PropagateNullsSpans(ExecSpan(batch), &output); + ASSERT_EQ(arr->Slice(offset)->null_count(), output.GetNullCount()); + ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output.offset, + sliced->null_bitmap_data(), sliced->offset(), + output.length)); + AssertValidityZeroExtraBits(output); + }; + + CheckSliced(8); + CheckSliced(7); + CheckSliced(8, /*offset=*/4); + CheckSliced(7, /*offset=*/4); +} + +TEST_F(TestPropagateNullsSpans, CasesThatUsedToBeZeroCopy) { + // ARROW-16576: testing behaviors that used to be zero copy but are + // not anymore + const int64_t length = 16; + + auto ty = boolean(); + constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0}; + auto nulls = std::make_shared(validity_bitmap, 8); + + ArraySpan some_nulls(ty.get(), length); + some_nulls.SetBuffer(0, nulls); + some_nulls.null_count = 9; + + ArraySpan no_nulls(ty.get(), length); + no_nulls.null_count = 0; + + { + uint8_t bitmap_data[2] = {0, 0}; + auto preallocated_mem = std::make_shared(bitmap_data, 2); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, preallocated_mem); + PropagateNullsSpans(ExecSpan({some_nulls, no_nulls}, length), &output); + ASSERT_EQ( + 0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size)); + ASSERT_EQ(output.buffers[0].owner, &preallocated_mem); + ASSERT_EQ(9, output.GetNullCount()); + } + + // Flip order of args + { + uint8_t bitmap_data[2] = {0, 0}; + auto preallocated_mem = std::make_shared(bitmap_data, 2); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, preallocated_mem); + PropagateNullsSpans(ExecSpan({no_nulls, no_nulls, some_nulls}, length), &output); + ASSERT_EQ( + 0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size)); + ASSERT_EQ(output.buffers[0].owner, &preallocated_mem); + ASSERT_EQ(9, output.GetNullCount()); + } +} + +TEST_F(TestPropagateNullsSpans, IntersectsNulls) { + const int64_t length = 16; + + // 0b01111111 0b11001111 + constexpr uint8_t bitmap1[8] = {127, 207, 0, 0, 0, 0, 0, 0}; + auto buffer1 = std::make_shared(bitmap1, 8); + + // 0b11111110 0b01111111 + constexpr uint8_t bitmap2[8] = {254, 127, 0, 0, 0, 0, 0, 0}; + auto buffer2 = std::make_shared(bitmap2, 8); + + // 0b11101111 0b11111110 + constexpr uint8_t bitmap3[8] = {239, 254, 0, 0, 0, 0, 0, 0}; + auto buffer3 = std::make_shared(bitmap3, 8); + + auto ty = boolean(); + + ArraySpan arr1(ty.get(), length); + arr1.SetBuffer(0, buffer1); + + ArraySpan arr2(ty.get(), length); + arr2.SetBuffer(0, buffer2); + + ArraySpan arr3(ty.get(), length); + arr3.SetBuffer(0, buffer3); + + auto CheckCase = [&](std::vector values, int64_t ex_null_count, + const uint8_t* ex_bitmap, int64_t output_offset = 0) { + ExecSpan batch(values, length); + + std::shared_ptr nulls; + // Make the buffer one byte bigger so we can have non-zero offsets + ASSERT_OK_AND_ASSIGN(nulls, AllocateBuffer(3)); + std::memset(nulls->mutable_data(), 0, nulls->size()); + + ArraySpan output(ty.get(), length); + output.SetBuffer(0, nulls); + output.offset = output_offset; + + PropagateNullsSpans(batch, &output); + ASSERT_EQ(&nulls, output.buffers[0].owner); + EXPECT_EQ(kUnknownNullCount, output.null_count); + EXPECT_EQ(ex_null_count, output.GetNullCount()); + ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output_offset, ex_bitmap, + /*ex_offset=*/0, length)); + + // Now check that the rest of the bits in out_buffer are still 0 + AssertValidityZeroExtraBits(output); + }; + + // 0b01101110 0b01001110 + uint8_t expected1[2] = {110, 78}; + CheckCase({arr1, arr2, arr3}, 7, expected1); + CheckCase({arr1, arr2, arr3}, 7, expected1, /*output_offset=*/4); + + // 0b01111110 0b01001111 + uint8_t expected2[2] = {126, 79}; + CheckCase({arr1, arr2}, 5, expected2, /*output_offset=*/4); +} + +TEST_F(TestPropagateNullsSpans, NullOutputTypeNoop) { + // Ensure we leave the buffers alone when the output type is null() + // TODO(wesm): is this test useful? Can probably delete + const int64_t length = 100; + ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length); + + auto ty = null(); + ArraySpan result(ty.get(), length); + PropagateNullsSpans(ExecSpan(batch), &result); + ASSERT_EQ(nullptr, result.buffers[0].data); +} + // ---------------------------------------------------------------------- // ExecBatchIterator @@ -540,59 +789,202 @@ TEST_F(TestExecBatchIterator, ZeroLengthInputs) { CheckArgs(args); } +// ---------------------------------------------------------------------- +// ExecSpanIterator tests + +class TestExecSpanIterator : public TestComputeInternals { + public: + void SetupIterator(const ExecBatch& batch, + ValueDescr::Shape output_shape = ValueDescr::ARRAY, + int64_t max_chunksize = kDefaultMaxChunksize) { + ASSERT_OK(iterator_.Init(batch, output_shape, max_chunksize)); + } + void CheckIteration(const ExecBatch& input, int chunksize, + const std::vector& ex_batch_sizes) { + SetupIterator(input, ValueDescr::ARRAY, chunksize); + ExecSpan batch; + int64_t position = 0; + for (size_t i = 0; i < ex_batch_sizes.size(); ++i) { + ASSERT_EQ(position, iterator_.position()); + ASSERT_TRUE(iterator_.Next(&batch)); + ASSERT_EQ(ex_batch_sizes[i], batch.length); + + for (size_t j = 0; j < input.values.size(); ++j) { + switch (input[j].kind()) { + case Datum::SCALAR: + ASSERT_TRUE(input[j].scalar()->Equals(*batch[j].scalar)); + break; + case Datum::ARRAY: + AssertArraysEqual(*input[j].make_array()->Slice(position, batch.length), + *batch[j].array.ToArray()); + break; + case Datum::CHUNKED_ARRAY: { + const ChunkedArray& carr = *input[j].chunked_array(); + if (batch.length == 0) { + ASSERT_EQ(0, carr.length()); + } else { + auto arg_slice = carr.Slice(position, batch.length); + // The sliced ChunkedArrays should only ever be 1 chunk + ASSERT_EQ(1, arg_slice->num_chunks()); + AssertArraysEqual(*arg_slice->chunk(0), *batch[j].array.ToArray()); + } + } break; + default: + break; + } + } + position += ex_batch_sizes[i]; + } + // Ensure that the iterator is exhausted + ASSERT_FALSE(iterator_.Next(&batch)); + + ASSERT_EQ(iterator_.length(), iterator_.position()); + } + + protected: + ExecSpanIterator iterator_; +}; + +TEST_F(TestExecSpanIterator, Basics) { + const int64_t length = 100; + + ExecBatch input; + input.length = 100; + + // Simple case with a single chunk + input.values = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)), + Datum(std::make_shared(3))}; + SetupIterator(input); + + ExecSpan batch; + ASSERT_TRUE(iterator_.Next(&batch)); + ASSERT_EQ(3, batch.values.size()); + ASSERT_EQ(3, batch.num_values()); + ASSERT_EQ(length, batch.length); + + AssertArraysEqual(*input[0].make_array(), *batch[0].array.ToArray()); + AssertArraysEqual(*input[1].make_array(), *batch[1].array.ToArray()); + ASSERT_TRUE(input[2].scalar()->Equals(*batch[2].scalar)); + + ASSERT_EQ(length, iterator_.position()); + ASSERT_FALSE(iterator_.Next(&batch)); + + // Split into chunks of size 16 + CheckIteration(input, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4}); +} + +TEST_F(TestExecSpanIterator, InputValidation) { + ExecSpanIterator iterator; + + ExecBatch batch({Datum(GetInt32Array(10)), Datum(GetInt32Array(9))}, 10); + ASSERT_RAISES(Invalid, iterator.Init(batch)); + + batch.values = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))}; + ASSERT_RAISES(Invalid, iterator.Init(batch)); + + batch.values = {Datum(GetInt32Array(10))}; + ASSERT_OK(iterator.Init(batch)); +} + +TEST_F(TestExecSpanIterator, ChunkedArrays) { + ExecBatch batch({Datum(GetInt32Chunked({0, 20, 10})), Datum(GetInt32Chunked({15, 15})), + Datum(GetInt32Array(30)), Datum(std::make_shared(5)), + Datum(MakeNullScalar(boolean()))}, + 30); + + CheckIteration(batch, /*chunksize=*/10, {10, 5, 5, 10}); + CheckIteration(batch, /*chunksize=*/20, {15, 5, 10}); + CheckIteration(batch, /*chunksize=*/30, {15, 5, 10}); +} + +TEST_F(TestExecSpanIterator, ZeroLengthInputs) { + auto carr = std::shared_ptr(new ChunkedArray({}, int32())); + + auto CheckArgs = [&](const ExecBatch& batch) { + ExecSpanIterator iterator; + ASSERT_OK(iterator.Init(batch, ValueDescr::ARRAY)); + ExecSpan iter_span; + ASSERT_FALSE(iterator.Next(&iter_span)); + }; + + ExecBatch input; + input.length = 0; + + // Zero-length ChunkedArray with zero chunks + input.values = {Datum(carr)}; + CheckArgs(input); + + // Zero-length array + input.values = {Datum(GetInt32Array(0))}; + CheckArgs(input); + + // ChunkedArray with single empty chunk + input.values = {Datum(GetInt32Chunked({0}))}; + CheckArgs(input); +} + // ---------------------------------------------------------------------- // Scalar function execution -Status ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) { +Status ExecCopyArrayData(KernelContext*, const ExecSpan& batch, ExecResult* out) { DCHECK_EQ(1, batch.num_values()); - const auto& type = checked_cast(*batch[0].type()); - int value_size = type.bit_width() / 8; + int value_size = batch[0].type()->byte_width(); - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); + const ArraySpan& arg0 = batch[0].array; + ArrayData* out_arr = out->array_data().get(); uint8_t* dst = out_arr->buffers[1]->mutable_data() + out_arr->offset * value_size; - const uint8_t* src = arg0.buffers[1]->data() + arg0.offset * value_size; + const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size; + std::memcpy(dst, src, batch.length * value_size); + return Status::OK(); +} + +Status ExecCopyArraySpan(KernelContext*, const ExecSpan& batch, ExecResult* out) { + DCHECK_EQ(1, batch.num_values()); + int value_size = batch[0].type()->byte_width(); + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); + uint8_t* dst = out_arr->buffers[1].data + out_arr->offset * value_size; + const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size; std::memcpy(dst, src, batch.length * value_size); return Status::OK(); } -Status ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecComputedBitmap(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Propagate nulls not used. Check that the out bitmap isn't the same already // as the input bitmap - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); - - if (CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) { + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); + if (CountSetBits(arg0.buffers[0].data, arg0.offset, batch.length) > 0) { // Check that the bitmap has not been already copied over - DCHECK(!BitmapEquals(arg0.buffers[0]->data(), arg0.offset, - out_arr->buffers[0]->data(), out_arr->offset, batch.length)); + DCHECK(!BitmapEquals(arg0.buffers[0].data, arg0.offset, out_arr->buffers[0].data, + out_arr->offset, batch.length)); } - CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, - out_arr->buffers[0]->mutable_data(), out_arr->offset); - return ExecCopy(ctx, batch, out); + CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length, out_arr->buffers[0].data, + out_arr->offset); + return ExecCopyArraySpan(ctx, batch, out); } -Status ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecNoPreallocatedData(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { // Validity preallocated, but not the data - ArrayData* out_arr = out->mutable_array(); + ArrayData* out_arr = out->array_data().get(); DCHECK_EQ(0, out_arr->offset); - const auto& type = checked_cast(*batch[0].type()); - int value_size = type.bit_width() / 8; + int value_size = batch[0].type()->byte_width(); Status s = (ctx->Allocate(out_arr->length * value_size).Value(&out_arr->buffers[1])); DCHECK_OK(s); - return ExecCopy(ctx, batch, out); + return ExecCopyArrayData(ctx, batch, out); } -Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch, - Datum* out) { +Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { // Neither validity nor data preallocated - ArrayData* out_arr = out->mutable_array(); + ArrayData* out_arr = out->array_data().get(); DCHECK_EQ(0, out_arr->offset); Status s = (ctx->AllocateBitmap(out_arr->length).Value(&out_arr->buffers[0])); DCHECK_OK(s); - const ArrayData& arg0 = *batch[0].array(); - CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, + const ArraySpan& arg0 = batch[0].array; + CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length, out_arr->buffers[0]->mutable_data(), /*offset=*/0); // Reuse the kernel that allocates the data @@ -638,22 +1030,23 @@ Result> InitStateful(KernelContext*, return std::unique_ptr(new ExampleState{func_options->value}); } -Status ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecStateful(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // We take the value from the state and multiply the data in batch[0] with it ExampleState* state = static_cast(ctx->state()); int32_t multiplier = checked_cast(*state->value).value; - const ArrayData& arg0 = *batch[0].array(); - ArrayData* out_arr = out->mutable_array(); + const ArraySpan& arg0 = batch[0].array; + ArraySpan* out_arr = out->array_span(); const int32_t* arg0_data = arg0.GetValues(1); - int32_t* dst = out_arr->GetMutableValues(1); + int32_t* dst = out_arr->GetValues(1); for (int64_t i = 0; i < arg0.length; ++i) { dst[i] = arg0_data[i] * multiplier; } return Status::OK(); } -Status ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +// TODO: remove this / refactor it in ARROW-16577 +Status ExecAddInt32(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const Int32Scalar& arg0 = batch[0].scalar_as(); const Int32Scalar& arg1 = batch[1].scalar_as(); out->value = std::make_shared(arg0.value + arg1.value); @@ -685,9 +1078,10 @@ class TestCallScalarFunction : public TestComputeInternals { /*doc=*/FunctionDoc::Empty()); // Add a few kernels. Our implementation only accepts arrays - ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy)); - ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopy)); - ASSERT_OK(func->AddKernel({InputType::Array(float64())}, float64(), ExecCopy)); + ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopyArraySpan)); + ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopyArraySpan)); + ASSERT_OK( + func->AddKernel({InputType::Array(float64())}, float64(), ExecCopyArraySpan)); ASSERT_OK(registry->AddFunction(func)); // A version which doesn't want the executor to call PropagateNulls @@ -767,7 +1161,7 @@ TEST_F(TestCallScalarFunction, ArgumentValidation) { TEST_F(TestCallScalarFunction, PreallocationCases) { double null_prob = 0.2; - auto arr = GetUInt8Array(1000, null_prob); + auto arr = GetUInt8Array(100, null_prob); auto CheckFunction = [&](std::string func_name) { ResetContexts(); @@ -792,7 +1186,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { { // Chunksize not multiple of 8 std::vector args = {Datum(arr)}; - exec_ctx_->set_exec_chunksize(111); + exec_ctx_->set_exec_chunksize(11); ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); AssertArraysEqual(*arr, *result.make_array()); } @@ -800,7 +1194,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { // Input is chunked, output has one big chunk { auto carr = std::shared_ptr( - new ChunkedArray({arr->Slice(0, 100), arr->Slice(100)})); + new ChunkedArray({arr->Slice(0, 10), arr->Slice(10)})); std::vector args = {Datum(carr)}; ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); std::shared_ptr actual = result.chunked_array(); @@ -812,14 +1206,14 @@ TEST_F(TestCallScalarFunction, PreallocationCases) { { std::vector args = {Datum(arr)}; exec_ctx_->set_preallocate_contiguous(false); - exec_ctx_->set_exec_chunksize(400); + exec_ctx_->set_exec_chunksize(40); ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get())); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); const ChunkedArray& carr = *result.chunked_array(); ASSERT_EQ(3, carr.num_chunks()); - AssertArraysEqual(*arr->Slice(0, 400), *carr.chunk(0)); - AssertArraysEqual(*arr->Slice(400, 400), *carr.chunk(1)); - AssertArraysEqual(*arr->Slice(800), *carr.chunk(2)); + AssertArraysEqual(*arr->Slice(0, 40), *carr.chunk(0)); + AssertArraysEqual(*arr->Slice(40, 40), *carr.chunk(1)); + AssertArraysEqual(*arr->Slice(80), *carr.chunk(2)); } }; diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 2b3d4e6feb9..e4be8b5a356 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -205,13 +205,25 @@ Result Function::DispatchBest(std::vector* values) co Result Function::Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const { + return ExecuteInternal(args, /*passed_length=*/-1, options, ctx); +} + +Result Function::Execute(const ExecBatch& batch, const FunctionOptions* options, + ExecContext* ctx) const { + return ExecuteInternal(batch.values, batch.length, options, ctx); +} + +Result Function::ExecuteInternal(const std::vector& args, + int64_t passed_length, + const FunctionOptions* options, + ExecContext* ctx) const { if (options == nullptr) { RETURN_NOT_OK(CheckOptions(*this, options)); options = default_options(); } if (ctx == nullptr) { ExecContext default_ctx; - return Execute(args, options, &default_ctx); + return ExecuteInternal(args, passed_length, options, &default_ctx); } util::tracing::Span span; @@ -228,17 +240,6 @@ Result Function::Execute(const std::vector& args, inputs[i] = args[i].descr(); } - ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs)); - ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx)); - - std::unique_ptr state; - - KernelContext kernel_ctx{ctx}; - if (kernel->init) { - ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options})); - kernel_ctx.SetState(state.get()); - } - std::unique_ptr executor; if (kind() == Function::SCALAR) { executor = detail::KernelExecutor::MakeScalar(); @@ -249,11 +250,39 @@ Result Function::Execute(const std::vector& args, } else { return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions"); } + + ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs)); + ARROW_ASSIGN_OR_RAISE(std::vector args_with_casts, Cast(args, inputs, ctx)); + + std::unique_ptr state; + KernelContext kernel_ctx{ctx}; + if (kernel->init) { + ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options})); + kernel_ctx.SetState(state.get()); + } + RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options})); detail::DatumAccumulator listener; - RETURN_NOT_OK(executor->Execute(implicitly_cast_args, &listener)); - const auto out = executor->WrapResults(implicitly_cast_args, listener.values()); + + // Set length to 0 unless it's a scalar function (vector functions don't use + // it). + ExecBatch input(std::move(args_with_casts), 0); + if (kind() == Function::SCALAR) { + ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, + detail::InferBatchLength(input.values)); + if (passed_length == -1) { + input.length = inferred_length; + } else { + // ARROW-16819: will clean up more later + if (input.num_values() > 0 && passed_length != inferred_length) { + return Status::Invalid("Passed batch length did not equal actual array lengths"); + } + input.length = passed_length; + } + } + RETURN_NOT_OK(executor->Execute(input, &listener)); + const auto out = executor->WrapResults(input.values, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, name_.c_str())); #endif @@ -336,7 +365,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) { } Status VectorFunction::AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init) { + ArrayKernelExecOld exec, KernelInit init) { RETURN_NOT_OK(CheckArity(in_types)); if (arity_.is_varargs && in_types.size() != 1) { @@ -388,5 +417,11 @@ Result MetaFunction::Execute(const std::vector& args, return ExecuteImpl(args, options, ctx); } +Result MetaFunction::Execute(const ExecBatch& batch, + const FunctionOptions* options, + ExecContext* ctx) const { + return Execute(batch.values, options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index face491690f..6e3c8374335 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -235,6 +235,9 @@ class ARROW_EXPORT Function { virtual Result Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const; + virtual Result Execute(const ExecBatch& batch, const FunctionOptions* options, + ExecContext* ctx) const; + /// \brief Returns the default options for this function. /// /// Whatever option semantics a Function has, implementations must guarantee @@ -252,6 +255,9 @@ class ARROW_EXPORT Function { doc_(std::move(doc)), default_options_(default_options) {} + Result ExecuteInternal(const std::vector& args, int64_t passed_length, + const FunctionOptions* options, ExecContext* ctx) const; + Status CheckArity(const std::vector&) const; Status CheckArity(const std::vector&) const; @@ -338,7 +344,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { /// state initialization, no data preallocation, and no preallocation of the /// validity bitmap. Status AddKernel(std::vector in_types, OutputType out_type, - ArrayKernelExec exec, KernelInit init = NULLPTR); + ArrayKernelExecOld exec, KernelInit init = NULLPTR); /// \brief Add a kernel (function implementation). Returns error if the /// kernel's signature does not match the function's arity. @@ -389,6 +395,9 @@ class ARROW_EXPORT MetaFunction : public Function { Result Execute(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override; + Result Execute(const ExecBatch& batch, const FunctionOptions* options, + ExecContext* ctx) const override; + protected: virtual Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index a29a766be79..b508ad047fb 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -85,13 +85,15 @@ void BM_CastDispatchBaseline(benchmark::State& state) { .ValueOrDie(); kernel_context.SetState(cast_state.get()); + ExecSpan input; + input.length = 1; for (auto _ : state) { - Datum timestamp_scalar = MakeNullScalar(double_type); - for (Datum int_scalar : int_scalars) { - ABORT_NOT_OK( - exec(&kernel_context, {{std::move(int_scalar)}, 1}, ×tamp_scalar)); + ExecResult result; + result.value = MakeNullScalar(double_type); + for (const std::shared_ptr& int_scalar : int_scalars) { + input.values = {ExecValue(int_scalar.get())}; + ABORT_NOT_OK(exec(&kernel_context, input, &result)); } - benchmark::DoNotOptimize(timestamp_scalar); } state.SetItemsProcessed(state.iterations() * kScalarCount); @@ -162,11 +164,15 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { ExecContext exec_context; KernelContext kernel_context(&exec_context); + ExecSpan input; + input.length = 1; for (auto _ : state) { int64_t total = 0; - for (const auto& scalar : scalars) { - Datum result{MakeNullScalar(int64())}; - ABORT_NOT_OK(exec(&kernel_context, ExecBatch{{scalar}, /*length=*/1}, &result)); + for (const std::shared_ptr& scalar : scalars) { + ExecResult result; + result.value = MakeNullScalar(int64()); + input.values = {scalar.get()}; + ABORT_NOT_OK(exec(&kernel_context, input, &result)); total += result.scalar()->is_valid; } benchmark::DoNotOptimize(total); diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 94e86c7bd57..ec5f3bc170c 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -210,12 +210,16 @@ TEST(VectorFunction, Basics) { ASSERT_EQ(Function::VECTOR, varargs_func.kind()); } -auto ExecNYI = [](KernelContext* ctx, const ExecBatch& args, Datum* out) { +auto ExecNYI = [](KernelContext* ctx, const ExecSpan& args, ExecResult* out) { return Status::NotImplemented("NYI"); }; -template -void CheckAddDispatch(FunctionType* func) { +auto ExecNYIOld = [](KernelContext* ctx, const ExecBatch& args, Datum* out) { + return Status::NotImplemented("NYI"); +}; + +template +void CheckAddDispatch(FunctionType* func, ExecType exec) { using KernelType = typename FunctionType::KernelType; ASSERT_EQ(0, func->num_kernels()); @@ -224,29 +228,29 @@ void CheckAddDispatch(FunctionType* func) { std::vector in_types1 = {int32(), int32()}; OutputType out_type1 = int32(); - ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI)); - ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), ExecNYI)); + ASSERT_OK(func->AddKernel(in_types1, out_type1, exec)); + ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), exec)); // Duplicate sig is okay - ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI)); + ASSERT_OK(func->AddKernel(in_types1, out_type1, exec)); // Add given a descr - KernelType descr({float64(), float64()}, float64(), ExecNYI); + KernelType descr({float64(), float64()}, float64(), exec); ASSERT_OK(func->AddKernel(descr)); ASSERT_EQ(4, func->num_kernels()); ASSERT_EQ(4, func->kernels().size()); // Try adding some invalid kernels - ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), ExecNYI)); - ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), ExecNYI)); - ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), ExecNYI)); + ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), exec)); + ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), exec)); + ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), exec)); // Add valid and invalid kernel using kernel struct directly - KernelType valid_kernel({boolean(), boolean()}, boolean(), ExecNYI); + KernelType valid_kernel({boolean(), boolean()}, boolean(), exec); ASSERT_OK(func->AddKernel(valid_kernel)); - KernelType invalid_kernel({boolean()}, boolean(), ExecNYI); + KernelType invalid_kernel({boolean()}, boolean(), exec); ASSERT_RAISES(Invalid, func->AddKernel(invalid_kernel)); ASSERT_OK_AND_ASSIGN(const Kernel* kernel, func->DispatchExact({int32(), int32()})); @@ -265,8 +269,10 @@ TEST(ScalarVectorFunction, DispatchExact) { ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); - CheckAddDispatch(&func1); - CheckAddDispatch(&func2); + CheckAddDispatch(&func1, ExecNYI); + + // ARROW-16576: will migrate later to new span-based kernel exec API + CheckAddDispatch(&func2, ExecNYIOld); } TEST(ArrayFunction, VarArgs) { diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 6b1f23e78df..e115c5194bc 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -83,16 +83,6 @@ class ARROW_EXPORT KernelContext { KernelState* state_ = NULLPTR; }; -/// \brief The standard kernel execution API that must be implemented for -/// SCALAR and VECTOR kernel types. This includes both stateless and stateful -/// kernels. Kernels depending on some execution state access that state via -/// subclasses of KernelState set on the KernelContext object. May be used for -/// SCALAR and VECTOR kernel kinds. Implementations should endeavor to write -/// into pre-allocated memory if they are able, though for some kernels -/// (e.g. in cases when a builder like StringBuilder) must be employed this may -/// not be possible. -using ArrayKernelExec = std::function; - /// \brief An type-checking interface to permit customizable validation rules /// for use with InputType and KernelSignature. This is for scenarios where the /// acceptance is not an exact type instance, such as a TIMESTAMP type for a @@ -486,7 +476,7 @@ struct MemAllocation { struct Kernel; -/// \brief Arguments to pass to a KernelInit function. A struct is used to help +/// \brief Arguments to pass to an KernelInit function. A struct is used to help /// avoid API breakage should the arguments passed need to be expanded. struct KernelInitArgs { /// \brief A pointer to the kernel being initialized. The init function may @@ -548,19 +538,32 @@ struct Kernel { SimdLevel::type simd_level = SimdLevel::NONE; }; -/// \brief Common kernel base data structure for ScalarKernel and -/// VectorKernel. It is called "ArrayKernel" in that the functions generally -/// output array values (as opposed to scalar values in the case of aggregate -/// functions). -struct ArrayKernel : public Kernel { - ArrayKernel() = default; +/// \brief The scalar kernel execution API that must be implemented for SCALAR +/// kernel types. This includes both stateless and stateful kernels. Kernels +/// depending on some execution state access that state via subclasses of +/// KernelState set on the KernelContext object. Implementations should +/// endeavor to write into pre-allocated memory if they are able, though for +/// some kernels (e.g. in cases when a builder like StringBuilder) must be +/// employed this may not be possible. +using ArrayKernelExec = + std::function; + +/// \brief Kernel execution API being phased out per ARROW-16756 +using ArrayKernelExecOld = + std::function; + +/// \brief Kernel data structure for implementations of ScalarFunction. In +/// addition to the members found in Kernel, contains the null handling +/// and memory pre-allocation preferences. +struct ScalarKernel : public Kernel { + ScalarKernel() = default; - ArrayKernel(std::shared_ptr sig, ArrayKernelExec exec, - KernelInit init = NULLPTR) + ScalarKernel(std::shared_ptr sig, ArrayKernelExec exec, + KernelInit init = NULLPTR) : Kernel(std::move(sig), init), exec(std::move(exec)) {} - ArrayKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, - KernelInit init = NULLPTR) + ScalarKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, + KernelInit init = NULLPTR) : Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(std::move(exec)) {} @@ -576,13 +579,6 @@ struct ArrayKernel : public Kernel { /// not be able to do this, so setting this to false disables this /// functionality. bool can_write_into_slices = true; -}; - -/// \brief Kernel data structure for implementations of ScalarFunction. In -/// addition to the members found in ArrayKernel, contains the null handling -/// and memory pre-allocation preferences. -struct ScalarKernel : public ArrayKernel { - using ArrayKernel::ArrayKernel; // For scalar functions preallocated data and intersecting arg validity // bitmaps is a reasonable default @@ -593,34 +589,39 @@ struct ScalarKernel : public ArrayKernel { // ---------------------------------------------------------------------- // VectorKernel (for VectorFunction) -/// \brief See VectorKernel::finalize member for usage -using VectorFinalize = std::function*)>; - /// \brief Kernel data structure for implementations of VectorFunction. In -/// addition to the members found in ArrayKernel, contains an optional -/// finalizer function, the null handling and memory pre-allocation preferences -/// (which have different defaults from ScalarKernel), and some other -/// execution-related options. -struct VectorKernel : public ArrayKernel { +/// contains an optional finalizer function, the null handling and memory +/// pre-allocation preferences (which have different defaults from +/// ScalarKernel), and some other execution-related options. +struct VectorKernel : public Kernel { + /// \brief See VectorKernel::finalize member for usage + using FinalizeFunc = std::function*)>; + VectorKernel() = default; - VectorKernel(std::vector in_types, OutputType out_type, ArrayKernelExec exec, - KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(in_types), std::move(out_type), std::move(exec), - std::move(init)), + VectorKernel(std::vector in_types, OutputType out_type, + ArrayKernelExecOld exec, KernelInit init = NULLPTR, + FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(std::move(exec)), finalize(std::move(finalize)) {} - VectorKernel(std::shared_ptr sig, ArrayKernelExec exec, - KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR) - : ArrayKernel(std::move(sig), std::move(exec), std::move(init)), + VectorKernel(std::shared_ptr sig, ArrayKernelExecOld exec, + KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR) + : Kernel(std::move(sig), std::move(init)), + exec(std::move(exec)), finalize(std::move(finalize)) {} + /// \brief Perform a single invocation of this kernel. Any required state is + /// managed through the KernelContext. + ArrayKernelExecOld exec; + /// \brief For VectorKernel, convert intermediate results into finalized /// results. Mutates input argument. Some kernels may accumulate state /// (example: hashing-related functions) through processing chunked inputs, and /// then need to attach some accumulated state to each of the outputs of /// processing each chunk of data. - VectorFinalize finalize; + FinalizeFunc finalize; /// Since vector kernels generally are implemented rather differently from /// scalar/elementwise kernels (and they may not even yield arrays of the same @@ -629,6 +630,13 @@ struct VectorKernel : public ArrayKernel { NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE; + /// \brief Writing execution results into larger contiguous allocations + /// requires that the kernel be able to write into sliced output ArrayData*, + /// including sliced output validity bitmaps. Some kernel implementations may + /// not be able to do this, so setting this to false disables this + /// functionality. + bool can_write_into_slices = true; + /// Some vector kernels can do chunkwise execution using ExecBatchIterator, /// in some cases accumulating some state. Other kernels (like Take) need to /// be passed whole arrays and don't work on ChunkedArray inputs diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc index a63c42d4fde..2d427374426 100644 --- a/cpp/src/arrow/compute/kernel_test.cc +++ b/cpp/src/arrow/compute/kernel_test.cc @@ -266,7 +266,7 @@ TEST(OutputType, Constructors) { ASSERT_EQ(OutputType::COMPUTED, ty2.kind()); ASSERT_OK_AND_ASSIGN(ValueDescr out_descr2, ty2.Resolve(nullptr, {})); - ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr2); + ASSERT_EQ(ValueDescr::Array(int32()), out_descr2); // Copy constructor OutputType ty3 = ty1; @@ -276,7 +276,7 @@ TEST(OutputType, Constructors) { OutputType ty4 = ty2; ASSERT_EQ(OutputType::COMPUTED, ty4.kind()); ASSERT_OK_AND_ASSIGN(ValueDescr out_descr4, ty4.Resolve(nullptr, {})); - ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr4); + ASSERT_EQ(ValueDescr::Array(int32()), out_descr4); // Move constructor OutputType ty5 = std::move(ty1); @@ -286,7 +286,7 @@ TEST(OutputType, Constructors) { OutputType ty6 = std::move(ty4); ASSERT_EQ(OutputType::COMPUTED, ty6.kind()); ASSERT_OK_AND_ASSIGN(ValueDescr out_descr6, ty6.Resolve(nullptr, {})); - ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr6); + ASSERT_EQ(ValueDescr::Array(int32()), out_descr6); // ToString @@ -300,7 +300,7 @@ TEST(OutputType, Resolve) { OutputType ty1(int32()); ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve(nullptr, {})); - ASSERT_EQ(ValueDescr::Scalar(int32()), descr); + ASSERT_EQ(ValueDescr::Array(int32()), descr); ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve(nullptr, {ValueDescr(int8(), ValueDescr::SCALAR)})); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 16495bc8030..661b6a4edb1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -141,7 +141,7 @@ struct CountDistinctImpl : public ScalarAggregator { int y; return memo_table_->GetOrInsert(arg, &y); }; - RETURN_NOT_OK(VisitArrayDataInline(arr, visit_value, visit_null)); + RETURN_NOT_OK(VisitArraySpanInline(arr, visit_value, visit_null)); this->non_nulls += memo_table_->size(); this->has_nulls = arr.GetNullCount() > 0; } else { diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 7d3440cbef3..f8c56b2a220 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -389,7 +389,7 @@ Result ModeType(KernelContext*, const std::vector& descr } VectorKernel NewModeKernel(const std::shared_ptr& in_type, - ArrayKernelExec exec) { + ArrayKernelExecOld exec) { VectorKernel kernel; kernel.init = ModeState::Init; kernel.can_execute_chunkwise = false; @@ -433,8 +433,9 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) { DCHECK_OK(func->AddKernel( NewModeKernel(boolean(), ModeExecutor::Exec))); for (const auto& type : NumericTypes()) { + // TODO(wesm): DCHECK_OK(func->AddKernel( - NewModeKernel(type, GenerateNumeric(*type)))); + NewModeKernel(type, GenerateNumericOld(*type)))); } // Type parameters are ignored DCHECK_OK(func->AddKernel( diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 810fb539913..d18d8425946 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -500,7 +500,7 @@ void AddQuantileKernels(VectorFunction* func) { for (const auto& ty : NumericTypes()) { base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput)); // output type is determined at runtime, set template argument to nulltype - base.exec = GenerateNumeric(*ty); + base.exec = GenerateNumericOld(*ty); DCHECK_OK(func->AddKernel(base)); } { diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index faae139c130..79b01b477af 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1063,7 +1063,7 @@ TEST_F(TestCountDistinctKernel, Random) { }; auto rand = random::RandomArrayGenerator(0x1205643); auto arr = rand.Numeric(1024, 0, 100, 0.0)->data(); - auto r = VisitArrayDataInline(*arr, visit_value, visit_null); + auto r = VisitArraySpanInline(*arr, visit_value, visit_null); auto input = builder.Finish().ValueOrDie(); Check(input, memo.size(), false); } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index b31ef408b10..c696e6376f7 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -29,15 +29,19 @@ namespace arrow { namespace compute { namespace internal { -Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return Status::NotImplemented("This kernel is malformed"); +} + +Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return Status::NotImplemented("This kernel is malformed"); } ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) { - return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ExecBatch flipped_batch = batch; - std::swap(flipped_batch.values[0], flipped_batch.values[1]); - return exec(ctx, flipped_batch, out); + return [exec](KernelContext* ctx, const ExecSpan& span, ExecResult* out) { + ExecSpan flipped_span = span; + std::swap(flipped_span.values[0], flipped_span.values[1]); + return exec(ctx, flipped_span, out); }; } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 6d31c1fe246..8c3c7e3d423 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -58,6 +58,7 @@ using internal::BitmapReader; using internal::checked_cast; using internal::FirstTimeBitmapWriter; using internal::GenerateBitsUnrolled; +using internal::VisitBitBlocks; using internal::VisitBitBlocksVoid; using internal::VisitTwoBitBlocksVoid; @@ -237,7 +238,7 @@ struct ArrayIterator> { using T = typename TypeTraits::ScalarType::ValueType; const T* values; - explicit ArrayIterator(const ArrayData& data) : values(data.GetValues(1)) {} + explicit ArrayIterator(const ArraySpan& arr) : values(arr.GetValues(1)) {} T operator()() { return *values++; } }; @@ -245,8 +246,8 @@ template struct ArrayIterator> { BitmapReader reader; - explicit ArrayIterator(const ArrayData& data) - : reader(data.buffers[1]->data(), data.offset, data.length) {} + explicit ArrayIterator(const ArraySpan& arr) + : reader(arr.buffers[1].data, arr.offset, arr.length) {} bool operator()() { bool out = reader.IsSet(); reader.Next(); @@ -257,18 +258,17 @@ struct ArrayIterator> { template struct ArrayIterator> { using offset_type = typename Type::offset_type; - const ArrayData& arr; + const ArraySpan& arr; const offset_type* offsets; offset_type cur_offset; const char* data; int64_t position; - explicit ArrayIterator(const ArrayData& arr) + explicit ArrayIterator(const ArraySpan& arr) : arr(arr), - offsets(reinterpret_cast(arr.buffers[1]->data()) + - arr.offset), + offsets(reinterpret_cast(arr.buffers[1].data) + arr.offset), cur_offset(offsets[0]), - data(reinterpret_cast(arr.buffers[2]->data())), + data(reinterpret_cast(arr.buffers[2].data)), position(0) {} util::string_view operator()() { @@ -281,15 +281,15 @@ struct ArrayIterator> { template <> struct ArrayIterator { - const ArrayData& arr; + const ArraySpan& arr; const char* data; const int32_t width; int64_t position; - explicit ArrayIterator(const ArrayData& arr) + explicit ArrayIterator(const ArraySpan& arr) : arr(arr), - data(reinterpret_cast(arr.buffers[1]->data())), - width(checked_cast(*arr.type).byte_width()), + data(reinterpret_cast(arr.buffers[1].data)), + width(arr.type->byte_width()), position(arr.offset) {} util::string_view operator()() { @@ -309,7 +309,7 @@ struct OutputArrayWriter> { using T = typename TypeTraits::ScalarType::ValueType; T* values; - explicit OutputArrayWriter(ArrayData* data) : values(data->GetMutableValues(1)) {} + explicit OutputArrayWriter(ArraySpan* data) : values(data->GetValues(1)) {} void Write(T value) { *values++ = value; } @@ -340,7 +340,8 @@ struct UnboxScalar> { template struct UnboxScalar> { - static util::string_view Unbox(const Scalar& val) { + using T = util::string_view; + static T Unbox(const Scalar& val) { if (!val.is_valid) return util::string_view(); return util::string_view(*checked_cast(val).value); } @@ -348,14 +349,16 @@ struct UnboxScalar> { template <> struct UnboxScalar { - static const Decimal128& Unbox(const Scalar& val) { + using T = Decimal128; + static const T& Unbox(const Scalar& val) { return checked_cast(val).value; } }; template <> struct UnboxScalar { - static const Decimal256& Unbox(const Scalar& val) { + using T = Decimal256; + static const T& Unbox(const Scalar& val) { return checked_cast(val).value; } }; @@ -397,14 +400,14 @@ struct BoxScalar { static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } }; -// A VisitArrayDataInline variant that calls its visitor function with logical +// A VisitArraySpanInline variant that calls its visitor function with logical // values, such as Decimal128 rather than util::string_view. template -static typename arrow::internal::call_traits::enable_if_return::type -VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, +static typename ::arrow::internal::call_traits::enable_if_return::type +VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func, NullFunc&& null_func) { - VisitArrayDataInline( + VisitArraySpanInline( arr, [&](typename GetViewType::PhysicalType v) { valid_func(GetViewType::LogicalValue(std::move(v))); @@ -413,10 +416,10 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, } template -static typename arrow::internal::call_traits::enable_if_return::type -VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, +static typename ::arrow::internal::call_traits::enable_if_return::type +VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func, NullFunc&& null_func) { - return VisitArrayDataInline( + return VisitArraySpanInline( arr, [&](typename GetViewType::PhysicalType v) { return valid_func(GetViewType::LogicalValue(std::move(v))); @@ -427,7 +430,7 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func, // Like VisitArrayValuesInline, but for binary functions. template -static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& arr1, +static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& arr1, VisitFunc&& valid_func, NullFunc&& null_func) { ArrayIterator arr0_it(arr0); ArrayIterator arr1_it(arr1); @@ -441,8 +444,9 @@ static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& ar arr1_it(); null_func(); }; - VisitTwoBitBlocksVoid(arr0.buffers[0], arr0.offset, arr1.buffers[0], arr1.offset, - arr0.length, std::move(visit_valid), std::move(visit_null)); + VisitTwoBitBlocksVoid(arr0.buffers[0].data, arr0.offset, arr1.buffers[0].data, + arr1.offset, arr0.length, std::move(visit_valid), + std::move(visit_null)); } // ---------------------------------------------------------------------- @@ -455,7 +459,8 @@ Result ListValuesType(KernelContext*, const std::vector& // ---------------------------------------------------------------------- // Generate an array kernel given template classes -Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out); +Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out); +Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out); ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec); @@ -493,14 +498,14 @@ namespace applicator { // // Operator must implement // -// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out) -// static Status Call(KernelContext*, const Scalar& in, Scalar* out) +// static Status Call(KernelContext*, const ArraySpan& in, ExecResult* out) +// static Status Call(KernelContext*, const Scalar& in, ExecResult* out) template -static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::SCALAR) { - return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get()); +static Status SimpleUnary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_scalar()) { + return Operator::Call(ctx, *batch[0].scalar, out); } else if (batch.length > 0) { - return Operator::Call(ctx, *batch[0].array(), out->mutable_array()); + return Operator::Call(ctx, batch[0].array, out); } return Status::OK(); } @@ -510,33 +515,29 @@ static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out // // Operator must implement // -// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1, -// ArrayData* out) -// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1, -// ArrayData* out) -// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1, -// ArrayData* out) +// static Status Call(KernelContext*, const ArraySpan& arg0, const ArraySpan& arg1, +// * out) +// static Status Call(KernelContext*, const ArraySpan& arg0, const Scalar& arg1, +// ExecResult* out) +// static Status Call(KernelContext*, const Scalar& arg0, const ArraySpan& arg1, +// ExecResult* out) // static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1, -// Scalar* out) +// ExecResult* out) template -static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +static Status SimpleBinary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { if (batch.length == 0) return Status::OK(); - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return Operator::Call(ctx, *batch[0].array(), *batch[1].array(), - out->mutable_array()); + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return Operator::Call(ctx, batch[0].array, batch[1].array, out); } else { - return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(), - out->mutable_array()); + return Operator::Call(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(), - out->mutable_array()); + if (batch[1].is_array()) { + return Operator::Call(ctx, *batch[0].scalar, batch[1].array, out); } else { - return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(), - out->scalar().get()); + return Operator::Call(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -551,10 +552,8 @@ struct OutputAdapter; template struct OutputAdapter> { template - static Status Write(KernelContext*, Datum* out, Generator&& generator) { - ArrayData* out_arr = out->mutable_array(); - auto out_bitmap = out_arr->buffers[1]->mutable_data(); - GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length, + static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) { + GenerateBitsUnrolled(out->buffers[1].data, out->offset, out->length, std::forward(generator)); return Status::OK(); } @@ -565,11 +564,10 @@ struct OutputAdapter> { using T = typename TypeTraits::ScalarType::ValueType; template - static Status Write(KernelContext*, Datum* out, Generator&& generator) { - ArrayData* out_arr = out->mutable_array(); - auto out_data = out_arr->GetMutableValues(1); + static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) { + T* out_data = out->GetValues(1); // TODO: Is this as fast as a more explicitly inlined function? - for (int64_t i = 0; i < out_arr->length; ++i) { + for (int64_t i = 0; i < out->length; ++i) { *out_data++ = generator(); } return Status::OK(); @@ -579,7 +577,7 @@ struct OutputAdapter> { template struct OutputAdapter> { template - static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) { + static Status Write(KernelContext* ctx, ArraySpan* out, Generator&& generator) { return Status::NotImplemented("NYI"); } }; @@ -607,16 +605,17 @@ struct ScalarUnary { using OutValue = typename GetOutputType::T; using Arg0Value = typename GetViewType::T; - static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) { + static Status ExecArray(KernelContext* ctx, const ArraySpan& arg0, ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), &st); + })); return st; } - static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) { + static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) { Status st = Status::OK(); Scalar* out_scalar = out->scalar().get(); if (arg0.is_valid) { @@ -630,11 +629,11 @@ struct ScalarUnary { return st; } - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - return ExecArray(ctx, *batch[0].array(), out); + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + return ExecArray(ctx, batch[0].array, out); } else { - return ExecScalar(ctx, *batch[0].scalar(), out); + return ExecScalar(ctx, *batch[0].scalar, out); } } }; @@ -654,8 +653,8 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec { - static Status Exec(const ThisType& functor, KernelContext* ctx, - const ExecBatch& batch, Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ExecSpan& batch, + ExecResult* out) { ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type " << out->type(); return Status::NotImplemented("NYI"); @@ -664,11 +663,10 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { Status st = Status::OK(); - ArrayData* out_arr = out->mutable_array(); - auto out_data = out_arr->GetMutableValues(1); + auto out_data = out->array_span()->GetValues(1); VisitArrayValuesInline( arg0, [&](Arg0Value v) { @@ -684,8 +682,8 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { // NOTE: This code is not currently used by any kernels and has // suboptimal performance because it's recomputing the validity bitmap // that is already computed by the kernel execution layer. Consider @@ -706,12 +704,12 @@ struct ScalarUnaryNotNullStateful { template struct ArrayExec::value>> { - static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0, - Datum* out) { + static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0, + ExecResult* out) { Status st = Status::OK(); - ArrayData* out_arr = out->mutable_array(); - FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(), - out_arr->offset, out_arr->length); + ArraySpan* out_arr = out->array_span(); + FirstTimeBitmapWriter out_writer(out_arr->buffers[1].data, out_arr->offset, + out_arr->length); VisitArrayValuesInline( arg0, [&](Arg0Value v) { @@ -730,7 +728,7 @@ struct ScalarUnaryNotNullStateful { } }; - Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) { + Status Scalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) { Status st = Status::OK(); if (arg0.is_valid) { Arg0Value arg0_val = UnboxScalar::Unbox(arg0); @@ -741,11 +739,11 @@ struct ScalarUnaryNotNullStateful { return st; } - Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - return ArrayExec::Exec(*this, ctx, *batch[0].array(), out); + Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + return ArrayExec::Exec(*this, ctx, batch[0].array, out); } else { - return Scalar(ctx, *batch[0].scalar(), out); + return Scalar(ctx, *batch[0].scalar, out); } } }; @@ -758,7 +756,7 @@ struct ScalarUnaryNotNull { using OutValue = typename GetOutputType::T; using Arg0Value = typename GetViewType::T; - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Seed kernel with dummy state ScalarUnaryNotNullStateful kernel({}); return kernel.Exec(ctx, batch, out); @@ -790,44 +788,47 @@ struct ScalarBinary { using Arg0Value = typename GetViewType::T; using Arg1Value = typename GetViewType::T; - static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, - const ArrayData& arg1, Datum* out) { + static Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0, + const ArraySpan& arg1, ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); ArrayIterator arg1_it(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), arg1_it(), - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), + arg1_it(), &st); + })); return st; } - static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1, - Datum* out) { + static Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1, + ExecResult* out) { Status st = Status::OK(); ArrayIterator arg0_it(arg0); auto arg1_val = UnboxScalar::Unbox(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_it(), arg1_val, - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_it(), + arg1_val, &st); + })); return st; } - static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1, - Datum* out) { + static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); auto arg0_val = UnboxScalar::Unbox(arg0); ArrayIterator arg1_it(arg1); - RETURN_NOT_OK(OutputAdapter::Write(ctx, out, [&]() -> OutValue { - return Op::template Call(ctx, arg0_val, arg1_it(), - &st); - })); + RETURN_NOT_OK( + OutputAdapter::Write(ctx, out->array_span(), [&]() -> OutValue { + return Op::template Call(ctx, arg0_val, + arg1_it(), &st); + })); return st; } static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1, - Datum* out) { + ExecResult* out) { Status st = Status::OK(); if (out->scalar()->is_valid) { auto arg0_val = UnboxScalar::Unbox(arg0); @@ -839,18 +840,18 @@ struct ScalarBinary { return st; } - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out); + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return ArrayArray(ctx, batch[0].array, batch[1].array, out); } else { - return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out); + return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out); + if (batch[1].is_array()) { + return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out); } else { - return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out); + return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -870,10 +871,10 @@ struct ScalarBinaryNotNullStateful { // NOTE: In ArrayExec, Type is really OutputType - Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1, - Datum* out) { + Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + OutputArrayWriter writer(out->array_span()); VisitTwoArrayValuesInline( arg0, arg1, [&](Arg0Value u, Arg1Value v) { @@ -883,10 +884,11 @@ struct ScalarBinaryNotNullStateful { return st; } - Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1, - Datum* out) { + Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + ArraySpan* out_span = out->array_span(); + OutputArrayWriter writer(out_span); if (arg1.is_valid) { const auto arg1_val = UnboxScalar::Unbox(arg1); VisitArrayValuesInline( @@ -897,15 +899,16 @@ struct ScalarBinaryNotNullStateful { }, [&]() { writer.WriteNull(); }); } else { - writer.WriteAllNull(out->mutable_array()->length); + writer.WriteAllNull(out_span->length); } return st; } - Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1, - Datum* out) { + Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1, + ExecResult* out) { Status st = Status::OK(); - OutputArrayWriter writer(out->mutable_array()); + ArraySpan* out_span = out->array_span(); + OutputArrayWriter writer(out_span); if (arg0.is_valid) { const auto arg0_val = UnboxScalar::Unbox(arg0); VisitArrayValuesInline( @@ -916,13 +919,13 @@ struct ScalarBinaryNotNullStateful { }, [&]() { writer.WriteNull(); }); } else { - writer.WriteAllNull(out->mutable_array()->length); + writer.WriteAllNull(out_span->length); } return st; } Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1, - Datum* out) { + ExecResult* out) { Status st = Status::OK(); if (arg0.is_valid && arg1.is_valid) { const auto arg0_val = UnboxScalar::Unbox(arg0); @@ -934,18 +937,18 @@ struct ScalarBinaryNotNullStateful { return st; } - Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - if (batch[0].kind() == Datum::ARRAY) { - if (batch[1].kind() == Datum::ARRAY) { - return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out); + Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + if (batch[0].is_array()) { + if (batch[1].is_array()) { + return ArrayArray(ctx, batch[0].array, batch[1].array, out); } else { - return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out); + return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out); } } else { - if (batch[1].kind() == Datum::ARRAY) { - return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out); + if (batch[1].is_array()) { + return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out); } else { - return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out); + return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out); } } } @@ -961,7 +964,7 @@ struct ScalarBinaryNotNull { using Arg0Value = typename GetViewType::T; using Arg1Value = typename GetViewType::T; - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // Seed kernel with dummy state ScalarBinaryNotNullStateful kernel({}); return kernel.Exec(ctx, batch, out); @@ -997,7 +1000,7 @@ using ScalarBinaryNotNullStatefulEqualTypes = // // template // struct FUNCTOR { -// static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +// static void Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // // IMPLEMENTATION // } // }; @@ -1059,6 +1062,38 @@ ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) { } } +// TODO(wesm): for ARROW-16756, while in transition to a new kernel +// API I duplicated this generator dispatcher to be able to create old +// kernel types +template