-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-41974: [C++][Compute] Support more precise pre-allocation and more pre-allocated types for ScalarExecutor and VectorExecutor #41975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
29e34f3
d42eeb7
0a358e7
4307d1e
7bbe96b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -297,11 +297,23 @@ void ComputeDataPreallocate(const DataType& type, | |
| case Type::MAP: | ||
| widths->emplace_back(32, /*added_length=*/1); | ||
| return; | ||
| case Type::LIST_VIEW: { | ||
| // add offsets and size | ||
| widths->emplace_back(32); | ||
| widths->emplace_back(32); | ||
| return; | ||
| } | ||
| case Type::LARGE_BINARY: | ||
| case Type::LARGE_STRING: | ||
| case Type::LARGE_LIST: | ||
| widths->emplace_back(64, /*added_length=*/1); | ||
| return; | ||
| case Type::LARGE_LIST_VIEW: { | ||
| // add offsets and size | ||
| widths->emplace_back(64); | ||
| widths->emplace_back(64); | ||
| return; | ||
| } | ||
| default: | ||
| break; | ||
| } | ||
|
|
@@ -410,7 +422,7 @@ bool ExecSpanIterator::Next(ExecSpan* span) { | |
| // The first time this is called, we populate the output span with any | ||
| // Scalar or Array arguments in the ExecValue struct, and then just | ||
| // increment array offsets below. If any arguments are ChunkedArray, then | ||
| // the internal ArraySpans will see their members updated during hte | ||
| // the internal ArraySpans will see their members updated during the | ||
| // iteration | ||
| span->values.resize(args_->size()); | ||
| for (size_t i = 0; i < args_->size(); ++i) { | ||
|
|
@@ -473,7 +485,7 @@ bool ExecSpanIterator::Next(ExecSpan* span) { | |
| namespace { | ||
|
|
||
| struct NullGeneralization { | ||
| enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL }; | ||
| enum type { PERHAPS_NULL = 0, ALL_VALID = 1, ALL_NULL = 2 }; | ||
|
|
||
| static type Get(const ExecValue& value) { | ||
| const auto dtype_id = value.type()->id(); | ||
|
|
@@ -498,15 +510,34 @@ struct NullGeneralization { | |
| return PERHAPS_NULL; | ||
| } | ||
|
|
||
| static type Get(const ChunkedArray& chunk_array) { | ||
| std::optional<type> current_gen; | ||
| for (const auto& chunk : chunk_array.chunks()) { | ||
| if (chunk->length() == 0) { | ||
| continue; | ||
| } | ||
|
|
||
| const auto& chunk_gen = Get(chunk); | ||
| if (current_gen.has_value() && chunk_gen != *current_gen) { | ||
| return PERHAPS_NULL; | ||
| } | ||
| current_gen = chunk_gen; | ||
| } | ||
| return current_gen.value_or(ALL_VALID); | ||
| } | ||
|
|
||
| static type Get(const Datum& datum) { | ||
| if (datum.is_chunked_array()) { | ||
| return Get(*datum.chunked_array()); | ||
| } | ||
|
|
||
| // Temporary workaround to help with ARROW-16756 | ||
| ExecValue value; | ||
| if (datum.is_array()) { | ||
| value.SetArray(*datum.array()); | ||
| } else if (datum.is_scalar()) { | ||
| value.SetScalar(datum.scalar().get()); | ||
| } else { | ||
| // TODO(wesm): ChunkedArray, I think | ||
| return PERHAPS_NULL; | ||
| } | ||
| return Get(value); | ||
|
|
@@ -738,12 +769,14 @@ class KernelExecutorImpl : public KernelExecutor { | |
| } | ||
| for (size_t i = 0; i < data_preallocated_.size(); ++i) { | ||
| const auto& prealloc = data_preallocated_[i]; | ||
| if (prealloc.bit_width >= 0) { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| out->buffers[i + 1], | ||
| AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length, | ||
| prealloc.bit_width)); | ||
| } | ||
|
|
||
| // ComputeDataPreallocate can make sure all of the element in | ||
| // data_preallocated_ could satisfy below DCHECK | ||
| DCHECK_GE(prealloc.bit_width, 0); | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| out->buffers[i + 1], | ||
| AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length, | ||
| prealloc.bit_width)); | ||
| } | ||
| return out; | ||
| } | ||
|
|
@@ -796,7 +829,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> { | |
| // kernels supporting preallocation, then we do so up front and then | ||
| // iterate over slices of that large array. Otherwise, we preallocate prior | ||
| // to processing each span emitted from the ExecSpanIterator | ||
| RETURN_NOT_OK(SetupPreallocation(span_iterator_.length(), batch.values)); | ||
| RETURN_NOT_OK(SetupPreallocation(batch.values)); | ||
|
|
||
| // ARROW-16756: Here we have to accommodate the distinct cases | ||
| // | ||
|
|
@@ -928,7 +961,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> { | |
| return Status::OK(); | ||
| } | ||
|
|
||
| Status SetupPreallocation(int64_t total_length, const std::vector<Datum>& args) { | ||
| Status SetupPreallocation(const std::vector<Datum>& args) { | ||
| output_num_buffers_ = static_cast<int>(output_type_.type->layout().buffers.size()); | ||
| auto out_type_id = output_type_.type->id(); | ||
| // Default to no validity pre-allocation for following cases: | ||
|
|
@@ -966,12 +999,6 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> { | |
| data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) && | ||
| !is_nested(out_type_id) && !is_dictionary(out_type_id)); | ||
|
|
||
| // TODO(wesm): why was this check ever here? Fixed width binary | ||
|
||
| // can be 0-width but anything else? | ||
| DCHECK(std::all_of( | ||
| data_preallocated_.begin(), data_preallocated_.end(), | ||
| [](const BufferPreallocation& prealloc) { return prealloc.bit_width >= 0; })); | ||
|
|
||
| // Contiguous preallocation only possible on non-nested types if all | ||
| // buffers are preallocated. Otherwise, we must go chunk-by-chunk. | ||
| // | ||
|
|
@@ -1022,15 +1049,6 @@ Status CheckCanExecuteChunked(const VectorKernel* kernel) { | |
| class VectorExecutor : public KernelExecutorImpl<VectorKernel> { | ||
| public: | ||
| Status Execute(const ExecBatch& batch, ExecListener* listener) override { | ||
| // Some vector kernels have a separate code path for handling | ||
| // chunked arrays (VectorKernel::exec_chunked) so we check if we | ||
| // have any chunked arrays. If we do and an exec_chunked function | ||
| // is defined then we call that. | ||
| bool have_chunked_arrays = false; | ||
| for (const Datum& arg : batch.values) { | ||
| if (arg.is_chunked_array()) have_chunked_arrays = true; | ||
| } | ||
|
|
||
| output_num_buffers_ = static_cast<int>(output_type_.type->layout().buffers.size()); | ||
|
|
||
| // Decide if we need to preallocate memory for this kernel | ||
|
|
@@ -1049,10 +1067,19 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> { | |
| RETURN_NOT_OK(Exec(span, listener)); | ||
| } | ||
| } else { | ||
| // Kernel cannot execute chunkwise. If we have any chunked | ||
| // arrays, then VectorKernel::exec_chunked must be defined | ||
| // otherwise we raise an error | ||
| // Some vector kernels have a separate code path for handling | ||
| // chunked arrays (VectorKernel::exec_chunked) so we check if we | ||
| // have any chunked arrays. If we do and an exec_chunked function | ||
| // is defined then we call that. | ||
| bool have_chunked_arrays = false; | ||
|
||
| for (const Datum& arg : batch.values) { | ||
| if (arg.is_chunked_array()) have_chunked_arrays = true; | ||
| } | ||
|
|
||
| if (have_chunked_arrays) { | ||
| // Kernel cannot execute chunkwise. If we have any chunked | ||
| // arrays, then VectorKernel::exec_chunked must be defined | ||
| // otherwise we raise an error | ||
| RETURN_NOT_OK(ExecChunked(batch, listener)); | ||
| } else { | ||
| // No chunked arrays. We pack the args into an ExecSpan and | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What would
NAtype do here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
data_preallocated_will be filled in ComputeDataPreallocate, and theNAtype will not be added todata_preallocated_.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, seems this could be a DCHECK?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed_size_binary<0>can havebit_width == 0There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes,
fixed_size_binary<0>can be added intodata_preallocated_normally inComputeDataPreallocate, and the function call make sure all of element indata_preallocated_should satisfy>=0. So we just add a DCHECK here.