-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-14970: [C++] Make ExecNodes can generate/consume tasks #11923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -175,18 +175,21 @@ class ScalarAggregateNode : public ExecNode { | |
| return Status::OK(); | ||
| } | ||
|
|
||
| void InputReceived(ExecNode* input, ExecBatch batch) override { | ||
| void InputReceived(ExecNode* input, std::function<Result<ExecBatch>()> task) override { | ||
| DCHECK_EQ(input, inputs_[0]); | ||
|
|
||
| auto thread_index = get_thread_index_(); | ||
|
|
||
| if (ErrorIfNotOk(DoConsume(std::move(batch), thread_index))) return; | ||
| auto prev = task(); | ||
| if (!prev.ok()) { | ||
| ErrorIfNotOk(prev.status()); | ||
| return; | ||
| } | ||
| if (ErrorIfNotOk(DoConsume(prev.MoveValueUnsafe(), thread_index))) return; | ||
|
|
||
| if (input_counter_.Increment()) { | ||
| ErrorIfNotOk(Finish()); | ||
| } | ||
| } | ||
|
|
||
| void ErrorReceived(ExecNode* input, Status error) override { | ||
| DCHECK_EQ(input, inputs_[0]); | ||
| outputs_[0]->ErrorReceived(this, std::move(error)); | ||
|
|
@@ -235,17 +238,18 @@ class ScalarAggregateNode : public ExecNode { | |
|
|
||
| private: | ||
| Status Finish() { | ||
| ExecBatch batch{{}, 1}; | ||
| batch.values.resize(kernels_.size()); | ||
|
|
||
| for (size_t i = 0; i < kernels_.size(); ++i) { | ||
| KernelContext ctx{plan()->exec_context()}; | ||
| ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll( | ||
| kernels_[i], &ctx, std::move(states_[i]))); | ||
| RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i])); | ||
| } | ||
|
|
||
| outputs_[0]->InputReceived(this, std::move(batch)); | ||
| auto task = [this]() -> Result<ExecBatch> { | ||
| ExecBatch batch{{}, 1}; | ||
| batch.values.resize(kernels_.size()); | ||
| for (size_t i = 0; i < kernels_.size(); ++i) { | ||
| KernelContext ctx{plan()->exec_context()}; | ||
| ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll( | ||
| kernels_[i], &ctx, std::move(states_[i]))); | ||
| RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i])); | ||
| } | ||
| return batch; | ||
| }; | ||
| outputs_[0]->InputReceived(this, std::move(task)); | ||
|
||
| finished_.MarkFinished(); | ||
| return Status::OK(); | ||
| } | ||
|
|
@@ -452,8 +456,12 @@ class GroupByNode : public ExecNode { | |
| // bail if StopProducing was called | ||
| if (finished_.is_finished()) return; | ||
|
|
||
| int64_t batch_size = output_batch_size(); | ||
| outputs_[0]->InputReceived(this, out_data_.Slice(batch_size * n, batch_size)); | ||
| auto task = [n, this]() -> Result<ExecBatch> { | ||
| int64_t batch_size = output_batch_size(); | ||
| return out_data_.Slice(batch_size * n, batch_size); | ||
| }; | ||
|
|
||
| outputs_[0]->InputReceived(this, std::move(task)); | ||
|
|
||
| if (output_counter_.Increment()) { | ||
| finished_.MarkFinished(); | ||
|
|
@@ -483,13 +491,18 @@ class GroupByNode : public ExecNode { | |
| return Status::OK(); | ||
| } | ||
|
|
||
| void InputReceived(ExecNode* input, ExecBatch batch) override { | ||
| void InputReceived(ExecNode* input, std::function<Result<ExecBatch>()> task) override { | ||
| // bail if StopProducing was called | ||
| if (finished_.is_finished()) return; | ||
|
|
||
| DCHECK_EQ(input, inputs_[0]); | ||
|
|
||
| if (ErrorIfNotOk(Consume(std::move(batch)))) return; | ||
| auto prev = task(); | ||
| if (!prev.ok()) { | ||
| ErrorIfNotOk(prev.status()); | ||
| return; | ||
| } | ||
| if (ErrorIfNotOk(Consume(prev.MoveValueUnsafe()))) return; | ||
westonpace marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if (input_counter_.Increment()) { | ||
| ErrorIfNotOk(OutputResult()); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,7 +61,10 @@ struct SourceNode : ExecNode { | |
| [[noreturn]] static void NoInputs() { | ||
| Unreachable("no inputs; this should never be called"); | ||
| } | ||
| [[noreturn]] void InputReceived(ExecNode*, ExecBatch) override { NoInputs(); } | ||
| [[noreturn]] void InputReceived(ExecNode*, | ||
| std::function<Result<ExecBatch>()>) override { | ||
| NoInputs(); | ||
| } | ||
| [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); } | ||
| [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } | ||
|
|
||
|
|
@@ -107,19 +110,19 @@ struct SourceNode : ExecNode { | |
| ExecBatch batch = std::move(*maybe_batch); | ||
|
|
||
| if (executor) { | ||
| auto status = | ||
| task_group_.AddTask([this, executor, batch]() -> Result<Future<>> { | ||
| return executor->Submit([=]() { | ||
| outputs_[0]->InputReceived(this, std::move(batch)); | ||
| return Status::OK(); | ||
| }); | ||
| }); | ||
| auto status = task_group_.AddTask([this, executor, | ||
| batch]() -> Result<Future<>> { | ||
| return executor->Submit([=]() { | ||
| outputs_[0]->InputReceived(this, IdentityTask(std::move(batch))); | ||
| return Status::OK(); | ||
| }); | ||
| }); | ||
| if (!status.ok()) { | ||
| outputs_[0]->ErrorReceived(this, std::move(status)); | ||
| return Break(total_batches); | ||
| } | ||
| } else { | ||
| outputs_[0]->InputReceived(this, std::move(batch)); | ||
| outputs_[0]->InputReceived(this, IdentityTask(std::move(batch))); | ||
|
||
| } | ||
| return Continue(); | ||
| }, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is what I'm thinking pipeline breakers would look like.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that is the idea, but this PR is to enable that construction later, this PR is not going to define any scheduler or submitting logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we aren't going to address this now let's make another JIRA (taskify 3?) Something like, "Fix logic in existing nodes so that pipeline breakers submit and non-breakers forward" and then add a comment in all of these spots along the lines of...
// This node should be forwarding the task downstream but that will be addressed in ARROW-XYZ