From cb43d75b9669f5ff835929f67dbe4c263e5baee3 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 18:57:55 +0530 Subject: [PATCH 01/32] updating submodule --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 53b49804710..634739c6644 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 +Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From b1b25b7c053260bfbe97c749959606208599630a Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:14:39 +0530 Subject: [PATCH 02/32] temp commit to remove files in submodule --- .gitmodules | 3 --- testing | 1 - 2 files changed, 4 deletions(-) delete mode 160000 testing diff --git a/.gitmodules b/.gitmodules index 6efc4871542..71722b21777 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git -[submodule "testing"] - path = testing - url = https://github.com/apache/arrow-testing diff --git a/testing b/testing deleted file mode 160000 index 634739c6644..00000000000 --- a/testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From 394fe82f704aaa44aa65b0224c7756b13d8d47e3 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:15:36 +0530 Subject: [PATCH 03/32] adding submodule --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index 71722b21777..6efc4871542 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git +[submodule "testing"] + path = testing + url = https://github.com/apache/arrow-testing From 3c93575589c5bb96d23bd7e4134ed5702a16b90f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:09:21 +0530 Subject: [PATCH 04/32] updating testing submodule --- testing | 1 + 1 file changed, 1 insertion(+) create mode 160000 testing diff --git a/testing b/testing new file mode 160000 index 00000000000..d315f798520 --- /dev/null +++ b/testing @@ -0,0 +1 @@ +Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b From b926c5546ab3b74e6a369740d28428e2dd7c7c64 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:11:24 +0530 Subject: [PATCH 05/32] revert to uupstream version --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index d315f798520..53b49804710 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From f63db74a68fdc2f2ff439a059396124d146233e4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 13 May 2022 18:37:24 +0530 Subject: [PATCH 06/32] initial commit for agg option refactor --- .../execution_plan_documentation_examples.cc | 9 +- cpp/src/arrow/compute/api_aggregate.h | 3 + cpp/src/arrow/compute/exec/aggregate_node.cc | 9 +- cpp/src/arrow/compute/exec/ir_consumer.cc | 5 +- cpp/src/arrow/compute/exec/ir_test.cc | 10 +- cpp/src/arrow/compute/exec/options.h | 6 +- cpp/src/arrow/compute/exec/plan_test.cc | 99 ++-- cpp/src/arrow/compute/exec/test_util.cc | 12 +- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 17 +- .../compute/kernels/hash_aggregate_test.cc | 495 +++++++++--------- cpp/src/arrow/dataset/scanner.cc | 3 +- cpp/src/arrow/dataset/scanner_test.cc | 21 +- 12 files changed, 325 insertions(+), 364 deletions(-) diff --git a/cpp/examples/arrow/execution_plan_documentation_examples.cc b/cpp/examples/arrow/execution_plan_documentation_examples.cc index 5f80119bbba..3063431b5f9 100644 --- a/cpp/examples/arrow/execution_plan_documentation_examples.cc +++ b/cpp/examples/arrow/execution_plan_documentation_examples.cc @@ -502,9 +502,9 @@ arrow::Status SourceScalarAggregateSinkExample(cp::ExecContext& exec_context) { ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, cp::MakeExecNode("source", plan.get(), {}, source_node_options)); - auto aggregate_options = cp::AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr}}, - /*targets=*/{"a"}, - /*names=*/{"sum(a)"}}; + auto aggregate_options = + cp::AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "a"}}, + /*names=*/{"sum(a)"}}; ARROW_ASSIGN_OR_RAISE( cp::ExecNode * aggregate, cp::MakeExecNode("aggregate", plan.get(), {source}, std::move(aggregate_options))); @@ -541,8 +541,7 @@ arrow::Status SourceGroupAggregateSinkExample(cp::ExecContext& exec_context) { cp::MakeExecNode("source", plan.get(), {}, source_node_options)); auto options = std::make_shared(cp::CountOptions::ONLY_VALID); auto aggregate_options = - cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", options}}, - /*targets=*/{"a"}, + cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", &options, "a"}}, /*names=*/{"count(a)"}, /*keys=*/{"b"}}; ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index becd5a74149..ff994178fc1 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -402,6 +402,9 @@ struct ARROW_EXPORT Aggregate { /// options for the aggregation function std::shared_ptr options; + + // fields to which aggregations will be applied + FieldRef target; }; } // namespace internal diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index c5c5d3efcfc..ace6d9aa0ee 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -93,8 +93,9 @@ class ScalarAggregateNode : public ExecNode { std::vector target_field_ids(kernels.size()); for (size_t i = 0; i < kernels.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto match, - FieldRef(aggregate_options.targets[i]).FindOne(input_schema)); + ARROW_ASSIGN_OR_RAISE( + auto match, + FieldRef(aggregate_options.aggregates[i].target).FindOne(input_schema)); target_field_ids[i] = match[0]; ARROW_ASSIGN_OR_RAISE( @@ -310,13 +311,11 @@ class GroupByNode : public ExecNode { // Find input field indices for aggregates std::vector agg_src_field_ids(aggs.size()); for (size_t i = 0; i < aggs.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto match, - aggregate_options.targets[i].FindOne(*input_schema)); + ARROW_ASSIGN_OR_RAISE(auto match, aggs[i].target.FindOne(*input_schema)); agg_src_field_ids[i] = match[0]; } // Build vector of aggregate source field data types - DCHECK_EQ(aggregate_options.targets.size(), aggs.size()); std::vector agg_src_descrs(aggs.size()); for (size_t i = 0; i < aggs.size(); ++i) { auto agg_src_field_id = agg_src_field_ids[i]; diff --git a/cpp/src/arrow/compute/exec/ir_consumer.cc b/cpp/src/arrow/compute/exec/ir_consumer.cc index 0aafa2c2819..4f539c91fca 100644 --- a/cpp/src/arrow/compute/exec/ir_consumer.cc +++ b/cpp/src/arrow/compute/exec/ir_consumer.cc @@ -531,7 +531,7 @@ Result Convert(const ir::Relation& rel) { ARROW_ASSIGN_OR_RAISE(auto arg, Convert(*aggregate->rel()).As()); - AggregateNodeOptions opts{{}, {}, {}}; + AggregateNodeOptions opts{{}, {}}; if (!aggregate->measures()) return UnexpectedNullField("Aggregate.measures"); for (const ir::Expression* m : *aggregate->measures()) { @@ -550,8 +550,7 @@ Result Convert(const ir::Relation& rel) { "Support for non-FieldRef arguments to Aggregate.measures"); } - opts.aggregates.push_back({call->function_name, nullptr}); - opts.targets.push_back(*target); + opts.aggregates.push_back({call->function_name, nullptr, *target}); opts.names.push_back(call->function_name + " " + target->ToString()); } diff --git a/cpp/src/arrow/compute/exec/ir_test.cc b/cpp/src/arrow/compute/exec/ir_test.cc index 847f555c69a..9f3e3098cc0 100644 --- a/cpp/src/arrow/compute/exec/ir_test.cc +++ b/cpp/src/arrow/compute/exec/ir_test.cc @@ -357,10 +357,9 @@ TEST(Relation, AggregateSimple) { "0"}, {"aggregate", AggregateNodeOptions{/*aggregates=*/{ - {"sum", nullptr}, - {"mean", nullptr}, + {"sum", nullptr, 1}, + {"mean", nullptr, 2}, }, - /*targets=*/{1, 2}, /*names=*/ { "sum FieldRef.FieldPath(1)", @@ -564,10 +563,9 @@ TEST(Relation, AggregateWithHaving) { {"filter", FilterNodeOptions{less(field_ref(0), literal(3))}, "1"}, {"aggregate", AggregateNodeOptions{/*aggregates=*/{ - {"sum", nullptr}, - {"mean", nullptr}, + {"sum", nullptr, 1}, + {"mean", nullptr, 2}, }, - /*targets=*/{1, 2}, /*names=*/ { "sum FieldRef.FieldPath(1)", diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h index 4691ad65a96..7132bd64784 100644 --- a/cpp/src/arrow/compute/exec/options.h +++ b/cpp/src/arrow/compute/exec/options.h @@ -112,17 +112,13 @@ class ARROW_EXPORT ProjectNodeOptions : public ExecNodeOptions { class ARROW_EXPORT AggregateNodeOptions : public ExecNodeOptions { public: AggregateNodeOptions(std::vector aggregates, - std::vector targets, std::vector names, - std::vector keys = {}) + std::vector names, std::vector keys = {}) : aggregates(std::move(aggregates)), - targets(std::move(targets)), names(std::move(names)), keys(std::move(keys)) {} // aggregations which will be applied to the targetted fields std::vector aggregates; - // fields to which aggregations will be applied - std::vector targets; // output field names for aggregations std::vector names; // keys by which aggregations will be grouped diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index 2df3c5e915e..4d1905fc8e7 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -391,8 +391,8 @@ TEST(ExecPlan, ToString) { }}}, {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"hash_sum", nullptr}, {"hash_count", options}}, - /*targets=*/{"multiply(i32, 2)", "multiply(i32, 2)"}, + /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)"}, + {"hash_count", options, "multiply(i32, 2)"}}, /*names=*/{"sum(multiply(i32, 2))", "count(multiply(i32, 2))"}, /*keys=*/{"bool"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), @@ -433,8 +433,7 @@ custom_sink_label:OrderBySinkNode{by={sort_keys=[FieldRef.Name(sum(multiply(i32, { union_node, {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"count", std::move(options)}}, - /*targets=*/{"i32"}, + AggregateNodeOptions{/*aggregates=*/{{"count", options, "i32"}}, /*names=*/{"count(i32)"}, /*keys=*/{}}}, {"sink", SinkNodeOptions{&sink_gen}}, @@ -770,8 +769,8 @@ TEST(ExecPlanExecution, StressSourceGroupedSumStop) { {"source", SourceNodeOptions{random_data.schema, random_data.gen(parallel, slow)}}, {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"a"}, /*names=*/{"sum(a)"}, + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "a"}}, + /*names=*/{"sum(a)"}, /*keys=*/{"b"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) @@ -918,8 +917,8 @@ TEST(ExecPlanExecution, SourceGroupedSum) { {"source", SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"i32"}, /*names=*/{"sum(i32)"}, + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32"}}, + /*names=*/{"sum(i32)"}, /*keys=*/{"str"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) @@ -987,10 +986,10 @@ TEST(ExecPlanExecution, NestedSourceProjectGroupedSum) { field_ref(FieldRef("struct", "bool")), }, {"i32", "bool"}}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"i32"}, - /*names=*/{"sum(i32)"}, - /*keys=*/{"bool"}}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32"}}, + /*names=*/{"sum(i32)"}, + /*keys=*/{"bool"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1021,8 +1020,8 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumFilter) { field_ref("str"), call("multiply", {field_ref("i32"), literal(2)}), }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"multiply(i32, 2)"}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "multiply(i32, 2)"}}, /*names=*/{"sum(multiply(i32, 2))"}, /*keys=*/{"str"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), @@ -1060,8 +1059,8 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumOrderBy) { field_ref("str"), call("multiply", {field_ref("i32"), literal(2)}), }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"multiply(i32, 2)"}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "multiply(i32, 2)"}}, /*names=*/{"sum(multiply(i32, 2))"}, /*keys=*/{"str"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), @@ -1097,8 +1096,8 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumTopK) { field_ref("str"), call("multiply", {field_ref("i32"), literal(2)}), }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"multiply(i32, 2)"}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "multiply(i32, 2)"}}, /*names=*/{"sum(multiply(i32, 2))"}, /*keys=*/{"str"}}}, {"select_k_sink", SelectKSinkNodeOptions{options, &sink_gen}}, @@ -1123,10 +1122,10 @@ TEST(ExecPlanExecution, SourceScalarAggSink) { {"source", SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"sum", nullptr}, {"any", nullptr}}, - /*targets=*/{"i32", "bool"}, - /*names=*/{"sum(i32)", "any(bool)"}}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "i32"}, + {"any", nullptr, "bool"}}, + /*names=*/{"sum(i32)", "any(bool)"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1150,18 +1149,18 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { { auto options = std::make_shared(TDigestOptions::Defaults()); - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{basic_data.schema, - basic_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"tdigest", options}}, - /*targets=*/{"i32"}, - /*names=*/{"tdigest(i32)"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + {"source", + SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{ + /*aggregates=*/{{"tdigest", options, "i32"}}, + /*names=*/{"tdigest(i32)"}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); } ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), @@ -1182,11 +1181,10 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { { {"source", SourceNodeOptions{data.schema, data.gen(/*parallel=*/false, /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_count", options}}, - /*targets=*/{"i32"}, - /*names=*/{"count(i32)"}, - /*keys=*/{"str"}}}, + {"aggregate", AggregateNodeOptions{ + /*aggregates=*/{{"hash_count", options, "i32"}}, + /*names=*/{"count(i32)"}, + /*keys=*/{"str"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1222,16 +1220,15 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { SourceNodeOptions{scalar_data.schema, scalar_data.gen(/*parallel=*/false, /*slow=*/false)}}, {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"all", nullptr}, - {"any", nullptr}, - {"count", nullptr}, - {"mean", nullptr}, - {"product", nullptr}, - {"stddev", nullptr}, - {"sum", nullptr}, - {"tdigest", nullptr}, - {"variance", nullptr}}, - /*targets=*/{"b", "b", "a", "a", "a", "a", "a", "a", "a"}, + /*aggregates=*/{{"all", nullptr, "b"}, + {"any", nullptr, "b"}, + {"count", nullptr, "a"}, + {"mean", nullptr, "a"}, + {"product", nullptr, "a"}, + {"stddev", nullptr, "a"}, + {"sum", nullptr, "a"}, + {"tdigest", nullptr, "a"}, + {"variance", nullptr, "a"}}, /*names=*/ {"all(b)", "any(b)", "count(a)", "mean(a)", "product(a)", "stddev(a)", "sum(a)", "tdigest(a)", "variance(a)"}}}, @@ -1273,8 +1270,8 @@ TEST(ExecPlanExecution, ScalarSourceGroupedSum) { scalar_data.gen(/*parallel=*/false, /*slow=*/false)}}, {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}}, - /*targets=*/{"a"}, /*names=*/{"hash_sum(a)"}, + AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "a"}}, + /*names=*/{"hash_sum(a)"}, /*keys=*/{"b"}}}, {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}}, }) diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc index 3f5d094774c..e336b245bb7 100644 --- a/cpp/src/arrow/compute/exec/test_util.cc +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -313,10 +313,11 @@ bool operator==(const Declaration& l, const Declaration& r) { if (l_agg->options == nullptr || r_agg->options == nullptr) return false; if (!l_agg->options->Equals(*r_agg->options)) return false; + + if (l_agg->target != r_agg->target) return false; } - return l_opts->targets == r_opts->targets && l_opts->names == r_opts->names && - l_opts->keys == r_opts->keys; + return l_opts->names == r_opts->names && l_opts->keys == r_opts->keys; } if (l.factory_name == "order_by_sink") { @@ -378,16 +379,11 @@ static inline void PrintToImpl(const std::string& factory_name, for (const auto& agg : o->aggregates) { *os << agg.function << "<"; if (agg.options) PrintTo(*agg.options, os); + *os << agg.target.ToString() << "<"; *os << ">,"; } *os << "},"; - *os << "targets={"; - for (const auto& target : o->targets) { - *os << target.ToString() << ","; - } - *os << "},"; - *os << "names={"; for (const auto& name : o->names) { *os << name << ","; diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index 82584f58e93..f348eaf3eb1 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -78,20 +78,21 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin std::make_shared(ScalarAggregateOptions::Defaults()); auto count_opts = std::make_shared(CountOptions::CountMode::ALL); std::vector aggs = { - {"hash_sum", sum_opts}, {"hash_sum", sum_opts}, {"hash_sum", sum_opts}, - {"hash_sum", sum_opts}, {"hash_mean", sum_opts}, {"hash_mean", sum_opts}, - {"hash_mean", sum_opts}, {"hash_count", count_opts}}; - - std::vector to_aggregate = {"sum_qty", "sum_base_price", "sum_disc_price", - "sum_charge", "avg_qty", "avg_price", - "avg_disc", "sum_qty"}; + {"hash_sum", sum_opts, "sum_qty"}, + {"hash_sum", sum_opts, "sum_base_price"}, + {"hash_sum", sum_opts, "sum_disc_price"}, + {"hash_sum", sum_opts, "sum_charge"}, + {"hash_mean", sum_opts, "avg_qty"}, + {"hash_mean", sum_opts, "avg_price"}, + {"hash_mean", sum_opts, "avg_disc"}, + {"hash_count", count_opts, "sum_qty"}}; std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc", "count_order"}; std::vector keys = {"l_returnflag", "l_linestatus"}; - AggregateNodeOptions agg_opts(aggs, to_aggregate, names, keys); + AggregateNodeOptions agg_opts(aggs, names, keys); SortKey l_returnflag_key("l_returnflag"); SortKey l_linestatus_key("l_linestatus"); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 7b47845f234..51c7aea1f37 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -123,15 +123,12 @@ Result NaiveGroupBy(std::vector arguments, std::vector keys Result GroupByUsingExecPlan(const BatchesWithSchema& input, const std::vector& key_names, - const std::vector& arg_names, const std::vector& aggregates, bool use_threads, ExecContext* ctx) { std::vector keys(key_names.size()); - std::vector targets(aggregates.size()); std::vector names(aggregates.size()); for (size_t i = 0; i < aggregates.size(); ++i) { names[i] = aggregates[i].function; - targets[i] = FieldRef(arg_names[i]); } for (size_t i = 0; i < key_names.size(); ++i) { keys[i] = FieldRef(key_names[i]); @@ -144,9 +141,8 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, { {"source", SourceNodeOptions{input.schema, input.gen(use_threads, /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{std::move(aggregates), std::move(targets), - std::move(names), std::move(keys)}}, + {"aggregate", AggregateNodeOptions{std::move(aggregates), std::move(names), + std::move(keys)}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -197,11 +193,9 @@ Result GroupByUsingExecPlan(const std::vector& arguments, FieldVector scan_fields(arguments.size() + keys.size()); std::vector key_names(keys.size()); - std::vector arg_names(arguments.size()); for (size_t i = 0; i < arguments.size(); ++i) { auto name = std::string("agg_") + std::to_string(i); scan_fields[i] = field(name, arguments[i].type()); - arg_names[i] = std::move(name); } for (size_t i = 0; i < keys.size(); ++i) { auto name = std::string("key_") + std::to_string(i); @@ -223,7 +217,7 @@ Result GroupByUsingExecPlan(const std::vector& arguments, input.batches.push_back(std::move(batch)); } - return GroupByUsingExecPlan(input, key_names, arg_names, aggregates, use_threads, ctx); + return GroupByUsingExecPlan(input, key_names, aggregates, use_threads, ctx); } void ValidateGroupBy(const std::vector& aggregates, @@ -756,7 +750,7 @@ TEST(GroupBy, NoBatches) { Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr}, + {"hash_count", nullptr, {"agg_0"}}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -821,7 +815,7 @@ TEST(GroupBy, CountOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr}, + {"hash_count", nullptr, "agg_0"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -858,15 +852,14 @@ TEST(GroupBy, CountScalar) { auto count_all = std::make_shared(CountOptions::ALL); for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum actual, - GroupByUsingExecPlan(input, {"key"}, {"argument", "argument", "argument"}, - { - {"hash_count", skip_nulls}, - {"hash_count", keep_nulls}, - {"hash_count", count_all}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN(Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_count", skip_nulls, "argument"}, + {"hash_count", keep_nulls, "argument"}, + {"hash_count", count_all, "argument"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_count", int64()), field("hash_count", int64()), @@ -909,7 +902,7 @@ TEST(GroupBy, SumOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr}, + {"hash_sum", nullptr, "agg_0"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -974,12 +967,12 @@ TEST(GroupBy, SumMeanProductDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr}, - {"hash_sum", nullptr}, - {"hash_mean", nullptr}, - {"hash_mean", nullptr}, - {"hash_product", nullptr}, - {"hash_product", nullptr}, + {"hash_sum", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_1"}, + {"hash_mean", nullptr, "agg_2"}, + {"hash_mean", nullptr, "agg_3"}, + {"hash_product", nullptr, "agg_4"}, + {"hash_product", nullptr, "agg_5"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1035,8 +1028,8 @@ TEST(GroupBy, MeanOnly) { table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_mean", nullptr}, - {"hash_mean", min_count}, + {"hash_mean", nullptr, "agg_0"}, + {"hash_mean", min_count, "agg_1"}, }, use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1070,15 +1063,14 @@ TEST(GroupBy, SumMeanProductScalar) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum actual, - GroupByUsingExecPlan(input, {"key"}, {"argument", "argument", "argument"}, - { - {"hash_sum", nullptr}, - {"hash_mean", nullptr}, - {"hash_product", nullptr}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN(Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_sum", nullptr, "argument"}, + {"hash_mean", nullptr, "argument"}, + {"hash_product", nullptr, "argument"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_sum", int64()), field("hash_mean", float64()), @@ -1119,8 +1111,8 @@ TEST(GroupBy, VarianceAndStddev) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, + {"hash_variance", nullptr, "agg_0"}, + {"hash_stddev", nullptr, "agg_1"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1151,18 +1143,19 @@ TEST(GroupBy, VarianceAndStddev) { [null, 3] ])"); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - })); + ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_variance", nullptr, "agg_0"}, + {"hash_stddev", nullptr, "agg_1"}, + })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1190,8 +1183,8 @@ TEST(GroupBy, VarianceAndStddev) { batch->GetColumnByName("key"), }, { - {"hash_variance", variance_options}, - {"hash_stddev", variance_options}, + {"hash_variance", variance_options, "agg_0"}, + {"hash_stddev", variance_options, "agg_1"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1236,10 +1229,10 @@ TEST(GroupBy, VarianceAndStddevDecimal) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, - {"hash_variance", nullptr}, - {"hash_stddev", nullptr}, + {"hash_variance", nullptr, "agg_0"}, + {"hash_stddev", nullptr, "agg_1"}, + {"hash_variance", nullptr, "agg_2"}, + {"hash_stddev", nullptr, "agg_3"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1304,12 +1297,12 @@ TEST(GroupBy, TDigest) { batch->GetColumnByName("key"), }, { - {"hash_tdigest", nullptr}, - {"hash_tdigest", options1}, - {"hash_tdigest", options2}, - {"hash_tdigest", keep_nulls}, - {"hash_tdigest", min_count}, - {"hash_tdigest", keep_nulls_min_count}, + {"hash_tdigest", nullptr, "agg_0"}, + {"hash_tdigest", options1, "agg_1"}, + {"hash_tdigest", options2, "agg_2"}, + {"hash_tdigest", keep_nulls, "agg_3"}, + {"hash_tdigest", min_count, "agg_4"}, + {"hash_tdigest", keep_nulls_min_count, "agg_5"}, })); AssertDatumsApproxEqual( @@ -1356,8 +1349,8 @@ TEST(GroupBy, TDigestDecimal) { }, {batch->GetColumnByName("key")}, { - {"hash_tdigest", nullptr}, - {"hash_tdigest", nullptr}, + {"hash_tdigest", nullptr, "agg_0"}, + {"hash_tdigest", nullptr, "agg_1"}, })); AssertDatumsApproxEqual( @@ -1402,23 +1395,24 @@ TEST(GroupBy, ApproximateMedian) { /*skip_nulls=*/true, /*min_count=*/3); auto keep_nulls_min_count = std::make_shared( /*skip_nulls=*/false, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_approximate_median", options}, - {"hash_approximate_median", keep_nulls}, - {"hash_approximate_median", min_count}, - {"hash_approximate_median", keep_nulls_min_count}, - })); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_approximate_median", options, "agg_0"}, + {"hash_approximate_median", keep_nulls, "agg_1"}, + {"hash_approximate_median", min_count, "agg_2"}, + {"hash_approximate_median", keep_nulls_min_count, "agg_3"}, + })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_approximate_median", float64()), @@ -1458,15 +1452,13 @@ TEST(GroupBy, StddevVarianceTDigestScalar) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN(Datum actual, GroupByUsingExecPlan(input, {"key"}, - {"argument", "argument", "argument", - "argument1", "argument1", "argument1"}, { - {"hash_stddev", nullptr}, - {"hash_variance", nullptr}, - {"hash_tdigest", nullptr}, - {"hash_stddev", nullptr}, - {"hash_variance", nullptr}, - {"hash_tdigest", nullptr}, + {"hash_stddev", nullptr, "argument"}, + {"hash_variance", nullptr, "argument"}, + {"hash_tdigest", nullptr, "argument"}, + {"hash_stddev", nullptr, "argument1"}, + {"hash_variance", nullptr, "argument1"}, + {"hash_tdigest", nullptr, "argument1"}, }, use_threads, default_exec_context())); Datum expected = @@ -1516,25 +1508,18 @@ TEST(GroupBy, VarianceOptions) { for (bool use_threads : {false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - "argument", - "argument", - "argument", - "argument", - "argument", - "argument", - }, - { - {"hash_stddev", keep_nulls}, - {"hash_stddev", min_count}, - {"hash_stddev", keep_nulls_min_count}, - {"hash_variance", keep_nulls}, - {"hash_variance", min_count}, - {"hash_variance", keep_nulls_min_count}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_stddev", keep_nulls, "argument"}, + {"hash_stddev", min_count, "argument"}, + {"hash_stddev", keep_nulls_min_count, "argument"}, + {"hash_variance", keep_nulls, "argument"}, + {"hash_variance", min_count, "argument"}, + {"hash_variance", keep_nulls_min_count, "argument"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -1553,25 +1538,18 @@ TEST(GroupBy, VarianceOptions) { ValidateOutput(expected); AssertDatumsApproxEqual(expected, actual, /*verbose=*/true); - ASSERT_OK_AND_ASSIGN(actual, - GroupByUsingExecPlan(input, {"key"}, - { - "argument1", - "argument1", - "argument1", - "argument1", - "argument1", - "argument1", - }, - { - {"hash_stddev", keep_nulls}, - {"hash_stddev", min_count}, - {"hash_stddev", keep_nulls_min_count}, - {"hash_variance", keep_nulls}, - {"hash_variance", min_count}, - {"hash_variance", keep_nulls_min_count}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_stddev", keep_nulls, "argument1"}, + {"hash_stddev", min_count, "argument1"}, + {"hash_stddev", keep_nulls_min_count, "argument1"}, + {"hash_variance", keep_nulls, "argument1"}, + {"hash_variance", min_count, "argument1"}, + {"hash_variance", keep_nulls_min_count, "argument1"}, + }, + use_threads, default_exec_context())); expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -1629,9 +1607,9 @@ TEST(GroupBy, MinMaxOnly) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr}, - {"hash_min_max", nullptr}, - {"hash_min_max", nullptr}, + {"hash_min_max", nullptr, "agg_0"}, + {"hash_min_max", nullptr, "agg_1"}, + {"hash_min_max", nullptr, "agg_2"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1736,7 +1714,7 @@ TEST(GroupBy, MinMaxTypes) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, + {table->GetColumnByName("key")}, {{"hash_min_max", nullptr, "agg_0"}}, /*use_threads=*/true, /*use_exec_plan=*/true)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1790,8 +1768,8 @@ TEST(GroupBy, MinMaxDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr}, - {"hash_min_max", nullptr}, + {"hash_min_max", nullptr, "agg_0"}, + {"hash_min_max", nullptr, "agg_1"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1849,11 +1827,11 @@ TEST(GroupBy, MinMaxBinary) { [null, 3] ])"}); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, + {{"hash_min_max", nullptr, "agg_0"}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1906,8 +1884,8 @@ TEST(GroupBy, MinMaxFixedSizeBinary) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, - use_threads, use_exec_plan)); + {table->GetColumnByName("key")}, + {{"hash_min_max", nullptr, "agg_0"}}, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1960,8 +1938,8 @@ TEST(GroupBy, MinOrMax) { table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_min", nullptr}, - {"hash_max", nullptr}, + {"hash_min", nullptr, "agg_0"}, + {"hash_max", nullptr, "agg_1"}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1997,9 +1975,8 @@ TEST(GroupBy, MinMaxScalar) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN( Datum actual, - GroupByUsingExecPlan(input, {"key"}, {"argument", "argument", "argument"}, - {{"hash_min_max", nullptr}}, use_threads, - default_exec_context())); + GroupByUsingExecPlan(input, {"key"}, {{"hash_min_max", nullptr, "argument"}}, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_min_max", @@ -2062,14 +2039,14 @@ TEST(GroupBy, AnyAndAll) { }, {table->GetColumnByName("key")}, { - {"hash_any", no_min}, - {"hash_any", min_count}, - {"hash_any", keep_nulls}, - {"hash_any", keep_nulls_min_count}, - {"hash_all", no_min}, - {"hash_all", min_count}, - {"hash_all", keep_nulls}, - {"hash_all", keep_nulls_min_count}, + {"hash_any", no_min, "agg_0"}, + {"hash_any", min_count, "agg_1"}, + {"hash_any", keep_nulls, "agg_2"}, + {"hash_any", keep_nulls_min_count, "agg_3"}, + {"hash_all", no_min, "agg_4"}, + {"hash_all", min_count, "agg_5"}, + {"hash_all", keep_nulls, "agg_6"}, + {"hash_all", keep_nulls_min_count, "agg_7"}, }, use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2119,17 +2096,15 @@ TEST(GroupBy, AnyAllScalar) { std::make_shared(/*skip_nulls=*/false, /*min_count=*/0); for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum actual, - GroupByUsingExecPlan(input, {"key"}, - {"argument", "argument", "argument", "argument"}, - { - {"hash_any", nullptr}, - {"hash_all", nullptr}, - {"hash_any", keep_nulls}, - {"hash_all", keep_nulls}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN(Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_any", nullptr, "argument"}, + {"hash_all", nullptr, "argument"}, + {"hash_any", keep_nulls, "argument"}, + {"hash_all", keep_nulls, "argument"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_any", boolean()), field("hash_all", boolean()), @@ -2195,9 +2170,9 @@ TEST(GroupBy, CountDistinct) { table->GetColumnByName("key"), }, { - {"hash_count_distinct", all}, - {"hash_count_distinct", only_valid}, - {"hash_count_distinct", only_null}, + {"hash_count_distinct", all, "agg_0"}, + {"hash_count_distinct", only_valid, "agg_1"}, + {"hash_count_distinct", only_null, "agg_2"}, }, use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2261,9 +2236,9 @@ TEST(GroupBy, CountDistinct) { table->GetColumnByName("key"), }, { - {"hash_count_distinct", all}, - {"hash_count_distinct", only_valid}, - {"hash_count_distinct", only_null}, + {"hash_count_distinct", all, "agg_0"}, + {"hash_count_distinct", only_valid, "agg_1"}, + {"hash_count_distinct", only_null, "agg_2"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2307,9 +2282,9 @@ TEST(GroupBy, CountDistinct) { table->GetColumnByName("key"), }, { - {"hash_count_distinct", all}, - {"hash_count_distinct", only_valid}, - {"hash_count_distinct", only_null}, + {"hash_count_distinct", all, "agg_0"}, + {"hash_count_distinct", only_valid, "agg_1"}, + {"hash_count_distinct", only_null, "agg_2"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2379,9 +2354,9 @@ TEST(GroupBy, Distinct) { table->GetColumnByName("key"), }, { - {"hash_distinct", all}, - {"hash_distinct", only_valid}, - {"hash_distinct", only_null}, + {"hash_distinct", all, "agg_0"}, + {"hash_distinct", only_valid, "agg_1"}, + {"hash_distinct", only_null, "agg_2"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2452,9 +2427,9 @@ TEST(GroupBy, Distinct) { table->GetColumnByName("key"), }, { - {"hash_distinct", all}, - {"hash_distinct", only_valid}, - {"hash_distinct", only_null}, + {"hash_distinct", all, "agg_0"}, + {"hash_distinct", only_valid, "agg_1"}, + {"hash_distinct", only_null, "agg_2"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2516,12 +2491,12 @@ TEST(GroupBy, OneMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_one", nullptr}, - {"hash_one", nullptr}, - {"hash_one", nullptr}, - {"hash_one", nullptr}, - {"hash_one", nullptr}, - {"hash_one", nullptr}, + {"hash_one", nullptr, "agg_0"}, + {"hash_one", nullptr, "agg_1"}, + {"hash_one", nullptr, "agg_2"}, + {"hash_one", nullptr, "agg_3"}, + {"hash_one", nullptr, "agg_4"}, + {"hash_one", nullptr, "agg_5"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -2647,7 +2622,7 @@ TEST(GroupBy, OneNumericTypes) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_one", nullptr}}, + {table->GetColumnByName("key")}, {{"hash_one", nullptr, "agg_0"}}, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2710,7 +2685,7 @@ TEST(GroupBy, OneBinaryTypes) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_one", nullptr}}, + {table->GetColumnByName("key")}, {{"hash_one", nullptr, "agg_0"}}, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2743,9 +2718,9 @@ TEST(GroupBy, OneScalar) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN( - Datum actual, GroupByUsingExecPlan( - input, {"key"}, {"argument", "argument", "argument"}, - {{"hash_one", nullptr}}, use_threads, default_exec_context())); + Datum actual, + GroupByUsingExecPlan(input, {"key"}, {{"hash_one", nullptr, "argument"}}, + use_threads, default_exec_context())); const auto& struct_arr = actual.array_as(); // Check the key column @@ -2805,7 +2780,7 @@ TEST(GroupBy, ListNumeric) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr}, + {"hash_list", nullptr, "agg_0"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2876,7 +2851,7 @@ TEST(GroupBy, ListNumeric) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr}, + {"hash_list", nullptr, "agg_0"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2945,7 +2920,7 @@ TEST(GroupBy, ListBinaryTypes) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr}, + {"hash_list", nullptr, "agg_0"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -3007,7 +2982,7 @@ TEST(GroupBy, ListBinaryTypes) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr}, + {"hash_list", nullptr, "agg_0"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -3085,12 +3060,12 @@ TEST(GroupBy, ListMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_list", nullptr}, - {"hash_list", nullptr}, - {"hash_list", nullptr}, - {"hash_list", nullptr}, - {"hash_list", nullptr}, - {"hash_list", nullptr}, + {"hash_list", nullptr, "agg_0"}, + {"hash_list", nullptr, "agg_1"}, + {"hash_list", nullptr, "agg_2"}, + {"hash_list", nullptr, "agg_3"}, + {"hash_list", nullptr, "agg_4"}, + {"hash_list", nullptr, "agg_5"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -3238,12 +3213,12 @@ TEST(GroupBy, CountAndSum) { batch->GetColumnByName("key"), }, { - {"hash_count", count_options}, - {"hash_count", count_nulls}, - {"hash_count", count_all}, - {"hash_sum", nullptr}, - {"hash_sum", min_count}, - {"hash_sum", nullptr}, + {"hash_count", count_options, "agg_0"}, + {"hash_count", count_nulls, "agg_1"}, + {"hash_count", count_all, "agg_2"}, + {"hash_sum", nullptr, "agg_3"}, + {"hash_sum", min_count, "agg_4"}, + {"hash_sum", nullptr, "agg_5"}, })); AssertDatumsEqual( @@ -3295,9 +3270,9 @@ TEST(GroupBy, Product) { batch->GetColumnByName("key"), }, { - {"hash_product", nullptr}, - {"hash_product", nullptr}, - {"hash_product", min_count}, + {"hash_product", nullptr, "agg_0"}, + {"hash_product", nullptr, "agg_1"}, + {"hash_product", min_count, "agg_2"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -3330,7 +3305,7 @@ TEST(GroupBy, Product) { batch->GetColumnByName("key"), }, { - {"hash_product", nullptr}, + {"hash_product", nullptr, "agg_0"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -3374,12 +3349,12 @@ TEST(GroupBy, SumMeanProductKeepNulls) { batch->GetColumnByName("key"), }, { - {"hash_sum", keep_nulls}, - {"hash_sum", min_count}, - {"hash_mean", keep_nulls}, - {"hash_mean", min_count}, - {"hash_product", keep_nulls}, - {"hash_product", min_count}, + {"hash_sum", keep_nulls, "agg_0"}, + {"hash_sum", min_count, "agg_1"}, + {"hash_mean", keep_nulls, "agg_2"}, + {"hash_mean", min_count, "agg_3"}, + {"hash_product", keep_nulls, "agg_4"}, + {"hash_product", min_count, "agg_5"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -3423,7 +3398,7 @@ TEST(GroupBy, SumOnlyStringAndDictKeys) { internal::GroupBy({batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}, { - {"hash_sum", nullptr}, + {"hash_sum", nullptr, "agg_0"}, })); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3466,11 +3441,11 @@ TEST(GroupBy, ConcreteCaseWithValidateGroupBy) { using internal::Aggregate; for (auto agg : { - Aggregate{"hash_sum", nullptr}, - Aggregate{"hash_count", non_null}, - Aggregate{"hash_count", nulls}, - Aggregate{"hash_min_max", nullptr}, - Aggregate{"hash_min_max", keepna}, + Aggregate{"hash_sum", nullptr, "agg_0"}, + Aggregate{"hash_count", non_null, "agg_1"}, + Aggregate{"hash_count", nulls, "agg_2"}, + Aggregate{"hash_min_max", nullptr, "agg_3"}, + Aggregate{"hash_min_max", keepna, "agg_4"}, }) { SCOPED_TRACE(agg.function); ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, @@ -3494,8 +3469,8 @@ TEST(GroupBy, CountNull) { using internal::Aggregate; for (auto agg : { - Aggregate{"hash_count", keepna}, - Aggregate{"hash_count", skipna}, + Aggregate{"hash_count", keepna, "agg_0"}, + Aggregate{"hash_count", skipna, "agg_1"}, }) { SCOPED_TRACE(agg.function); ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, @@ -3519,7 +3494,7 @@ TEST(GroupBy, RandomArraySum) { ValidateGroupBy( { - {"hash_sum", options}, + {"hash_sum", options, "agg_0"}, }, {batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}); } @@ -3552,9 +3527,9 @@ TEST(GroupBy, WithChunkedArray) { table->GetColumnByName("key"), }, { - {"hash_count", nullptr}, - {"hash_sum", nullptr}, - {"hash_min_max", nullptr}, + {"hash_count", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_1"}, + {"hash_min_max", nullptr, "agg_2"}, })); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3590,7 +3565,7 @@ TEST(GroupBy, MinMaxWithNewGroupsInChunkedArray) { table->GetColumnByName("key"), }, { - {"hash_min_max", nullptr}, + {"hash_min_max", nullptr, "agg_1"}, })); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3626,7 +3601,7 @@ TEST(GroupBy, SmallChunkSizeSumOnly) { internal::GroupBy({batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}, { - {"hash_sum", nullptr}, + {"hash_sum", nullptr, "agg_0"}, }, small_chunksize_context())); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3678,9 +3653,9 @@ TEST(GroupBy, CountWithNullType) { }, {table->GetColumnByName("key")}, { - {"hash_count", all}, - {"hash_count", only_valid}, - {"hash_count", only_null}, + {"hash_count", all, "agg_0"}, + {"hash_count", only_valid, "agg_1"}, + {"hash_count", only_null, "agg_2"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3723,9 +3698,9 @@ TEST(GroupBy, CountWithNullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_count", all}, - {"hash_count", only_valid}, - {"hash_count", only_null}, + {"hash_count", all, "agg_0"}, + {"hash_count", only_valid, "agg_1"}, + {"hash_count", only_null, "agg_2"}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); @@ -3768,10 +3743,10 @@ TEST(GroupBy, SingleNullTypeKey) { }, {table->GetColumnByName("key")}, { - {"hash_count", nullptr}, - {"hash_sum", nullptr}, - {"hash_mean", nullptr}, - {"hash_min_max", nullptr}, + {"hash_count", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_1"}, + {"hash_mean", nullptr, "agg_2"}, + {"hash_min_max", nullptr, "agg_3"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3828,9 +3803,9 @@ TEST(GroupBy, MultipleKeysIncludesNullType) { }, {table->GetColumnByName("key_0"), table->GetColumnByName("key_1")}, { - {"hash_count", nullptr}, - {"hash_sum", nullptr}, - {"hash_min_max", nullptr}, + {"hash_count", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_1"}, + {"hash_min_max", nullptr, "agg_2"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3899,10 +3874,10 @@ TEST(GroupBy, SumNullType) { }, {table->GetColumnByName("key")}, { - {"hash_sum", no_min}, - {"hash_sum", keep_nulls}, - {"hash_sum", min_count}, - {"hash_sum", keep_nulls_min_count}, + {"hash_sum", no_min, "agg_0"}, + {"hash_sum", keep_nulls, "agg_1"}, + {"hash_sum", min_count, "agg_2"}, + {"hash_sum", keep_nulls_min_count, "agg_3"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3967,10 +3942,10 @@ TEST(GroupBy, ProductNullType) { }, {table->GetColumnByName("key")}, { - {"hash_product", no_min}, - {"hash_product", keep_nulls}, - {"hash_product", min_count}, - {"hash_product", keep_nulls_min_count}, + {"hash_product", no_min, "agg_0"}, + {"hash_product", keep_nulls, "agg_1"}, + {"hash_product", min_count, "agg_2"}, + {"hash_product", keep_nulls_min_count, "agg_3"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -4035,10 +4010,10 @@ TEST(GroupBy, MeanNullType) { }, {table->GetColumnByName("key")}, { - {"hash_mean", no_min}, - {"hash_mean", keep_nulls}, - {"hash_mean", min_count}, - {"hash_mean", keep_nulls_min_count}, + {"hash_mean", no_min, "agg_0"}, + {"hash_mean", keep_nulls, "agg_1"}, + {"hash_mean", min_count, "agg_2"}, + {"hash_mean", keep_nulls_min_count, "agg_3"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -4087,9 +4062,9 @@ TEST(GroupBy, NullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_sum", no_min}, - {"hash_product", min_count}, - {"hash_mean", keep_nulls}, + {"hash_sum", no_min, "agg_0"}, + {"hash_product", min_count, "agg_1"}, + {"hash_mean", keep_nulls, "agg_2"}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 02f658181c0..b19e787f94d 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -676,8 +676,7 @@ Result AsyncScanner::CountRows() { options}}, {"project", compute::ProjectNodeOptions{{options->filter}, {"mask"}}}, {"aggregate", compute::AggregateNodeOptions{{compute::internal::Aggregate{ - "sum", nullptr}}, - /*targets=*/{"mask"}, + "sum", nullptr, "mask"}}, /*names=*/{"selected_count"}}}, {"sink", compute::SinkNodeOptions{&sink_gen}}, }) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index b7dcb8b18d2..1d3e0b403ef 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1837,11 +1837,10 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { // pipe the projection into a scalar aggregate node ASSERT_OK_AND_ASSIGN( compute::ExecNode * aggregate, - compute::MakeExecNode( - "aggregate", plan.get(), {project}, - compute::AggregateNodeOptions{{compute::internal::Aggregate{"sum", nullptr}}, - /*targets=*/{"a * 2"}, - /*names=*/{"sum(a * 2)"}})); + compute::MakeExecNode("aggregate", plan.get(), {project}, + compute::AggregateNodeOptions{ + {compute::internal::Aggregate{"sum", nullptr, "a * 2"}}, + /*names=*/{"sum(a * 2)"}})); // finally, pipe the aggregate node into a sink node AsyncGenerator> sink_gen; @@ -1927,12 +1926,12 @@ TEST(ScanNode, MinimalGroupedAggEndToEnd) { // pipe the projection into a grouped aggregate node ASSERT_OK_AND_ASSIGN( compute::ExecNode * aggregate, - compute::MakeExecNode("aggregate", plan.get(), {project}, - compute::AggregateNodeOptions{ - {compute::internal::Aggregate{"hash_sum", nullptr}}, - /*targets=*/{"a * 2"}, - /*names=*/{"sum(a * 2)"}, - /*keys=*/{"b"}})); + compute::MakeExecNode( + "aggregate", plan.get(), {project}, + compute::AggregateNodeOptions{ + {compute::internal::Aggregate{"hash_sum", nullptr, "a * 2"}}, + /*names=*/{"sum(a * 2)"}, + /*keys=*/{"b"}})); // finally, pipe the aggregate node into a sink node AsyncGenerator> sink_gen; From 97b0848f29fc279bb78812fa45d122e3bc82ef34 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 19 May 2022 12:43:27 +0530 Subject: [PATCH 07/32] adding modifications to R bindings (wip-test) --- r/R/arrowExports.R | 5 +++-- r/R/query-engine.R | 17 ++++++++++------- r/src/arrowExports.cpp | 9 ++++----- r/src/compute-exec.cpp | 27 +++++++++++++++++++-------- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 4c579840e49..53c0f8ac86b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -432,8 +432,8 @@ ExecNode_Project <- function(input, exprs, names) { .Call(`_arrow_ExecNode_Project`, input, exprs, names) } -ExecNode_Aggregate <- function(input, options, target_names, out_field_names, key_names) { - .Call(`_arrow_ExecNode_Aggregate`, input, options, target_names, out_field_names, key_names) +ExecNode_Aggregate <- function(input, options, out_field_names, key_names) { + .Call(`_arrow_ExecNode_Aggregate`, input, options, out_field_names, key_names) } ExecNode_Join <- function(input, type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) { @@ -2007,3 +2007,4 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } + diff --git a/r/R/query-engine.R b/r/R/query-engine.R index c40c61e98a7..2c2d96063bd 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -74,7 +74,8 @@ ExecPlan <- R6Class("ExecPlan", grouped <- length(group_vars) > 0 # Collect the target names first because we have to add back the group vars - target_names <- names(.data) + # TODO : remove upon discussion + # target_names <- names(.data) .data <- ensure_group_vars(.data) .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns @@ -114,10 +115,13 @@ ExecPlan <- R6Class("ExecPlan", x }) } + target_names <- names(.data$aggregations); + for (i in seq_len(length(target_names))) { + .data$aggregations[[i]][["target"]] <- target_names[i] + } node <- node$Aggregate( - options = map(.data$aggregations, ~ .[c("fun", "options")]), - target_names = names(.data$aggregations), + options = map(.data$aggregations, ~ .[c("fun", "options", "target")]), out_field_names = names(.data$aggregations), key_names = group_vars ) @@ -179,7 +183,6 @@ ExecPlan <- R6Class("ExecPlan", temp_columns = names(.data$temp_columns) ) } - # This is only safe because we are going to evaluate queries that end # with head/tail first, then evaluate any subsequent query as a new query if (!is.null(.data$head)) { @@ -304,9 +307,9 @@ ExecNode <- R6Class("ExecNode", assert_is(expr, "Expression") self$preserve_extras(ExecNode_Filter(self, expr)) }, - Aggregate = function(options, target_names, out_field_names, key_names) { - out <- self$preserve_extras( - ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + Aggregate = function(options, key_names) { + self$preserve_sort( + ExecNode_Aggregate(self, options, key_names) ) # dplyr drops top-level attributes when you call summarize() out$extras$source_schema$metadata[["r"]]$attributes <- NULL diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 887327d48f9..3bfed5f0af7 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -964,15 +964,14 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::shared_ptr ExecNode_Aggregate(const std::shared_ptr& input, cpp11::list options, std::vector target_names, std::vector out_field_names, std::vector key_names); -extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP target_names_sexp, SEXP out_field_names_sexp, SEXP key_names_sexp){ +std::shared_ptr ExecNode_Aggregate(const std::shared_ptr& input, cpp11::list options, std::vector out_field_names, std::vector key_names); +extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP out_field_names_sexp, SEXP key_names_sexp){ BEGIN_CPP11 arrow::r::Input&>::type input(input_sexp); arrow::r::Input::type options(options_sexp); - arrow::r::Input>::type target_names(target_names_sexp); arrow::r::Input>::type out_field_names(out_field_names_sexp); arrow::r::Input>::type key_names(key_names_sexp); - return cpp11::as_sexp(ExecNode_Aggregate(input, options, target_names, out_field_names, key_names)); + return cpp11::as_sexp(ExecNode_Aggregate(input, options, out_field_names, key_names)); END_CPP11 } // compute-exec.cpp @@ -5222,7 +5221,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, - { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, + { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 4}, { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 9}, { "_arrow_ExecNode_Union", (DL_FUNC) &_arrow_ExecNode_Union, 2}, { "_arrow_ExecNode_SourceNode", (DL_FUNC) &_arrow_ExecNode_SourceNode, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 089d1e71eba..addea6efa54 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -228,28 +228,39 @@ std::shared_ptr ExecNode_Project( // [[arrow::export]] std::shared_ptr ExecNode_Aggregate( const std::shared_ptr& input, cpp11::list options, - std::vector target_names, std::vector out_field_names, + std::vector out_field_names, std::vector key_names) { std::vector aggregates; + // aggregates.push_back( + // arrow::compute::internal::Aggregate{std::move(name), opts.get()}); + // keep_alives.push_back(std::move(opts)); + // } for (cpp11::list name_opts : options) { auto name = cpp11::as_cpp(name_opts[0]); auto opts = make_compute_options(name, name_opts[1]); - + auto target = cpp11::as_cpp(name_opts[2]); + aggregates.push_back( - arrow::compute::internal::Aggregate{std::move(name), std::move(opts)}); + arrow::compute::internal::Aggregate{std::move(name), opts, std::move(target)}); + keep_alives.push_back(std::move(opts)); } - std::vector targets, keys; - for (auto&& name : target_names) { - targets.emplace_back(std::move(name)); - } + // std::vector targets, keys; + // for (auto&& name : target_names) { + // targets.emplace_back(std::move(name)); + // } + std::vector keys; for (auto&& name : key_names) { keys.emplace_back(std::move(name)); } + // return MakeExecNodeOrStop( + // "aggregate", input->plan(), {input.get()}, + // compute::AggregateNodeOptions{std::move(aggregates), std::move(targets), + // std::move(out_field_names), std::move(keys)}); return MakeExecNodeOrStop( "aggregate", input->plan(), {input.get()}, - compute::AggregateNodeOptions{std::move(aggregates), std::move(targets), + compute::AggregateNodeOptions{std::move(aggregates), std::move(out_field_names), std::move(keys)}); } From 93196d23982a6bab41c649cc41cc61514b8449af Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 19 May 2022 20:46:17 +0530 Subject: [PATCH 08/32] reformat file and clean up --- r/src/compute-exec.cpp | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index addea6efa54..e4438d08d28 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -228,14 +228,9 @@ std::shared_ptr ExecNode_Project( // [[arrow::export]] std::shared_ptr ExecNode_Aggregate( const std::shared_ptr& input, cpp11::list options, - std::vector out_field_names, - std::vector key_names) { + std::vector out_field_names, std::vector key_names) { std::vector aggregates; - // aggregates.push_back( - // arrow::compute::internal::Aggregate{std::move(name), opts.get()}); - // keep_alives.push_back(std::move(opts)); - // } for (cpp11::list name_opts : options) { auto name = cpp11::as_cpp(name_opts[0]); auto opts = make_compute_options(name, name_opts[1]); @@ -246,22 +241,14 @@ std::shared_ptr ExecNode_Aggregate( keep_alives.push_back(std::move(opts)); } - // std::vector targets, keys; - // for (auto&& name : target_names) { - // targets.emplace_back(std::move(name)); - // } std::vector keys; for (auto&& name : key_names) { keys.emplace_back(std::move(name)); } - // return MakeExecNodeOrStop( - // "aggregate", input->plan(), {input.get()}, - // compute::AggregateNodeOptions{std::move(aggregates), std::move(targets), - // std::move(out_field_names), std::move(keys)}); return MakeExecNodeOrStop( "aggregate", input->plan(), {input.get()}, - compute::AggregateNodeOptions{std::move(aggregates), - std::move(out_field_names), std::move(keys)}); + compute::AggregateNodeOptions{std::move(aggregates), std::move(out_field_names), + std::move(keys)}); } // [[arrow::export]] From 7bd02343afd77cf613e5e5a271ba0c5523ec5857 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 19 May 2022 23:08:53 +0530 Subject: [PATCH 09/32] [c++] inclusion of name within aggregation --- .../execution_plan_documentation_examples.cc | 6 +- cpp/src/arrow/compute/api_aggregate.h | 3 + cpp/src/arrow/compute/exec/aggregate_node.cc | 7 +- cpp/src/arrow/compute/exec/ir_consumer.cc | 4 +- cpp/src/arrow/compute/exec/ir_test.cc | 49 +- cpp/src/arrow/compute/exec/options.h | 8 +- cpp/src/arrow/compute/exec/plan_test.cc | 232 +++--- cpp/src/arrow/compute/exec/test_util.cc | 10 +- .../compute/kernels/hash_aggregate_test.cc | 754 +++++++++--------- cpp/src/arrow/dataset/scanner.cc | 3 +- cpp/src/arrow/dataset/scanner_test.cc | 8 +- 11 files changed, 538 insertions(+), 546 deletions(-) diff --git a/cpp/examples/arrow/execution_plan_documentation_examples.cc b/cpp/examples/arrow/execution_plan_documentation_examples.cc index 3063431b5f9..44bff56e00c 100644 --- a/cpp/examples/arrow/execution_plan_documentation_examples.cc +++ b/cpp/examples/arrow/execution_plan_documentation_examples.cc @@ -503,8 +503,7 @@ arrow::Status SourceScalarAggregateSinkExample(cp::ExecContext& exec_context) { ARROW_ASSIGN_OR_RAISE(cp::ExecNode * source, cp::MakeExecNode("source", plan.get(), {}, source_node_options)); auto aggregate_options = - cp::AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "a"}}, - /*names=*/{"sum(a)"}}; + cp::AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "a", "sum(a)"}}}; ARROW_ASSIGN_OR_RAISE( cp::ExecNode * aggregate, cp::MakeExecNode("aggregate", plan.get(), {source}, std::move(aggregate_options))); @@ -541,8 +540,7 @@ arrow::Status SourceGroupAggregateSinkExample(cp::ExecContext& exec_context) { cp::MakeExecNode("source", plan.get(), {}, source_node_options)); auto options = std::make_shared(cp::CountOptions::ONLY_VALID); auto aggregate_options = - cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", &options, "a"}}, - /*names=*/{"count(a)"}, + cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", &options, "a", "count(a)"}}, /*keys=*/{"b"}}; ARROW_ASSIGN_OR_RAISE( cp::ExecNode * aggregate, diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index ff994178fc1..9cd06424299 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -405,6 +405,9 @@ struct ARROW_EXPORT Aggregate { // fields to which aggregations will be applied FieldRef target; + + // output field name for aggregations + std::string name; }; } // namespace internal diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index ace6d9aa0ee..3632f2da0b6 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -89,7 +89,6 @@ class ScalarAggregateNode : public ExecNode { std::vector kernels(aggregates.size()); std::vector>> states(kernels.size()); FieldVector fields(kernels.size()); - const auto& field_names = aggregate_options.names; std::vector target_field_ids(kernels.size()); for (size_t i = 0; i < kernels.size(); ++i) { @@ -130,7 +129,7 @@ class ScalarAggregateNode : public ExecNode { ARROW_ASSIGN_OR_RAISE( auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type})); - fields[i] = field(field_names[i], std::move(descr.type)); + fields[i] = field(aggregate_options.aggregates[i].name, std::move(descr.type)); } return plan->EmplaceNode( @@ -296,7 +295,6 @@ class GroupByNode : public ExecNode { const auto& keys = aggregate_options.keys; // Copy (need to modify options pointer below) auto aggs = aggregate_options.aggregates; - const auto& field_names = aggregate_options.names; // Get input schema auto input_schema = input->output_schema(); @@ -341,7 +339,8 @@ class GroupByNode : public ExecNode { // Aggregate fields come before key fields to match the behavior of GroupBy function for (size_t i = 0; i < aggs.size(); ++i) { - output_fields[i] = agg_result_fields[i]->WithName(field_names[i]); + output_fields[i] = + agg_result_fields[i]->WithName(aggregate_options.aggregates[i].name); } size_t base = aggs.size(); for (size_t i = 0; i < keys.size(); ++i) { diff --git a/cpp/src/arrow/compute/exec/ir_consumer.cc b/cpp/src/arrow/compute/exec/ir_consumer.cc index 4f539c91fca..f17dbf1ed79 100644 --- a/cpp/src/arrow/compute/exec/ir_consumer.cc +++ b/cpp/src/arrow/compute/exec/ir_consumer.cc @@ -550,8 +550,8 @@ Result Convert(const ir::Relation& rel) { "Support for non-FieldRef arguments to Aggregate.measures"); } - opts.aggregates.push_back({call->function_name, nullptr, *target}); - opts.names.push_back(call->function_name + " " + target->ToString()); + opts.aggregates.push_back({call->function_name, nullptr, *target, + call->function_name + " " + target->ToString()}); } if (!aggregate->groupings()) return UnexpectedNullField("Aggregate.groupings"); diff --git a/cpp/src/arrow/compute/exec/ir_test.cc b/cpp/src/arrow/compute/exec/ir_test.cc index 9f3e3098cc0..d7eb37c185e 100644 --- a/cpp/src/arrow/compute/exec/ir_test.cc +++ b/cpp/src/arrow/compute/exec/ir_test.cc @@ -249,7 +249,8 @@ TEST(Relation, Filter) { } TEST(Relation, AggregateSimple) { - ASSERT_THAT(ConvertJSON(R"({ + ASSERT_THAT( + ConvertJSON(R"({ "impl": { id: {id: 1}, "groupings": [ @@ -347,27 +348,22 @@ TEST(Relation, AggregateSimple) { }, "impl_type": "Aggregate" })"), - ResultWith(Eq(Declaration::Sequence({ - {"catalog_source", - CatalogSourceNodeOptions{"tbl", schema({ - field("foo", int32()), - field("bar", int64()), - field("baz", float64()), - })}, - "0"}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{ - {"sum", nullptr, 1}, - {"mean", nullptr, 2}, - }, - /*names=*/ - { - "sum FieldRef.FieldPath(1)", - "mean FieldRef.FieldPath(2)", - }, - /*keys=*/{0}}, - "1"}, - })))); + ResultWith(Eq(Declaration::Sequence({ + {"catalog_source", + CatalogSourceNodeOptions{"tbl", schema({ + field("foo", int32()), + field("bar", int64()), + field("baz", float64()), + })}, + "0"}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{ + {"sum", nullptr, 1, "sum FieldRef.FieldPath(1)"}, + {"mean", nullptr, 2, "mean FieldRef.FieldPath(2)"}, + }, + /*keys=*/{0}}, + "1"}, + })))); } TEST(Relation, AggregateWithHaving) { @@ -563,13 +559,8 @@ TEST(Relation, AggregateWithHaving) { {"filter", FilterNodeOptions{less(field_ref(0), literal(3))}, "1"}, {"aggregate", AggregateNodeOptions{/*aggregates=*/{ - {"sum", nullptr, 1}, - {"mean", nullptr, 2}, - }, - /*names=*/ - { - "sum FieldRef.FieldPath(1)", - "mean FieldRef.FieldPath(2)", + {"sum", nullptr, 1, "sum FieldRef.FieldPath(1)"}, + {"mean", nullptr, 2, "mean FieldRef.FieldPath(2)"}, }, /*keys=*/{0}}, "2"}, diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h index 7132bd64784..84254ed5ad9 100644 --- a/cpp/src/arrow/compute/exec/options.h +++ b/cpp/src/arrow/compute/exec/options.h @@ -112,15 +112,11 @@ class ARROW_EXPORT ProjectNodeOptions : public ExecNodeOptions { class ARROW_EXPORT AggregateNodeOptions : public ExecNodeOptions { public: AggregateNodeOptions(std::vector aggregates, - std::vector names, std::vector keys = {}) - : aggregates(std::move(aggregates)), - names(std::move(names)), - keys(std::move(keys)) {} + std::vector keys = {}) + : aggregates(std::move(aggregates)), keys(std::move(keys)) {} // aggregations which will be applied to the targetted fields std::vector aggregates; - // output field names for aggregations - std::vector names; // keys by which aggregations will be grouped std::vector keys; }; diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index 4d1905fc8e7..4ffee8bdc13 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -391,9 +391,10 @@ TEST(ExecPlan, ToString) { }}}, {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)"}, - {"hash_count", options, "multiply(i32, 2)"}}, - /*names=*/{"sum(multiply(i32, 2))", "count(multiply(i32, 2))"}, + /*aggregates=*/{ + {"hash_sum", nullptr, "multiply(i32, 2)", "sum(multiply(i32, 2))"}, + {"hash_count", options, "multiply(i32, 2)", + "count(multiply(i32, 2))"}}, /*keys=*/{"bool"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), literal(10))}}, @@ -429,16 +430,16 @@ custom_sink_label:OrderBySinkNode{by={sort_keys=[FieldRef.Name(sum(multiply(i32, rhs.label = "rhs"; union_node.inputs.emplace_back(lhs); union_node.inputs.emplace_back(rhs); - ASSERT_OK(Declaration::Sequence( - { - union_node, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"count", options, "i32"}}, - /*names=*/{"count(i32)"}, - /*keys=*/{}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + union_node, + {"aggregate", AggregateNodeOptions{ + /*aggregates=*/{{"count", options, "i32", "count(i32)"}}, + /*keys=*/{}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 5 nodes: :SinkNode{} :ScalarAggregateNode{aggregates=[ @@ -764,17 +765,17 @@ TEST(ExecPlanExecution, StressSourceGroupedSumStop) { auto random_data = MakeRandomBatches(input_schema, num_batches); SortOptions options({SortKey("a", SortOrder::Ascending)}); - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{random_data.schema, - random_data.gen(parallel, slow)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "a"}}, - /*names=*/{"sum(a)"}, - /*keys=*/{"b"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + {"source", SourceNodeOptions{random_data.schema, + random_data.gen(parallel, slow)}}, + {"aggregate", AggregateNodeOptions{ + /*aggregates=*/{{"hash_sum", nullptr, "a", "sum(a)"}}, + /*keys=*/{"b"}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_OK(plan->Validate()); ASSERT_OK(plan->StartProducing()); @@ -912,17 +913,17 @@ TEST(ExecPlanExecution, SourceGroupedSum) { ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); AsyncGenerator> sink_gen; - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{input.schema, - input.gen(parallel, /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32"}}, - /*names=*/{"sum(i32)"}, - /*keys=*/{"str"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + {"source", + SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "i32", "sum(i32)"}}, + /*keys=*/{"str"}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), Finishes(ResultWith(UnorderedElementsAreArray({ExecBatchFromJSON( @@ -986,10 +987,9 @@ TEST(ExecPlanExecution, NestedSourceProjectGroupedSum) { field_ref(FieldRef("struct", "bool")), }, {"i32", "bool"}}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "i32"}}, - /*names=*/{"sum(i32)"}, - /*keys=*/{"bool"}}}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "i32", "sum(i32)"}}, + /*keys=*/{"bool"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1020,10 +1020,11 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumFilter) { field_ref("str"), call("multiply", {field_ref("i32"), literal(2)}), }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, - "multiply(i32, 2)"}}, - /*names=*/{"sum(multiply(i32, 2))"}, - /*keys=*/{"str"}}}, + {"aggregate", + AggregateNodeOptions{ + /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)", + "sum(multiply(i32, 2))"}}, + /*keys=*/{"str"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), literal(10 * batch_multiplicity))}}, {"sink", SinkNodeOptions{&sink_gen}}, @@ -1059,10 +1060,11 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumOrderBy) { field_ref("str"), call("multiply", {field_ref("i32"), literal(2)}), }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, - "multiply(i32, 2)"}}, - /*names=*/{"sum(multiply(i32, 2))"}, - /*keys=*/{"str"}}}, + {"aggregate", + AggregateNodeOptions{ + /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)", + "sum(multiply(i32, 2))"}}, + /*keys=*/{"str"}}}, {"filter", FilterNodeOptions{greater(field_ref("sum(multiply(i32, 2))"), literal(10 * batch_multiplicity))}}, {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}}, @@ -1087,22 +1089,22 @@ TEST(ExecPlanExecution, SourceFilterProjectGroupedSumTopK) { AsyncGenerator> sink_gen; SelectKOptions options = SelectKOptions::TopKDefault(/*k=*/1, {"str"}); - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}}, - {"project", ProjectNodeOptions{{ - field_ref("str"), - call("multiply", {field_ref("i32"), literal(2)}), - }}}, - {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, - "multiply(i32, 2)"}}, - /*names=*/{"sum(multiply(i32, 2))"}, - /*keys=*/{"str"}}}, - {"select_k_sink", SelectKSinkNodeOptions{options, &sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions{input.schema, + input.gen(parallel, /*slow=*/false)}}, + {"project", ProjectNodeOptions{{ + field_ref("str"), + call("multiply", {field_ref("i32"), literal(2)}), + }}}, + {"aggregate", + AggregateNodeOptions{ + /*aggregates=*/{{"hash_sum", nullptr, "multiply(i32, 2)", + "sum(multiply(i32, 2))"}}, + /*keys=*/{"str"}}}, + {"select_k_sink", SelectKSinkNodeOptions{options, &sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_THAT( StartAndCollect(plan.get(), sink_gen), @@ -1117,18 +1119,19 @@ TEST(ExecPlanExecution, SourceScalarAggSink) { auto basic_data = MakeBasicBatches(); - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{basic_data.schema, - basic_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"sum", nullptr, "i32"}, - {"any", nullptr, "bool"}}, - /*names=*/{"sum(i32)", "any(bool)"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + {"source", + SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{ + /*aggregates=*/{{"sum", nullptr, "i32", "sum(i32)"}, + {"any", nullptr, "bool", "any(bool)"}}, + }}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_THAT( StartAndCollect(plan.get(), sink_gen), @@ -1155,9 +1158,10 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { {"source", SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"tdigest", options, "i32"}}, - /*names=*/{"tdigest(i32)"}}}, + {"aggregate", + AggregateNodeOptions{ + /*aggregates=*/{{"tdigest", options, "i32", "tdigest(i32)"}}, + }}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1181,10 +1185,10 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { { {"source", SourceNodeOptions{data.schema, data.gen(/*parallel=*/false, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"hash_count", options, "i32"}}, - /*names=*/{"count(i32)"}, - /*keys=*/{"str"}}}, + {"aggregate", + AggregateNodeOptions{/*aggregates=*/{{"hash_count", options, + "i32", "count(i32)"}}, + /*keys=*/{"str"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -1213,28 +1217,24 @@ TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { // index can't be tested as it's order-dependent // mode/quantile can't be tested as they're technically vector kernels - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{scalar_data.schema, scalar_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{ - /*aggregates=*/{{"all", nullptr, "b"}, - {"any", nullptr, "b"}, - {"count", nullptr, "a"}, - {"mean", nullptr, "a"}, - {"product", nullptr, "a"}, - {"stddev", nullptr, "a"}, - {"sum", nullptr, "a"}, - {"tdigest", nullptr, "a"}, - {"variance", nullptr, "a"}}, - /*names=*/ - {"all(b)", "any(b)", "count(a)", "mean(a)", "product(a)", - "stddev(a)", "sum(a)", "tdigest(a)", "variance(a)"}}}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions{scalar_data.schema, + scalar_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{ + {"all", nullptr, "b", "all(b)"}, + {"any", nullptr, "b", "any(b)"}, + {"count", nullptr, "a", "count(a)"}, + {"mean", nullptr, "a", "mean(a)"}, + {"product", nullptr, "a", "product(a)"}, + {"stddev", nullptr, "a", "stddev(a)"}, + {"sum", nullptr, "a", "sum(a)"}, + {"tdigest", nullptr, "a", "tdigest(a)"}, + {"variance", nullptr, "a", "variance(a)"}}}}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_THAT( StartAndCollect(plan.get(), sink_gen), @@ -1264,18 +1264,18 @@ TEST(ExecPlanExecution, ScalarSourceGroupedSum) { scalar_data.schema = schema({field("a", int32()), field("b", boolean())}); SortOptions options({SortKey("b", SortOrder::Descending)}); - ASSERT_OK(Declaration::Sequence( - { - {"source", SourceNodeOptions{scalar_data.schema, - scalar_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, "a"}}, - /*names=*/{"hash_sum(a)"}, - /*keys=*/{"b"}}}, - {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK( + Declaration::Sequence( + { + {"source", + SourceNodeOptions{scalar_data.schema, scalar_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr, + "a", "hash_sum(a)"}}, + /*keys=*/{"b"}}}, + {"order_by_sink", OrderBySinkNodeOptions{options, &sink_gen}}, + }) + .AddToPlan(plan.get())); ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), Finishes(ResultWith(UnorderedElementsAreArray({ diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc index e336b245bb7..762bf320f03 100644 --- a/cpp/src/arrow/compute/exec/test_util.cc +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -315,9 +315,10 @@ bool operator==(const Declaration& l, const Declaration& r) { if (!l_agg->options->Equals(*r_agg->options)) return false; if (l_agg->target != r_agg->target) return false; + if (l_agg->name != r_agg->name) return false; } - return l_opts->names == r_opts->names && l_opts->keys == r_opts->keys; + return l_opts->keys == r_opts->keys; } if (l.factory_name == "order_by_sink") { @@ -380,16 +381,11 @@ static inline void PrintToImpl(const std::string& factory_name, *os << agg.function << "<"; if (agg.options) PrintTo(*agg.options, os); *os << agg.target.ToString() << "<"; + *os << agg.name << "<"; *os << ">,"; } *os << "},"; - *os << "names={"; - for (const auto& name : o->names) { - *os << name << ","; - } - *os << "}"; - if (!o->keys.empty()) { *os << ",keys={"; for (const auto& key : o->keys) { diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 51c7aea1f37..0c3e219d439 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -126,10 +126,6 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, const std::vector& aggregates, bool use_threads, ExecContext* ctx) { std::vector keys(key_names.size()); - std::vector names(aggregates.size()); - for (size_t i = 0; i < aggregates.size(); ++i) { - names[i] = aggregates[i].function; - } for (size_t i = 0; i < key_names.size(); ++i) { keys[i] = FieldRef(key_names[i]); } @@ -141,8 +137,7 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, { {"source", SourceNodeOptions{input.schema, input.gen(use_threads, /*slow=*/false)}}, - {"aggregate", AggregateNodeOptions{std::move(aggregates), std::move(names), - std::move(keys)}}, + {"aggregate", AggregateNodeOptions{std::move(aggregates), std::move(keys)}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); @@ -750,7 +745,7 @@ TEST(GroupBy, NoBatches) { Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, {"agg_0"}}, + {"hash_count", nullptr, "agg_0", "hash_count"}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -815,7 +810,7 @@ TEST(GroupBy, CountOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, "agg_0"}, + {"hash_count", nullptr, "agg_0", "hash_count"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -852,14 +847,15 @@ TEST(GroupBy, CountScalar) { auto count_all = std::make_shared(CountOptions::ALL); for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_count", skip_nulls, "argument"}, - {"hash_count", keep_nulls, "argument"}, - {"hash_count", count_all, "argument"}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_count", skip_nulls, "argument", "hash_count"}, + {"hash_count", keep_nulls, "argument", "hash_count"}, + {"hash_count", count_all, "argument", "hash_count"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_count", int64()), field("hash_count", int64()), @@ -902,7 +898,7 @@ TEST(GroupBy, SumOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_0", "hash_sum"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -967,12 +963,12 @@ TEST(GroupBy, SumMeanProductDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0"}, - {"hash_sum", nullptr, "agg_1"}, - {"hash_mean", nullptr, "agg_2"}, - {"hash_mean", nullptr, "agg_3"}, - {"hash_product", nullptr, "agg_4"}, - {"hash_product", nullptr, "agg_5"}, + {"hash_sum", nullptr, "agg_0", "hash_sum"}, + {"hash_sum", nullptr, "agg_1", "hash_sum"}, + {"hash_mean", nullptr, "agg_2", "hash_mean"}, + {"hash_mean", nullptr, "agg_3", "hash_mean"}, + {"hash_product", nullptr, "agg_4", "hash_product"}, + {"hash_product", nullptr, "agg_5", "hash_product"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1021,17 +1017,17 @@ TEST(GroupBy, MeanOnly) { [null, 3] ])"}); - auto min_count = - std::make_shared(/*skip_nulls=*/true, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy({table->GetColumnByName("argument"), - table->GetColumnByName("argument")}, - {table->GetColumnByName("key")}, - { - {"hash_mean", nullptr, "agg_0"}, - {"hash_mean", min_count, "agg_1"}, - }, - use_threads)); + ScalarAggregateOptions min_count(/*skip_nulls=*/true, /*min_count=*/3); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + internal::GroupBy( + {table->GetColumnByName("argument"), table->GetColumnByName("argument")}, + {table->GetColumnByName("key")}, + { + {"hash_mean", nullptr, "agg_0", "hash_mean"}, + {"hash_mean", min_count, "agg_1", "hash_mean"}, + }, + use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1063,14 +1059,15 @@ TEST(GroupBy, SumMeanProductScalar) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_sum", nullptr, "argument"}, - {"hash_mean", nullptr, "argument"}, - {"hash_product", nullptr, "argument"}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_sum", nullptr, "argument", "hash_sum"}, + {"hash_mean", nullptr, "argument", "hash_mean"}, + {"hash_product", nullptr, "argument", "hash_product"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_sum", int64()), field("hash_mean", float64()), @@ -1111,8 +1108,8 @@ TEST(GroupBy, VarianceAndStddev) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr, "agg_0"}, - {"hash_stddev", nullptr, "agg_1"}, + {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, + {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1153,8 +1150,8 @@ TEST(GroupBy, VarianceAndStddev) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr, "agg_0"}, - {"hash_stddev", nullptr, "agg_1"}, + {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, + {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1172,20 +1169,21 @@ TEST(GroupBy, VarianceAndStddev) { /*verbose=*/true); // Test ddof - auto variance_options = std::make_shared(/*ddof=*/2); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_variance", variance_options, "agg_0"}, - {"hash_stddev", variance_options, "agg_1"}, - })); + VarianceOptions variance_options(/*ddof=*/2); + ASSERT_OK_AND_ASSIGN( + aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_variance", variance_options, "agg_0", "hash_variance_0"}, + {"hash_stddev", variance_options, "agg_1", "hash_stddev_0"}, + })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1229,10 +1227,10 @@ TEST(GroupBy, VarianceAndStddevDecimal) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr, "agg_0"}, - {"hash_stddev", nullptr, "agg_1"}, - {"hash_variance", nullptr, "agg_2"}, - {"hash_stddev", nullptr, "agg_3"}, + {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, + {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, + {"hash_variance", nullptr, "agg_2", "hash_variance_2"}, + {"hash_stddev", nullptr, "agg_3", "hash_stddev_3"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1270,40 +1268,37 @@ TEST(GroupBy, TDigest) { [null, 4] ])"); - auto options1 = std::make_shared(std::vector{0.5, 0.9, 0.99}); - auto options2 = - std::make_shared(std::vector{0.5, 0.9, 0.99}, /*delta=*/50, - /*buffer_size=*/1024); - auto keep_nulls = - std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/false, /*min_count=*/0); - auto min_count = - std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/true, /*min_count=*/3); - auto keep_nulls_min_count = - std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/false, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_tdigest", nullptr, "agg_0"}, - {"hash_tdigest", options1, "agg_1"}, - {"hash_tdigest", options2, "agg_2"}, - {"hash_tdigest", keep_nulls, "agg_3"}, - {"hash_tdigest", min_count, "agg_4"}, - {"hash_tdigest", keep_nulls_min_count, "agg_5"}, - })); + TDigestOptions options1(std::vector{0.5, 0.9, 0.99}); + TDigestOptions options2(std::vector{0.5, 0.9, 0.99}, /*delta=*/50, + /*buffer_size=*/1024); + TDigestOptions keep_nulls(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/0); + TDigestOptions min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/true, /*min_count=*/3); + TDigestOptions keep_nulls_min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/3); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_tdigest", nullptr, "agg_0", "hash_tdigest_0"}, + {"hash_tdigest", options1, "agg_1", "hash_tdigest_1"}, + {"hash_tdigest", options2, "agg_2", "hash_tdigest_2"}, + {"hash_tdigest", keep_nulls, "agg_3", "hash_tdigest_3"}, + {"hash_tdigest", min_count, "agg_4", "hash_tdigest_4"}, + {"hash_tdigest", keep_nulls_min_count, "agg_5", "hash_tdigest_5"}, + })); AssertDatumsApproxEqual( ArrayFromJSON(struct_({ @@ -1349,8 +1344,8 @@ TEST(GroupBy, TDigestDecimal) { }, {batch->GetColumnByName("key")}, { - {"hash_tdigest", nullptr, "agg_0"}, - {"hash_tdigest", nullptr, "agg_1"}, + {"hash_tdigest", nullptr, "agg_0", "hash_tdigest_0"}, + {"hash_tdigest", nullptr, "agg_1", "hash_tdigest_1"}, })); AssertDatumsApproxEqual( @@ -1395,24 +1390,27 @@ TEST(GroupBy, ApproximateMedian) { /*skip_nulls=*/true, /*min_count=*/3); auto keep_nulls_min_count = std::make_shared( /*skip_nulls=*/false, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_approximate_median", options, "agg_0"}, - {"hash_approximate_median", keep_nulls, "agg_1"}, - {"hash_approximate_median", min_count, "agg_2"}, - {"hash_approximate_median", keep_nulls_min_count, "agg_3"}, - })); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_approximate_median", options, "agg_0", + "hash_approximate_median_0"}, + {"hash_approximate_median", keep_nulls, "agg_1", + "hash_approximate_median_1"}, + {"hash_approximate_median", min_count, "agg_2", + "hash_approximate_median_2"}, + {"hash_approximate_median", keep_nulls_min_count, + "agg_3", "hash_approximate_median_3"}, + })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_approximate_median", float64()), @@ -1450,17 +1448,18 @@ TEST(GroupBy, StddevVarianceTDigestScalar) { for (bool use_threads : {false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_stddev", nullptr, "argument"}, - {"hash_variance", nullptr, "argument"}, - {"hash_tdigest", nullptr, "argument"}, - {"hash_stddev", nullptr, "argument1"}, - {"hash_variance", nullptr, "argument1"}, - {"hash_tdigest", nullptr, "argument1"}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_stddev", nullptr, "argument", "hash_stddev"}, + {"hash_variance", nullptr, "argument", "hash_variance"}, + {"hash_tdigest", nullptr, "argument", "hash_tdigest"}, + {"hash_stddev", nullptr, "argument1", "hash_stddev"}, + {"hash_variance", nullptr, "argument1", "hash_variance"}, + {"hash_tdigest", nullptr, "argument1", "hash_tdigest"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), @@ -1510,16 +1509,17 @@ TEST(GroupBy, VarianceOptions) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN( Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_stddev", keep_nulls, "argument"}, - {"hash_stddev", min_count, "argument"}, - {"hash_stddev", keep_nulls_min_count, "argument"}, - {"hash_variance", keep_nulls, "argument"}, - {"hash_variance", min_count, "argument"}, - {"hash_variance", keep_nulls_min_count, "argument"}, - }, - use_threads, default_exec_context())); + GroupByUsingExecPlan( + input, {"key"}, + { + {"hash_stddev", keep_nulls, "argument", "hash_stddev"}, + {"hash_stddev", min_count, "argument", "hash_stddev"}, + {"hash_stddev", keep_nulls_min_count, "argument", "hash_stddev"}, + {"hash_variance", keep_nulls, "argument", "hash_variance"}, + {"hash_variance", min_count, "argument", "hash_variance"}, + {"hash_variance", keep_nulls_min_count, "argument", "hash_variance"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -1540,16 +1540,17 @@ TEST(GroupBy, VarianceOptions) { ASSERT_OK_AND_ASSIGN( actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_stddev", keep_nulls, "argument1"}, - {"hash_stddev", min_count, "argument1"}, - {"hash_stddev", keep_nulls_min_count, "argument1"}, - {"hash_variance", keep_nulls, "argument1"}, - {"hash_variance", min_count, "argument1"}, - {"hash_variance", keep_nulls_min_count, "argument1"}, - }, - use_threads, default_exec_context())); + GroupByUsingExecPlan( + input, {"key"}, + { + {"hash_stddev", keep_nulls, "argument1", "hash_stddev"}, + {"hash_stddev", min_count, "argument1", "hash_stddev"}, + {"hash_stddev", keep_nulls_min_count, "argument1", "hash_stddev"}, + {"hash_variance", keep_nulls, "argument1", "hash_variance"}, + {"hash_variance", min_count, "argument1", "hash_variance"}, + {"hash_variance", keep_nulls_min_count, "argument1", "hash_variance"}, + }, + use_threads, default_exec_context())); expected = ArrayFromJSON(struct_({ field("hash_stddev", float64()), field("hash_stddev", float64()), @@ -1607,9 +1608,9 @@ TEST(GroupBy, MinMaxOnly) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr, "agg_0"}, - {"hash_min_max", nullptr, "agg_1"}, - {"hash_min_max", nullptr, "agg_2"}, + {"hash_min_max", nullptr, "agg_0", "hash_min_max"}, + {"hash_min_max", nullptr, "agg_1", "hash_min_max"}, + {"hash_min_max", nullptr, "agg_2", "hash_min_max"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1711,11 +1712,11 @@ TEST(GroupBy, MinMaxTypes) { auto table = TableFromJSON(in_schema, (ty->name() == "date64") ? date64_table : default_table); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_min_max", nullptr, "agg_0"}}, - /*use_threads=*/true, /*use_exec_plan=*/true)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, + {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, + /*use_threads=*/true, /*use_exec_plan=*/true)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1768,8 +1769,8 @@ TEST(GroupBy, MinMaxDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr, "agg_0"}, - {"hash_min_max", nullptr, "agg_1"}, + {"hash_min_max", nullptr, "agg_0", "hash_min_max"}, + {"hash_min_max", nullptr, "agg_1", "hash_min_max"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1827,11 +1828,12 @@ TEST(GroupBy, MinMaxBinary) { [null, 3] ])"}); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_min_max", nullptr, "agg_0"}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, + {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, use_threads, + use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1885,7 +1887,8 @@ TEST(GroupBy, MinMaxFixedSizeBinary) { Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, {table->GetColumnByName("key")}, - {{"hash_min_max", nullptr, "agg_0"}}, use_threads, use_exec_plan)); + {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, use_threads, + use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1938,8 +1941,8 @@ TEST(GroupBy, MinOrMax) { table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_min", nullptr, "agg_0"}, - {"hash_max", nullptr, "agg_1"}, + {"hash_min", nullptr, "agg_0", "hash_min"}, + {"hash_max", nullptr, "agg_1", "hash_max"}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1975,7 +1978,8 @@ TEST(GroupBy, MinMaxScalar) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN( Datum actual, - GroupByUsingExecPlan(input, {"key"}, {{"hash_min_max", nullptr, "argument"}}, + GroupByUsingExecPlan(input, {"key"}, + {{"hash_min_max", nullptr, "argument", "hash_min_max"}}, use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ @@ -2039,14 +2043,14 @@ TEST(GroupBy, AnyAndAll) { }, {table->GetColumnByName("key")}, { - {"hash_any", no_min, "agg_0"}, - {"hash_any", min_count, "agg_1"}, - {"hash_any", keep_nulls, "agg_2"}, - {"hash_any", keep_nulls_min_count, "agg_3"}, - {"hash_all", no_min, "agg_4"}, - {"hash_all", min_count, "agg_5"}, - {"hash_all", keep_nulls, "agg_6"}, - {"hash_all", keep_nulls_min_count, "agg_7"}, + {"hash_any", no_min, "agg_0", "hash_any"}, + {"hash_any", min_count, "agg_1", "hash_any"}, + {"hash_any", keep_nulls, "agg_2", "hash_any"}, + {"hash_any", keep_nulls_min_count, "agg_3", "hash_any"}, + {"hash_all", no_min, "agg_4", "hash_all"}, + {"hash_all", min_count, "agg_5", "hash_all"}, + {"hash_all", keep_nulls, "agg_6", "hash_all"}, + {"hash_all", keep_nulls_min_count, "agg_7", "hash_all"}, }, use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2096,15 +2100,16 @@ TEST(GroupBy, AnyAllScalar) { std::make_shared(/*skip_nulls=*/false, /*min_count=*/0); for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum actual, - GroupByUsingExecPlan(input, {"key"}, - { - {"hash_any", nullptr, "argument"}, - {"hash_all", nullptr, "argument"}, - {"hash_any", keep_nulls, "argument"}, - {"hash_all", keep_nulls, "argument"}, - }, - use_threads, default_exec_context())); + ASSERT_OK_AND_ASSIGN( + Datum actual, + GroupByUsingExecPlan(input, {"key"}, + { + {"hash_any", nullptr, "argument", "hash_any"}, + {"hash_all", nullptr, "argument", "hash_all"}, + {"hash_any", keep_nulls, "argument", "hash_any"}, + {"hash_all", keep_nulls, "argument", "hash_all"}, + }, + use_threads, default_exec_context())); Datum expected = ArrayFromJSON(struct_({ field("hash_any", boolean()), field("hash_all", boolean()), @@ -2159,22 +2164,23 @@ TEST(GroupBy, CountDistinct) { [3, null] ])"}); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - { - table->GetColumnByName("key"), - }, - { - {"hash_count_distinct", all, "agg_0"}, - {"hash_count_distinct", only_valid, "agg_1"}, - {"hash_count_distinct", only_null, "agg_2"}, - }, - use_threads)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + internal::GroupBy( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + { + table->GetColumnByName("key"), + }, + { + {"hash_count_distinct", all, "agg_0", "hash_count_distinct"}, + {"hash_count_distinct", only_valid, "agg_1", "hash_count_distinct"}, + {"hash_count_distinct", only_null, "agg_2", "hash_count_distinct"}, + }, + use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); ValidateOutput(aggregated_and_grouped); @@ -2225,22 +2231,23 @@ TEST(GroupBy, CountDistinct) { ["b", null] ])"}); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, - internal::GroupBy( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - { - table->GetColumnByName("key"), - }, - { - {"hash_count_distinct", all, "agg_0"}, - {"hash_count_distinct", only_valid, "agg_1"}, - {"hash_count_distinct", only_null, "agg_2"}, - }, - use_threads)); + ASSERT_OK_AND_ASSIGN( + aggregated_and_grouped, + internal::GroupBy( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + { + table->GetColumnByName("key"), + }, + { + {"hash_count_distinct", all, "agg_0", "hash_count_distinct"}, + {"hash_count_distinct", only_valid, "agg_1", "hash_count_distinct"}, + {"hash_count_distinct", only_null, "agg_2", "hash_count_distinct"}, + }, + use_threads)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2271,22 +2278,23 @@ TEST(GroupBy, CountDistinct) { ])", }); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, - internal::GroupBy( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - { - table->GetColumnByName("key"), - }, - { - {"hash_count_distinct", all, "agg_0"}, - {"hash_count_distinct", only_valid, "agg_1"}, - {"hash_count_distinct", only_null, "agg_2"}, - }, - use_threads)); + ASSERT_OK_AND_ASSIGN( + aggregated_and_grouped, + internal::GroupBy( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + { + table->GetColumnByName("key"), + }, + { + {"hash_count_distinct", all, "agg_0", "hash_count_distinct"}, + {"hash_count_distinct", only_valid, "agg_1", "hash_count_distinct"}, + {"hash_count_distinct", only_null, "agg_2", "hash_count_distinct"}, + }, + use_threads)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2354,9 +2362,9 @@ TEST(GroupBy, Distinct) { table->GetColumnByName("key"), }, { - {"hash_distinct", all, "agg_0"}, - {"hash_distinct", only_valid, "agg_1"}, - {"hash_distinct", only_null, "agg_2"}, + {"hash_distinct", all, "agg_0", "hash_distinct"}, + {"hash_distinct", only_valid, "agg_1", "hash_distinct"}, + {"hash_distinct", only_null, "agg_2", "hash_distinct"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2427,9 +2435,9 @@ TEST(GroupBy, Distinct) { table->GetColumnByName("key"), }, { - {"hash_distinct", all, "agg_0"}, - {"hash_distinct", only_valid, "agg_1"}, - {"hash_distinct", only_null, "agg_2"}, + {"hash_distinct", all, "agg_0", "hash_distinct"}, + {"hash_distinct", only_valid, "agg_1", "hash_distinct"}, + {"hash_distinct", only_null, "agg_2", "hash_distinct"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2491,12 +2499,12 @@ TEST(GroupBy, OneMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_one", nullptr, "agg_0"}, - {"hash_one", nullptr, "agg_1"}, - {"hash_one", nullptr, "agg_2"}, - {"hash_one", nullptr, "agg_3"}, - {"hash_one", nullptr, "agg_4"}, - {"hash_one", nullptr, "agg_5"}, + {"hash_one", nullptr, "agg_0", "hash_one"}, + {"hash_one", nullptr, "agg_1", "hash_one"}, + {"hash_one", nullptr, "agg_2", "hash_one"}, + {"hash_one", nullptr, "agg_3", "hash_one"}, + {"hash_one", nullptr, "agg_4", "hash_one"}, + {"hash_one", nullptr, "agg_5", "hash_one"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -2619,11 +2627,11 @@ TEST(GroupBy, OneNumericTypes) { auto table = TableFromJSON(in_schema, (type->name() == "date64") ? temporal_table_json : numeric_table_json); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_one", nullptr, "agg_0"}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, + {{"hash_one", nullptr, "agg_0", "hash_one"}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2682,11 +2690,11 @@ TEST(GroupBy, OneBinaryTypes) { [null, 3] ])"}); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, {{"hash_one", nullptr, "agg_0"}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, + {{"hash_one", nullptr, "agg_0", "hash_one"}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2718,9 +2726,9 @@ TEST(GroupBy, OneScalar) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); ASSERT_OK_AND_ASSIGN( - Datum actual, - GroupByUsingExecPlan(input, {"key"}, {{"hash_one", nullptr, "argument"}}, - use_threads, default_exec_context())); + Datum actual, GroupByUsingExecPlan( + input, {"key"}, {{"hash_one", nullptr, "argument", "hash_one"}}, + use_threads, default_exec_context())); const auto& struct_arr = actual.array_as(); // Check the key column @@ -2780,7 +2788,7 @@ TEST(GroupBy, ListNumeric) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr, "agg_0"}, + {"hash_list", nullptr, "agg_0", "hash_list"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2851,7 +2859,7 @@ TEST(GroupBy, ListNumeric) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr, "agg_0"}, + {"hash_list", nullptr, "agg_0", "hash_list"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2920,7 +2928,7 @@ TEST(GroupBy, ListBinaryTypes) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr, "agg_0"}, + {"hash_list", nullptr, "agg_0", "hash_list"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -2982,7 +2990,7 @@ TEST(GroupBy, ListBinaryTypes) { table->GetColumnByName("key"), }, { - {"hash_list", nullptr, "agg_0"}, + {"hash_list", nullptr, "agg_0", "hash_list"}, }, use_threads)); ValidateOutput(aggregated_and_grouped); @@ -3060,12 +3068,12 @@ TEST(GroupBy, ListMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_list", nullptr, "agg_0"}, - {"hash_list", nullptr, "agg_1"}, - {"hash_list", nullptr, "agg_2"}, - {"hash_list", nullptr, "agg_3"}, - {"hash_list", nullptr, "agg_4"}, - {"hash_list", nullptr, "agg_5"}, + {"hash_list", nullptr, "agg_0", "hash_list"}, + {"hash_list", nullptr, "agg_1", "hash_list"}, + {"hash_list", nullptr, "agg_2", "hash_list"}, + {"hash_list", nullptr, "agg_3", "hash_list"}, + {"hash_list", nullptr, "agg_4", "hash_list"}, + {"hash_list", nullptr, "agg_5", "hash_list"}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -3213,12 +3221,12 @@ TEST(GroupBy, CountAndSum) { batch->GetColumnByName("key"), }, { - {"hash_count", count_options, "agg_0"}, - {"hash_count", count_nulls, "agg_1"}, - {"hash_count", count_all, "agg_2"}, - {"hash_sum", nullptr, "agg_3"}, - {"hash_sum", min_count, "agg_4"}, - {"hash_sum", nullptr, "agg_5"}, + {"hash_count", count_options, "agg_0", "hash_count"}, + {"hash_count", count_nulls, "agg_1", "hash_count"}, + {"hash_count", count_all, "agg_2", "hash_count"}, + {"hash_sum", nullptr, "agg_3", "hash_sum"}, + {"hash_sum", min_count, "agg_4", "hash_sum"}, + {"hash_sum", nullptr, "agg_5", "hash_sum"}, })); AssertDatumsEqual( @@ -3270,9 +3278,9 @@ TEST(GroupBy, Product) { batch->GetColumnByName("key"), }, { - {"hash_product", nullptr, "agg_0"}, - {"hash_product", nullptr, "agg_1"}, - {"hash_product", min_count, "agg_2"}, + {"hash_product", nullptr, "agg_0", "hash_product"}, + {"hash_product", nullptr, "agg_1", "hash_product"}, + {"hash_product", min_count, "agg_2", "hash_product"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -3297,16 +3305,17 @@ TEST(GroupBy, Product) { [8589934593, 1] ])"); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, internal::GroupBy( - { - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_product", nullptr, "agg_0"}, - })); + ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, + internal::GroupBy( + { + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_product", nullptr, "agg_0", "hash_product"}, + })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_product", int64()), @@ -3349,12 +3358,12 @@ TEST(GroupBy, SumMeanProductKeepNulls) { batch->GetColumnByName("key"), }, { - {"hash_sum", keep_nulls, "agg_0"}, - {"hash_sum", min_count, "agg_1"}, - {"hash_mean", keep_nulls, "agg_2"}, - {"hash_mean", min_count, "agg_3"}, - {"hash_product", keep_nulls, "agg_4"}, - {"hash_product", min_count, "agg_5"}, + {"hash_sum", keep_nulls, "agg_0", "hash_sum"}, + {"hash_sum", min_count, "agg_1", "hash_sum"}, + {"hash_mean", keep_nulls, "agg_2", "hash_mean"}, + {"hash_mean", min_count, "agg_3", "hash_mean"}, + {"hash_product", keep_nulls, "agg_4", "hash_product"}, + {"hash_product", min_count, "agg_5", "hash_product"}, })); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -3398,7 +3407,7 @@ TEST(GroupBy, SumOnlyStringAndDictKeys) { internal::GroupBy({batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_0", "hash_sum"}, })); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3441,11 +3450,11 @@ TEST(GroupBy, ConcreteCaseWithValidateGroupBy) { using internal::Aggregate; for (auto agg : { - Aggregate{"hash_sum", nullptr, "agg_0"}, - Aggregate{"hash_count", non_null, "agg_1"}, - Aggregate{"hash_count", nulls, "agg_2"}, - Aggregate{"hash_min_max", nullptr, "agg_3"}, - Aggregate{"hash_min_max", keepna, "agg_4"}, + Aggregate{"hash_sum", nullptr, "agg_0", "hash_sum"}, + Aggregate{"hash_count", non_null, "agg_1", "hash_count"}, + Aggregate{"hash_count", nulls, "agg_2", "hash_count"}, + Aggregate{"hash_min_max", nullptr, "agg_3", "hash_min_max"}, + Aggregate{"hash_min_max", keepna, "agg_4", "hash_min_max"}, }) { SCOPED_TRACE(agg.function); ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, @@ -3469,8 +3478,8 @@ TEST(GroupBy, CountNull) { using internal::Aggregate; for (auto agg : { - Aggregate{"hash_count", keepna, "agg_0"}, - Aggregate{"hash_count", skipna, "agg_1"}, + Aggregate{"hash_count", keepna, "agg_0", "hash_count"}, + Aggregate{"hash_count", skipna, "agg_1", "hash_count"}, }) { SCOPED_TRACE(agg.function); ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, @@ -3494,7 +3503,7 @@ TEST(GroupBy, RandomArraySum) { ValidateGroupBy( { - {"hash_sum", options, "agg_0"}, + {"hash_sum", options, "agg_0", "hash_sum"}, }, {batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}); } @@ -3527,9 +3536,9 @@ TEST(GroupBy, WithChunkedArray) { table->GetColumnByName("key"), }, { - {"hash_count", nullptr, "agg_0"}, - {"hash_sum", nullptr, "agg_1"}, - {"hash_min_max", nullptr, "agg_2"}, + {"hash_count", nullptr, "agg_0", "hash_count"}, + {"hash_sum", nullptr, "agg_1", "hash_sum"}, + {"hash_min_max", nullptr, "agg_2", "hash_min_max"}, })); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3565,7 +3574,7 @@ TEST(GroupBy, MinMaxWithNewGroupsInChunkedArray) { table->GetColumnByName("key"), }, { - {"hash_min_max", nullptr, "agg_1"}, + {"hash_min_max", nullptr, "agg_1", "hash_min_max"}, })); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3601,7 +3610,7 @@ TEST(GroupBy, SmallChunkSizeSumOnly) { internal::GroupBy({batch->GetColumnByName("argument")}, {batch->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0"}, + {"hash_sum", nullptr, "agg_0", "hash_sum"}, }, small_chunksize_context())); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3653,9 +3662,9 @@ TEST(GroupBy, CountWithNullType) { }, {table->GetColumnByName("key")}, { - {"hash_count", all, "agg_0"}, - {"hash_count", only_valid, "agg_1"}, - {"hash_count", only_null, "agg_2"}, + {"hash_count", all, "agg_0", "hash_count"}, + {"hash_count", only_valid, "agg_1", "hash_count"}, + {"hash_count", only_null, "agg_2", "hash_count"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3698,9 +3707,9 @@ TEST(GroupBy, CountWithNullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_count", all, "agg_0"}, - {"hash_count", only_valid, "agg_1"}, - {"hash_count", only_null, "agg_2"}, + {"hash_count", all, "agg_0", "hash_count"}, + {"hash_count", only_valid, "agg_1", "hash_count"}, + {"hash_count", only_null, "agg_2", "hash_count"}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); @@ -3743,10 +3752,10 @@ TEST(GroupBy, SingleNullTypeKey) { }, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, "agg_0"}, - {"hash_sum", nullptr, "agg_1"}, - {"hash_mean", nullptr, "agg_2"}, - {"hash_min_max", nullptr, "agg_3"}, + {"hash_count", nullptr, "agg_0", "hash_count"}, + {"hash_sum", nullptr, "agg_1", "hash_sum"}, + {"hash_mean", nullptr, "agg_2", "hash_mean"}, + {"hash_min_max", nullptr, "agg_3", "hash_min_max"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3803,9 +3812,9 @@ TEST(GroupBy, MultipleKeysIncludesNullType) { }, {table->GetColumnByName("key_0"), table->GetColumnByName("key_1")}, { - {"hash_count", nullptr, "agg_0"}, - {"hash_sum", nullptr, "agg_1"}, - {"hash_min_max", nullptr, "agg_2"}, + {"hash_count", nullptr, "agg_0", "hash_count"}, + {"hash_sum", nullptr, "agg_1", "hash_sum"}, + {"hash_min_max", nullptr, "agg_2", "hash_min_max"}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3864,22 +3873,23 @@ TEST(GroupBy, SumNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_sum", no_min, "agg_0"}, - {"hash_sum", keep_nulls, "agg_1"}, - {"hash_sum", min_count, "agg_2"}, - {"hash_sum", keep_nulls_min_count, "agg_3"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_sum", no_min, "agg_0", "hash_sum"}, + {"hash_sum", keep_nulls, "agg_1", "hash_sum"}, + {"hash_sum", min_count, "agg_2", "hash_sum"}, + {"hash_sum", keep_nulls_min_count, "agg_3", "hash_sum"}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3932,22 +3942,23 @@ TEST(GroupBy, ProductNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_product", no_min, "agg_0"}, - {"hash_product", keep_nulls, "agg_1"}, - {"hash_product", min_count, "agg_2"}, - {"hash_product", keep_nulls_min_count, "agg_3"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_product", no_min, "agg_0", "hash_product"}, + {"hash_product", keep_nulls, "agg_1", "hash_product"}, + {"hash_product", min_count, "agg_2", "hash_product"}, + {"hash_product", keep_nulls_min_count, "agg_3", "hash_product"}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -4000,22 +4011,23 @@ TEST(GroupBy, MeanNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_mean", no_min, "agg_0"}, - {"hash_mean", keep_nulls, "agg_1"}, - {"hash_mean", min_count, "agg_2"}, - {"hash_mean", keep_nulls_min_count, "agg_3"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_mean", no_min, "agg_0", "hash_mean"}, + {"hash_mean", keep_nulls, "agg_1", "hash_mean"}, + {"hash_mean", min_count, "agg_2", "hash_mean"}, + {"hash_mean", keep_nulls_min_count, "agg_3", "hash_mean"}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -4062,9 +4074,9 @@ TEST(GroupBy, NullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_sum", no_min, "agg_0"}, - {"hash_product", min_count, "agg_1"}, - {"hash_mean", keep_nulls, "agg_2"}, + {"hash_sum", no_min, "agg_0", "hash_sum"}, + {"hash_product", min_count, "agg_1", "hash_product"}, + {"hash_mean", keep_nulls, "agg_2", "hash_mean"}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index b19e787f94d..2295665b161 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -676,8 +676,7 @@ Result AsyncScanner::CountRows() { options}}, {"project", compute::ProjectNodeOptions{{options->filter}, {"mask"}}}, {"aggregate", compute::AggregateNodeOptions{{compute::internal::Aggregate{ - "sum", nullptr, "mask"}}, - /*names=*/{"selected_count"}}}, + "sum", nullptr, "mask", "selected_count"}}}}, {"sink", compute::SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 1d3e0b403ef..6dd75db114a 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1838,9 +1838,8 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { ASSERT_OK_AND_ASSIGN( compute::ExecNode * aggregate, compute::MakeExecNode("aggregate", plan.get(), {project}, - compute::AggregateNodeOptions{ - {compute::internal::Aggregate{"sum", nullptr, "a * 2"}}, - /*names=*/{"sum(a * 2)"}})); + compute::AggregateNodeOptions{{compute::internal::Aggregate{ + "sum", nullptr, "a * 2", "sum(a * 2)"}}})); // finally, pipe the aggregate node into a sink node AsyncGenerator> sink_gen; @@ -1929,8 +1928,7 @@ TEST(ScanNode, MinimalGroupedAggEndToEnd) { compute::MakeExecNode( "aggregate", plan.get(), {project}, compute::AggregateNodeOptions{ - {compute::internal::Aggregate{"hash_sum", nullptr, "a * 2"}}, - /*names=*/{"sum(a * 2)"}, + {compute::internal::Aggregate{"hash_sum", nullptr, "a * 2", "sum(a * 2)"}}, /*keys=*/{"b"}})); // finally, pipe the aggregate node into a sink node From fd2e78d18180b065c2f075d1f32759d9bc1aa9e9 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 19 May 2022 23:31:24 +0530 Subject: [PATCH 10/32] [r] fix r bindings --- r/R/arrowExports.R | 5 ++--- r/R/query-engine.R | 9 +++------ r/src/arrowExports.cpp | 9 ++++----- r/src/compute-exec.cpp | 17 ++++++++--------- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 53c0f8ac86b..3a14730e535 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -432,8 +432,8 @@ ExecNode_Project <- function(input, exprs, names) { .Call(`_arrow_ExecNode_Project`, input, exprs, names) } -ExecNode_Aggregate <- function(input, options, out_field_names, key_names) { - .Call(`_arrow_ExecNode_Aggregate`, input, options, out_field_names, key_names) +ExecNode_Aggregate <- function(input, options, key_names) { + .Call(`_arrow_ExecNode_Aggregate`, input, options, key_names) } ExecNode_Join <- function(input, type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) { @@ -2007,4 +2007,3 @@ SetIOThreadPoolCapacity <- function(threads) { Array__infer_type <- function(x) { .Call(`_arrow_Array__infer_type`, x) } - diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 2c2d96063bd..7c7d25b9029 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -74,8 +74,6 @@ ExecPlan <- R6Class("ExecPlan", grouped <- length(group_vars) > 0 # Collect the target names first because we have to add back the group vars - # TODO : remove upon discussion - # target_names <- names(.data) .data <- ensure_group_vars(.data) .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns @@ -115,14 +113,13 @@ ExecPlan <- R6Class("ExecPlan", x }) } - target_names <- names(.data$aggregations); + target_names <- names(.data$aggregations) for (i in seq_len(length(target_names))) { - .data$aggregations[[i]][["target"]] <- target_names[i] + .data$aggregations[[i]][["name"]] <- .data$aggregations[[i]][["target"]] <- target_names[i] } node <- node$Aggregate( - options = map(.data$aggregations, ~ .[c("fun", "options", "target")]), - out_field_names = names(.data$aggregations), + options = map(.data$aggregations, ~ .[c("fun", "options", "target", "name")]), key_names = group_vars ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 3bfed5f0af7..6ff771fdf5a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -964,14 +964,13 @@ BEGIN_CPP11 END_CPP11 } // compute-exec.cpp -std::shared_ptr ExecNode_Aggregate(const std::shared_ptr& input, cpp11::list options, std::vector out_field_names, std::vector key_names); -extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP out_field_names_sexp, SEXP key_names_sexp){ +std::shared_ptr ExecNode_Aggregate(const std::shared_ptr& input, cpp11::list options, std::vector key_names); +extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP key_names_sexp){ BEGIN_CPP11 arrow::r::Input&>::type input(input_sexp); arrow::r::Input::type options(options_sexp); - arrow::r::Input>::type out_field_names(out_field_names_sexp); arrow::r::Input>::type key_names(key_names_sexp); - return cpp11::as_sexp(ExecNode_Aggregate(input, options, out_field_names, key_names)); + return cpp11::as_sexp(ExecNode_Aggregate(input, options, key_names)); END_CPP11 } // compute-exec.cpp @@ -5221,7 +5220,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecPlan_Write", (DL_FUNC) &_arrow_ExecPlan_Write, 14}, { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, - { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 4}, + { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 3}, { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 9}, { "_arrow_ExecNode_Union", (DL_FUNC) &_arrow_ExecNode_Union, 2}, { "_arrow_ExecNode_SourceNode", (DL_FUNC) &_arrow_ExecNode_SourceNode, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e4438d08d28..5359065d8d8 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -228,17 +228,17 @@ std::shared_ptr ExecNode_Project( // [[arrow::export]] std::shared_ptr ExecNode_Aggregate( const std::shared_ptr& input, cpp11::list options, - std::vector out_field_names, std::vector key_names) { + std::vector key_names) { std::vector aggregates; for (cpp11::list name_opts : options) { - auto name = cpp11::as_cpp(name_opts[0]); - auto opts = make_compute_options(name, name_opts[1]); + auto function = cpp11::as_cpp(name_opts[0]); + auto opts = make_compute_options(function, name_opts[1]); auto target = cpp11::as_cpp(name_opts[2]); - - aggregates.push_back( - arrow::compute::internal::Aggregate{std::move(name), opts, std::move(target)}); - keep_alives.push_back(std::move(opts)); + auto name = cpp11::as_cpp(name_opts[3]); + + aggregates.push_back(arrow::compute::internal::Aggregate{ + std::move(function), opts.get(), std::move(target), std::move(name)}); } std::vector keys; @@ -247,8 +247,7 @@ std::shared_ptr ExecNode_Aggregate( } return MakeExecNodeOrStop( "aggregate", input->plan(), {input.get()}, - compute::AggregateNodeOptions{std::move(aggregates), std::move(out_field_names), - std::move(keys)}); + compute::AggregateNodeOptions{std::move(aggregates), std::move(keys)}); } // [[arrow::export]] From 9eecd8c35068572ae2d5458d07debafa7172b3fd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 20 May 2022 15:36:39 +0530 Subject: [PATCH 11/32] fixing tpch benchmarks for agg op refactor --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 22 ++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index f348eaf3eb1..b998da5eded 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -78,21 +78,17 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin std::make_shared(ScalarAggregateOptions::Defaults()); auto count_opts = std::make_shared(CountOptions::CountMode::ALL); std::vector aggs = { - {"hash_sum", sum_opts, "sum_qty"}, - {"hash_sum", sum_opts, "sum_base_price"}, - {"hash_sum", sum_opts, "sum_disc_price"}, - {"hash_sum", sum_opts, "sum_charge"}, - {"hash_mean", sum_opts, "avg_qty"}, - {"hash_mean", sum_opts, "avg_price"}, - {"hash_mean", sum_opts, "avg_disc"}, - {"hash_count", count_opts, "sum_qty"}}; - - std::vector names = {"sum_qty", "sum_base_price", "sum_disc_price", - "sum_charge", "avg_qty", "avg_price", - "avg_disc", "count_order"}; + {"hash_sum", sum_opts, "sum_qty", "sum_qty"}, + {"hash_sum", sum_opts, "sum_base_price", "sum_base_price"}, + {"hash_sum", sum_opts, "sum_disc_price", "sum_disc_price"}, + {"hash_sum", sum_opts, "sum_charge", "sum_charge"}, + {"hash_mean", sum_opts, "avg_qty", "avg_qty"}, + {"hash_mean", sum_opts, "avg_price", "avg_price"}, + {"hash_mean", sum_opts, "avg_disc", "avg_disc"}, + {"hash_count", count_opts, "sum_qty", "count_order"}}; std::vector keys = {"l_returnflag", "l_linestatus"}; - AggregateNodeOptions agg_opts(aggs, names, keys); + AggregateNodeOptions agg_opts(aggs, keys); SortKey l_returnflag_key("l_returnflag"); SortKey l_linestatus_key("l_linestatus"); From e9c70d7196003d61be842c26ee3f8ca179a884fb Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 24 May 2022 16:12:25 +0530 Subject: [PATCH 12/32] minor changes --- r/R/query-engine.R | 3 +-- r/src/compute-exec.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 7c7d25b9029..69fa9e742ec 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -73,7 +73,6 @@ ExecPlan <- R6Class("ExecPlan", group_vars <- dplyr::group_vars(.data) grouped <- length(group_vars) > 0 - # Collect the target names first because we have to add back the group vars .data <- ensure_group_vars(.data) .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns @@ -119,7 +118,7 @@ ExecPlan <- R6Class("ExecPlan", } node <- node$Aggregate( - options = map(.data$aggregations, ~ .[c("fun", "options", "target", "name")]), + options = .data$aggregations, key_names = group_vars ) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 5359065d8d8..7b28a700c2b 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -232,10 +232,10 @@ std::shared_ptr ExecNode_Aggregate( std::vector aggregates; for (cpp11::list name_opts : options) { - auto function = cpp11::as_cpp(name_opts[0]); - auto opts = make_compute_options(function, name_opts[1]); - auto target = cpp11::as_cpp(name_opts[2]); - auto name = cpp11::as_cpp(name_opts[3]); + auto function = cpp11::as_cpp(name_opts["fun"]); + auto opts = make_compute_options(function, name_opts["options"]); + auto target = cpp11::as_cpp(name_opts["target"]); + auto name = cpp11::as_cpp(name_opts["name"]); aggregates.push_back(arrow::compute::internal::Aggregate{ std::move(function), opts.get(), std::move(target), std::move(name)}); From 7d8b4f7cf419a483a68002e055ed1e37c5cb8cad Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 30 May 2022 17:03:43 +0530 Subject: [PATCH 13/32] fix print format --- cpp/src/arrow/compute/exec/test_util.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc index 762bf320f03..4e8f57afcf9 100644 --- a/cpp/src/arrow/compute/exec/test_util.cc +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -378,11 +378,11 @@ static inline void PrintToImpl(const std::string& factory_name, *os << "aggregates={"; for (const auto& agg : o->aggregates) { - *os << agg.function << "<"; + *os << "function=" << agg.function << "<"; if (agg.options) PrintTo(*agg.options, os); - *os << agg.target.ToString() << "<"; - *os << agg.name << "<"; *os << ">,"; + *os << "target=" << agg.target.ToString() << ","; + *os << "name=" << agg.name; } *os << "},"; From 2e2df66711728a169cc45b254988b7a9e69033d0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 30 May 2022 17:35:00 +0530 Subject: [PATCH 14/32] adding test aggregate option --- .../compute/kernels/hash_aggregate_test.cc | 134 ++++++++++-------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 0c3e219d439..0478cba98a1 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -235,15 +235,27 @@ ExecContext* small_chunksize_context(bool use_threads = false) { return use_threads ? &ctx_with_threads : &ctx; } -Result GroupByTest( - const std::vector& arguments, const std::vector& keys, - const std::vector<::arrow::compute::internal::Aggregate>& aggregates, - bool use_threads, bool use_exec_plan) { +struct TestAggregate { + std::string function; + const FunctionOptions* options; +}; + +Result GroupByTest(const std::vector& arguments, + const std::vector& keys, + const std::vector& aggregates, bool use_threads, + bool use_exec_plan) { + std::vector internal_aggregates; + int idx = 0; + for (auto t_agg : aggregates) { + internal_aggregates.push_back( + {t_agg.function, t_agg.options, "agg_" + std::to_string(idx), t_agg.function}); + idx = idx + 1; + } if (use_exec_plan) { - return GroupByUsingExecPlan(arguments, keys, aggregates, use_threads, + return GroupByUsingExecPlan(arguments, keys, internal_aggregates, use_threads, small_chunksize_context(use_threads)); } else { - return internal::GroupBy(arguments, keys, aggregates, use_threads, + return internal::GroupBy(arguments, keys, internal_aggregates, use_threads, default_exec_context()); } } @@ -745,7 +757,7 @@ TEST(GroupBy, NoBatches) { Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, "agg_0", "hash_count"}, + {"hash_count", nullptr}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -810,7 +822,7 @@ TEST(GroupBy, CountOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, "agg_0", "hash_count"}, + {"hash_count", nullptr}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -898,7 +910,7 @@ TEST(GroupBy, SumOnly) { GroupByTest({table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0", "hash_sum"}, + {"hash_sum", nullptr}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -963,12 +975,12 @@ TEST(GroupBy, SumMeanProductDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_sum", nullptr, "agg_0", "hash_sum"}, - {"hash_sum", nullptr, "agg_1", "hash_sum"}, - {"hash_mean", nullptr, "agg_2", "hash_mean"}, - {"hash_mean", nullptr, "agg_3", "hash_mean"}, - {"hash_product", nullptr, "agg_4", "hash_product"}, - {"hash_product", nullptr, "agg_5", "hash_product"}, + {"hash_sum", nullptr}, + {"hash_sum", nullptr}, + {"hash_mean", nullptr}, + {"hash_mean", nullptr}, + {"hash_product", nullptr}, + {"hash_product", nullptr}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1608,9 +1620,9 @@ TEST(GroupBy, MinMaxOnly) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr, "agg_0", "hash_min_max"}, - {"hash_min_max", nullptr, "agg_1", "hash_min_max"}, - {"hash_min_max", nullptr, "agg_2", "hash_min_max"}, + {"hash_min_max", nullptr}, + {"hash_min_max", nullptr}, + {"hash_min_max", nullptr}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1712,11 +1724,11 @@ TEST(GroupBy, MinMaxTypes) { auto table = TableFromJSON(in_schema, (ty->name() == "date64") ? date64_table : default_table); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, - /*use_threads=*/true, /*use_exec_plan=*/true)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, + /*use_threads=*/true, /*use_exec_plan=*/true)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1769,8 +1781,8 @@ TEST(GroupBy, MinMaxDecimal) { }, {table->GetColumnByName("key")}, { - {"hash_min_max", nullptr, "agg_0", "hash_min_max"}, - {"hash_min_max", nullptr, "agg_1", "hash_min_max"}, + {"hash_min_max", nullptr}, + {"hash_min_max", nullptr}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -1831,9 +1843,8 @@ TEST(GroupBy, MinMaxBinary) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, use_threads, - use_exec_plan)); + {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1886,9 +1897,8 @@ TEST(GroupBy, MinMaxFixedSizeBinary) { ASSERT_OK_AND_ASSIGN( Datum aggregated_and_grouped, GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_min_max", nullptr, "agg_0", "hash_min_max"}}, use_threads, - use_exec_plan)); + {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -1941,8 +1951,8 @@ TEST(GroupBy, MinOrMax) { table->GetColumnByName("argument")}, {table->GetColumnByName("key")}, { - {"hash_min", nullptr, "agg_0", "hash_min"}, - {"hash_max", nullptr, "agg_1", "hash_max"}, + {"hash_min", nullptr}, + {"hash_max", nullptr}, }, /*use_threads=*/true, /*use_exec_plan=*/true)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2499,12 +2509,12 @@ TEST(GroupBy, OneMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_one", nullptr, "agg_0", "hash_one"}, - {"hash_one", nullptr, "agg_1", "hash_one"}, - {"hash_one", nullptr, "agg_2", "hash_one"}, - {"hash_one", nullptr, "agg_3", "hash_one"}, - {"hash_one", nullptr, "agg_4", "hash_one"}, - {"hash_one", nullptr, "agg_5", "hash_one"}, + {"hash_one", nullptr}, + {"hash_one", nullptr}, + {"hash_one", nullptr}, + {"hash_one", nullptr}, + {"hash_one", nullptr}, + {"hash_one", nullptr}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -2627,11 +2637,11 @@ TEST(GroupBy, OneNumericTypes) { auto table = TableFromJSON(in_schema, (type->name() == "date64") ? temporal_table_json : numeric_table_json); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_one", nullptr, "agg_0", "hash_one"}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, {{"hash_one", nullptr}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -2690,11 +2700,11 @@ TEST(GroupBy, OneBinaryTypes) { [null, 3] ])"}); - ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument0")}, - {table->GetColumnByName("key")}, - {{"hash_one", nullptr, "agg_0", "hash_one"}}, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, {{"hash_one", nullptr}}, + use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3068,12 +3078,12 @@ TEST(GroupBy, ListMiscTypes) { }, {table->GetColumnByName("key")}, { - {"hash_list", nullptr, "agg_0", "hash_list"}, - {"hash_list", nullptr, "agg_1", "hash_list"}, - {"hash_list", nullptr, "agg_2", "hash_list"}, - {"hash_list", nullptr, "agg_3", "hash_list"}, - {"hash_list", nullptr, "agg_4", "hash_list"}, - {"hash_list", nullptr, "agg_5", "hash_list"}, + {"hash_list", nullptr}, + {"hash_list", nullptr}, + {"hash_list", nullptr}, + {"hash_list", nullptr}, + {"hash_list", nullptr}, + {"hash_list", nullptr}, }, use_threads, use_exec_plan)); ValidateOutput(aggregated_and_grouped); @@ -3752,10 +3762,10 @@ TEST(GroupBy, SingleNullTypeKey) { }, {table->GetColumnByName("key")}, { - {"hash_count", nullptr, "agg_0", "hash_count"}, - {"hash_sum", nullptr, "agg_1", "hash_sum"}, - {"hash_mean", nullptr, "agg_2", "hash_mean"}, - {"hash_min_max", nullptr, "agg_3", "hash_min_max"}, + {"hash_count", nullptr}, + {"hash_sum", nullptr}, + {"hash_mean", nullptr}, + {"hash_min_max", nullptr}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3812,9 +3822,9 @@ TEST(GroupBy, MultipleKeysIncludesNullType) { }, {table->GetColumnByName("key_0"), table->GetColumnByName("key_1")}, { - {"hash_count", nullptr, "agg_0", "hash_count"}, - {"hash_sum", nullptr, "agg_1", "hash_sum"}, - {"hash_min_max", nullptr, "agg_2", "hash_min_max"}, + {"hash_count", nullptr}, + {"hash_sum", nullptr}, + {"hash_min_max", nullptr}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); From 56551acee9ee5c38125700302953937c382059bf Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 31 May 2022 13:46:31 +0530 Subject: [PATCH 15/32] fixing typo in rebase --- r/R/query-engine.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 69fa9e742ec..1a53298fd47 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -304,7 +304,7 @@ ExecNode <- R6Class("ExecNode", self$preserve_extras(ExecNode_Filter(self, expr)) }, Aggregate = function(options, key_names) { - self$preserve_sort( + out <- self$preserve_extras( ExecNode_Aggregate(self, options, key_names) ) # dplyr drops top-level attributes when you call summarize() From 1227a133cd793617c78ee6de9e2b250d128c4928 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 31 May 2022 18:18:54 +0530 Subject: [PATCH 16/32] adding patch --- c_glib/arrow-glib/compute.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index ca5bc1a76f5..1595c745bb9 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1255,8 +1255,6 @@ garrow_aggregate_node_options_new(GList *aggregations, GError **error) { std::vector arrow_aggregates; - std::vector arrow_targets; - std::vector arrow_names; std::vector arrow_keys; for (auto node = aggregations; node; node = node->next) { auto aggregation_priv = GARROW_AGGREGATION_GET_PRIVATE(node->data); @@ -1279,7 +1277,12 @@ garrow_aggregate_node_options_new(GList *aggregations, "[aggregate-node-options][new][input]")) { return NULL; } - arrow_names.emplace_back(aggregation_priv->output); + arrow_aggregates.push_back({ + aggregation_priv->function, + function_options, + arrow_targets[0], + aggregation_priv->output + }); } for (gsize i = 0; i < n_keys; ++i) { if (!garrow_field_refs_add(arrow_keys, @@ -1291,8 +1294,6 @@ garrow_aggregate_node_options_new(GList *aggregations, } auto arrow_options = new arrow::compute::AggregateNodeOptions(std::move(arrow_aggregates), - std::move(arrow_targets), - std::move(arrow_names), std::move(arrow_keys)); auto options = g_object_new(GARROW_TYPE_AGGREGATE_NODE_OPTIONS, "options", arrow_options, From ab9c7d5fe4ec70f99ca83c3bdd77cc24a81e2765 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 8 Jun 2022 23:15:52 +0530 Subject: [PATCH 17/32] rebase --- cpp/src/arrow/compute/api_aggregate.h | 113 +++++++++++++++--- cpp/src/arrow/compute/exec/aggregate_node.cc | 27 ++++- cpp/src/arrow/compute/exec/options.h | 4 +- .../compute/kernels/aggregate_benchmark.cc | 2 +- .../compute/kernels/hash_aggregate_test.cc | 14 +-- cpp/src/arrow/dataset/scanner.cc | 2 +- cpp/src/arrow/dataset/scanner_test.cc | 4 +- python/pyarrow/includes/libarrow.pxd | 2 +- r/src/compute-exec.cpp | 4 +- 9 files changed, 137 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 9cd06424299..dd3bd591a5f 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -56,6 +56,21 @@ class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { uint32_t min_count; }; +/// \brief Configure a grouped aggregation +struct ARROW_EXPORT Aggregate { + /// the name of the aggregation function + std::string function; + + /// options for the aggregation function + const FunctionOptions* options; + + // fields to which aggregations will be applied + FieldRef target; + + // output field name for aggregations + std::string name; +}; + /// \brief Control count aggregate kernel behavior. /// /// By default, only non-null values are counted. @@ -393,23 +408,93 @@ ARROW_EXPORT Result Index(const Datum& value, const IndexOptions& options, ExecContext* ctx = NULLPTR); -namespace internal { -/// \brief Configure a grouped aggregation -struct ARROW_EXPORT Aggregate { - /// the name of the aggregation function - std::string function; - - /// options for the aggregation function - std::shared_ptr options; - - // fields to which aggregations will be applied - FieldRef target; - - // output field name for aggregations - std::string name; +namespace internal { +/// Internal use only: streaming group identifier. +/// Consumes batches of keys and yields batches of the group ids. +class ARROW_EXPORT Grouper { + public: + virtual ~Grouper() = default; + + /// Construct a Grouper which receives the specified key types + static Result> Make(const std::vector& descrs, + ExecContext* ctx = default_exec_context()); + + /// Consume a batch of keys, producing the corresponding group ids as an integer array. + /// Currently only uint32 indices will be produced, eventually the bit width will only + /// be as wide as necessary. + virtual Result Consume(const ExecBatch& batch) = 0; + + /// Get current unique keys. May be called multiple times. + virtual Result GetUniques() = 0; + + /// Get the current number of groups. + virtual uint32_t num_groups() const = 0; + + /// \brief Assemble lists of indices of identical elements. + /// + /// \param[in] ids An unsigned, all-valid integral array which will be + /// used as grouping criteria. + /// \param[in] num_groups An upper bound for the elements of ids + /// \return A num_groups-long ListArray where the slot at i contains a + /// list of indices where i appears in ids. + /// + /// MakeGroupings([ + /// 2, + /// 2, + /// 5, + /// 5, + /// 2, + /// 3 + /// ], 8) == [ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ] + static Result> MakeGroupings( + const UInt32Array& ids, uint32_t num_groups, + ExecContext* ctx = default_exec_context()); + + /// \brief Produce a ListArray whose slots are selections of `array` which correspond to + /// the provided groupings. + /// + /// For example, + /// ApplyGroupings([ + /// [], + /// [], + /// [0, 1, 4], + /// [5], + /// [], + /// [2, 3], + /// [], + /// [] + /// ], [2, 2, 5, 5, 2, 3]) == [ + /// [], + /// [], + /// [2, 2, 2], + /// [3], + /// [], + /// [5, 5], + /// [], + /// [] + /// ] + static Result> ApplyGroupings( + const ListArray& groupings, const Array& array, + ExecContext* ctx = default_exec_context()); }; +/// Internal use only: helper function for testing HashAggregateKernels. +/// This will be replaced by streaming execution operators. +ARROW_EXPORT +Result GroupBy(const std::vector& arguments, const std::vector& keys, + const std::vector& aggregates, bool use_threads = false, + ExecContext* ctx = default_exec_context()); + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index 3632f2da0b6..5d71e23546f 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -41,6 +41,25 @@ using internal::checked_cast; namespace compute { +namespace internal { + +Result> GetKernels( + ExecContext* ctx, const std::vector& aggregates, + const std::vector& in_descrs); + +Result>> InitKernels( + const std::vector& kernels, ExecContext* ctx, + const std::vector& aggregates, + const std::vector& in_descrs); + +Result ResolveKernels( + const std::vector& aggregates, + const std::vector& kernels, + const std::vector>& states, ExecContext* ctx, + const std::vector& descrs); + +} // namespace internal + namespace { void AggregatesToString(std::stringstream* ss, const Schema& input_schema, @@ -65,7 +84,7 @@ class ScalarAggregateNode : public ExecNode { ScalarAggregateNode(ExecPlan* plan, std::vector inputs, std::shared_ptr output_schema, std::vector target_field_ids, - std::vector aggs, + std::vector aggs, std::vector kernels, std::vector>> states) : ExecNode(plan, std::move(inputs), {"target"}, @@ -263,7 +282,7 @@ class ScalarAggregateNode : public ExecNode { } const std::vector target_field_ids_; - const std::vector aggs_; + const std::vector aggs_; const std::vector kernels_; std::vector>> states_; @@ -276,7 +295,7 @@ class GroupByNode : public ExecNode { public: GroupByNode(ExecNode* input, std::shared_ptr output_schema, ExecContext* ctx, std::vector key_field_ids, std::vector agg_src_field_ids, - std::vector aggs, + std::vector aggs, std::vector agg_kernels) : ExecNode(input->plan(), {input}, {"groupby"}, std::move(output_schema), /*num_outputs=*/1), @@ -658,7 +677,7 @@ class GroupByNode : public ExecNode { const std::vector key_field_ids_; const std::vector agg_src_field_ids_; - const std::vector aggs_; + const std::vector aggs_; const std::vector agg_kernels_; ThreadIndexer get_thread_index_; diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h index 84254ed5ad9..48448b69f0f 100644 --- a/cpp/src/arrow/compute/exec/options.h +++ b/cpp/src/arrow/compute/exec/options.h @@ -111,12 +111,12 @@ class ARROW_EXPORT ProjectNodeOptions : public ExecNodeOptions { /// \brief Make a node which aggregates input batches, optionally grouped by keys. class ARROW_EXPORT AggregateNodeOptions : public ExecNodeOptions { public: - AggregateNodeOptions(std::vector aggregates, + AggregateNodeOptions(std::vector aggregates, std::vector keys = {}) : aggregates(std::move(aggregates)), keys(std::move(keys)) {} // aggregations which will be applied to the targetted fields - std::vector aggregates; + std::vector aggregates; // keys by which aggregations will be grouped std::vector keys; }; diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index c2712854345..dfd008de1e5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -307,7 +307,7 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) // static void BenchmarkGroupBy(benchmark::State& state, - std::vector aggregates, + std::vector aggregates, std::vector arguments, std::vector keys) { for (auto _ : state) { ABORT_NOT_OK(GroupBy(arguments, keys, aggregates).status()); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 0478cba98a1..c6e87e5e368 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -72,7 +72,7 @@ namespace compute { namespace { Result NaiveGroupBy(std::vector arguments, std::vector keys, - const std::vector& aggregates) { + const std::vector& aggregates) { ARROW_ASSIGN_OR_RAISE(auto key_batch, ExecBatch::Make(std::move(keys))); ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_batch.GetDescriptors())); @@ -123,7 +123,7 @@ Result NaiveGroupBy(std::vector arguments, std::vector keys Result GroupByUsingExecPlan(const BatchesWithSchema& input, const std::vector& key_names, - const std::vector& aggregates, + const std::vector& aggregates, bool use_threads, ExecContext* ctx) { std::vector keys(key_names.size()); for (size_t i = 0; i < key_names.size(); ++i) { @@ -182,7 +182,7 @@ Result GroupByUsingExecPlan(const BatchesWithSchema& input, /// Simpler overload where you can give the columns as datums Result GroupByUsingExecPlan(const std::vector& arguments, const std::vector& keys, - const std::vector& aggregates, + const std::vector& aggregates, bool use_threads, ExecContext* ctx) { using arrow::compute::detail::ExecBatchIterator; @@ -215,11 +215,11 @@ Result GroupByUsingExecPlan(const std::vector& arguments, return GroupByUsingExecPlan(input, key_names, aggregates, use_threads, ctx); } -void ValidateGroupBy(const std::vector& aggregates, +void ValidateGroupBy(const std::vector& aggregates, std::vector arguments, std::vector keys) { ASSERT_OK_AND_ASSIGN(Datum expected, NaiveGroupBy(arguments, keys, aggregates)); - ASSERT_OK_AND_ASSIGN(Datum actual, GroupBy(arguments, keys, aggregates)); + ASSERT_OK_AND_ASSIGN(Datum actual, internal::GroupBy(arguments, keys, aggregates)); ASSERT_OK(expected.make_array()->ValidateFull()); ValidateOutput(actual); @@ -244,7 +244,7 @@ Result GroupByTest(const std::vector& arguments, const std::vector& keys, const std::vector& aggregates, bool use_threads, bool use_exec_plan) { - std::vector internal_aggregates; + std::vector internal_aggregates; int idx = 0; for (auto t_agg : aggregates) { internal_aggregates.push_back( @@ -3458,7 +3458,6 @@ TEST(GroupBy, ConcreteCaseWithValidateGroupBy) { std::shared_ptr non_null = std::make_shared(CountOptions::ONLY_VALID); - using internal::Aggregate; for (auto agg : { Aggregate{"hash_sum", nullptr, "agg_0", "hash_sum"}, Aggregate{"hash_count", non_null, "agg_1", "hash_count"}, @@ -3486,7 +3485,6 @@ TEST(GroupBy, CountNull) { std::shared_ptr skipna = std::make_shared(CountOptions::ONLY_VALID); - using internal::Aggregate; for (auto agg : { Aggregate{"hash_count", keepna, "agg_0", "hash_count"}, Aggregate{"hash_count", skipna, "agg_1", "hash_count"}, diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 2295665b161..3cd5f1fcc26 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -675,7 +675,7 @@ Result AsyncScanner::CountRows() { std::move(fragment_gen)), options}}, {"project", compute::ProjectNodeOptions{{options->filter}, {"mask"}}}, - {"aggregate", compute::AggregateNodeOptions{{compute::internal::Aggregate{ + {"aggregate", compute::AggregateNodeOptions{{compute::Aggregate{ "sum", nullptr, "mask", "selected_count"}}}}, {"sink", compute::SinkNodeOptions{&sink_gen}}, }) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 6dd75db114a..5316f63d080 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1838,7 +1838,7 @@ TEST(ScanNode, MinimalScalarAggEndToEnd) { ASSERT_OK_AND_ASSIGN( compute::ExecNode * aggregate, compute::MakeExecNode("aggregate", plan.get(), {project}, - compute::AggregateNodeOptions{{compute::internal::Aggregate{ + compute::AggregateNodeOptions{{compute::Aggregate{ "sum", nullptr, "a * 2", "sum(a * 2)"}}})); // finally, pipe the aggregate node into a sink node @@ -1928,7 +1928,7 @@ TEST(ScanNode, MinimalGroupedAggEndToEnd) { compute::MakeExecNode( "aggregate", plan.get(), {project}, compute::AggregateNodeOptions{ - {compute::internal::Aggregate{"hash_sum", nullptr, "a * 2", "sum(a * 2)"}}, + {compute::Aggregate{"hash_sum", nullptr, "a * 2", "sum(a * 2)"}}, /*keys=*/{"b"}})); // finally, pipe the aggregate node into a sink node diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 302ac99c36a..9e43eb4eb9c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2408,7 +2408,7 @@ cdef extern from * namespace "arrow::compute": cdef extern from "arrow/compute/exec/aggregate.h" namespace \ "arrow::compute::internal" nogil: - cdef cppclass CAggregate "arrow::compute::internal::Aggregate": + cdef cppclass CAggregate "arrow::compute::Aggregate": c_string function shared_ptr[CFunctionOptions] options diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 7b28a700c2b..fc4a41ab03a 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -229,7 +229,7 @@ std::shared_ptr ExecNode_Project( std::shared_ptr ExecNode_Aggregate( const std::shared_ptr& input, cpp11::list options, std::vector key_names) { - std::vector aggregates; + std::vector aggregates; for (cpp11::list name_opts : options) { auto function = cpp11::as_cpp(name_opts["fun"]); @@ -237,7 +237,7 @@ std::shared_ptr ExecNode_Aggregate( auto target = cpp11::as_cpp(name_opts["target"]); auto name = cpp11::as_cpp(name_opts["name"]); - aggregates.push_back(arrow::compute::internal::Aggregate{ + aggregates.push_back(arrow::compute::Aggregate{ std::move(function), opts.get(), std::move(target), std::move(name)}); } From 3aa44a3ecec0f181cdf0876b86130c3a7fa3e67f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 10 Jun 2022 13:14:24 +0530 Subject: [PATCH 18/32] test commit on fixing cglib --- c_glib/arrow-glib/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 1595c745bb9..4044a4955dc 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1254,7 +1254,7 @@ garrow_aggregate_node_options_new(GList *aggregations, gsize n_keys, GError **error) { - std::vector arrow_aggregates; + std::vector arrow_aggregates; std::vector arrow_keys; for (auto node = aggregations; node; node = node->next) { auto aggregation_priv = GARROW_AGGREGATION_GET_PRIVATE(node->data); From c0d72b470b2b983f0a8e521f6c2de92b766308d1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 11 Jun 2022 10:56:29 +0530 Subject: [PATCH 19/32] format and updated the aggregate_benchmark script --- cpp/src/arrow/compute/exec/aggregate_node.cc | 13 ++++++------- cpp/src/arrow/compute/exec/options.h | 4 ++-- .../arrow/compute/kernels/aggregate_benchmark.cc | 5 +++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index 5d71e23546f..c36da8c13f8 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -49,8 +49,7 @@ Result> GetKernels( Result>> InitKernels( const std::vector& kernels, ExecContext* ctx, - const std::vector& aggregates, - const std::vector& in_descrs); + const std::vector& aggregates, const std::vector& in_descrs); Result ResolveKernels( const std::vector& aggregates, @@ -62,9 +61,10 @@ Result ResolveKernels( namespace { -void AggregatesToString(std::stringstream* ss, const Schema& input_schema, - const std::vector& aggs, - const std::vector& target_field_ids, int indent = 0) { +void AggregatesToString( + std::stringstream* ss, const Schema& input_schema, const std::vector& aggs, + const std::vector& target_field_ids, + int indent = 0) { *ss << "aggregates=[" << std::endl; for (size_t i = 0; i < aggs.size(); i++) { for (int j = 0; j < indent; ++j) *ss << " "; @@ -83,8 +83,7 @@ class ScalarAggregateNode : public ExecNode { public: ScalarAggregateNode(ExecPlan* plan, std::vector inputs, std::shared_ptr output_schema, - std::vector target_field_ids, - std::vector aggs, + std::vector target_field_ids, std::vector aggs, std::vector kernels, std::vector>> states) : ExecNode(plan, std::move(inputs), {"target"}, diff --git a/cpp/src/arrow/compute/exec/options.h b/cpp/src/arrow/compute/exec/options.h index 48448b69f0f..0e0314eedb4 100644 --- a/cpp/src/arrow/compute/exec/options.h +++ b/cpp/src/arrow/compute/exec/options.h @@ -111,8 +111,8 @@ class ARROW_EXPORT ProjectNodeOptions : public ExecNodeOptions { /// \brief Make a node which aggregates input batches, optionally grouped by keys. class ARROW_EXPORT AggregateNodeOptions : public ExecNodeOptions { public: - AggregateNodeOptions(std::vector aggregates, - std::vector keys = {}) + explicit AggregateNodeOptions(std::vector aggregates, + std::vector keys = {}) : aggregates(std::move(aggregates)), keys(std::move(keys)) {} // aggregations which will be applied to the targetted fields diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index dfd008de1e5..a8cae5b50ca 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -306,8 +306,9 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) // GroupBy // -static void BenchmarkGroupBy(benchmark::State& state, - std::vector aggregates, +using arrow::compute::internal::GroupBy; + +static void BenchmarkGroupBy(benchmark::State& state, std::vector aggregates, std::vector arguments, std::vector keys) { for (auto _ : state) { ABORT_NOT_OK(GroupBy(arguments, keys, aggregates).status()); From 2533d42140af16ffc22fbf4d138ff6ba9aa46c03 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 17 Jun 2022 06:28:26 +0530 Subject: [PATCH 20/32] temp commit --- cpp/src/arrow/compute/api_aggregate.h | 88 -------------------------- cpp/src/arrow/compute/exec/aggregate.h | 6 +- 2 files changed, 3 insertions(+), 91 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index dd3bd591a5f..4d16447b8b2 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -408,93 +408,5 @@ ARROW_EXPORT Result Index(const Datum& value, const IndexOptions& options, ExecContext* ctx = NULLPTR); - -namespace internal { -/// Internal use only: streaming group identifier. -/// Consumes batches of keys and yields batches of the group ids. -class ARROW_EXPORT Grouper { - public: - virtual ~Grouper() = default; - - /// Construct a Grouper which receives the specified key types - static Result> Make(const std::vector& descrs, - ExecContext* ctx = default_exec_context()); - - /// Consume a batch of keys, producing the corresponding group ids as an integer array. - /// Currently only uint32 indices will be produced, eventually the bit width will only - /// be as wide as necessary. - virtual Result Consume(const ExecBatch& batch) = 0; - - /// Get current unique keys. May be called multiple times. - virtual Result GetUniques() = 0; - - /// Get the current number of groups. - virtual uint32_t num_groups() const = 0; - - /// \brief Assemble lists of indices of identical elements. - /// - /// \param[in] ids An unsigned, all-valid integral array which will be - /// used as grouping criteria. - /// \param[in] num_groups An upper bound for the elements of ids - /// \return A num_groups-long ListArray where the slot at i contains a - /// list of indices where i appears in ids. - /// - /// MakeGroupings([ - /// 2, - /// 2, - /// 5, - /// 5, - /// 2, - /// 3 - /// ], 8) == [ - /// [], - /// [], - /// [0, 1, 4], - /// [5], - /// [], - /// [2, 3], - /// [], - /// [] - /// ] - static Result> MakeGroupings( - const UInt32Array& ids, uint32_t num_groups, - ExecContext* ctx = default_exec_context()); - - /// \brief Produce a ListArray whose slots are selections of `array` which correspond to - /// the provided groupings. - /// - /// For example, - /// ApplyGroupings([ - /// [], - /// [], - /// [0, 1, 4], - /// [5], - /// [], - /// [2, 3], - /// [], - /// [] - /// ], [2, 2, 5, 5, 2, 3]) == [ - /// [], - /// [], - /// [2, 2, 2], - /// [3], - /// [], - /// [5, 5], - /// [], - /// [] - /// ] - static Result> ApplyGroupings( - const ListArray& groupings, const Array& array, - ExecContext* ctx = default_exec_context()); -}; - -/// Internal use only: helper function for testing HashAggregateKernels. -/// This will be replaced by streaming execution operators. -ARROW_EXPORT -Result GroupBy(const std::vector& arguments, const std::vector& keys, - const std::vector& aggregates, bool use_threads = false, - ExecContext* ctx = default_exec_context()); - -} // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/aggregate.h b/cpp/src/arrow/compute/exec/aggregate.h index 2c62acf2319..67d22d0df41 100644 --- a/cpp/src/arrow/compute/exec/aggregate.h +++ b/cpp/src/arrow/compute/exec/aggregate.h @@ -41,16 +41,16 @@ Result GroupBy(const std::vector& arguments, const std::vector> GetKernels( - ExecContext* ctx, const std::vector& aggregates, + ExecContext* ctx, const std::vector& aggregates, const std::vector& in_descrs); Result>> InitKernels( const std::vector& kernels, ExecContext* ctx, - const std::vector& aggregates, + const std::vector& aggregates, const std::vector& in_descrs); Result ResolveKernels( - const std::vector& aggregates, + const std::vector& aggregates, const std::vector& kernels, const std::vector>& states, ExecContext* ctx, const std::vector& descrs); From dda8b851a627bbae8dbe79fd73e91fd5e884c623 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 17 Jun 2022 07:00:16 +0530 Subject: [PATCH 21/32] rebase --- cpp/src/arrow/compute/api_aggregate.h | 30 ++++++++++---------- cpp/src/arrow/compute/exec/aggregate.h | 3 +- cpp/src/arrow/compute/exec/aggregate_node.cc | 18 ------------ 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 4d16447b8b2..9b71994e5e4 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -56,21 +56,6 @@ class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { uint32_t min_count; }; -/// \brief Configure a grouped aggregation -struct ARROW_EXPORT Aggregate { - /// the name of the aggregation function - std::string function; - - /// options for the aggregation function - const FunctionOptions* options; - - // fields to which aggregations will be applied - FieldRef target; - - // output field name for aggregations - std::string name; -}; - /// \brief Control count aggregate kernel behavior. /// /// By default, only non-null values are counted. @@ -408,5 +393,20 @@ ARROW_EXPORT Result Index(const Datum& value, const IndexOptions& options, ExecContext* ctx = NULLPTR); +/// \brief Configure a grouped aggregation +struct ARROW_EXPORT Aggregate { + /// the name of the aggregation function + std::string function; + + /// options for the aggregation function + const FunctionOptions* options; + + // fields to which aggregations will be applied + FieldRef target; + + // output field name for aggregations + std::string name; +}; + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/aggregate.h b/cpp/src/arrow/compute/exec/aggregate.h index 67d22d0df41..753b0a8c47e 100644 --- a/cpp/src/arrow/compute/exec/aggregate.h +++ b/cpp/src/arrow/compute/exec/aggregate.h @@ -46,8 +46,7 @@ Result> GetKernels( Result>> InitKernels( const std::vector& kernels, ExecContext* ctx, - const std::vector& aggregates, - const std::vector& in_descrs); + const std::vector& aggregates, const std::vector& in_descrs); Result ResolveKernels( const std::vector& aggregates, diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index c36da8c13f8..7ada036fcc4 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -41,24 +41,6 @@ using internal::checked_cast; namespace compute { -namespace internal { - -Result> GetKernels( - ExecContext* ctx, const std::vector& aggregates, - const std::vector& in_descrs); - -Result>> InitKernels( - const std::vector& kernels, ExecContext* ctx, - const std::vector& aggregates, const std::vector& in_descrs); - -Result ResolveKernels( - const std::vector& aggregates, - const std::vector& kernels, - const std::vector>& states, ExecContext* ctx, - const std::vector& descrs); - -} // namespace internal - namespace { void AggregatesToString( From c7a5e54009bdef8b730e64c45903aa65ad9f146a Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 21 Jun 2022 13:08:44 +0530 Subject: [PATCH 22/32] tmp check on function refactor --- r/R/query-engine.R | 84 +++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 1a53298fd47..7aaf32ff0cc 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, public = list( @@ -99,44 +98,8 @@ ExecPlan <- R6Class("ExecPlan", } if (!is.null(.data$aggregations)) { - # Project to include just the data required for each aggregation, - # plus group_by_vars (last) - # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) - # dplyr does not error on this but the result it gives isn't great - node <- node$Project(summarize_projection(.data)) - - if (grouped) { - # We need to prefix all of the aggregation function names with "hash_" - .data$aggregations <- lapply(.data$aggregations, function(x) { - x[["fun"]] <- paste0("hash_", x[["fun"]]) - x - }) - } - target_names <- names(.data$aggregations) - for (i in seq_len(length(target_names))) { - .data$aggregations[[i]][["name"]] <- .data$aggregations[[i]][["target"]] <- target_names[i] - } - - node <- node$Aggregate( - options = .data$aggregations, - key_names = group_vars - ) - - if (grouped) { - # The result will have result columns first then the grouping cols. - # dplyr orders group cols first, so adapt the result to meet that expectation. - node <- node$Project( - make_field_refs(c(group_vars, names(.data$aggregations))) - ) - if (getOption("arrow.summarise.sort", FALSE)) { - # Add sorting instructions for the rows too to match dplyr - # (see below about why sorting isn't itself a Node) - node$extras$sort <- list( - names = group_vars, - orders = rep(0L, length(group_vars)) - ) - } - } + list[node, data] <- private$.config_aggregation(node, .data, grouped, group_vars) + .data <- data } else { # If any columns are derived, reordered, or renamed we need to Project # If there are aggregations, the projection was already handled above @@ -258,6 +221,49 @@ ExecPlan <- R6Class("ExecPlan", ) }, Stop = function() ExecPlan_StopProducing(self) + ), + private = list( + .config_aggregation = function(node, data, grouped, group_vars) { + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(data)) + + if (grouped) { + # We need to prefix all of the aggregation function names with "hash_" + data$aggregations <- lapply(data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + } + target_names <- names(data$aggregations) + for (i in seq_len(length(target_names))) { + data$aggregations[[i]][["name"]] <- data$aggregations[[i]][["target"]] <- target_names[i] + } + + node <- node$Aggregate( + options = data$aggregations, + key_names = group_vars + ) + + if (grouped) { + # The result will have result columns first then the grouping cols. + # dplyr orders group cols first, so adapt the result to meet that expectation. + node <- node$Project( + make_field_refs(c(group_vars, names(data$aggregations))) + ) + if (getOption("arrow.summarise.sort", FALSE)) { + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$extras$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } + } + return(list(node = node, data = data)) + } ) ) ExecPlan$create <- function(use_threads = option_use_threads()) { From fd443e0fd2739e2adbd68b627b921453bb85dae0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 22 Jun 2022 07:26:51 +0530 Subject: [PATCH 23/32] fixed the test issue --- r/R/query-engine.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 7aaf32ff0cc..8d87fa1be16 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -98,8 +98,9 @@ ExecPlan <- R6Class("ExecPlan", } if (!is.null(.data$aggregations)) { - list[node, data] <- private$.config_aggregation(node, .data, grouped, group_vars) - .data <- data + config_agg <- private$.set_aggregation(node, .data, grouped, group_vars) + node <- config_agg$node + .data <- config_agg$data } else { # If any columns are derived, reordered, or renamed we need to Project # If there are aggregations, the projection was already handled above @@ -223,7 +224,7 @@ ExecPlan <- R6Class("ExecPlan", Stop = function() ExecPlan_StopProducing(self) ), private = list( - .config_aggregation = function(node, data, grouped, group_vars) { + .set_aggregation = function(node, data, grouped, group_vars) { # Project to include just the data required for each aggregation, # plus group_by_vars (last) # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) From d53c68520001fac74331ce44a8eeb755e3332768 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 22 Jun 2022 10:52:41 +0530 Subject: [PATCH 24/32] rebase --- .../execution_plan_documentation_examples.cc | 2 +- cpp/src/arrow/compute/api_aggregate.h | 2 +- cpp/src/arrow/compute/exec/aggregate.cc | 5 +- cpp/src/arrow/compute/exec/aggregate_node.cc | 7 +- cpp/src/arrow/compute/exec/plan_test.cc | 31 +- .../compute/kernels/hash_aggregate_test.cc | 313 +++++++++--------- r/src/compute-exec.cpp | 2 +- 7 files changed, 182 insertions(+), 180 deletions(-) diff --git a/cpp/examples/arrow/execution_plan_documentation_examples.cc b/cpp/examples/arrow/execution_plan_documentation_examples.cc index 44bff56e00c..331388a5291 100644 --- a/cpp/examples/arrow/execution_plan_documentation_examples.cc +++ b/cpp/examples/arrow/execution_plan_documentation_examples.cc @@ -540,7 +540,7 @@ arrow::Status SourceGroupAggregateSinkExample(cp::ExecContext& exec_context) { cp::MakeExecNode("source", plan.get(), {}, source_node_options)); auto options = std::make_shared(cp::CountOptions::ONLY_VALID); auto aggregate_options = - cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", &options, "a", "count(a)"}}, + cp::AggregateNodeOptions{/*aggregates=*/{{"hash_count", options, "a", "count(a)"}}, /*keys=*/{"b"}}; ARROW_ASSIGN_OR_RAISE( cp::ExecNode * aggregate, diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 9b71994e5e4..55f434c6659 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -399,7 +399,7 @@ struct ARROW_EXPORT Aggregate { std::string function; /// options for the aggregation function - const FunctionOptions* options; + std::shared_ptr options; // fields to which aggregations will be applied FieldRef target; diff --git a/cpp/src/arrow/compute/exec/aggregate.cc b/cpp/src/arrow/compute/exec/aggregate.cc index 934cdd4d1fd..41b5bb75b66 100644 --- a/cpp/src/arrow/compute/exec/aggregate.cc +++ b/cpp/src/arrow/compute/exec/aggregate.cc @@ -22,6 +22,7 @@ #include "arrow/compute/exec_internal.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/task_group.h" namespace arrow { @@ -55,7 +56,9 @@ Result>> InitKernels( std::vector> states(kernels.size()); for (size_t i = 0; i < aggregates.size(); ++i) { - const FunctionOptions* options = aggregates[i].options.get(); + const FunctionOptions* options = + arrow::internal::checked_cast( + aggregates[i].options.get()); if (options == nullptr) { // use known default options for the named function if possible diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index 7ada036fcc4..8c7899c41ec 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -43,10 +43,9 @@ namespace compute { namespace { -void AggregatesToString( - std::stringstream* ss, const Schema& input_schema, const std::vector& aggs, - const std::vector& target_field_ids, - int indent = 0) { +void AggregatesToString(std::stringstream* ss, const Schema& input_schema, + const std::vector& aggs, + const std::vector& target_field_ids, int indent = 0) { *ss << "aggregates=[" << std::endl; for (size_t i = 0; i < aggs.size(); i++) { for (int j = 0; j < indent; ++j) *ss << " "; diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc index 4ffee8bdc13..9efa6623e5a 100644 --- a/cpp/src/arrow/compute/exec/plan_test.cc +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -1152,19 +1152,18 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { { auto options = std::make_shared(TDigestOptions::Defaults()); - ASSERT_OK( - Declaration::Sequence( - { - {"source", - SourceNodeOptions{basic_data.schema, basic_data.gen(/*parallel=*/false, - /*slow=*/false)}}, - {"aggregate", - AggregateNodeOptions{ - /*aggregates=*/{{"tdigest", options, "i32", "tdigest(i32)"}}, - }}, - {"sink", SinkNodeOptions{&sink_gen}}, - }) - .AddToPlan(plan.get())); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions{basic_data.schema, + basic_data.gen(/*parallel=*/false, + /*slow=*/false)}}, + {"aggregate", + AggregateNodeOptions{ + /*aggregates=*/{{"tdigest", options, "i32", "tdigest(i32)"}}, + }}, + {"sink", SinkNodeOptions{&sink_gen}}, + }) + .AddToPlan(plan.get())); } ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), @@ -1186,9 +1185,9 @@ TEST(ExecPlanExecution, AggregationPreservesOptions) { {"source", SourceNodeOptions{data.schema, data.gen(/*parallel=*/false, /*slow=*/false)}}, {"aggregate", - AggregateNodeOptions{/*aggregates=*/{{"hash_count", options, - "i32", "count(i32)"}}, - /*keys=*/{"str"}}}, + AggregateNodeOptions{ + /*aggregates=*/{{"hash_count", options, "i32", "count(i32)"}}, + /*keys=*/{"str"}}}, {"sink", SinkNodeOptions{&sink_gen}}, }) .AddToPlan(plan.get())); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index c6e87e5e368..82d40aba948 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -237,13 +237,13 @@ ExecContext* small_chunksize_context(bool use_threads = false) { struct TestAggregate { std::string function; - const FunctionOptions* options; + std::shared_ptr options; }; Result GroupByTest(const std::vector& arguments, const std::vector& keys, const std::vector& aggregates, bool use_threads, - bool use_exec_plan) { + bool use_exec_plan = false) { std::vector internal_aggregates; int idx = 0; for (auto t_agg : aggregates) { @@ -1029,17 +1029,17 @@ TEST(GroupBy, MeanOnly) { [null, 3] ])"}); - ScalarAggregateOptions min_count(/*skip_nulls=*/true, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - internal::GroupBy( - {table->GetColumnByName("argument"), table->GetColumnByName("argument")}, - {table->GetColumnByName("key")}, - { - {"hash_mean", nullptr, "agg_0", "hash_mean"}, - {"hash_mean", min_count, "agg_1", "hash_mean"}, - }, - use_threads)); + auto min_count = + std::make_shared(/*skip_nulls=*/true, /*min_count=*/3); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument"), + table->GetColumnByName("argument")}, + {table->GetColumnByName("key")}, + { + {"hash_mean", nullptr}, + {"hash_mean", min_count}, + }, + use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ @@ -1111,7 +1111,7 @@ TEST(GroupBy, VarianceAndStddev) { ])"); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( + GroupByTest( { batch->GetColumnByName("argument"), batch->GetColumnByName("argument"), @@ -1120,9 +1120,10 @@ TEST(GroupBy, VarianceAndStddev) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, - {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, - })); + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + }, + false)); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1152,19 +1153,19 @@ TEST(GroupBy, VarianceAndStddev) { [null, 3] ])"); - ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, - {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, - })); + ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, GroupByTest( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + }, + false)); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1181,21 +1182,21 @@ TEST(GroupBy, VarianceAndStddev) { /*verbose=*/true); // Test ddof - VarianceOptions variance_options(/*ddof=*/2); - ASSERT_OK_AND_ASSIGN( - aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_variance", variance_options, "agg_0", "hash_variance_0"}, - {"hash_stddev", variance_options, "agg_1", "hash_stddev_0"}, - })); + auto variance_options = std::make_shared(/*ddof=*/2); + ASSERT_OK_AND_ASSIGN(aggregated_and_grouped, + GroupByTest( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_variance", variance_options}, + {"hash_stddev", variance_options}, + }, + false)); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1228,7 +1229,7 @@ TEST(GroupBy, VarianceAndStddevDecimal) { ])"); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( + GroupByTest( { batch->GetColumnByName("argument0"), batch->GetColumnByName("argument0"), @@ -1239,11 +1240,12 @@ TEST(GroupBy, VarianceAndStddevDecimal) { batch->GetColumnByName("key"), }, { - {"hash_variance", nullptr, "agg_0", "hash_variance_0"}, - {"hash_stddev", nullptr, "agg_1", "hash_stddev_1"}, - {"hash_variance", nullptr, "agg_2", "hash_variance_2"}, - {"hash_stddev", nullptr, "agg_3", "hash_stddev_3"}, - })); + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + {"hash_variance", nullptr}, + {"hash_stddev", nullptr}, + }, + false)); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_variance", float64()), @@ -1280,37 +1282,41 @@ TEST(GroupBy, TDigest) { [null, 4] ])"); - TDigestOptions options1(std::vector{0.5, 0.9, 0.99}); - TDigestOptions options2(std::vector{0.5, 0.9, 0.99}, /*delta=*/50, - /*buffer_size=*/1024); - TDigestOptions keep_nulls(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/false, /*min_count=*/0); - TDigestOptions min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/true, /*min_count=*/3); - TDigestOptions keep_nulls_min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, - /*skip_nulls=*/false, /*min_count=*/3); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - internal::GroupBy( - { - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - batch->GetColumnByName("argument"), - }, - { - batch->GetColumnByName("key"), - }, - { - {"hash_tdigest", nullptr, "agg_0", "hash_tdigest_0"}, - {"hash_tdigest", options1, "agg_1", "hash_tdigest_1"}, - {"hash_tdigest", options2, "agg_2", "hash_tdigest_2"}, - {"hash_tdigest", keep_nulls, "agg_3", "hash_tdigest_3"}, - {"hash_tdigest", min_count, "agg_4", "hash_tdigest_4"}, - {"hash_tdigest", keep_nulls_min_count, "agg_5", "hash_tdigest_5"}, - })); + auto options1 = std::make_shared(std::vector{0.5, 0.9, 0.99}); + auto options2 = + std::make_shared(std::vector{0.5, 0.9, 0.99}, /*delta=*/50, + /*buffer_size=*/1024); + auto keep_nulls = + std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/0); + auto min_count = + std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/true, /*min_count=*/3); + auto keep_nulls_min_count = + std::make_shared(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/3); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + }, + { + batch->GetColumnByName("key"), + }, + { + {"hash_tdigest", nullptr}, + {"hash_tdigest", options1}, + {"hash_tdigest", options2}, + {"hash_tdigest", keep_nulls}, + {"hash_tdigest", min_count}, + {"hash_tdigest", keep_nulls_min_count}, + }, + false)); AssertDatumsApproxEqual( ArrayFromJSON(struct_({ @@ -1349,16 +1355,17 @@ TEST(GroupBy, TDigestDecimal) { ])"); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( + GroupByTest( { batch->GetColumnByName("argument0"), batch->GetColumnByName("argument1"), }, {batch->GetColumnByName("key")}, { - {"hash_tdigest", nullptr, "agg_0", "hash_tdigest_0"}, - {"hash_tdigest", nullptr, "agg_1", "hash_tdigest_1"}, - })); + {"hash_tdigest", nullptr}, + {"hash_tdigest", nullptr}, + }, + false)); AssertDatumsApproxEqual( ArrayFromJSON(struct_({ @@ -1403,7 +1410,7 @@ TEST(GroupBy, ApproximateMedian) { auto keep_nulls_min_count = std::make_shared( /*skip_nulls=*/false, /*min_count=*/3); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy( + GroupByTest( { batch->GetColumnByName("argument"), batch->GetColumnByName("argument"), @@ -1414,15 +1421,12 @@ TEST(GroupBy, ApproximateMedian) { batch->GetColumnByName("key"), }, { - {"hash_approximate_median", options, "agg_0", - "hash_approximate_median_0"}, - {"hash_approximate_median", keep_nulls, "agg_1", - "hash_approximate_median_1"}, - {"hash_approximate_median", min_count, "agg_2", - "hash_approximate_median_2"}, - {"hash_approximate_median", keep_nulls_min_count, - "agg_3", "hash_approximate_median_3"}, - })); + {"hash_approximate_median", options}, + {"hash_approximate_median", keep_nulls}, + {"hash_approximate_median", min_count}, + {"hash_approximate_median", keep_nulls_min_count}, + }, + false)); AssertDatumsApproxEqual(ArrayFromJSON(struct_({ field("hash_approximate_median", float64()), @@ -3670,9 +3674,9 @@ TEST(GroupBy, CountWithNullType) { }, {table->GetColumnByName("key")}, { - {"hash_count", all, "agg_0", "hash_count"}, - {"hash_count", only_valid, "agg_1", "hash_count"}, - {"hash_count", only_null, "agg_2", "hash_count"}, + {"hash_count", all}, + {"hash_count", only_valid}, + {"hash_count", only_null}, }, use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); @@ -3715,9 +3719,9 @@ TEST(GroupBy, CountWithNullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_count", all, "agg_0", "hash_count"}, - {"hash_count", only_valid, "agg_1", "hash_count"}, - {"hash_count", only_null, "agg_2", "hash_count"}, + {"hash_count", all}, + {"hash_count", only_valid}, + {"hash_count", only_null}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); @@ -3881,23 +3885,22 @@ TEST(GroupBy, SumNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_sum", no_min, "agg_0", "hash_sum"}, - {"hash_sum", keep_nulls, "agg_1", "hash_sum"}, - {"hash_sum", min_count, "agg_2", "hash_sum"}, - {"hash_sum", keep_nulls_min_count, "agg_3", "hash_sum"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_sum", no_min}, + {"hash_sum", keep_nulls}, + {"hash_sum", min_count}, + {"hash_sum", keep_nulls_min_count}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -3950,23 +3953,22 @@ TEST(GroupBy, ProductNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_product", no_min, "agg_0", "hash_product"}, - {"hash_product", keep_nulls, "agg_1", "hash_product"}, - {"hash_product", min_count, "agg_2", "hash_product"}, - {"hash_product", keep_nulls_min_count, "agg_3", "hash_product"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_product", no_min}, + {"hash_product", keep_nulls}, + {"hash_product", min_count}, + {"hash_product", keep_nulls_min_count}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -4019,23 +4021,22 @@ TEST(GroupBy, MeanNullType) { for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - ASSERT_OK_AND_ASSIGN( - Datum aggregated_and_grouped, - GroupByTest( - { - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - table->GetColumnByName("argument"), - }, - {table->GetColumnByName("key")}, - { - {"hash_mean", no_min, "agg_0", "hash_mean"}, - {"hash_mean", keep_nulls, "agg_1", "hash_mean"}, - {"hash_mean", min_count, "agg_2", "hash_mean"}, - {"hash_mean", keep_nulls_min_count, "agg_3", "hash_mean"}, - }, - use_threads, use_exec_plan)); + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_mean", no_min}, + {"hash_mean", keep_nulls}, + {"hash_mean", min_count}, + {"hash_mean", keep_nulls_min_count}, + }, + use_threads, use_exec_plan)); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual(ArrayFromJSON(struct_({ @@ -4082,9 +4083,9 @@ TEST(GroupBy, NullTypeEmptyTable) { }, {table->GetColumnByName("key")}, { - {"hash_sum", no_min, "agg_0", "hash_sum"}, - {"hash_product", min_count, "agg_1", "hash_product"}, - {"hash_mean", keep_nulls, "agg_2", "hash_mean"}, + {"hash_sum", no_min}, + {"hash_product", min_count}, + {"hash_mean", keep_nulls}, }, use_threads, use_exec_plan)); auto struct_arr = aggregated_and_grouped.array_as(); diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index fc4a41ab03a..c725c3379a6 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -238,7 +238,7 @@ std::shared_ptr ExecNode_Aggregate( auto name = cpp11::as_cpp(name_opts["name"]); aggregates.push_back(arrow::compute::Aggregate{ - std::move(function), opts.get(), std::move(target), std::move(name)}); + std::move(function), opts, std::move(target), std::move(name)}); } std::vector keys; From 3ebc09a5243ca9c4b98526f682cc3db7444cdf37 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 22 Jun 2022 11:16:13 +0530 Subject: [PATCH 25/32] fix R format issue --- r/src/compute-exec.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index c725c3379a6..76112b4cefd 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -237,8 +237,8 @@ std::shared_ptr ExecNode_Aggregate( auto target = cpp11::as_cpp(name_opts["target"]); auto name = cpp11::as_cpp(name_opts["name"]); - aggregates.push_back(arrow::compute::Aggregate{ - std::move(function), opts, std::move(target), std::move(name)}); + aggregates.push_back(arrow::compute::Aggregate{std::move(function), opts, + std::move(target), std::move(name)}); } std::vector keys; From 5754ebb7e8dce91aff14fd88de2a7ef376e7b08e Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 22 Jun 2022 13:50:21 +0530 Subject: [PATCH 26/32] test fix for cglib --- c_glib/arrow-glib/compute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 4044a4955dc..1fff429b5a2 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1271,7 +1271,7 @@ garrow_aggregate_node_options_new(GList *aggregations, } else { arrow_aggregates.push_back({aggregation_priv->function, nullptr}); }; - if (!garrow_field_refs_add(arrow_targets, + if (!garrow_field_refs_add(aggregation_priv->output, aggregation_priv->input, error, "[aggregate-node-options][new][input]")) { @@ -1280,7 +1280,7 @@ garrow_aggregate_node_options_new(GList *aggregations, arrow_aggregates.push_back({ aggregation_priv->function, function_options, - arrow_targets[0], + aggregation_priv->output, aggregation_priv->output }); } From dfef8eceb1319cc31db764c7a8c50bdfad44609b Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 22 Jun 2022 19:30:23 +0530 Subject: [PATCH 27/32] typo fix in tpch bench script --- cpp/src/arrow/compute/exec/tpch_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec/tpch_benchmark.cc b/cpp/src/arrow/compute/exec/tpch_benchmark.cc index b998da5eded..54ac7cbdbf5 100644 --- a/cpp/src/arrow/compute/exec/tpch_benchmark.cc +++ b/cpp/src/arrow/compute/exec/tpch_benchmark.cc @@ -77,7 +77,7 @@ std::shared_ptr Plan_Q1(AsyncGenerator>* sin auto sum_opts = std::make_shared(ScalarAggregateOptions::Defaults()); auto count_opts = std::make_shared(CountOptions::CountMode::ALL); - std::vector aggs = { + std::vector aggs = { {"hash_sum", sum_opts, "sum_qty", "sum_qty"}, {"hash_sum", sum_opts, "sum_base_price", "sum_base_price"}, {"hash_sum", sum_opts, "sum_disc_price", "sum_disc_price"}, From 038595ba499fc2de4338b8000b4dc7d3012ff142 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 23 Jun 2022 12:59:13 +0530 Subject: [PATCH 28/32] partition the function further --- r/R/query-engine.R | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 8d87fa1be16..860d947cd48 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -224,13 +224,7 @@ ExecPlan <- R6Class("ExecPlan", Stop = function() ExecPlan_StopProducing(self) ), private = list( - .set_aggregation = function(node, data, grouped, group_vars) { - # Project to include just the data required for each aggregation, - # plus group_by_vars (last) - # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) - # dplyr does not error on this but the result it gives isn't great - node <- node$Project(summarize_projection(data)) - + .set_aggregate_func_names = function(data, grouped) { if (grouped) { # We need to prefix all of the aggregation function names with "hash_" data$aggregations <- lapply(data$aggregations, function(x) { @@ -238,16 +232,9 @@ ExecPlan <- R6Class("ExecPlan", x }) } - target_names <- names(data$aggregations) - for (i in seq_len(length(target_names))) { - data$aggregations[[i]][["name"]] <- data$aggregations[[i]][["target"]] <- target_names[i] - } - - node <- node$Aggregate( - options = data$aggregations, - key_names = group_vars - ) - + data + }, + .set_group_by = function(node, data, group_vars, grouped) { if (grouped) { # The result will have result columns first then the grouping cols. # dplyr orders group cols first, so adapt the result to meet that expectation. @@ -264,6 +251,27 @@ ExecPlan <- R6Class("ExecPlan", } } return(list(node = node, data = data)) + }, + .set_aggregation = function(node, data, grouped, group_vars) { + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(data)) + + data <- private$.set_aggregate_func_names(data, grouped) + + target_names <- names(data$aggregations) + for (i in seq_len(length(target_names))) { + data$aggregations[[i]][["name"]] <- data$aggregations[[i]][["target"]] <- target_names[i] + } + + node <- node$Aggregate( + options = data$aggregations, + key_names = group_vars + ) + group_config <- private$.set_group_by(node, data, group_vars, grouped) + return(list(node = group_config$node, data = group_config$data)) } ) ) From 764927a3b58fc7238a8a8cec85971582aca91b8d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 23 Jun 2022 18:25:49 +0530 Subject: [PATCH 29/32] added the overwritten patch --- c_glib/arrow-glib/compute.cpp | 21 +++++++-------------- c_glib/arrow-glib/compute.h | 2 +- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 1fff429b5a2..193ba2837b9 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1263,26 +1263,19 @@ garrow_aggregate_node_options_new(GList *aggregations, function_options = garrow_function_options_get_raw(aggregation_priv->options); }; - if (function_options) { - arrow_aggregates.push_back({ - aggregation_priv->function, - function_options->Copy(), - }); - } else { - arrow_aggregates.push_back({aggregation_priv->function, nullptr}); - }; - if (!garrow_field_refs_add(aggregation_priv->output, + std::vector arrow_targets; + if (!garrow_field_refs_add(arrow_targets, aggregation_priv->input, error, "[aggregate-node-options][new][input]")) { return NULL; } arrow_aggregates.push_back({ - aggregation_priv->function, - function_options, - aggregation_priv->output, - aggregation_priv->output - }); + aggregation_priv->function, + function_options ? function_options->Copy() : nullptr, + arrow_targets[0], + aggregation_priv->output, + }); } for (gsize i = 0; i < n_keys; ++i) { if (!garrow_field_refs_add(arrow_keys, diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 818b76ed72c..32db15be8b0 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -755,7 +755,7 @@ garrow_quantile_options_get_qs(GArrowQuantileOptions *options, GARROW_AVAILABLE_IN_9_0 void garrow_quantile_options_set_q(GArrowQuantileOptions *options, - gdouble quantile); + gdouble q); GARROW_AVAILABLE_IN_9_0 void garrow_quantile_options_set_qs(GArrowQuantileOptions *options, From 4ab7b83dbd2985952585462ababf8d77a60f69ff Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 23 Jun 2022 21:17:49 +0530 Subject: [PATCH 30/32] removing the splitted function --- r/R/query-engine.R | 93 +++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 54 deletions(-) diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 860d947cd48..b033d86604e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -98,9 +98,45 @@ ExecPlan <- R6Class("ExecPlan", } if (!is.null(.data$aggregations)) { - config_agg <- private$.set_aggregation(node, .data, grouped, group_vars) - node <- config_agg$node - .data <- config_agg$data + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(.data)) + + if (grouped) { + # We need to prefix all of the aggregation function names with "hash_" + .data$aggregations <- lapply(.data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + } + + target_names <- names(.data$aggregations) + for (i in seq_len(length(target_names))) { + .data$aggregations[[i]][["name"]] <- .data$aggregations[[i]][["target"]] <- target_names[i] + } + + node <- node$Aggregate( + options = .data$aggregations, + key_names = group_vars + ) + + if (grouped) { + # The result will have result columns first then the grouping cols. + # dplyr orders group cols first, so adapt the result to meet that expectation. + node <- node$Project( + make_field_refs(c(group_vars, names(.data$aggregations))) + ) + if (getOption("arrow.summarise.sort", FALSE)) { + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$extras$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } + } } else { # If any columns are derived, reordered, or renamed we need to Project # If there are aggregations, the projection was already handled above @@ -222,57 +258,6 @@ ExecPlan <- R6Class("ExecPlan", ) }, Stop = function() ExecPlan_StopProducing(self) - ), - private = list( - .set_aggregate_func_names = function(data, grouped) { - if (grouped) { - # We need to prefix all of the aggregation function names with "hash_" - data$aggregations <- lapply(data$aggregations, function(x) { - x[["fun"]] <- paste0("hash_", x[["fun"]]) - x - }) - } - data - }, - .set_group_by = function(node, data, group_vars, grouped) { - if (grouped) { - # The result will have result columns first then the grouping cols. - # dplyr orders group cols first, so adapt the result to meet that expectation. - node <- node$Project( - make_field_refs(c(group_vars, names(data$aggregations))) - ) - if (getOption("arrow.summarise.sort", FALSE)) { - # Add sorting instructions for the rows too to match dplyr - # (see below about why sorting isn't itself a Node) - node$extras$sort <- list( - names = group_vars, - orders = rep(0L, length(group_vars)) - ) - } - } - return(list(node = node, data = data)) - }, - .set_aggregation = function(node, data, grouped, group_vars) { - # Project to include just the data required for each aggregation, - # plus group_by_vars (last) - # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) - # dplyr does not error on this but the result it gives isn't great - node <- node$Project(summarize_projection(data)) - - data <- private$.set_aggregate_func_names(data, grouped) - - target_names <- names(data$aggregations) - for (i in seq_len(length(target_names))) { - data$aggregations[[i]][["name"]] <- data$aggregations[[i]][["target"]] <- target_names[i] - } - - node <- node$Aggregate( - options = data$aggregations, - key_names = group_vars - ) - group_config <- private$.set_group_by(node, data, group_vars, grouped) - return(list(node = group_config$node, data = group_config$data)) - } ) ) ExecPlan$create <- function(use_threads = option_use_threads()) { From 5c367b566ba2829c6758bda497454557afdca4d6 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Thu, 23 Jun 2022 14:42:44 -0400 Subject: [PATCH 31/32] Apply suggested imap() patch to reduce (detected) complexity. Update lintr config for 3.0.0 --- r/.lintr | 6 +++--- r/R/query-engine.R | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/r/.lintr b/r/.lintr index 0298fd7f995..619339afca3 100644 --- a/r/.lintr +++ b/r/.lintr @@ -14,7 +14,7 @@ license: # Licensed to the Apache Software Foundation (ASF) under one # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -linters: with_defaults( +linters: linters_with_defaults( line_length_linter = line_length_linter(120), object_name_linter = NULL, # Even with a liberal definition of name styles, some of our names cause issues due to `.`s for s3 classes or NA in the name @@ -22,8 +22,8 @@ linters: with_defaults( # object_name_linter = object_name_linter(styles = c("snake_case", "camelCase", "CamelCase", "symbols", "dotted.case", "UPPERCASE", "SNAKE_CASE")), object_length_linter = object_length_linter(40), object_usage_linter = NULL, # R6 methods are flagged, - cyclocomp_linter = cyclocomp_linter(26), # TODO: reduce to default of 15 - open_curly_linter = NULL # styler and lintr conflict on this (https://github.com/r-lib/styler/issues/549#issuecomment-537191536) + cyclocomp_linter = cyclocomp_linter(26) # TODO: reduce to default of 15 + # See also https://github.com/r-lib/lintr/issues/804 for cyclocomp issues with R6 ) exclusions: list( "R/arrowExports.R", diff --git a/r/R/query-engine.R b/r/R/query-engine.R index b033d86604e..513b861d414 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -112,10 +112,12 @@ ExecPlan <- R6Class("ExecPlan", }) } - target_names <- names(.data$aggregations) - for (i in seq_len(length(target_names))) { - .data$aggregations[[i]][["name"]] <- .data$aggregations[[i]][["target"]] <- target_names[i] - } + .data$aggregations <- imap(.data$aggregations, function(x, name) { + # Embed the name inside the aggregation objects. `target` and `name` + # are the same because we just Project()ed the data that way above + x[["name"]] <- x[["target"]] <- name + x + }) node <- node$Aggregate( options = .data$aggregations, From 0a09d22c0d0be40dbf24e3017677b7eef88d7b0b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 24 Jun 2022 08:19:05 -0400 Subject: [PATCH 32/32] Back out lintr changes (will PR separately) --- r/.lintr | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/.lintr b/r/.lintr index 619339afca3..0298fd7f995 100644 --- a/r/.lintr +++ b/r/.lintr @@ -14,7 +14,7 @@ license: # Licensed to the Apache Software Foundation (ASF) under one # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -linters: linters_with_defaults( +linters: with_defaults( line_length_linter = line_length_linter(120), object_name_linter = NULL, # Even with a liberal definition of name styles, some of our names cause issues due to `.`s for s3 classes or NA in the name @@ -22,8 +22,8 @@ linters: linters_with_defaults( # object_name_linter = object_name_linter(styles = c("snake_case", "camelCase", "CamelCase", "symbols", "dotted.case", "UPPERCASE", "SNAKE_CASE")), object_length_linter = object_length_linter(40), object_usage_linter = NULL, # R6 methods are flagged, - cyclocomp_linter = cyclocomp_linter(26) # TODO: reduce to default of 15 - # See also https://github.com/r-lib/lintr/issues/804 for cyclocomp issues with R6 + cyclocomp_linter = cyclocomp_linter(26), # TODO: reduce to default of 15 + open_curly_linter = NULL # styler and lintr conflict on this (https://github.com/r-lib/styler/issues/549#issuecomment-537191536) ) exclusions: list( "R/arrowExports.R",