From 9ae14d8ad293662b245b799e48daa1841860207b Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 5 Aug 2021 13:40:21 -0400 Subject: [PATCH 01/13] ARROW-13311: [C++][Documentation] Document hash aggregate kernels --- .../arrow/compute/kernels/hash_aggregate.cc | 80 +++++++++++++++---- .../compute/kernels/hash_aggregate_test.cc | 45 ++++++++--- docs/source/cpp/compute.rst | 64 ++++++++++++++- 3 files changed, 157 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 4fd6af9b190..4b530d24f81 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1740,15 +1740,18 @@ struct GroupedMinMaxFactory { // Any/All implementation struct GroupedAnyImpl : public GroupedAggregator { - Status Init(ExecContext* ctx, const FunctionOptions*) override { + Status Init(ExecContext* ctx, const FunctionOptions* options) override { + options_ = *checked_cast(options); seen_ = TypedBufferBuilder(ctx->memory_pool()); + has_nulls_ = TypedBufferBuilder(ctx->memory_pool()); return Status::OK(); } Status Resize(int64_t new_num_groups) override { auto added_groups = new_num_groups - num_groups_; num_groups_ = new_num_groups; - return seen_.Append(added_groups, false); + RETURN_NOT_OK(seen_.Append(added_groups, false)); + return has_nulls_.Append(added_groups, false); } Status Merge(GroupedAggregator&& raw_other, @@ -1757,29 +1760,48 @@ struct GroupedAnyImpl : public GroupedAggregator { auto seen = seen_.mutable_data(); auto other_seen = other->seen_.data(); + auto has_nulls = has_nulls_.mutable_data(); + auto other_has_nulls = other->has_nulls_.data(); auto g = group_id_mapping.GetValues(1); for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { if (BitUtil::GetBit(other_seen, other_g)) BitUtil::SetBitTo(seen, *g, true); + if (BitUtil::GetBit(other_has_nulls, other_g)) { + BitUtil::SetBitTo(has_nulls, *g, true); + } } return Status::OK(); } Status Consume(const ExecBatch& batch) override { auto seen = seen_.mutable_data(); + auto has_nulls = has_nulls_.mutable_data(); const auto& input = *batch[0].array(); auto g = batch[1].array()->GetValues(1); - arrow::internal::VisitTwoBitBlocksVoid( - input.buffers[0], input.offset, input.buffers[1], input.offset, input.length, - [&](int64_t) { BitUtil::SetBitTo(seen, *g++, true); }, [&]() { g++; }); + auto values = input.buffers[1]->data(); + arrow::internal::VisitBitBlocksVoid( + input.buffers[0], input.offset, input.length, + [&](int64_t offset) { + BitUtil::SetBitTo(seen, *g++, BitUtil::GetBit(values, input.offset + offset)); + }, + [&]() { BitUtil::SetBitTo(has_nulls, *g++, true); }); return Status::OK(); } Result Finalize() override { ARROW_ASSIGN_OR_RAISE(auto seen, seen_.Finish()); - return std::make_shared(num_groups_, std::move(seen)); + if (options_.skip_nulls) { + return std::make_shared(num_groups_, std::move(seen)); + } + ARROW_ASSIGN_OR_RAISE(auto bitmap, has_nulls_.Finish()); + // null if (~seen & has_nulls) -> not null if (seen | ~has_nulls) + ::arrow::internal::BitmapOrNot(seen->data(), /*left_offset=*/0, bitmap->data(), + /*right_offset=*/0, num_groups_, /*out_offset=*/0, + bitmap->mutable_data()); + return std::make_shared(num_groups_, std::move(seen), + std::move(bitmap)); } std::shared_ptr out_type() const override { return boolean(); } @@ -1787,18 +1809,22 @@ struct GroupedAnyImpl : public GroupedAggregator { int64_t num_groups_ = 0; ScalarAggregateOptions options_; TypedBufferBuilder seen_; + TypedBufferBuilder has_nulls_; }; struct GroupedAllImpl : public GroupedAggregator { - Status Init(ExecContext* ctx, const FunctionOptions*) override { + Status Init(ExecContext* ctx, const FunctionOptions* options) override { + options_ = *checked_cast(options); seen_ = TypedBufferBuilder(ctx->memory_pool()); + has_nulls_ = TypedBufferBuilder(ctx->memory_pool()); return Status::OK(); } Status Resize(int64_t new_num_groups) override { auto added_groups = new_num_groups - num_groups_; num_groups_ = new_num_groups; - return seen_.Append(added_groups, true); + RETURN_NOT_OK(seen_.Append(added_groups, true)); + return has_nulls_.Append(added_groups, false); } Status Merge(GroupedAggregator&& raw_other, @@ -1807,17 +1833,23 @@ struct GroupedAllImpl : public GroupedAggregator { auto seen = seen_.mutable_data(); auto other_seen = other->seen_.data(); + auto has_nulls = has_nulls_.mutable_data(); + auto other_has_nulls = other->has_nulls_.data(); auto g = group_id_mapping.GetValues(1); for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { BitUtil::SetBitTo( seen, *g, BitUtil::GetBit(seen, *g) && BitUtil::GetBit(other_seen, other_g)); + if (BitUtil::GetBit(other_has_nulls, other_g)) { + BitUtil::SetBitTo(has_nulls, *g, true); + } } return Status::OK(); } Status Consume(const ExecBatch& batch) override { auto seen = seen_.mutable_data(); + auto has_nulls = has_nulls_.mutable_data(); const auto& input = *batch[0].array(); @@ -1832,7 +1864,7 @@ struct GroupedAllImpl : public GroupedAggregator { BitUtil::GetBit(bitmap, input.offset + position)); g++; }, - [&]() { g++; }); + [&]() { BitUtil::SetBitTo(has_nulls, *g++, true); }); } else { arrow::internal::VisitBitBlocksVoid( input.buffers[1], input.offset, input.length, [&](int64_t) { g++; }, @@ -1843,7 +1875,18 @@ struct GroupedAllImpl : public GroupedAggregator { Result Finalize() override { ARROW_ASSIGN_OR_RAISE(auto seen, seen_.Finish()); - return std::make_shared(num_groups_, std::move(seen)); + if (options_.skip_nulls) { + return std::make_shared(num_groups_, std::move(seen)); + } + ARROW_ASSIGN_OR_RAISE(auto bitmap, has_nulls_.Finish()); + // null if (seen & has_nulls) + ::arrow::internal::BitmapAnd(seen->data(), /*left_offset=*/0, bitmap->data(), + /*right_offset=*/0, num_groups_, /*out_offset=*/0, + bitmap->mutable_data()); + ::arrow::internal::InvertBitmap(bitmap->data(), /*offset=*/0, num_groups_, + bitmap->mutable_data(), /*dest_offset=*/0); + return std::make_shared(num_groups_, std::move(seen), + std::move(bitmap)); } std::shared_ptr out_type() const override { return boolean(); } @@ -1851,6 +1894,7 @@ struct GroupedAllImpl : public GroupedAggregator { int64_t num_groups_ = 0; ScalarAggregateOptions options_; TypedBufferBuilder seen_; + TypedBufferBuilder has_nulls_; }; } // namespace @@ -2155,7 +2199,8 @@ const FunctionDoc hash_tdigest_doc{ ("By default, the 0.5 quantile (median) is returned.\n" "Nulls and NaNs are ignored.\n" "A null array is returned if there are no valid data points."), - {"array", "group_id_array"}}; + {"array", "group_id_array"}, + "TDigestOptions"}; const FunctionDoc hash_min_max_doc{ "Compute the minimum and maximum values of a numeric array", @@ -2175,6 +2220,9 @@ const FunctionDoc hash_all_doc{"Test whether all elements evaluate to true", void RegisterHashAggregateBasic(FunctionRegistry* registry) { static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults(); + static auto default_tdigest_options = TDigestOptions::Defaults(); + static auto default_variance_options = VarianceOptions::Defaults(); + { static auto default_count_options = CountOptions::Defaults(); auto func = std::make_shared( @@ -2222,7 +2270,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } - static auto default_variance_options = VarianceOptions::Defaults(); { auto func = std::make_shared( "hash_stddev", Arity::Binary(), &hash_stddev_doc, &default_variance_options); @@ -2247,7 +2294,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } - static auto default_tdigest_options = TDigestOptions::Defaults(); { auto func = std::make_shared( "hash_tdigest", Arity::Binary(), &hash_tdigest_doc, &default_tdigest_options); @@ -2273,15 +2319,15 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { } { - auto func = std::make_shared("hash_any", Arity::Binary(), - &hash_any_doc); + auto func = std::make_shared( + "hash_any", Arity::Binary(), &hash_any_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit))); DCHECK_OK(registry->AddFunction(std::move(func))); } { - auto func = std::make_shared("hash_all", Arity::Binary(), - &hash_all_doc); + auto func = std::make_shared( + "hash_all", Arity::Binary(), &hash_all_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit))); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index c69b51e71fc..6f180b6b3b8 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1083,6 +1083,7 @@ TEST(GroupBy, MinMaxDecimal) { } TEST(GroupBy, AnyAndAll) { + ScalarAggregateOptions options(/*skip_nulls=*/false); for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); @@ -1094,6 +1095,9 @@ TEST(GroupBy, AnyAndAll) { R"([ [false, 2], [null, 3], + [null, 4], + [false, 4], + [true, 5], [false, null], [true, 1], [true, 2] @@ -1105,26 +1109,43 @@ TEST(GroupBy, AnyAndAll) { ])"}); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - internal::GroupBy({table->GetColumnByName("argument"), - table->GetColumnByName("argument")}, - {table->GetColumnByName("key")}, - { - {"hash_any", nullptr}, - {"hash_all", nullptr}, - }, - use_threads)); + internal::GroupBy( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + table->GetColumnByName("argument"), + }, + {table->GetColumnByName("key")}, + { + {"hash_any", nullptr}, + {"hash_all", nullptr}, + {"hash_any", &options}, + {"hash_all", &options}, + }, + use_threads)); SortBy({"key_0"}, &aggregated_and_grouped); + // Group 1: trues and nulls + // Group 2: trues and falses + // Group 3: nulls + // Group 4: falses and nulls + // Group 5: trues + // Group null: falses AssertDatumsEqual(ArrayFromJSON(struct_({ + field("hash_any", boolean()), + field("hash_all", boolean()), field("hash_any", boolean()), field("hash_all", boolean()), field("key_0", int64()), }), R"([ - [true, true, 1], - [true, false, 2], - [false, true, 3], - [false, false, null] + [true, true, true, null, 1], + [true, false, true, false, 2], + [false, true, null, null, 3], + [false, false, null, false, 4], + [true, true, true, true, 5], + [false, false, false, false, null] ])"), aggregated_and_grouped, /*verbose=*/true); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 39bbbec3e16..4f47ea41195 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -183,6 +183,9 @@ recommend you try it out. Unsupported input types return a ``TypeError`` Aggregations ------------ +Scalar aggregations operate on a (chunked) array or scalar value and reduce +the input to a single output value. + +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +===============+=======+=============+================+==================================+=======+ @@ -208,15 +211,16 @@ Aggregations +---------------+-------+-------------+----------------+----------------------------------+-------+ | sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | | +| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(6) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ Notes: -* \(1) If null values are taken into account by setting ScalarAggregateOptions - parameter skip_nulls = false then `Kleene logic`_ logic is applied. +* \(1) If null values are taken into account, by setting the + ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_ + logic is applied. The min_count option is not respected. * \(2) CountMode controls whether only non-null values are counted (the default), only null values are counted, or all values are counted. @@ -230,10 +234,64 @@ Notes: Note that the output can have less than *N* elements if the input has less than *N* distinct values. + The mode kernel is not a proper aggregate (it is actually a vector + function, see below). + * \(5) Output is Int64, UInt64 or Float64, depending on the input type. * \(6) Output is Float64 or input type, depending on QuantileOptions. + The quantile kernel is not a proper aggregate (it is actually a vector + function, see below). + +* \(6) tdigest/t-digest computes approximate quantiles, and so only needs a + fixed amount of memory. See the `reference implementation + `_ for details. + +Hash Aggregations ("group by") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hash aggregations are not directly invokable, but are used as part of a group +by operation. Like scalar aggregations, hash aggregations reduce their input +to a single output value, but do so on subsets of the input, based on a +partitioning of the input values on some set of "key" columns, and emit one +output per input group. + ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===============+=======+=============+================+==================================+=======+ +| hash_all | Unary | Boolean | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_any | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_count | Unary | Boolean | Scalar Int64 | :struct:`CountOptions` | \(2) | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_mean | Unary | Numeric | Scalar Float64 | | | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_sum | Unary | Numeric | Scalar Numeric | | | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(4) | ++---------------+-------+-------------+----------------+----------------------------------+-------+ +| hash_variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++---------------+-------+-------------+----------------+----------------------------------+-------+ + +* \(1) If null values are taken into account, by setting the + ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_ + logic is applied. The min_count option is not respected. + +* \(2) CountMode controls whether only non-null values are counted (the + default), only null values are counted, or all values are counted. + +* \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar. + +* \(4) tdigest/t-digest computes approximate quantiles, and so only needs a + fixed amount of memory. See the `reference implementation + `_ for details. + Element-wise ("scalar") functions --------------------------------- From 9ed71cb2c94b9cf0fbf186b17806cb5a57db093a Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 5 Aug 2021 14:52:25 -0400 Subject: [PATCH 02/13] ARROW-13311: [C++][Documentation] Fix hash_any --- cpp/src/arrow/compute/kernels/hash_aggregate.cc | 5 ++++- cpp/src/arrow/compute/kernels/hash_aggregate_test.cc | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 4b530d24f81..17a090ff293 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1784,7 +1784,10 @@ struct GroupedAnyImpl : public GroupedAggregator { arrow::internal::VisitBitBlocksVoid( input.buffers[0], input.offset, input.length, [&](int64_t offset) { - BitUtil::SetBitTo(seen, *g++, BitUtil::GetBit(values, input.offset + offset)); + BitUtil::SetBitTo(seen, *g, + BitUtil::GetBit(seen, *g) || + BitUtil::GetBit(values, input.offset + offset)); + g++; }, [&]() { BitUtil::SetBitTo(has_nulls, *g++, true); }); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 6f180b6b3b8..e96fdcd6084 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1103,7 +1103,7 @@ TEST(GroupBy, AnyAndAll) { [true, 2] ])", R"([ - [true, 2], + [false, 2], [false, null], [null, 3] ])"}); From 5cf6f33e618f0ebb5b191ebdd836dfe81016ff45 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 6 Aug 2021 15:43:40 -0400 Subject: [PATCH 03/13] ARROW-13311: [C++][Documentation] Fix hash_min_max --- cpp/src/arrow/compute/kernels/hash_aggregate.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 17a090ff293..4a77e4f51b3 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1718,11 +1718,11 @@ struct GroupedMinMaxFactory { } Status Visit(const HalfFloatType& type) { - return Status::NotImplemented("Summing data of type ", type); + return Status::NotImplemented("Computing min/max of data of type ", type); } Status Visit(const DataType& type) { - return Status::NotImplemented("Summing data of type ", type); + return Status::NotImplemented("Computing min/max of data of type ", type); } static Result Make(const std::shared_ptr& type) { @@ -2313,7 +2313,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { auto func = std::make_shared( "hash_min_max", Arity::Binary(), &hash_min_max_doc, &default_scalar_aggregate_options); - DCHECK_OK(AddHashAggKernels({boolean()}, GroupedSumFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedMinMaxFactory::Make, func.get())); // Type parameters are ignored DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)}, From 746cbe3929e92ffb61be1d667c5422793b281491 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 16 Aug 2021 10:07:38 -0400 Subject: [PATCH 04/13] ARROW-13311: [C++][Documentation] Add example of grouped aggregation --- docs/source/cpp/compute.rst | 50 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 4f47ea41195..b9b69b0a0a8 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -248,14 +248,48 @@ Notes: fixed amount of memory. See the `reference implementation `_ for details. -Hash Aggregations ("group by") -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Hash aggregations are not directly invokable, but are used as part of a group -by operation. Like scalar aggregations, hash aggregations reduce their input -to a single output value, but do so on subsets of the input, based on a -partitioning of the input values on some set of "key" columns, and emit one -output per input group. +Grouped Aggregations ("group by") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Grouped aggregations are not directly invokable, but are used as part of a +group by operation. Like scalar aggregations, grouped aggregations reduce +multiple input values to a single output value. Instead of aggregating all +values of the input, however, grouped aggregations partition of the input +values on some set of "key" columns, then aggregate each group individually, +and emit one output per input group. + +As an example, for the following table: + ++-----------------+--------------+ +| Column "x" | Column "key" | ++=================+==============+ +| 2 | "a" | ++-----------------+--------------+ +| 5 | "a" | ++-----------------+--------------+ +| null | "b" | ++-----------------+--------------+ +| null | "b" | ++-----------------+--------------+ +| null | null | ++-----------------+--------------+ +| 5 | null | ++-----------------+--------------+ + +We compute a sum of column "x", grouped on the key column "key". This gives us +three groups: + ++-----------------+--------------+ +| Column "sum(x)" | Column "key" | ++=================+==============+ +| 7 | "a" | ++-----------------+--------------+ +| null | "b" | ++-----------------+--------------+ +| 5 | null | ++-----------------+--------------+ + +The supported aggregation functions are as follows. +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | From 152b150bfc8ae644625507305b91ec65a2eeb526 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 17 Aug 2021 10:04:08 -0400 Subject: [PATCH 05/13] ARROW-13311: [Documentation] Fix footnote numbering --- docs/source/cpp/compute.rst | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index b9b69b0a0a8..405b9b4457e 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -211,7 +211,7 @@ the input to a single output value. +---------------+-------+-------------+----------------+----------------------------------+-------+ | sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(6) | +| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(7) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ @@ -234,17 +234,11 @@ Notes: Note that the output can have less than *N* elements if the input has less than *N* distinct values. - The mode kernel is not a proper aggregate (it is actually a vector - function, see below). - * \(5) Output is Int64, UInt64 or Float64, depending on the input type. * \(6) Output is Float64 or input type, depending on QuantileOptions. - The quantile kernel is not a proper aggregate (it is actually a vector - function, see below). - -* \(6) tdigest/t-digest computes approximate quantiles, and so only needs a +* \(7) tdigest/t-digest computes approximate quantiles, and so only needs a fixed amount of memory. See the `reference implementation `_ for details. From a536658ad73f2116bb34e06586c6faf7f734189c Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 17 Aug 2021 11:01:26 -0400 Subject: [PATCH 06/13] ARROW-13311: [Documentation] Fix hash_mean/hash_sum doc structs --- .../arrow/compute/kernels/hash_aggregate.cc | 6 +++-- docs/source/cpp/compute.rst | 22 ++++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 4a77e4f51b3..9222c5dd18f 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -2169,7 +2169,8 @@ const FunctionDoc hash_count_doc{"Count the number of null / non-null values", const FunctionDoc hash_sum_doc{"Sum values of a numeric array", ("Null values are ignored."), - {"array", "group_id_array"}}; + {"array", "group_id_array"}, + "ScalarAggregateOptions"}; const FunctionDoc hash_product_doc{ "Compute product of values of a numeric array", @@ -2179,7 +2180,8 @@ const FunctionDoc hash_product_doc{ const FunctionDoc hash_mean_doc{"Average values of a numeric array", ("Null values are ignored."), - {"array", "group_id_array"}}; + {"array", "group_id_array"}, + "ScalarAggregateOptions"}; const FunctionDoc hash_stddev_doc{ "Calculate the standard deviation of a numeric array", diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 405b9b4457e..71c424523ef 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -288,23 +288,23 @@ The supported aggregation functions are as follows. +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +===============+=======+=============+================+==================================+=======+ -| hash_all | Unary | Boolean | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) | +| hash_all | Unary | Boolean | Int64 | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_any | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) | +| hash_any | Unary | Any | Int64 | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_count | Unary | Boolean | Scalar Int64 | :struct:`CountOptions` | \(2) | +| hash_count | Unary | Boolean | Int64 | :struct:`CountOptions` | \(2) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_mean | Unary | Numeric | Scalar Float64 | | | +| hash_mean | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | +| hash_min_max | Unary | Numeric | Struct | :struct:`ScalarAggregateOptions` | \(3) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | +| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_sum | Unary | Numeric | Scalar Numeric | | | +| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(4) | +| hash_tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(5) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | +| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+ * \(1) If null values are taken into account, by setting the @@ -316,7 +316,9 @@ The supported aggregation functions are as follows. * \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar. -* \(4) tdigest/t-digest computes approximate quantiles, and so only needs a +* \(4) Output is Int64, UInt64 or Float64, depending on the input type. + +* \(5) tdigest/t-digest computes approximate quantiles, and so only needs a fixed amount of memory. See the `reference implementation `_ for details. From 7667e41c31e4db7baff113f72e174fcfb2962973 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 17 Aug 2021 16:50:00 -0400 Subject: [PATCH 07/13] ARROW-13311: [Documentation] Tweak wording --- docs/source/cpp/compute.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 71c424523ef..d94bc4ce38a 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -246,11 +246,11 @@ Grouped Aggregations ("group by") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Grouped aggregations are not directly invokable, but are used as part of a -group by operation. Like scalar aggregations, grouped aggregations reduce -multiple input values to a single output value. Instead of aggregating all -values of the input, however, grouped aggregations partition of the input +SQL-style "group by" operation. Like scalar aggregations, grouped aggregations +reduce multiple input values to a single output value. Instead of aggregating +all values of the input, however, grouped aggregations partition of the input values on some set of "key" columns, then aggregate each group individually, -and emit one output per input group. +emitting one output per input group. As an example, for the following table: @@ -271,7 +271,8 @@ As an example, for the following table: +-----------------+--------------+ We compute a sum of column "x", grouped on the key column "key". This gives us -three groups: +three groups, with the following results. Note that null is treated as a +distinct key. +-----------------+--------------+ | Column "sum(x)" | Column "key" | @@ -283,7 +284,9 @@ three groups: | 5 | null | +-----------------+--------------+ -The supported aggregation functions are as follows. +The supported aggregation functions are as follows. Note that currently, all +function names are prefixed with "hash\_", which differentiates them from their +scalar equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | From e3b1dd15bbecc64cc960e76fc22643af8ff43936 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 18 Aug 2021 17:02:40 +0200 Subject: [PATCH 08/13] Wording nits --- docs/source/cpp/compute.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d94bc4ce38a..d9b338ad41c 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -248,9 +248,9 @@ Grouped Aggregations ("group by") Grouped aggregations are not directly invokable, but are used as part of a SQL-style "group by" operation. Like scalar aggregations, grouped aggregations reduce multiple input values to a single output value. Instead of aggregating -all values of the input, however, grouped aggregations partition of the input +all values of the input, however, grouped aggregations partition the input values on some set of "key" columns, then aggregate each group individually, -emitting one output per input group. +emitting one output value per input group. As an example, for the following table: @@ -267,12 +267,12 @@ As an example, for the following table: +-----------------+--------------+ | null | null | +-----------------+--------------+ -| 5 | null | +| 9 | null | +-----------------+--------------+ -We compute a sum of column "x", grouped on the key column "key". This gives us -three groups, with the following results. Note that null is treated as a -distinct key. +we can compute a sum of the column "x", grouped on the column "key". +This gives us three groups, with the following results. Note that null is +treated as a distinct key value. +-----------------+--------------+ | Column "sum(x)" | Column "key" | @@ -281,12 +281,12 @@ distinct key. +-----------------+--------------+ | null | "b" | +-----------------+--------------+ -| 5 | null | +| 9 | null | +-----------------+--------------+ -The supported aggregation functions are as follows. Note that currently, all -function names are prefixed with "hash\_", which differentiates them from their -scalar equivalents above and reflects how they are implemented internally. +The supported aggregation functions are as follows. All function names are +prefixed with "hash\_", which differentiates them from their scalar +equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | @@ -311,7 +311,7 @@ scalar equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ * \(1) If null values are taken into account, by setting the - ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_ + :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_ logic is applied. The min_count option is not respected. * \(2) CountMode controls whether only non-null values are counted (the @@ -321,7 +321,7 @@ scalar equivalents above and reflects how they are implemented internally. * \(4) Output is Int64, UInt64 or Float64, depending on the input type. -* \(5) tdigest/t-digest computes approximate quantiles, and so only needs a +* \(5) T-digest computes approximate quantiles, and so only needs a fixed amount of memory. See the `reference implementation `_ for details. From bf80780fd3dc079c55d0b04c4f5f3fa2ecd67ead Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 11:19:04 -0400 Subject: [PATCH 09/13] Apply suggestions from code review Co-authored-by: Ian Cook --- docs/source/cpp/compute.rst | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d9b338ad41c..d928d55d34e 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -255,34 +255,34 @@ emitting one output value per input group. As an example, for the following table: +-----------------+--------------+ -| Column "x" | Column "key" | +| Column ``key`` | Column ``x`` | +=================+==============+ -| 2 | "a" | +| "a" | 2 | +-----------------+--------------+ -| 5 | "a" | +| "a" | 5 | +-----------------+--------------+ -| null | "b" | +| "b" | null | +-----------------+--------------+ -| null | "b" | +| "b" | null | +-----------------+--------------+ | null | null | +-----------------+--------------+ -| 9 | null | +| null | 9 | +-----------------+--------------+ -we can compute a sum of the column "x", grouped on the column "key". +we can compute a sum of the column ``x``, grouped on the column ``key``. This gives us three groups, with the following results. Note that null is treated as a distinct key value. -+-----------------+--------------+ -| Column "sum(x)" | Column "key" | -+=================+==============+ -| 7 | "a" | -+-----------------+--------------+ -| null | "b" | -+-----------------+--------------+ -| 9 | null | -+-----------------+--------------+ ++-----------------+-------------------+ +| Column ``key`` | Column ``sum(x)`` | ++=================+===================+ +| "a" | 7 | ++-----------------+-------------------+ +| "b" | null | ++-----------------+-------------------+ +| null | 9 | ++-----------------+-------------------+ The supported aggregation functions are as follows. All function names are prefixed with "hash\_", which differentiates them from their scalar From b107d136b71d583b0472b66cc6b6589e7b0770a7 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 11:19:31 -0400 Subject: [PATCH 10/13] Update docs/source/cpp/compute.rst Co-authored-by: Ian Cook --- docs/source/cpp/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d928d55d34e..fcbf148dcac 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -285,7 +285,7 @@ treated as a distinct key value. +-----------------+-------------------+ The supported aggregation functions are as follows. All function names are -prefixed with "hash\_", which differentiates them from their scalar +prefixed with ``hash\_``, which differentiates them from their scalar equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ From 844f45c9a1d3b3c85772c89d0a7453a722ab9a4d Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 11:22:17 -0400 Subject: [PATCH 11/13] ARROW-13311: [Documentation] Fix reST --- docs/source/cpp/compute.rst | 50 ++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index fcbf148dcac..f77a4a86eef 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -254,38 +254,38 @@ emitting one output value per input group. As an example, for the following table: -+-----------------+--------------+ -| Column ``key`` | Column ``x`` | -+=================+==============+ -| "a" | 2 | -+-----------------+--------------+ -| "a" | 5 | -+-----------------+--------------+ -| "b" | null | -+-----------------+--------------+ -| "b" | null | -+-----------------+--------------+ -| null | null | -+-----------------+--------------+ -| null | 9 | -+-----------------+--------------+ ++------------------+-----------------+ +| Column ``key`` | Column ``x`` | ++==================+=================+ +| "a" | 2 | ++------------------+-----------------+ +| "a" | 5 | ++------------------+-----------------+ +| "b" | null | ++------------------+-----------------+ +| "b" | null | ++------------------+-----------------+ +| null | null | ++------------------+-----------------+ +| null | 9 | ++------------------+-----------------+ we can compute a sum of the column ``x``, grouped on the column ``key``. This gives us three groups, with the following results. Note that null is treated as a distinct key value. -+-----------------+-------------------+ -| Column ``key`` | Column ``sum(x)`` | -+=================+===================+ -| "a" | 7 | -+-----------------+-------------------+ -| "b" | null | -+-----------------+-------------------+ -| null | 9 | -+-----------------+-------------------+ ++------------------+-------------------+ +| Column ``key`` | Column ``sum(x)`` | ++==================+===================+ +| "a" | 7 | ++------------------+-------------------+ +| "b" | null | ++------------------+-------------------+ +| null | 9 | ++------------------+-------------------+ The supported aggregation functions are as follows. All function names are -prefixed with ``hash\_``, which differentiates them from their scalar +prefixed with ``hash_``, which differentiates them from their scalar equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ From 275c29970566d76d52a0fdb8c3d784e94e453713 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 11:32:52 -0400 Subject: [PATCH 12/13] Update docs/source/cpp/compute.rst Co-authored-by: Ian Cook --- docs/source/cpp/compute.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index f77a4a86eef..4c2278c11e0 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -274,15 +274,15 @@ we can compute a sum of the column ``x``, grouped on the column ``key``. This gives us three groups, with the following results. Note that null is treated as a distinct key value. -+------------------+-------------------+ -| Column ``key`` | Column ``sum(x)`` | -+==================+===================+ -| "a" | 7 | -+------------------+-------------------+ -| "b" | null | -+------------------+-------------------+ -| null | 9 | -+------------------+-------------------+ ++------------------+-----------------------+ +| Column ``key`` | Column ``sum(x)`` | ++==================+=======================+ +| "a" | 7 | ++------------------+-----------------------+ +| "b" | null | ++------------------+-----------------------+ +| null | 9 | ++------------------+-----------------------+ The supported aggregation functions are as follows. All function names are prefixed with ``hash_``, which differentiates them from their scalar From 5590a67b37d10b55b8602e0c4e2364bed05b6ed1 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 08:10:15 -0400 Subject: [PATCH 13/13] ARROW-13311: [Documentation] Fix return types --- docs/source/cpp/compute.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 4c2278c11e0..25cb7b7822f 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -291,11 +291,11 @@ equivalents above and reflects how they are implemented internally. +---------------+-------+-------------+----------------+----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +===============+=======+=============+================+==================================+=======+ -| hash_all | Unary | Boolean | Int64 | :struct:`ScalarAggregateOptions` | \(1) | +| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_any | Unary | Any | Int64 | :struct:`ScalarAggregateOptions` | \(1) | +| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | +---------------+-------+-------------+----------------+----------------------------------+-------+ -| hash_count | Unary | Boolean | Int64 | :struct:`CountOptions` | \(2) | +| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | +---------------+-------+-------------+----------------+----------------------------------+-------+ | hash_mean | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | | +---------------+-------+-------------+----------------+----------------------------------+-------+