From 9ae14d8ad293662b245b799e48daa1841860207b Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 5 Aug 2021 13:40:21 -0400
Subject: [PATCH 01/13] ARROW-13311: [C++][Documentation] Document hash
aggregate kernels
---
.../arrow/compute/kernels/hash_aggregate.cc | 80 +++++++++++++++----
.../compute/kernels/hash_aggregate_test.cc | 45 ++++++++---
docs/source/cpp/compute.rst | 64 ++++++++++++++-
3 files changed, 157 insertions(+), 32 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 4fd6af9b190..4b530d24f81 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1740,15 +1740,18 @@ struct GroupedMinMaxFactory {
// Any/All implementation
struct GroupedAnyImpl : public GroupedAggregator {
- Status Init(ExecContext* ctx, const FunctionOptions*) override {
+ Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+ options_ = *checked_cast(options);
seen_ = TypedBufferBuilder(ctx->memory_pool());
+ has_nulls_ = TypedBufferBuilder(ctx->memory_pool());
return Status::OK();
}
Status Resize(int64_t new_num_groups) override {
auto added_groups = new_num_groups - num_groups_;
num_groups_ = new_num_groups;
- return seen_.Append(added_groups, false);
+ RETURN_NOT_OK(seen_.Append(added_groups, false));
+ return has_nulls_.Append(added_groups, false);
}
Status Merge(GroupedAggregator&& raw_other,
@@ -1757,29 +1760,48 @@ struct GroupedAnyImpl : public GroupedAggregator {
auto seen = seen_.mutable_data();
auto other_seen = other->seen_.data();
+ auto has_nulls = has_nulls_.mutable_data();
+ auto other_has_nulls = other->has_nulls_.data();
auto g = group_id_mapping.GetValues(1);
for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) {
if (BitUtil::GetBit(other_seen, other_g)) BitUtil::SetBitTo(seen, *g, true);
+ if (BitUtil::GetBit(other_has_nulls, other_g)) {
+ BitUtil::SetBitTo(has_nulls, *g, true);
+ }
}
return Status::OK();
}
Status Consume(const ExecBatch& batch) override {
auto seen = seen_.mutable_data();
+ auto has_nulls = has_nulls_.mutable_data();
const auto& input = *batch[0].array();
auto g = batch[1].array()->GetValues(1);
- arrow::internal::VisitTwoBitBlocksVoid(
- input.buffers[0], input.offset, input.buffers[1], input.offset, input.length,
- [&](int64_t) { BitUtil::SetBitTo(seen, *g++, true); }, [&]() { g++; });
+ auto values = input.buffers[1]->data();
+ arrow::internal::VisitBitBlocksVoid(
+ input.buffers[0], input.offset, input.length,
+ [&](int64_t offset) {
+ BitUtil::SetBitTo(seen, *g++, BitUtil::GetBit(values, input.offset + offset));
+ },
+ [&]() { BitUtil::SetBitTo(has_nulls, *g++, true); });
return Status::OK();
}
Result Finalize() override {
ARROW_ASSIGN_OR_RAISE(auto seen, seen_.Finish());
- return std::make_shared(num_groups_, std::move(seen));
+ if (options_.skip_nulls) {
+ return std::make_shared(num_groups_, std::move(seen));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto bitmap, has_nulls_.Finish());
+ // null if (~seen & has_nulls) -> not null if (seen | ~has_nulls)
+ ::arrow::internal::BitmapOrNot(seen->data(), /*left_offset=*/0, bitmap->data(),
+ /*right_offset=*/0, num_groups_, /*out_offset=*/0,
+ bitmap->mutable_data());
+ return std::make_shared(num_groups_, std::move(seen),
+ std::move(bitmap));
}
std::shared_ptr out_type() const override { return boolean(); }
@@ -1787,18 +1809,22 @@ struct GroupedAnyImpl : public GroupedAggregator {
int64_t num_groups_ = 0;
ScalarAggregateOptions options_;
TypedBufferBuilder seen_;
+ TypedBufferBuilder has_nulls_;
};
struct GroupedAllImpl : public GroupedAggregator {
- Status Init(ExecContext* ctx, const FunctionOptions*) override {
+ Status Init(ExecContext* ctx, const FunctionOptions* options) override {
+ options_ = *checked_cast(options);
seen_ = TypedBufferBuilder(ctx->memory_pool());
+ has_nulls_ = TypedBufferBuilder(ctx->memory_pool());
return Status::OK();
}
Status Resize(int64_t new_num_groups) override {
auto added_groups = new_num_groups - num_groups_;
num_groups_ = new_num_groups;
- return seen_.Append(added_groups, true);
+ RETURN_NOT_OK(seen_.Append(added_groups, true));
+ return has_nulls_.Append(added_groups, false);
}
Status Merge(GroupedAggregator&& raw_other,
@@ -1807,17 +1833,23 @@ struct GroupedAllImpl : public GroupedAggregator {
auto seen = seen_.mutable_data();
auto other_seen = other->seen_.data();
+ auto has_nulls = has_nulls_.mutable_data();
+ auto other_has_nulls = other->has_nulls_.data();
auto g = group_id_mapping.GetValues(1);
for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) {
BitUtil::SetBitTo(
seen, *g, BitUtil::GetBit(seen, *g) && BitUtil::GetBit(other_seen, other_g));
+ if (BitUtil::GetBit(other_has_nulls, other_g)) {
+ BitUtil::SetBitTo(has_nulls, *g, true);
+ }
}
return Status::OK();
}
Status Consume(const ExecBatch& batch) override {
auto seen = seen_.mutable_data();
+ auto has_nulls = has_nulls_.mutable_data();
const auto& input = *batch[0].array();
@@ -1832,7 +1864,7 @@ struct GroupedAllImpl : public GroupedAggregator {
BitUtil::GetBit(bitmap, input.offset + position));
g++;
},
- [&]() { g++; });
+ [&]() { BitUtil::SetBitTo(has_nulls, *g++, true); });
} else {
arrow::internal::VisitBitBlocksVoid(
input.buffers[1], input.offset, input.length, [&](int64_t) { g++; },
@@ -1843,7 +1875,18 @@ struct GroupedAllImpl : public GroupedAggregator {
Result Finalize() override {
ARROW_ASSIGN_OR_RAISE(auto seen, seen_.Finish());
- return std::make_shared(num_groups_, std::move(seen));
+ if (options_.skip_nulls) {
+ return std::make_shared(num_groups_, std::move(seen));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto bitmap, has_nulls_.Finish());
+ // null if (seen & has_nulls)
+ ::arrow::internal::BitmapAnd(seen->data(), /*left_offset=*/0, bitmap->data(),
+ /*right_offset=*/0, num_groups_, /*out_offset=*/0,
+ bitmap->mutable_data());
+ ::arrow::internal::InvertBitmap(bitmap->data(), /*offset=*/0, num_groups_,
+ bitmap->mutable_data(), /*dest_offset=*/0);
+ return std::make_shared(num_groups_, std::move(seen),
+ std::move(bitmap));
}
std::shared_ptr out_type() const override { return boolean(); }
@@ -1851,6 +1894,7 @@ struct GroupedAllImpl : public GroupedAggregator {
int64_t num_groups_ = 0;
ScalarAggregateOptions options_;
TypedBufferBuilder seen_;
+ TypedBufferBuilder has_nulls_;
};
} // namespace
@@ -2155,7 +2199,8 @@ const FunctionDoc hash_tdigest_doc{
("By default, the 0.5 quantile (median) is returned.\n"
"Nulls and NaNs are ignored.\n"
"A null array is returned if there are no valid data points."),
- {"array", "group_id_array"}};
+ {"array", "group_id_array"},
+ "TDigestOptions"};
const FunctionDoc hash_min_max_doc{
"Compute the minimum and maximum values of a numeric array",
@@ -2175,6 +2220,9 @@ const FunctionDoc hash_all_doc{"Test whether all elements evaluate to true",
void RegisterHashAggregateBasic(FunctionRegistry* registry) {
static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ static auto default_tdigest_options = TDigestOptions::Defaults();
+ static auto default_variance_options = VarianceOptions::Defaults();
+
{
static auto default_count_options = CountOptions::Defaults();
auto func = std::make_shared(
@@ -2222,7 +2270,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}
- static auto default_variance_options = VarianceOptions::Defaults();
{
auto func = std::make_shared(
"hash_stddev", Arity::Binary(), &hash_stddev_doc, &default_variance_options);
@@ -2247,7 +2294,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}
- static auto default_tdigest_options = TDigestOptions::Defaults();
{
auto func = std::make_shared(
"hash_tdigest", Arity::Binary(), &hash_tdigest_doc, &default_tdigest_options);
@@ -2273,15 +2319,15 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
}
{
- auto func = std::make_shared("hash_any", Arity::Binary(),
- &hash_any_doc);
+ auto func = std::make_shared(
+ "hash_any", Arity::Binary(), &hash_any_doc, &default_scalar_aggregate_options);
DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit)));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
- auto func = std::make_shared("hash_all", Arity::Binary(),
- &hash_all_doc);
+ auto func = std::make_shared(
+ "hash_all", Arity::Binary(), &hash_all_doc, &default_scalar_aggregate_options);
DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit)));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index c69b51e71fc..6f180b6b3b8 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -1083,6 +1083,7 @@ TEST(GroupBy, MinMaxDecimal) {
}
TEST(GroupBy, AnyAndAll) {
+ ScalarAggregateOptions options(/*skip_nulls=*/false);
for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
@@ -1094,6 +1095,9 @@ TEST(GroupBy, AnyAndAll) {
R"([
[false, 2],
[null, 3],
+ [null, 4],
+ [false, 4],
+ [true, 5],
[false, null],
[true, 1],
[true, 2]
@@ -1105,26 +1109,43 @@ TEST(GroupBy, AnyAndAll) {
])"});
ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
- internal::GroupBy({table->GetColumnByName("argument"),
- table->GetColumnByName("argument")},
- {table->GetColumnByName("key")},
- {
- {"hash_any", nullptr},
- {"hash_all", nullptr},
- },
- use_threads));
+ internal::GroupBy(
+ {
+ table->GetColumnByName("argument"),
+ table->GetColumnByName("argument"),
+ table->GetColumnByName("argument"),
+ table->GetColumnByName("argument"),
+ },
+ {table->GetColumnByName("key")},
+ {
+ {"hash_any", nullptr},
+ {"hash_all", nullptr},
+ {"hash_any", &options},
+ {"hash_all", &options},
+ },
+ use_threads));
SortBy({"key_0"}, &aggregated_and_grouped);
+ // Group 1: trues and nulls
+ // Group 2: trues and falses
+ // Group 3: nulls
+ // Group 4: falses and nulls
+ // Group 5: trues
+ // Group null: falses
AssertDatumsEqual(ArrayFromJSON(struct_({
+ field("hash_any", boolean()),
+ field("hash_all", boolean()),
field("hash_any", boolean()),
field("hash_all", boolean()),
field("key_0", int64()),
}),
R"([
- [true, true, 1],
- [true, false, 2],
- [false, true, 3],
- [false, false, null]
+ [true, true, true, null, 1],
+ [true, false, true, false, 2],
+ [false, true, null, null, 3],
+ [false, false, null, false, 4],
+ [true, true, true, true, 5],
+ [false, false, false, false, null]
])"),
aggregated_and_grouped,
/*verbose=*/true);
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 39bbbec3e16..4f47ea41195 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -183,6 +183,9 @@ recommend you try it out. Unsupported input types return a ``TypeError``
Aggregations
------------
+Scalar aggregations operate on a (chunked) array or scalar value and reduce
+the input to a single output value.
+
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+===============+=======+=============+================+==================================+=======+
@@ -208,15 +211,16 @@ Aggregations
+---------------+-------+-------------+----------------+----------------------------------+-------+
| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | |
+| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(6) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+
Notes:
-* \(1) If null values are taken into account by setting ScalarAggregateOptions
- parameter skip_nulls = false then `Kleene logic`_ logic is applied.
+* \(1) If null values are taken into account, by setting the
+ ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_
+ logic is applied. The min_count option is not respected.
* \(2) CountMode controls whether only non-null values are counted (the
default), only null values are counted, or all values are counted.
@@ -230,10 +234,64 @@ Notes:
Note that the output can have less than *N* elements if the input has
less than *N* distinct values.
+ The mode kernel is not a proper aggregate (it is actually a vector
+ function, see below).
+
* \(5) Output is Int64, UInt64 or Float64, depending on the input type.
* \(6) Output is Float64 or input type, depending on QuantileOptions.
+ The quantile kernel is not a proper aggregate (it is actually a vector
+ function, see below).
+
+* \(6) tdigest/t-digest computes approximate quantiles, and so only needs a
+ fixed amount of memory. See the `reference implementation
+ `_ for details.
+
+Hash Aggregations ("group by")
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Hash aggregations are not directly invokable, but are used as part of a group
+by operation. Like scalar aggregations, hash aggregations reduce their input
+to a single output value, but do so on subsets of the input, based on a
+partitioning of the input values on some set of "key" columns, and emit one
+output per input group.
+
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++===============+=======+=============+================+==================================+=======+
+| hash_all | Unary | Boolean | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_any | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_count | Unary | Boolean | Scalar Int64 | :struct:`CountOptions` | \(2) |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_mean | Unary | Numeric | Scalar Float64 | | |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_sum | Unary | Numeric | Scalar Numeric | | |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(4) |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+| hash_variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
++---------------+-------+-------------+----------------+----------------------------------+-------+
+
+* \(1) If null values are taken into account, by setting the
+ ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_
+ logic is applied. The min_count option is not respected.
+
+* \(2) CountMode controls whether only non-null values are counted (the
+ default), only null values are counted, or all values are counted.
+
+* \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar.
+
+* \(4) tdigest/t-digest computes approximate quantiles, and so only needs a
+ fixed amount of memory. See the `reference implementation
+ `_ for details.
+
Element-wise ("scalar") functions
---------------------------------
From 9ed71cb2c94b9cf0fbf186b17806cb5a57db093a Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 5 Aug 2021 14:52:25 -0400
Subject: [PATCH 02/13] ARROW-13311: [C++][Documentation] Fix hash_any
---
cpp/src/arrow/compute/kernels/hash_aggregate.cc | 5 ++++-
cpp/src/arrow/compute/kernels/hash_aggregate_test.cc | 2 +-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 4b530d24f81..17a090ff293 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1784,7 +1784,10 @@ struct GroupedAnyImpl : public GroupedAggregator {
arrow::internal::VisitBitBlocksVoid(
input.buffers[0], input.offset, input.length,
[&](int64_t offset) {
- BitUtil::SetBitTo(seen, *g++, BitUtil::GetBit(values, input.offset + offset));
+ BitUtil::SetBitTo(seen, *g,
+ BitUtil::GetBit(seen, *g) ||
+ BitUtil::GetBit(values, input.offset + offset));
+ g++;
},
[&]() { BitUtil::SetBitTo(has_nulls, *g++, true); });
return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index 6f180b6b3b8..e96fdcd6084 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -1103,7 +1103,7 @@ TEST(GroupBy, AnyAndAll) {
[true, 2]
])",
R"([
- [true, 2],
+ [false, 2],
[false, null],
[null, 3]
])"});
From 5cf6f33e618f0ebb5b191ebdd836dfe81016ff45 Mon Sep 17 00:00:00 2001
From: David Li
Date: Fri, 6 Aug 2021 15:43:40 -0400
Subject: [PATCH 03/13] ARROW-13311: [C++][Documentation] Fix hash_min_max
---
cpp/src/arrow/compute/kernels/hash_aggregate.cc | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 17a090ff293..4a77e4f51b3 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1718,11 +1718,11 @@ struct GroupedMinMaxFactory {
}
Status Visit(const HalfFloatType& type) {
- return Status::NotImplemented("Summing data of type ", type);
+ return Status::NotImplemented("Computing min/max of data of type ", type);
}
Status Visit(const DataType& type) {
- return Status::NotImplemented("Summing data of type ", type);
+ return Status::NotImplemented("Computing min/max of data of type ", type);
}
static Result Make(const std::shared_ptr& type) {
@@ -2313,7 +2313,6 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) {
auto func = std::make_shared(
"hash_min_max", Arity::Binary(), &hash_min_max_doc,
&default_scalar_aggregate_options);
- DCHECK_OK(AddHashAggKernels({boolean()}, GroupedSumFactory::Make, func.get()));
DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedMinMaxFactory::Make, func.get()));
// Type parameters are ignored
DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)},
From 746cbe3929e92ffb61be1d667c5422793b281491 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 16 Aug 2021 10:07:38 -0400
Subject: [PATCH 04/13] ARROW-13311: [C++][Documentation] Add example of
grouped aggregation
---
docs/source/cpp/compute.rst | 50 +++++++++++++++++++++++++++++++------
1 file changed, 42 insertions(+), 8 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 4f47ea41195..b9b69b0a0a8 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -248,14 +248,48 @@ Notes:
fixed amount of memory. See the `reference implementation
`_ for details.
-Hash Aggregations ("group by")
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Hash aggregations are not directly invokable, but are used as part of a group
-by operation. Like scalar aggregations, hash aggregations reduce their input
-to a single output value, but do so on subsets of the input, based on a
-partitioning of the input values on some set of "key" columns, and emit one
-output per input group.
+Grouped Aggregations ("group by")
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Grouped aggregations are not directly invokable, but are used as part of a
+group by operation. Like scalar aggregations, grouped aggregations reduce
+multiple input values to a single output value. Instead of aggregating all
+values of the input, however, grouped aggregations partition of the input
+values on some set of "key" columns, then aggregate each group individually,
+and emit one output per input group.
+
+As an example, for the following table:
+
++-----------------+--------------+
+| Column "x" | Column "key" |
++=================+==============+
+| 2 | "a" |
++-----------------+--------------+
+| 5 | "a" |
++-----------------+--------------+
+| null | "b" |
++-----------------+--------------+
+| null | "b" |
++-----------------+--------------+
+| null | null |
++-----------------+--------------+
+| 5 | null |
++-----------------+--------------+
+
+We compute a sum of column "x", grouped on the key column "key". This gives us
+three groups:
+
++-----------------+--------------+
+| Column "sum(x)" | Column "key" |
++=================+==============+
+| 7 | "a" |
++-----------------+--------------+
+| null | "b" |
++-----------------+--------------+
+| 5 | null |
++-----------------+--------------+
+
+The supported aggregation functions are as follows.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
From 152b150bfc8ae644625507305b91ec65a2eeb526 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 17 Aug 2021 10:04:08 -0400
Subject: [PATCH 05/13] ARROW-13311: [Documentation] Fix footnote numbering
---
docs/source/cpp/compute.rst | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index b9b69b0a0a8..405b9b4457e 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -211,7 +211,7 @@ the input to a single output value.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(6) |
+| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(7) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+
@@ -234,17 +234,11 @@ Notes:
Note that the output can have less than *N* elements if the input has
less than *N* distinct values.
- The mode kernel is not a proper aggregate (it is actually a vector
- function, see below).
-
* \(5) Output is Int64, UInt64 or Float64, depending on the input type.
* \(6) Output is Float64 or input type, depending on QuantileOptions.
- The quantile kernel is not a proper aggregate (it is actually a vector
- function, see below).
-
-* \(6) tdigest/t-digest computes approximate quantiles, and so only needs a
+* \(7) tdigest/t-digest computes approximate quantiles, and so only needs a
fixed amount of memory. See the `reference implementation
`_ for details.
From a536658ad73f2116bb34e06586c6faf7f734189c Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 17 Aug 2021 11:01:26 -0400
Subject: [PATCH 06/13] ARROW-13311: [Documentation] Fix hash_mean/hash_sum doc
structs
---
.../arrow/compute/kernels/hash_aggregate.cc | 6 +++--
docs/source/cpp/compute.rst | 22 ++++++++++---------
2 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 4a77e4f51b3..9222c5dd18f 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -2169,7 +2169,8 @@ const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
("Null values are ignored."),
- {"array", "group_id_array"}};
+ {"array", "group_id_array"},
+ "ScalarAggregateOptions"};
const FunctionDoc hash_product_doc{
"Compute product of values of a numeric array",
@@ -2179,7 +2180,8 @@ const FunctionDoc hash_product_doc{
const FunctionDoc hash_mean_doc{"Average values of a numeric array",
("Null values are ignored."),
- {"array", "group_id_array"}};
+ {"array", "group_id_array"},
+ "ScalarAggregateOptions"};
const FunctionDoc hash_stddev_doc{
"Calculate the standard deviation of a numeric array",
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 405b9b4457e..71c424523ef 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -288,23 +288,23 @@ The supported aggregation functions are as follows.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+===============+=======+=============+================+==================================+=======+
-| hash_all | Unary | Boolean | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+| hash_all | Unary | Boolean | Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_any | Unary | Any | Scalar Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+| hash_any | Unary | Any | Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_count | Unary | Boolean | Scalar Int64 | :struct:`CountOptions` | \(2) |
+| hash_count | Unary | Boolean | Int64 | :struct:`CountOptions` | \(2) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_mean | Unary | Numeric | Scalar Float64 | | |
+| hash_mean | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) |
+| hash_min_max | Unary | Numeric | Struct | :struct:`ScalarAggregateOptions` | \(3) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
+| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_sum | Unary | Numeric | Scalar Numeric | | |
+| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(4) |
+| hash_tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(5) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
+| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+
* \(1) If null values are taken into account, by setting the
@@ -316,7 +316,9 @@ The supported aggregation functions are as follows.
* \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar.
-* \(4) tdigest/t-digest computes approximate quantiles, and so only needs a
+* \(4) Output is Int64, UInt64 or Float64, depending on the input type.
+
+* \(5) tdigest/t-digest computes approximate quantiles, and so only needs a
fixed amount of memory. See the `reference implementation
`_ for details.
From 7667e41c31e4db7baff113f72e174fcfb2962973 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 17 Aug 2021 16:50:00 -0400
Subject: [PATCH 07/13] ARROW-13311: [Documentation] Tweak wording
---
docs/source/cpp/compute.rst | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 71c424523ef..d94bc4ce38a 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -246,11 +246,11 @@ Grouped Aggregations ("group by")
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Grouped aggregations are not directly invokable, but are used as part of a
-group by operation. Like scalar aggregations, grouped aggregations reduce
-multiple input values to a single output value. Instead of aggregating all
-values of the input, however, grouped aggregations partition of the input
+SQL-style "group by" operation. Like scalar aggregations, grouped aggregations
+reduce multiple input values to a single output value. Instead of aggregating
+all values of the input, however, grouped aggregations partition of the input
values on some set of "key" columns, then aggregate each group individually,
-and emit one output per input group.
+emitting one output per input group.
As an example, for the following table:
@@ -271,7 +271,8 @@ As an example, for the following table:
+-----------------+--------------+
We compute a sum of column "x", grouped on the key column "key". This gives us
-three groups:
+three groups, with the following results. Note that null is treated as a
+distinct key.
+-----------------+--------------+
| Column "sum(x)" | Column "key" |
@@ -283,7 +284,9 @@ three groups:
| 5 | null |
+-----------------+--------------+
-The supported aggregation functions are as follows.
+The supported aggregation functions are as follows. Note that currently, all
+function names are prefixed with "hash\_", which differentiates them from their
+scalar equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
From e3b1dd15bbecc64cc960e76fc22643af8ff43936 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou
Date: Wed, 18 Aug 2021 17:02:40 +0200
Subject: [PATCH 08/13] Wording nits
---
docs/source/cpp/compute.rst | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index d94bc4ce38a..d9b338ad41c 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -248,9 +248,9 @@ Grouped Aggregations ("group by")
Grouped aggregations are not directly invokable, but are used as part of a
SQL-style "group by" operation. Like scalar aggregations, grouped aggregations
reduce multiple input values to a single output value. Instead of aggregating
-all values of the input, however, grouped aggregations partition of the input
+all values of the input, however, grouped aggregations partition the input
values on some set of "key" columns, then aggregate each group individually,
-emitting one output per input group.
+emitting one output value per input group.
As an example, for the following table:
@@ -267,12 +267,12 @@ As an example, for the following table:
+-----------------+--------------+
| null | null |
+-----------------+--------------+
-| 5 | null |
+| 9 | null |
+-----------------+--------------+
-We compute a sum of column "x", grouped on the key column "key". This gives us
-three groups, with the following results. Note that null is treated as a
-distinct key.
+we can compute a sum of the column "x", grouped on the column "key".
+This gives us three groups, with the following results. Note that null is
+treated as a distinct key value.
+-----------------+--------------+
| Column "sum(x)" | Column "key" |
@@ -281,12 +281,12 @@ distinct key.
+-----------------+--------------+
| null | "b" |
+-----------------+--------------+
-| 5 | null |
+| 9 | null |
+-----------------+--------------+
-The supported aggregation functions are as follows. Note that currently, all
-function names are prefixed with "hash\_", which differentiates them from their
-scalar equivalents above and reflects how they are implemented internally.
+The supported aggregation functions are as follows. All function names are
+prefixed with "hash\_", which differentiates them from their scalar
+equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
@@ -311,7 +311,7 @@ scalar equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
* \(1) If null values are taken into account, by setting the
- ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_
+ :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_
logic is applied. The min_count option is not respected.
* \(2) CountMode controls whether only non-null values are counted (the
@@ -321,7 +321,7 @@ scalar equivalents above and reflects how they are implemented internally.
* \(4) Output is Int64, UInt64 or Float64, depending on the input type.
-* \(5) tdigest/t-digest computes approximate quantiles, and so only needs a
+* \(5) T-digest computes approximate quantiles, and so only needs a
fixed amount of memory. See the `reference implementation
`_ for details.
From bf80780fd3dc079c55d0b04c4f5f3fa2ecd67ead Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 18 Aug 2021 11:19:04 -0400
Subject: [PATCH 09/13] Apply suggestions from code review
Co-authored-by: Ian Cook
---
docs/source/cpp/compute.rst | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index d9b338ad41c..d928d55d34e 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -255,34 +255,34 @@ emitting one output value per input group.
As an example, for the following table:
+-----------------+--------------+
-| Column "x" | Column "key" |
+| Column ``key`` | Column ``x`` |
+=================+==============+
-| 2 | "a" |
+| "a" | 2 |
+-----------------+--------------+
-| 5 | "a" |
+| "a" | 5 |
+-----------------+--------------+
-| null | "b" |
+| "b" | null |
+-----------------+--------------+
-| null | "b" |
+| "b" | null |
+-----------------+--------------+
| null | null |
+-----------------+--------------+
-| 9 | null |
+| null | 9 |
+-----------------+--------------+
-we can compute a sum of the column "x", grouped on the column "key".
+we can compute a sum of the column ``x``, grouped on the column ``key``.
This gives us three groups, with the following results. Note that null is
treated as a distinct key value.
-+-----------------+--------------+
-| Column "sum(x)" | Column "key" |
-+=================+==============+
-| 7 | "a" |
-+-----------------+--------------+
-| null | "b" |
-+-----------------+--------------+
-| 9 | null |
-+-----------------+--------------+
++-----------------+-------------------+
+| Column ``key`` | Column ``sum(x)`` |
++=================+===================+
+| "a" | 7 |
++-----------------+-------------------+
+| "b" | null |
++-----------------+-------------------+
+| null | 9 |
++-----------------+-------------------+
The supported aggregation functions are as follows. All function names are
prefixed with "hash\_", which differentiates them from their scalar
From b107d136b71d583b0472b66cc6b6589e7b0770a7 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 18 Aug 2021 11:19:31 -0400
Subject: [PATCH 10/13] Update docs/source/cpp/compute.rst
Co-authored-by: Ian Cook
---
docs/source/cpp/compute.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index d928d55d34e..fcbf148dcac 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -285,7 +285,7 @@ treated as a distinct key value.
+-----------------+-------------------+
The supported aggregation functions are as follows. All function names are
-prefixed with "hash\_", which differentiates them from their scalar
+prefixed with ``hash\_``, which differentiates them from their scalar
equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
From 844f45c9a1d3b3c85772c89d0a7453a722ab9a4d Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 18 Aug 2021 11:22:17 -0400
Subject: [PATCH 11/13] ARROW-13311: [Documentation] Fix reST
---
docs/source/cpp/compute.rst | 50 ++++++++++++++++++-------------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index fcbf148dcac..f77a4a86eef 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -254,38 +254,38 @@ emitting one output value per input group.
As an example, for the following table:
-+-----------------+--------------+
-| Column ``key`` | Column ``x`` |
-+=================+==============+
-| "a" | 2 |
-+-----------------+--------------+
-| "a" | 5 |
-+-----------------+--------------+
-| "b" | null |
-+-----------------+--------------+
-| "b" | null |
-+-----------------+--------------+
-| null | null |
-+-----------------+--------------+
-| null | 9 |
-+-----------------+--------------+
++------------------+-----------------+
+| Column ``key`` | Column ``x`` |
++==================+=================+
+| "a" | 2 |
++------------------+-----------------+
+| "a" | 5 |
++------------------+-----------------+
+| "b" | null |
++------------------+-----------------+
+| "b" | null |
++------------------+-----------------+
+| null | null |
++------------------+-----------------+
+| null | 9 |
++------------------+-----------------+
we can compute a sum of the column ``x``, grouped on the column ``key``.
This gives us three groups, with the following results. Note that null is
treated as a distinct key value.
-+-----------------+-------------------+
-| Column ``key`` | Column ``sum(x)`` |
-+=================+===================+
-| "a" | 7 |
-+-----------------+-------------------+
-| "b" | null |
-+-----------------+-------------------+
-| null | 9 |
-+-----------------+-------------------+
++------------------+-------------------+
+| Column ``key`` | Column ``sum(x)`` |
++==================+===================+
+| "a" | 7 |
++------------------+-------------------+
+| "b" | null |
++------------------+-------------------+
+| null | 9 |
++------------------+-------------------+
The supported aggregation functions are as follows. All function names are
-prefixed with ``hash\_``, which differentiates them from their scalar
+prefixed with ``hash_``, which differentiates them from their scalar
equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
From 275c29970566d76d52a0fdb8c3d784e94e453713 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 18 Aug 2021 11:32:52 -0400
Subject: [PATCH 12/13] Update docs/source/cpp/compute.rst
Co-authored-by: Ian Cook
---
docs/source/cpp/compute.rst | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index f77a4a86eef..4c2278c11e0 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -274,15 +274,15 @@ we can compute a sum of the column ``x``, grouped on the column ``key``.
This gives us three groups, with the following results. Note that null is
treated as a distinct key value.
-+------------------+-------------------+
-| Column ``key`` | Column ``sum(x)`` |
-+==================+===================+
-| "a" | 7 |
-+------------------+-------------------+
-| "b" | null |
-+------------------+-------------------+
-| null | 9 |
-+------------------+-------------------+
++------------------+-----------------------+
+| Column ``key`` | Column ``sum(x)`` |
++==================+=======================+
+| "a" | 7 |
++------------------+-----------------------+
+| "b" | null |
++------------------+-----------------------+
+| null | 9 |
++------------------+-----------------------+
The supported aggregation functions are as follows. All function names are
prefixed with ``hash_``, which differentiates them from their scalar
From 5590a67b37d10b55b8602e0c4e2364bed05b6ed1 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 19 Aug 2021 08:10:15 -0400
Subject: [PATCH 13/13] ARROW-13311: [Documentation] Fix return types
---
docs/source/cpp/compute.rst | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 4c2278c11e0..25cb7b7822f 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -291,11 +291,11 @@ equivalents above and reflects how they are implemented internally.
+---------------+-------+-------------+----------------+----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+===============+=======+=============+================+==================================+=======+
-| hash_all | Unary | Boolean | Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_any | Unary | Any | Int64 | :struct:`ScalarAggregateOptions` | \(1) |
+| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
-| hash_count | Unary | Boolean | Int64 | :struct:`CountOptions` | \(2) |
+| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) |
+---------------+-------+-------------+----------------+----------------------------------+-------+
| hash_mean | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | |
+---------------+-------+-------------+----------------+----------------------------------+-------+