From f30cf18846fc1dff1dfa36e9e31ddd15fcb70add Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 10 Aug 2023 09:23:48 +0800 Subject: [PATCH 01/60] dictionary min max init --- .../arrow/compute/kernels/aggregate_basic.cc | 1 + .../kernels/aggregate_basic_internal.h | 107 ++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index ddd24165246..15a5e95f17a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -1118,6 +1118,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get()); AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get()); + AddMinMaxKernel(MinMaxInit, Type::DICTIONARY, func.get()); AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get()); AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get()); AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 3de922531ab..37008d5d13b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -891,6 +891,108 @@ struct NullMinMaxImpl : public ScalarAggregator { } }; +template +struct DictionaryMinMaxImpl : public ScalarAggregator { + using ThisType = DictionaryMinMaxImpl; + + DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) + : out_type(std::move(out_type)), + options(std::move(options)), + count(0), + min(nullptr), + max(nullptr), + has_nulls(false) { + this->options.min_count = std::max(1, this->options.min_count); + } + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_scalar) { + return Status::NotImplemented("No min/max implemented for DictionaryScalar"); + } + + const DictionaryArray& dict_array = + checked_cast(*batch[0].array.ToArray()); + + std::shared_ptr dict_values = dict_array.dictionary(); + std::shared_ptr dict_indices = dict_array.indices(); + has_nulls = dict_indices->null_count() > 0; + count += dict_indices->length() - dict_indices->null_count() + + Datum dict_values_(*dict_values); + ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values_))); + const StructScalar& struct_result = + checked_cast(*result.scalar()); + ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); + ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); + CompareMinMax(std::move(min_), std::move(max_)); + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + CompareMinMax(other.min, other.max); + has_nulls = has_nulls || other.has_nulls; + this->count += other.count; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + const auto& struct_type = checked_cast(*out_type); + const auto& child_type = struct_type.field(0)->type(); + + std::vector> values; + // Physical type != result type + if ((has_nulls && !options.skip_nulls) || (this->count < options.min_count) || + min == nullptr || min->type->id() == Type::NA) { + // (null, null) + auto null_scalar = MakeNullScalar(child_type); + values = {null_scalar, null_scalar}; + } else { + ARROW_CHECK_EQ(child_type->id(), min->type->id()); + ARROW_CHECK_EQ(child_type->id(), max->type->id()); + values = {std::move(min), std::move(max)}; + } + out->value = std::make_shared(std::move(values), this->out_type); + return Status::OK(); + } + + std::shared_ptr min; + std::shared_ptr max; + bool has_nulls; + std::shared_ptr out_type; + ScalarAggregateOptions options; + int64_t count; + + private: + Status CompareMinMax(std::shared_ptr min_, + std::shared_ptr max_) override { + if (min == nullptr || min->type->id() == Type::NA) { + min = min_; + } else if (min_ != nullptr && min_->type->id() != Type::NA) { + ARROW_ASSIGN_OR_RAISE(auto min_compare_result, + CallFunction("greater", {min, min_})); + const BooleanScalar& min_compare_result_scalar = + checked_cast(*min_compare_result.scalar()); + if (min_compare_result_scalar.value) { + min = min_; + } + } + + if (max == nullptr || max->type->id() == Type::NA) { + max = max_; + } else if (max_ != nullptr && max_->type->id() != Type::NA) { + ARROW_ASSIGN_OR_RAISE(auto max_compare_result, CallFunction("less", {max, max_})); + const BooleanScalar& max_compare_result_scalar = + checked_cast(*max_compare_result.scalar()); + if (max_compare_result_scalar.value) { + max = max_; + } + } + + return Status::OK(); + } +}; + // First/Last struct FirstLastInitState { @@ -981,6 +1083,11 @@ struct MinMaxInitState { return Status::OK(); } + Status Visit(const DictionaryType&) { + state.reset(new DictionaryMinMaxImpl(out_type, options)); + return Status::OK(); + } + template enable_if_physical_integer Visit(const Type&) { using PhysicalType = typename Type::PhysicalType; From 422a02ff34a62913f8d3d4a07f44490ac3c8c0a2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 10 Aug 2023 21:32:21 +0800 Subject: [PATCH 02/60] fix bug --- .../kernels/aggregate_basic_internal.h | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 37008d5d13b..cffb2629df2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -898,15 +898,15 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) : out_type(std::move(out_type)), options(std::move(options)), + has_nulls(false), count(0), min(nullptr), - max(nullptr), - has_nulls(false) { + max(nullptr) { this->options.min_count = std::max(1, this->options.min_count); } Status Consume(KernelContext*, const ExecSpan& batch) override { - if (batch[0].is_scalar) { + if (batch[0].is_scalar()) { return Status::NotImplemented("No min/max implemented for DictionaryScalar"); } @@ -916,21 +916,21 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr dict_values = dict_array.dictionary(); std::shared_ptr dict_indices = dict_array.indices(); has_nulls = dict_indices->null_count() > 0; - count += dict_indices->length() - dict_indices->null_count() + count += dict_indices->length() - dict_indices->null_count(); - Datum dict_values_(*dict_values); + Datum dict_values_(*dict_values); ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values_))); const StructScalar& struct_result = checked_cast(*result.scalar()); ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); - CompareMinMax(std::move(min_), std::move(max_)); + ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); return Status::OK(); } Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); - CompareMinMax(other.min, other.max); + ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max)); has_nulls = has_nulls || other.has_nulls; this->count += other.count; return Status::OK(); @@ -942,7 +942,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::vector> values; // Physical type != result type - if ((has_nulls && !options.skip_nulls) || (this->count < options.min_count) || + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || min == nullptr || min->type->id() == Type::NA) { // (null, null) auto null_scalar = MakeNullScalar(child_type); @@ -956,16 +956,16 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { return Status::OK(); } - std::shared_ptr min; - std::shared_ptr max; - bool has_nulls; std::shared_ptr out_type; ScalarAggregateOptions options; + bool has_nulls; int64_t count; + std::shared_ptr min; + std::shared_ptr max; private: Status CompareMinMax(std::shared_ptr min_, - std::shared_ptr max_) override { + std::shared_ptr max_) { if (min == nullptr || min->type->id() == Type::NA) { min = min_; } else if (min_ != nullptr && min_->type->id() != Type::NA) { From 1a5da63deb32d7966bd662bd797370078c368dfd Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 10 Aug 2023 23:35:21 +0800 Subject: [PATCH 03/60] fix output type --- .../kernels/aggregate_basic_internal.h | 68 ++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index cffb2629df2..292712d95f9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -895,9 +895,9 @@ template struct DictionaryMinMaxImpl : public ScalarAggregator { using ThisType = DictionaryMinMaxImpl; - DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) - : out_type(std::move(out_type)), - options(std::move(options)), + DictionaryMinMaxImpl(ScalarAggregateOptions options) + : options(std::move(options)), + out_child_type(nullptr), has_nulls(false), count(0), min(nullptr), @@ -910,82 +910,88 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { return Status::NotImplemented("No min/max implemented for DictionaryScalar"); } - const DictionaryArray& dict_array = - checked_cast(*batch[0].array.ToArray()); + DictionaryArray arr(batch[0].array.ToArrayData()); + std::shared_ptr dict_values = arr.dictionary(); + std::shared_ptr dict_indices = arr.indices(); - std::shared_ptr dict_values = dict_array.dictionary(); - std::shared_ptr dict_indices = dict_array.indices(); - has_nulls = dict_indices->null_count() > 0; - count += dict_indices->length() - dict_indices->null_count(); + this->out_child_type = dict_values->type(); + this->has_nulls = dict_indices->null_count() > 0; + this->count += dict_indices->length() - dict_indices->null_count(); Datum dict_values_(*dict_values); ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values_))); const StructScalar& struct_result = checked_cast(*result.scalar()); + ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); + ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); return Status::OK(); } Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); + ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max)); - has_nulls = has_nulls || other.has_nulls; + if (this->out_child_type == nullptr) { + this->out_child_type = other.out_child_type; + } else if (other.out_child_type != nullptr) { + ARROW_CHECK_EQ(this->out_child_type->id(), other.out_child_type->id()); + } + this->has_nulls = this->has_nulls || other.has_nulls; this->count += other.count; return Status::OK(); } Status Finalize(KernelContext*, Datum* out) override { - const auto& struct_type = checked_cast(*out_type); - const auto& child_type = struct_type.field(0)->type(); - std::vector> values; // Physical type != result type if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || - min == nullptr || min->type->id() == Type::NA) { + this->min == nullptr || this->min->type->id() == Type::NA) { // (null, null) - auto null_scalar = MakeNullScalar(child_type); + std::shared_ptr null_scalar = MakeNullScalar(this->out_child_type); values = {null_scalar, null_scalar}; } else { - ARROW_CHECK_EQ(child_type->id(), min->type->id()); - ARROW_CHECK_EQ(child_type->id(), max->type->id()); - values = {std::move(min), std::move(max)}; + values = {std::move(this->min), std::move(this->max)}; } - out->value = std::make_shared(std::move(values), this->out_type); + + out->value = std::make_shared( + std::move(values), struct_({field("min", this->out_child_type), + field("max", this->out_child_type)})); return Status::OK(); } - std::shared_ptr out_type; ScalarAggregateOptions options; + std::shared_ptr out_child_type; bool has_nulls; int64_t count; std::shared_ptr min; std::shared_ptr max; private: - Status CompareMinMax(std::shared_ptr min_, - std::shared_ptr max_) { - if (min == nullptr || min->type->id() == Type::NA) { - min = min_; + Status CompareMinMax(std::shared_ptr min_, std::shared_ptr max_) { + if (this->min == nullptr || this->min->type->id() == Type::NA) { + this->min = min_; } else if (min_ != nullptr && min_->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(auto min_compare_result, + ARROW_ASSIGN_OR_RAISE(Datum min_compare_result, CallFunction("greater", {min, min_})); + const BooleanScalar& min_compare_result_scalar = checked_cast(*min_compare_result.scalar()); if (min_compare_result_scalar.value) { - min = min_; + this->min = min_; } } - if (max == nullptr || max->type->id() == Type::NA) { - max = max_; + if (this->max == nullptr || this->max->type->id() == Type::NA) { + this->max = max_; } else if (max_ != nullptr && max_->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(auto max_compare_result, CallFunction("less", {max, max_})); + ARROW_ASSIGN_OR_RAISE(Datum max_compare_result, CallFunction("less", {max, max_})); const BooleanScalar& max_compare_result_scalar = checked_cast(*max_compare_result.scalar()); if (max_compare_result_scalar.value) { - max = max_; + this->max = max_; } } @@ -1084,7 +1090,7 @@ struct MinMaxInitState { } Status Visit(const DictionaryType&) { - state.reset(new DictionaryMinMaxImpl(out_type, options)); + state.reset(new DictionaryMinMaxImpl(options)); return Status::OK(); } From 89571ad66d5b81196590d4d06bc312a82af1dbb2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 10:07:20 +0800 Subject: [PATCH 04/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 292712d95f9..f15eb8a2a39 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -895,7 +895,7 @@ template struct DictionaryMinMaxImpl : public ScalarAggregator { using ThisType = DictionaryMinMaxImpl; - DictionaryMinMaxImpl(ScalarAggregateOptions options) + explicit DictionaryMinMaxImpl(ScalarAggregateOptions options) : options(std::move(options)), out_child_type(nullptr), has_nulls(false), From 24f5d95a6154faf3cb32c12499366f60c56d61b3 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 21:42:49 +0800 Subject: [PATCH 05/60] add correct dictionary value type --- .../arrow/compute/kernels/aggregate_basic.cc | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 15a5e95f17a..70df1b32748 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -22,7 +22,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" - +#include #include namespace arrow { @@ -492,11 +492,22 @@ Result> MinMaxInit(KernelContext* ctx, return visitor.Create(); } +namespace{ + +Result DictionaryValueType(KernelContext*, const std::vector& types) { + // T -> T.value_type + auto ty = types.front().GetSharedPtr(); + const DictionaryType& ty_dict = checked_cast(*ty); + return ty_dict.value_type(); +} + +} //namespace + // For "min" and "max" functions: override finalize and return the actual value template void AddMinOrMaxAggKernel(ScalarAggregateFunction* func, ScalarAggregateFunction* min_max_func) { - auto sig = KernelSignature::Make({InputType::Any()}, FirstType); + std::shared_ptr sig = KernelSignature::Make({InputType::Any()}, FirstType); auto init = [min_max_func]( KernelContext* ctx, const KernelInitArgs& args) -> Result> { @@ -516,7 +527,10 @@ void AddMinOrMaxAggKernel(ScalarAggregateFunction* func, // Note SIMD level is always NONE, but the convenience kernel will // dispatch to an appropriate implementation - AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); + AddAggKernel(sig, init, finalize, func); + + sig = KernelSignature::Make({InputType(Type::DICTIONARY)}, DictionaryValueType); + AddAggKernel(sig, init, finalize, func); } // ---------------------------------------------------------------------- @@ -873,6 +887,13 @@ Result MinMaxType(KernelContext*, const std::vector& typ return struct_({field("min", ty), field("max", ty)}); } +Result DictionaryMinMaxType(KernelContext*, const std::vector& types) { + // T -> struct + auto ty = types.front().GetSharedPtr(); + const DictionaryType& ty_dict = checked_cast(*ty); + return struct_({field("min", ty_dict.value_type()), field("max", ty_dict.value_type())}); +} + } // namespace Result FirstLastType(KernelContext*, const std::vector& types) { @@ -896,7 +917,12 @@ void AddFirstLastKernels(KernelInit init, void AddMinMaxKernel(KernelInit init, internal::detail::GetTypeId get_id, ScalarAggregateFunction* func, SimdLevel::type simd_level) { - auto sig = KernelSignature::Make({InputType(get_id.id)}, MinMaxType); + std::shared_ptr sig; + if (get_id.id == Type::DICTIONARY) { + sig = KernelSignature::Make({InputType(get_id.id)}, DictionaryMinMaxType); + } else { + sig = KernelSignature::Make({InputType(get_id.id)}, MinMaxType); + } AddAggKernel(std::move(sig), init, func, simd_level); } From 5917cd24b30d9f124a9e4a1d739d830b60b177a7 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 21:46:49 +0800 Subject: [PATCH 06/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 70df1b32748..801d2dfc645 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -505,9 +505,10 @@ Result DictionaryValueType(KernelContext*, const std::vector -void AddMinOrMaxAggKernel(ScalarAggregateFunction* func, - ScalarAggregateFunction* min_max_func) { - std::shared_ptr sig = KernelSignature::Make({InputType::Any()}, FirstType); +void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, + ScalarAggregateFunction* min_max_func) { + std::shared_ptr sig = + KernelSignature::Make({InputType::Any()}, FirstType); auto init = [min_max_func]( KernelContext* ctx, const KernelInitArgs& args) -> Result> { @@ -1167,12 +1168,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // Add min/max as convenience functions func = std::make_shared("min", Arity::Unary(), min_or_max_doc, &default_scalar_aggregate_options); - AddMinOrMaxAggKernel(func.get(), min_max_func); + AddMinOrMaxAggKernels(func.get(), min_max_func); DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("max", Arity::Unary(), min_or_max_doc, &default_scalar_aggregate_options); - AddMinOrMaxAggKernel(func.get(), min_max_func); + AddMinOrMaxAggKernels(func.get(), min_max_func); DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("product", Arity::Unary(), product_doc, From e9f4e5ade0e1d8bf654b8a9f5199fdec1c22a6fe Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 21:47:33 +0800 Subject: [PATCH 07/60] delete --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 801d2dfc645..0622ef1b13b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -22,7 +22,7 @@ #include "arrow/compute/kernels/util_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" -#include + #include namespace arrow { From 77520113eada5814ef9937fb45477ed1fabfcfe7 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 21:59:33 +0800 Subject: [PATCH 08/60] unify out_type --- .../kernels/aggregate_basic_internal.h | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index f15eb8a2a39..483841daeea 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -895,9 +895,9 @@ template struct DictionaryMinMaxImpl : public ScalarAggregator { using ThisType = DictionaryMinMaxImpl; - explicit DictionaryMinMaxImpl(ScalarAggregateOptions options) + explicit DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) : options(std::move(options)), - out_child_type(nullptr), + out_type(std::move(out_type)), has_nulls(false), count(0), min(nullptr), @@ -914,7 +914,6 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr dict_values = arr.dictionary(); std::shared_ptr dict_indices = arr.indices(); - this->out_child_type = dict_values->type(); this->has_nulls = dict_indices->null_count() > 0; this->count += dict_indices->length() - dict_indices->null_count(); @@ -934,36 +933,32 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { const auto& other = checked_cast(src); ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max)); - if (this->out_child_type == nullptr) { - this->out_child_type = other.out_child_type; - } else if (other.out_child_type != nullptr) { - ARROW_CHECK_EQ(this->out_child_type->id(), other.out_child_type->id()); - } this->has_nulls = this->has_nulls || other.has_nulls; this->count += other.count; return Status::OK(); } Status Finalize(KernelContext*, Datum* out) override { + const auto& struct_type = checked_cast(*out_type); + const auto& child_type = struct_type.field(0)->type(); + std::vector> values; // Physical type != result type if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || this->min == nullptr || this->min->type->id() == Type::NA) { // (null, null) - std::shared_ptr null_scalar = MakeNullScalar(this->out_child_type); + std::shared_ptr null_scalar = MakeNullScalar(child_type); values = {null_scalar, null_scalar}; } else { values = {std::move(this->min), std::move(this->max)}; } - out->value = std::make_shared( - std::move(values), struct_({field("min", this->out_child_type), - field("max", this->out_child_type)})); + out->value = std::make_shared(std::move(values), this->out_type); return Status::OK(); } ScalarAggregateOptions options; - std::shared_ptr out_child_type; + std::shared_ptr out_type; bool has_nulls; int64_t count; std::shared_ptr min; @@ -1090,7 +1085,7 @@ struct MinMaxInitState { } Status Visit(const DictionaryType&) { - state.reset(new DictionaryMinMaxImpl(options)); + state.reset(new DictionaryMinMaxImpl(out_type, options)); return Status::OK(); } From 640b17c82f4c8a3abacf3e58c252e20f8e3ffb24 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 22:00:17 +0800 Subject: [PATCH 09/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 483841daeea..92d7f4ec1e3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -895,7 +895,7 @@ template struct DictionaryMinMaxImpl : public ScalarAggregator { using ThisType = DictionaryMinMaxImpl; - explicit DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) + DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) : options(std::move(options)), out_type(std::move(out_type)), has_nulls(false), From d35f3b9873422b50aa4d2fc9568e303b2869cae0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 11 Aug 2023 23:26:30 +0800 Subject: [PATCH 10/60] add one test --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 2a7ba1a51e4..bc4408c95f7 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2033,6 +2033,18 @@ TEST(TestDecimalMinMaxKernel, Decimals) { } } +TEST(TestDictionaryMinMaxKernel, DictionaryArray) { + ScalarAggregateOptions options; + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dictionary(index_type, int64()), + "[0, 4, null, 1, null, 0, 4, 2, 3]", "[5, 4, 2, 0, 7]"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "0", "max": "7"})"))); + } +} + TEST(TestNullMinMaxKernel, Basics) { auto item_ty = null(); auto ty = struct_({field("min", item_ty), field("max", item_ty)}); From f205ade3b60380ce275ba83be2bad393c58217cc Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sat, 12 Aug 2023 08:46:50 +0800 Subject: [PATCH 11/60] type --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 4 ++-- cpp/src/arrow/compute/kernels/aggregate_test.cc | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 0622ef1b13b..c8688a723ed 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -496,7 +496,7 @@ namespace{ Result DictionaryValueType(KernelContext*, const std::vector& types) { // T -> T.value_type - auto ty = types.front().GetSharedPtr(); + auto ty = types.front(); const DictionaryType& ty_dict = checked_cast(*ty); return ty_dict.value_type(); } @@ -890,7 +890,7 @@ Result MinMaxType(KernelContext*, const std::vector& typ Result DictionaryMinMaxType(KernelContext*, const std::vector& types) { // T -> struct - auto ty = types.front().GetSharedPtr(); + auto ty = types.front(); const DictionaryType& ty_dict = checked_cast(*ty); return struct_({field("min", ty_dict.value_type()), field("max", ty_dict.value_type())}); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index bc4408c95f7..8996361c379 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2035,8 +2035,11 @@ TEST(TestDecimalMinMaxKernel, Decimals) { TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ScalarAggregateOptions options; + std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + ty = struct_({field("min", int64()), field("max", int64())}); EXPECT_THAT( MinMax(DictArrayFromJSON(dictionary(index_type, int64()), "[0, 4, null, 1, null, 0, 4, 2, 3]", "[5, 4, 2, 0, 7]"), From b034d5d265580dedbd2336cc90b84d34baddf1d5 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sat, 12 Aug 2023 10:42:06 +0800 Subject: [PATCH 12/60] basic test --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 8996361c379..f31774c35d8 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2035,16 +2035,19 @@ TEST(TestDecimalMinMaxKernel, Decimals) { TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ScalarAggregateOptions options; + std::shared_ptr item_ty; + std::shared_ptr dict_ty; std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - ty = struct_({field("min", int64()), field("max", int64())}); + item_ty = decimal128(5, 2); + dict_ty = dictionary(index_type, item_ty); + ty = struct_({field("min", item_ty), field("max", item_ty)}); EXPECT_THAT( - MinMax(DictArrayFromJSON(dictionary(index_type, int64()), - "[0, 4, null, 1, null, 0, 4, 2, 3]", "[5, 4, 2, 0, 7]"), + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])" , R"(["5.10", "-1.23"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "0", "max": "7"})"))); + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); } } From 649d69ff566facfba01bee6af909352d9811ffbf Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sat, 12 Aug 2023 11:43:31 +0800 Subject: [PATCH 13/60] add decimal test --- .../arrow/compute/kernels/aggregate_test.cc | 70 ++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index f31774c35d8..3634ef3e690 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2037,17 +2037,69 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ScalarAggregateOptions options; std::shared_ptr item_ty; std::shared_ptr dict_ty; - std::shared_ptr ty; + std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - - item_ty = decimal128(5, 2); - dict_ty = dictionary(index_type, item_ty); - ty = struct_({field("min", item_ty), field("max", item_ty)}); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])" , R"(["5.10", "-1.23"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + + for (const auto& item_ty_ : {decimal128(5, 2), decimal256(5, 2)}) { + dict_ty = dictionary(index_type, item_ty_); + ty = struct_({field("min", item_ty_), field("max", item_ty_)}); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([3, 1, 1, 4, 0, 2, null])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, null, 1, 4, 3, 0, 2, null])", + R"(["-5.10", "-1.23", "-2.00", "-3.45", "-4.56"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-5.10", "max": "-1.23"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([3, 1, 1, 4, 0, 2])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); + } } } From aa20a10d0f9c6ed777387c514cdc653be6c91200 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sat, 12 Aug 2023 12:03:20 +0800 Subject: [PATCH 14/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 13 ++++++++----- cpp/src/arrow/compute/kernels/aggregate_test.cc | 11 ++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index c8688a723ed..365786ec7a1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -492,16 +492,17 @@ Result> MinMaxInit(KernelContext* ctx, return visitor.Create(); } -namespace{ +namespace { -Result DictionaryValueType(KernelContext*, const std::vector& types) { +Result DictionaryValueType(KernelContext*, + const std::vector& types) { // T -> T.value_type auto ty = types.front(); const DictionaryType& ty_dict = checked_cast(*ty); return ty_dict.value_type(); } -} //namespace +} // namespace // For "min" and "max" functions: override finalize and return the actual value template @@ -888,11 +889,13 @@ Result MinMaxType(KernelContext*, const std::vector& typ return struct_({field("min", ty), field("max", ty)}); } -Result DictionaryMinMaxType(KernelContext*, const std::vector& types) { +Result DictionaryMinMaxType(KernelContext*, + const std::vector& types) { // T -> struct auto ty = types.front(); const DictionaryType& ty_dict = checked_cast(*ty); - return struct_({field("min", ty_dict.value_type()), field("max", ty_dict.value_type())}); + return struct_( + {field("min", ty_dict.value_type()), field("max", ty_dict.value_type())}); } } // namespace diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 3634ef3e690..1deee32edc8 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2035,7 +2035,6 @@ TEST(TestDecimalMinMaxKernel, Decimals) { TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ScalarAggregateOptions options; - std::shared_ptr item_ty; std::shared_ptr dict_ty; std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { @@ -2078,8 +2077,9 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); @@ -2095,8 +2095,9 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); } From 0703113ee622cde5001c0f19b55efbbc101385c0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:14:17 +0800 Subject: [PATCH 15/60] optimize code --- .../arrow/compute/kernels/aggregate_basic_internal.h | 12 +++++------- cpp/src/arrow/compute/kernels/aggregate_test.cc | 12 ++++++++++++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 92d7f4ec1e3..676a6b11ed6 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -911,16 +911,14 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { } DictionaryArray arr(batch[0].array.ToArrayData()); - std::shared_ptr dict_values = arr.dictionary(); - std::shared_ptr dict_indices = arr.indices(); - - this->has_nulls = dict_indices->null_count() > 0; - this->count += dict_indices->length() - dict_indices->null_count(); + this->has_nulls = arr.null_count() > 0; + this->count += arr.length() - arr.null_count(); - Datum dict_values_(*dict_values); + std::shared_ptr dict_values = arr.dictionary(); + Datum dict_values_(*std::move(dict_values)); ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values_))); const StructScalar& struct_result = - checked_cast(*result.scalar()); + checked_cast(*std::move(result.scalar())); ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 1deee32edc8..f1705f9db9f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2101,6 +2101,18 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); } + + std::shared_ptr item_ty = null(); + dict_ty = dictionary(index_type, item_ty); + ty = struct_({field("min", item_ty), field("max", item_ty)}); + Datum result = ScalarFromJSON(ty, "[null, null]"); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([null])"), options), + ResultWith(result)); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), + ResultWith(result)); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0])", R"([null])"), options), + ResultWith(result)); } } From 5d2f51a50214a3b6d261039d0197ef8b0f299780 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:18:22 +0800 Subject: [PATCH 16/60] simplify --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 676a6b11ed6..43d712d7a11 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -914,16 +914,14 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { this->has_nulls = arr.null_count() > 0; this->count += arr.length() - arr.null_count(); - std::shared_ptr dict_values = arr.dictionary(); - Datum dict_values_(*std::move(dict_values)); - ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values_))); + Datum dict_values(arr.dictionary()); + ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values))); const StructScalar& struct_result = checked_cast(*std::move(result.scalar())); - ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); - ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); + return Status::OK(); } From 2176153fa49289bdfdcc4e840da4a7c6f14b7653 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:18:57 +0800 Subject: [PATCH 17/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 43d712d7a11..6b27453fdcf 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -921,7 +921,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); - + return Status::OK(); } From 8452aa611fe97641a241dc98062420e5779596e5 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:26:04 +0800 Subject: [PATCH 18/60] format --- .../kernels/aggregate_basic_internal.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 6b27453fdcf..941775a924d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -965,12 +965,12 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { if (this->min == nullptr || this->min->type->id() == Type::NA) { this->min = min_; } else if (min_ != nullptr && min_->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(Datum min_compare_result, - CallFunction("greater", {min, min_})); + ARROW_ASSIGN_OR_RAISE(Datum greater_result, + CallFunction("greater", {this->min, min_})); + const BooleanScalar& greater_scalar = + checked_cast(*greater_result.scalar()); - const BooleanScalar& min_compare_result_scalar = - checked_cast(*min_compare_result.scalar()); - if (min_compare_result_scalar.value) { + if (greater_scalar.value) { this->min = min_; } } @@ -978,10 +978,11 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { if (this->max == nullptr || this->max->type->id() == Type::NA) { this->max = max_; } else if (max_ != nullptr && max_->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(Datum max_compare_result, CallFunction("less", {max, max_})); - const BooleanScalar& max_compare_result_scalar = - checked_cast(*max_compare_result.scalar()); - if (max_compare_result_scalar.value) { + ARROW_ASSIGN_OR_RAISE(Datum less_result, CallFunction("less", {this->max, max_})); + const BooleanScalar& less_scalar = + checked_cast(*less_result.scalar()); + + if (less_scalar.value) { this->max = max_; } } From 20af8593e04aba529ee746d90b0a48010d6a7f20 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:27:27 +0800 Subject: [PATCH 19/60] space --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 941775a924d..31d20d2564b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -921,7 +921,6 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); - return Status::OK(); } From 2a21b62d1f6f5d2419320dba264ca572bc05f997 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 13 Aug 2023 17:40:29 +0800 Subject: [PATCH 20/60] add boolean test --- .../arrow/compute/kernels/aggregate_test.cc | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index f1705f9db9f..7556939e7c0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2106,13 +2106,23 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { dict_ty = dictionary(index_type, item_ty); ty = struct_({field("min", item_ty), field("max", item_ty)}); Datum result = ScalarFromJSON(ty, "[null, null]"); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([null])"), options), - ResultWith(result)); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), ResultWith(result)); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0])", R"([null])"), options), + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), ResultWith(result)); + + item_ty = boolean(); + dict_ty = dictionary(index_type, item_ty); + ty = struct_({field("min", item_ty), field("max", item_ty)}); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 1])", R"([false, true])"), options), + ResultWith(ScalarFromJSON(ty, "[false, true]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([false])"), options), + ResultWith(ScalarFromJSON(ty, "[false, false]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([true])"), options), + ResultWith(ScalarFromJSON(ty, "[true, true]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, "[null, null]"))); } } From 459609b81d387652334a0a7e3df4898b1a72b79b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 23:41:00 +0800 Subject: [PATCH 21/60] update --- .../arrow/compute/kernels/aggregate_basic_internal.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 31d20d2564b..72b9089a068 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -914,10 +914,15 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { this->has_nulls = arr.null_count() > 0; this->count += arr.length() - arr.null_count(); - Datum dict_values(arr.dictionary()); + const std::shared_ptr& dict = arr.dictionary(); + if (dict->length == 0) { + return Status::OK(); + } + + dict_values(std::move(dict)); ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values))); const StructScalar& struct_result = - checked_cast(*std::move(result.scalar())); + checked_cast(*result.scalar()); ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); @@ -960,7 +965,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr max; private: - Status CompareMinMax(std::shared_ptr min_, std::shared_ptr max_) { + Status CompareMinMax(const std::shared_ptr& min_, + const std::shared_ptr& max_) { if (this->min == nullptr || this->min->type->id() == Type::NA) { this->min = min_; } else if (min_ != nullptr && min_->type->id() != Type::NA) { From e30d33db80b76f3d48311f1ed459720bc85db231 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 23:48:00 +0800 Subject: [PATCH 22/60] add a empty test --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 4 ++-- cpp/src/arrow/compute/kernels/aggregate_test.cc | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 72b9089a068..e3a03bc797e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -915,11 +915,11 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { this->count += arr.length() - arr.null_count(); const std::shared_ptr& dict = arr.dictionary(); - if (dict->length == 0) { + if (dict->length() == 0) { return Status::OK(); } - dict_values(std::move(dict)); + Datum dict_values(std::move(dict)); ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values))); const StructScalar& struct_result = checked_cast(*result.scalar()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 7556939e7c0..0fbf80f4242 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2061,6 +2061,8 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ResultWith(ScalarFromJSON(ty, R"({"min": "-5.10", "max": "-1.23"})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); From 5cdb5b6e100674c1d0330b89c679055752c7afdd Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 22 Aug 2023 08:13:29 +0800 Subject: [PATCH 23/60] move and rename --- Testing/Temporary/CTestCostData.txt | 1 + Testing/Temporary/LastTest.log | 3 +++ .../arrow/compute/kernels/aggregate_basic.cc | 12 --------- .../kernels/aggregate_basic_internal.h | 27 ++++++++++--------- .../arrow/compute/kernels/codegen_internal.cc | 8 ++++++ .../arrow/compute/kernels/codegen_internal.h | 2 ++ 6 files changed, 28 insertions(+), 25 deletions(-) create mode 100644 Testing/Temporary/CTestCostData.txt create mode 100644 Testing/Temporary/LastTest.log diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt new file mode 100644 index 00000000000..ed97d539c09 --- /dev/null +++ b/Testing/Temporary/CTestCostData.txt @@ -0,0 +1 @@ +--- diff --git a/Testing/Temporary/LastTest.log b/Testing/Temporary/LastTest.log new file mode 100644 index 00000000000..0c369218269 --- /dev/null +++ b/Testing/Temporary/LastTest.log @@ -0,0 +1,3 @@ +Start testing: Aug 22 08:06 CST +---------------------------------------------------------- +End testing: Aug 22 08:06 CST diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 365786ec7a1..848c50523ad 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -492,18 +492,6 @@ Result> MinMaxInit(KernelContext* ctx, return visitor.Create(); } -namespace { - -Result DictionaryValueType(KernelContext*, - const std::vector& types) { - // T -> T.value_type - auto ty = types.front(); - const DictionaryType& ty_dict = checked_cast(*ty); - return ty_dict.value_type(); -} - -} // namespace - // For "min" and "max" functions: override finalize and return the actual value template void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index e3a03bc797e..7e30c076bc3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -923,9 +923,9 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values))); const StructScalar& struct_result = checked_cast(*result.scalar()); - ARROW_ASSIGN_OR_RAISE(auto min_, struct_result.field(FieldRef("min"))); - ARROW_ASSIGN_OR_RAISE(auto max_, struct_result.field(FieldRef("max"))); - ARROW_RETURN_NOT_OK(CompareMinMax(std::move(min_), std::move(max_))); + ARROW_ASSIGN_OR_RAISE(auto dict_min, struct_result.field(FieldRef("min"))); + ARROW_ASSIGN_OR_RAISE(auto dict_max, struct_result.field(FieldRef("max"))); + ARROW_RETURN_NOT_OK(CompareMinMax(std::move(dict_min), std::move(dict_max))); return Status::OK(); } @@ -965,30 +965,31 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr max; private: - Status CompareMinMax(const std::shared_ptr& min_, - const std::shared_ptr& max_) { + Status CompareMinMax(const std::shared_ptr& other_min, + const std::shared_ptr& other_max) { if (this->min == nullptr || this->min->type->id() == Type::NA) { - this->min = min_; - } else if (min_ != nullptr && min_->type->id() != Type::NA) { + this->min = other_min; + } else if (other_min != nullptr && other_min->type->id() != Type::NA) { ARROW_ASSIGN_OR_RAISE(Datum greater_result, - CallFunction("greater", {this->min, min_})); + CallFunction("greater", {this->min, other_min})); const BooleanScalar& greater_scalar = checked_cast(*greater_result.scalar()); if (greater_scalar.value) { - this->min = min_; + this->min = other_min; } } if (this->max == nullptr || this->max->type->id() == Type::NA) { - this->max = max_; - } else if (max_ != nullptr && max_->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(Datum less_result, CallFunction("less", {this->max, max_})); + this->max = other_max; + } else if (other_max != nullptr && other_max->type->id() != Type::NA) { + ARROW_ASSIGN_OR_RAISE(Datum less_result, + CallFunction("less", {this->max, other_max})); const BooleanScalar& less_scalar = checked_cast(*less_result.scalar()); if (less_scalar.value) { - this->max = max_; + this->max = other_max; } } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 8e2669bd3df..04cd02e6743 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -61,6 +61,14 @@ Result ListValuesType(KernelContext*, const std::vector& return list_type.value_type().get(); } +Result DictionaryValueType(KernelContext*, + const std::vector& types) { + // T -> T.value_type + auto ty = types.front(); + const DictionaryType& ty_dict = checked_cast(*ty); + return ty_dict.value_type(); +} + void EnsureDictionaryDecoded(std::vector* types) { EnsureDictionaryDecoded(types->data(), types->size()); } diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 72b29057b82..ec005234ec7 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -461,6 +461,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); Result ListValuesType(KernelContext*, const std::vector& types); +Result DictionaryValueType(KernelContext*, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to From 84dd9ea92a5a9ebbeb67511ea9bf48bc557fc520 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 22 Aug 2023 08:14:43 +0800 Subject: [PATCH 24/60] delete log --- Testing/Temporary/CTestCostData.txt | 1 - Testing/Temporary/LastTest.log | 3 --- 2 files changed, 4 deletions(-) delete mode 100644 Testing/Temporary/CTestCostData.txt delete mode 100644 Testing/Temporary/LastTest.log diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt deleted file mode 100644 index ed97d539c09..00000000000 --- a/Testing/Temporary/CTestCostData.txt +++ /dev/null @@ -1 +0,0 @@ ---- diff --git a/Testing/Temporary/LastTest.log b/Testing/Temporary/LastTest.log deleted file mode 100644 index 0c369218269..00000000000 --- a/Testing/Temporary/LastTest.log +++ /dev/null @@ -1,3 +0,0 @@ -Start testing: Aug 22 08:06 CST ----------------------------------------------------------- -End testing: Aug 22 08:06 CST From f458569e88f68c9466075259b9c0155c3283cbb0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 23 Aug 2023 08:48:08 +0800 Subject: [PATCH 25/60] matcher --- .../arrow/compute/kernels/aggregate_basic.cc | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 848c50523ad..795e3c7a094 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -492,12 +492,32 @@ Result> MinMaxInit(KernelContext* ctx, return visitor.Create(); } +struct AnyExceptDictionaryMatcher : TypeMatcher { + public: + AnyExceptDictionaryMatcher() {} + + bool Matches(const DataType& type) const override { + return type.id() != Type::DICTIONARY; + } + + std::string ToString() const override { return "Any Type except the Dictionary Type"; } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + auto casted = dynamic_cast(&other); + return casted != nullptr; + } +}; + // For "min" and "max" functions: override finalize and return the actual value template void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, ScalarAggregateFunction* min_max_func) { + auto any_except_dict_matcher = std::make_shared(); std::shared_ptr sig = - KernelSignature::Make({InputType::Any()}, FirstType); + KernelSignature::Make({InputType(any_except_dict_matcher)}, FirstType); auto init = [min_max_func]( KernelContext* ctx, const KernelInitArgs& args) -> Result> { From 6a108a3a1937855e73592e8be00237664321d0c0 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 13 Oct 2023 09:21:21 +0800 Subject: [PATCH 26/60] dictionary comaction --- .../kernels/aggregate_basic_internal.h | 35 +++++++++++-------- .../arrow/compute/kernels/aggregate_test.cc | 16 +++++++++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 9f999ac78fc..12d35b264e2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -926,34 +926,39 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { this->options.min_count = std::max(1, this->options.min_count); } - Status Consume(KernelContext*, const ExecSpan& batch) override { + Status Consume(KernelContext* ctx, const ExecSpan& batch) override { if (batch[0].is_scalar()) { return Status::NotImplemented("No min/max implemented for DictionaryScalar"); } - DictionaryArray arr(batch[0].array.ToArrayData()); - this->has_nulls = arr.null_count() > 0; - this->count += arr.length() - arr.null_count(); + DictionaryArray dict_arr(batch[0].array.ToArrayData()); + ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); + const DictionaryArray& compacted_dict_arr = + checked_cast(*compacted_arr); + this->has_nulls = compacted_dict_arr.null_count() > 0; + this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - const std::shared_ptr& dict = arr.dictionary(); + const std::shared_ptr& dict = compacted_dict_arr.dictionary(); if (dict->length() == 0) { return Status::OK(); } Datum dict_values(std::move(dict)); - ARROW_ASSIGN_OR_RAISE(Datum result, MinMax(std::move(dict_values))); + ARROW_ASSIGN_OR_RAISE( + Datum result, MinMax(std::move(dict_values), ScalarAggregateOptions::Defaults(), + ctx->exec_context())); const StructScalar& struct_result = checked_cast(*result.scalar()); ARROW_ASSIGN_OR_RAISE(auto dict_min, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto dict_max, struct_result.field(FieldRef("max"))); - ARROW_RETURN_NOT_OK(CompareMinMax(std::move(dict_min), std::move(dict_max))); + ARROW_RETURN_NOT_OK(CompareMinMax(std::move(dict_min), std::move(dict_max), ctx)); return Status::OK(); } - Status MergeFrom(KernelContext*, KernelState&& src) override { + Status MergeFrom(KernelContext* ctx, KernelState&& src) override { const auto& other = checked_cast(src); - ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max)); + ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max, ctx)); this->has_nulls = this->has_nulls || other.has_nulls; this->count += other.count; return Status::OK(); @@ -987,12 +992,13 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { private: Status CompareMinMax(const std::shared_ptr& other_min, - const std::shared_ptr& other_max) { + const std::shared_ptr& other_max, KernelContext* ctx) { if (this->min == nullptr || this->min->type->id() == Type::NA) { this->min = other_min; } else if (other_min != nullptr && other_min->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(Datum greater_result, - CallFunction("greater", {this->min, other_min})); + ARROW_ASSIGN_OR_RAISE( + Datum greater_result, + CallFunction("greater", {this->min, other_min}, ctx->exec_context())); const BooleanScalar& greater_scalar = checked_cast(*greater_result.scalar()); @@ -1004,8 +1010,9 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { if (this->max == nullptr || this->max->type->id() == Type::NA) { this->max = other_max; } else if (other_max != nullptr && other_max->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE(Datum less_result, - CallFunction("less", {this->max, other_max})); + ARROW_ASSIGN_OR_RAISE( + Datum less_result, + CallFunction("less", {this->max, other_max}, ctx->exec_context())); const BooleanScalar& less_scalar = checked_cast(*less_result.scalar()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 7d55545ea39..dcec8a4ea3a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2116,6 +2116,22 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); + + //compact dictionary + EXPECT_THAT( + MinMax( + DictArrayFromJSON( + dict_ty, R"([3, 1, 1, 4, 0, 2])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56", "8.20", "9.20", "10.20"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + EXPECT_THAT( + MinMax( + DictArrayFromJSON( + dict_ty, R"([5, 1, 1, 6, 0, 2])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56", "8.20", "9.20", "10.20"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "9.20"})"))); } std::shared_ptr item_ty = null(); From 5859bc958be2910394fe98fc433ab8f3a7281c3e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 13 Oct 2023 09:52:19 +0800 Subject: [PATCH 27/60] lint --- cpp/src/arrow/compute/kernels/aggregate_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index dcec8a4ea3a..dc1824031a9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2116,8 +2116,8 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); - - //compact dictionary + + // compact dictionary EXPECT_THAT( MinMax( DictArrayFromJSON( From 0b23b35452581dc87ed36ad87078cdc4db1233ae Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 11:05:21 +0800 Subject: [PATCH 28/60] Update cpp/src/arrow/compute/kernels/aggregate_basic_internal.h Co-authored-by: Jin Shang --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 12d35b264e2..c1bbe9172f9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -935,7 +935,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - this->has_nulls = compacted_dict_arr.null_count() > 0; + this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); const std::shared_ptr& dict = compacted_dict_arr.dictionary(); From 2020d483453e8aba141dd2752ef3822bd16156f2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 12:24:50 +0800 Subject: [PATCH 29/60] fix comment --- cpp/src/arrow/compute/kernel.cc | 31 +++++++++++++++++++ cpp/src/arrow/compute/kernel.h | 5 +++ .../arrow/compute/kernels/aggregate_basic.cc | 25 +++------------ .../kernels/aggregate_basic_internal.h | 19 +++++------- 4 files changed, 48 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index fd554ba3d83..37cce33d99e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -349,6 +349,37 @@ std::shared_ptr RunEndEncoded( std::move(value_type_matcher)); } +class NotMatcher : public TypeMatcher { + public: + explicit NotMatcher(std::shared_ptr base_matcher) + : base_matcher{std::move(base_matcher)} {} + + ~NotMatcher() override = default; + + bool Matches(const DataType& type) const override { + return !base_matcher->Matches(type); + } + + bool Equals(const TypeMatcher& other) const override { + if (this == &other) { + return true; + } + const auto* casted = dynamic_cast(&other); + return casted != nullptr && base_matcher->Equals(*casted->base_matcher); + } + + std::string ToString() const override { + return "not(" + base_matcher->ToString() + ")"; + }; + + private: + std::shared_ptr base_matcher; +}; + +std::shared_ptr Not(std::shared_ptr base_matcher) { + return std::make_shared(std::move(base_matcher)); +} + } // namespace match // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 5b5b5718e19..3516390ff8c 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -166,6 +166,11 @@ ARROW_EXPORT std::shared_ptr RunEndEncoded( std::shared_ptr run_end_type_matcher, std::shared_ptr value_type_matcher); +/// \brief Match types that the base_matcher doesn't match +/// +/// @param[in] base_matcher a matcher used to negation match +ARROW_EXPORT std::shared_ptr Not(std::shared_ptr base_matcher); + } // namespace match /// \brief An object used for type-checking arguments to be passed to a kernel diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 795e3c7a094..0512e17e21d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -482,6 +482,9 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func, // ---------------------------------------------------------------------- // MinMax implementation +using arrow::compute::match::SameTypeId; +using arrow::compute::match::Not; + Result> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { ARROW_ASSIGN_OR_RAISE(TypeHolder out_type, @@ -492,32 +495,12 @@ Result> MinMaxInit(KernelContext* ctx, return visitor.Create(); } -struct AnyExceptDictionaryMatcher : TypeMatcher { - public: - AnyExceptDictionaryMatcher() {} - - bool Matches(const DataType& type) const override { - return type.id() != Type::DICTIONARY; - } - - std::string ToString() const override { return "Any Type except the Dictionary Type"; } - - bool Equals(const TypeMatcher& other) const override { - if (this == &other) { - return true; - } - auto casted = dynamic_cast(&other); - return casted != nullptr; - } -}; - // For "min" and "max" functions: override finalize and return the actual value template void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, ScalarAggregateFunction* min_max_func) { - auto any_except_dict_matcher = std::make_shared(); std::shared_ptr sig = - KernelSignature::Make({InputType(any_except_dict_matcher)}, FirstType); + KernelSignature::Make({InputType(Not(SameTypeId(Type::DICTIONARY)))}, FirstType); auto init = [min_max_func]( KernelContext* ctx, const KernelInitArgs& args) -> Result> { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 12d35b264e2..35f083f00b1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -935,15 +935,14 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - this->has_nulls = compacted_dict_arr.null_count() > 0; - this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - const std::shared_ptr& dict = compacted_dict_arr.dictionary(); if (dict->length() == 0) { return Status::OK(); } + this->has_nulls = compacted_dict_arr.null_count() > 0; + this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - Datum dict_values(std::move(dict)); + Datum dict_values(dict); ARROW_ASSIGN_OR_RAISE( Datum result, MinMax(std::move(dict_values), ScalarAggregateOptions::Defaults(), ctx->exec_context())); @@ -951,14 +950,14 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { checked_cast(*result.scalar()); ARROW_ASSIGN_OR_RAISE(auto dict_min, struct_result.field(FieldRef("min"))); ARROW_ASSIGN_OR_RAISE(auto dict_max, struct_result.field(FieldRef("max"))); - ARROW_RETURN_NOT_OK(CompareMinMax(std::move(dict_min), std::move(dict_max), ctx)); + ARROW_RETURN_NOT_OK(UpdateMinMaxState(std::move(dict_min), std::move(dict_max), ctx)); return Status::OK(); } Status MergeFrom(KernelContext* ctx, KernelState&& src) override { const auto& other = checked_cast(src); - ARROW_RETURN_NOT_OK(CompareMinMax(other.min, other.max, ctx)); + ARROW_RETURN_NOT_OK(UpdateMinMaxState(other.min, other.max, ctx)); this->has_nulls = this->has_nulls || other.has_nulls; this->count += other.count; return Status::OK(); @@ -969,9 +968,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { const auto& child_type = struct_type.field(0)->type(); std::vector> values; - // Physical type != result type - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || - this->min == nullptr || this->min->type->id() == Type::NA) { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { // (null, null) std::shared_ptr null_scalar = MakeNullScalar(child_type); values = {null_scalar, null_scalar}; @@ -991,8 +988,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr max; private: - Status CompareMinMax(const std::shared_ptr& other_min, - const std::shared_ptr& other_max, KernelContext* ctx) { + Status UpdateMinMaxState(const std::shared_ptr& other_min, + const std::shared_ptr& other_max, KernelContext* ctx) { if (this->min == nullptr || this->min->type->id() == Type::NA) { this->min = other_min; } else if (other_min != nullptr && other_min->type->id() != Type::NA) { From 1594f4445a8ead3814ae6ca93cb8b9807bf0afc9 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 16:33:10 +0800 Subject: [PATCH 30/60] fix bug --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 64ef4934f4d..6ccdac95229 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -935,9 +935,6 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - this->has_nulls |= compacted_dict_arr.null_count() > 0; - this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - const std::shared_ptr& dict = compacted_dict_arr.dictionary(); if (dict->length() == 0) { return Status::OK(); @@ -960,9 +957,9 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { Status MergeFrom(KernelContext* ctx, KernelState&& src) override { const auto& other = checked_cast(src); - ARROW_RETURN_NOT_OK(UpdateMinMaxState(other.min, other.max, ctx)); - this->has_nulls = this->has_nulls || other.has_nulls; + this->has_nulls |= other.has_nulls; this->count += other.count; + ARROW_RETURN_NOT_OK(UpdateMinMaxState(other.min, other.max, ctx)); return Status::OK(); } From 7c2ebe064d224aa2e1923f905bed271373152bda Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 16:34:38 +0800 Subject: [PATCH 31/60] fix bug --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 6ccdac95229..fddc142669b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -939,7 +939,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { if (dict->length() == 0) { return Status::OK(); } - this->has_nulls = compacted_dict_arr.null_count() > 0; + this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); Datum dict_values(dict); From bc5a647fc51f95733c49ca56d8b3c286e496fb6b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 20:57:47 +0800 Subject: [PATCH 32/60] lint --- .../arrow/compute/kernels/aggregate_basic.cc | 2 +- .../arrow/compute/kernels/aggregate_test.cc | 49 +++++++++++++------ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 0512e17e21d..31343a9c8f5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -482,8 +482,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func, // ---------------------------------------------------------------------- // MinMax implementation -using arrow::compute::match::SameTypeId; using arrow::compute::match::Not; +using arrow::compute::match::SameTypeId; Result> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index dc1824031a9..1c52809d1d7 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2047,16 +2047,16 @@ TEST(TestDecimalMinMaxKernel, Decimals) { } } -TEST(TestDictionaryMinMaxKernel, DictionaryArray) { +TEST(TestDictionaryMinMaxKernel, DecimalValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - for (const auto& item_ty_ : {decimal128(5, 2), decimal256(5, 2)}) { - dict_ty = dictionary(index_type, item_ty_); - ty = struct_({field("min", item_ty_), field("max", item_ty_)}); + for (const auto& value_ty_ : {decimal128(5, 2), decimal256(5, 2)}) { + dict_ty = dictionary(index_type, value_ty_); + ty = struct_({field("min", value_ty_), field("max", value_ty_)}); options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT( @@ -2133,19 +2133,19 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { options), ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "9.20"})"))); } + } +} - std::shared_ptr item_ty = null(); - dict_ty = dictionary(index_type, item_ty); - ty = struct_({field("min", item_ty), field("max", item_ty)}); - Datum result = ScalarFromJSON(ty, "[null, null]"); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), - ResultWith(result)); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), - ResultWith(result)); +TEST(TestDictionaryMinMaxKernel, BooleanValue) { + ScalarAggregateOptions options; + std::shared_ptr value_ty = boolean(); + std::shared_ptr ty = + struct_({field("min", value_ty), field("max", value_ty)}); + std::shared_ptr dict_ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - item_ty = boolean(); - dict_ty = dictionary(index_type, item_ty); - ty = struct_({field("min", item_ty), field("max", item_ty)}); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 1])", R"([false, true])"), options), ResultWith(ScalarFromJSON(ty, "[false, true]"))); @@ -2158,6 +2158,25 @@ TEST(TestDictionaryMinMaxKernel, DictionaryArray) { } } +TEST(TestDictionaryMinMaxKernel, NullValue) { + ScalarAggregateOptions options; + std::shared_ptr value_ty = null(); + std::shared_ptr ty = + struct_({field("min", value_ty), field("max", value_ty)}); + std::shared_ptr dict_ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + // null + dict_ty = dictionary(index_type, value_ty); + Datum result = ScalarFromJSON(ty, "[null, null]"); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), + ResultWith(result)); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), + ResultWith(result)); + } +} + TEST(TestNullMinMaxKernel, Basics) { auto item_ty = null(); auto ty = struct_({field("min", item_ty), field("max", item_ty)}); From 26234720c32499e5675c596cbde9f870de47eb42 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 23:05:33 +0800 Subject: [PATCH 33/60] add float test --- .../arrow/compute/kernels/aggregate_test.cc | 85 ++++++++++++++++++- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 1c52809d1d7..8bf52efa8c0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2047,7 +2047,7 @@ TEST(TestDecimalMinMaxKernel, Decimals) { } } -TEST(TestDictionaryMinMaxKernel, DecimalValue) { +TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; std::shared_ptr ty; @@ -2139,13 +2139,14 @@ TEST(TestDictionaryMinMaxKernel, DecimalValue) { TEST(TestDictionaryMinMaxKernel, BooleanValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = boolean(); + std::shared_ptr dict_ty; std::shared_ptr ty = struct_({field("min", value_ty), field("max", value_ty)}); - std::shared_ptr dict_ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + dict_ty = dictionary(index_type, value_ty); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 1])", R"([false, true])"), options), ResultWith(ScalarFromJSON(ty, "[false, true]"))); @@ -2158,16 +2159,92 @@ TEST(TestDictionaryMinMaxKernel, BooleanValue) { } } +TEST(TestDictionaryMinMaxKernel, FloatsValue) { + ScalarAggregateOptions options; + std::shared_ptr dict_ty; + std::shared_ptr ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + for (const auto& value_ty_ : {float32(), float64()}) { + dict_ty = dictionary(index_type, value_ty_); + ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"([5, -Inf, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"([5, -Inf, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"([5, -Inf, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"([5, -Inf, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); + } + } +} + TEST(TestDictionaryMinMaxKernel, NullValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = null(); + std::shared_ptr dict_ty; std::shared_ptr ty = struct_({field("min", value_ty), field("max", value_ty)}); - std::shared_ptr dict_ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - // null + dict_ty = dictionary(index_type, value_ty); Datum result = ScalarFromJSON(ty, "[null, null]"); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), From 5599daac95c154f98b40b8b9b951645773b7f9f4 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 23:26:51 +0800 Subject: [PATCH 34/60] add an interger test --- .../arrow/compute/kernels/aggregate_test.cc | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 8bf52efa8c0..3e519e2ed70 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2159,6 +2159,82 @@ TEST(TestDictionaryMinMaxKernel, BooleanValue) { } } +TEST(TestDictionaryMinMaxKernel, IntegersValue) { + ScalarAggregateOptions options; + std::shared_ptr dict_ty; + std::shared_ptr ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + for (const auto& value_ty_ : + {int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}) { + dict_ty = dictionary(index_type, value_ty_); + ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"([5, 9, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 9})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"([5, 9, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 3, "max": 9})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"([5, 100, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", + R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([2])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 2})"))); + } + } +} + TEST(TestDictionaryMinMaxKernel, FloatsValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; From 38bb4d44a7a02f01e88b777bb1de012f563559b3 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 15 Oct 2023 23:31:35 +0800 Subject: [PATCH 35/60] lint --- .../arrow/compute/kernels/aggregate_test.cc | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 3e519e2ed70..1b3af3ffb64 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2177,10 +2177,10 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"([5, 9, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 9})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 9, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 9})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2195,10 +2195,10 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2292,8 +2292,7 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), - options), + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", From 208e8bdd43b8e44bb3d7043d4c21dec060bb3f07 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 21:42:06 +0800 Subject: [PATCH 36/60] add chunk test --- .../arrow/compute/kernels/aggregate_test.cc | 298 ++++++++++-------- 1 file changed, 170 insertions(+), 128 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 1b3af3ffb64..5296928ce05 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2047,6 +2047,134 @@ TEST(TestDecimalMinMaxKernel, Decimals) { } } +TEST(TestNullMinMaxKernel, Basics) { + auto item_ty = null(); + auto ty = struct_({field("min", item_ty), field("max", item_ty)}); + Datum result = ScalarFromJSON(ty, "[null, null]"); + EXPECT_THAT(MinMax(ScalarFromJSON(item_ty, "null")), ResultWith(result)); + EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[]")), ResultWith(result)); + EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[null]")), ResultWith(result)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(item_ty, {"[null]", "[]", "[null, null]"})), + ResultWith(result)); +} + +template +class TestBaseBinaryMinMaxKernel : public ::testing::Test {}; +TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BaseBinaryArrowTypes); +TYPED_TEST(TestBaseBinaryMinMaxKernel, Basics) { + std::vector chunked_input1 = {R"(["cc", "", "aa", "b", "c"])", + R"(["d", "", null, "b", "c"])"}; + std::vector chunked_input2 = {R"(["cc", null, "aa", "b", "c"])", + R"(["d", "", "aa", "b", "c"])"}; + std::vector chunked_input3 = {R"(["cc", "", "aa", "b", null])", + R"(["d", "", null, "b", "c"])"}; + auto ty = std::make_shared(); + auto res_ty = struct_({field("min", ty), field("max", ty)}); + Datum null = ScalarFromJSON(res_ty, R"([null, null])"); + + // SKIP nulls by default + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cc"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + + EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")")), + ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); + + ScalarAggregateOptions options(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), + ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); +} + +TEST(TestFixedSizeBinaryMinMaxKernel, Basics) { + auto ty = fixed_size_binary(2); + std::vector chunked_input1 = {R"(["cd", "aa", "ab", "bb", "cc"])", + R"(["da", "aa", null, "bb", "cc"])"}; + std::vector chunked_input2 = {R"(["cd", null, "ab", "bb", "cc"])", + R"(["da", "aa", "ab", "bb", "cc"])"}; + std::vector chunked_input3 = {R"(["cd", "aa", "ab", "bb", null])", + R"(["da", "aa", null, "bb", "cc"])"}; + auto res_ty = struct_({field("min", ty), field("max", ty)}); + Datum null = ScalarFromJSON(res_ty, R"([null, null])"); + + // SKIP nulls by default + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["ab", "cd"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + + EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")")), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); + + ScalarAggregateOptions options(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); +} + TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; @@ -2058,7 +2186,13 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { dict_ty = dictionary(index_type, value_ty_); ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"(["5.10"])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"(["3.10", "-1.23"])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), options), @@ -2081,6 +2215,8 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null])", R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), @@ -2093,6 +2229,8 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2100,6 +2238,8 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), options), @@ -2111,6 +2251,8 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2172,7 +2314,13 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { dict_ty = dictionary(index_type, value_ty_); ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([5])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([3, 1])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2191,6 +2339,8 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { ResultWith(ScalarFromJSON(ty, R"({"min": 3, "max": 9})"))); options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2209,6 +2359,8 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2216,6 +2368,8 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2226,6 +2380,8 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2247,7 +2403,13 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { dict_ty = dictionary(index_type, value_ty_); ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([5])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([-Inf, 1])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2266,6 +2428,8 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), options), @@ -2284,6 +2448,8 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2291,6 +2457,8 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2301,6 +2469,8 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -2329,134 +2499,6 @@ TEST(TestDictionaryMinMaxKernel, NullValue) { } } -TEST(TestNullMinMaxKernel, Basics) { - auto item_ty = null(); - auto ty = struct_({field("min", item_ty), field("max", item_ty)}); - Datum result = ScalarFromJSON(ty, "[null, null]"); - EXPECT_THAT(MinMax(ScalarFromJSON(item_ty, "null")), ResultWith(result)); - EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[]")), ResultWith(result)); - EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[null]")), ResultWith(result)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(item_ty, {"[null]", "[]", "[null, null]"})), - ResultWith(result)); -} - -template -class TestBaseBinaryMinMaxKernel : public ::testing::Test {}; -TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BaseBinaryArrowTypes); -TYPED_TEST(TestBaseBinaryMinMaxKernel, Basics) { - std::vector chunked_input1 = {R"(["cc", "", "aa", "b", "c"])", - R"(["d", "", null, "b", "c"])"}; - std::vector chunked_input2 = {R"(["cc", null, "aa", "b", "c"])", - R"(["d", "", "aa", "b", "c"])"}; - std::vector chunked_input3 = {R"(["cc", "", "aa", "b", null])", - R"(["d", "", null, "b", "c"])"}; - auto ty = std::make_shared(); - auto res_ty = struct_({field("min", ty), field("max", ty)}); - Datum null = ScalarFromJSON(res_ty, R"([null, null])"); - - // SKIP nulls by default - EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), - ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cc"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), - ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), - ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), - ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); - - EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")")), - ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); - - ScalarAggregateOptions options(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), - ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(null)); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), - ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); - - options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), - ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(null)); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); -} - -TEST(TestFixedSizeBinaryMinMaxKernel, Basics) { - auto ty = fixed_size_binary(2); - std::vector chunked_input1 = {R"(["cd", "aa", "ab", "bb", "cc"])", - R"(["da", "aa", null, "bb", "cc"])"}; - std::vector chunked_input2 = {R"(["cd", null, "ab", "bb", "cc"])", - R"(["da", "aa", "ab", "bb", "cc"])"}; - std::vector chunked_input3 = {R"(["cd", "aa", "ab", "bb", null])", - R"(["da", "aa", null, "bb", "cc"])"}; - auto res_ty = struct_({field("min", ty), field("max", ty)}); - Datum null = ScalarFromJSON(res_ty, R"([null, null])"); - - // SKIP nulls by default - EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), - ResultWith(ScalarFromJSON(res_ty, R"(["ab", "cd"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); - - EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")")), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); - - ScalarAggregateOptions options(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(null)); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); - - options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), - ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); - EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); - EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), - ResultWith(null)); - EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); - EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); -} - template struct MinMaxResult { using T = typename ArrowType::c_type; From 656d4e2eb870d007b479379551192972b33be914 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 21:59:30 +0800 Subject: [PATCH 37/60] Update cpp/src/arrow/compute/kernels/aggregate_basic.cc Co-authored-by: mwish <1506118561@qq.com> --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 31343a9c8f5..32bd0c3b9f4 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -523,7 +523,7 @@ void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, AddAggKernel(sig, init, finalize, func); sig = KernelSignature::Make({InputType(Type::DICTIONARY)}, DictionaryValueType); - AddAggKernel(sig, init, finalize, func); +AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); } // ---------------------------------------------------------------------- From dbf3cbe57141107864ed7c8993bd6fff416e805b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 22:00:00 +0800 Subject: [PATCH 38/60] Update cpp/src/arrow/compute/kernels/aggregate_basic.cc Co-authored-by: mwish <1506118561@qq.com> --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 32bd0c3b9f4..d1cb13ae8ff 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -520,7 +520,7 @@ void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, // Note SIMD level is always NONE, but the convenience kernel will // dispatch to an appropriate implementation - AddAggKernel(sig, init, finalize, func); + AddAggKernel(std::move(sig), init, finalize, func); sig = KernelSignature::Make({InputType(Type::DICTIONARY)}, DictionaryValueType); AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); From e07f261accad4b65213d8872420ac1859bdad72e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 22:30:56 +0800 Subject: [PATCH 39/60] optimization --- .../kernels/aggregate_basic_internal.h | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index fddc142669b..7e491ba781a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -935,22 +935,28 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - const std::shared_ptr& dict = compacted_dict_arr.dictionary(); - if (dict->length() == 0) { + if (compacted_dict_arr.length() - compacted_dict_arr.null_count() == 0) { return Status::OK(); } this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - Datum dict_values(dict); - ARROW_ASSIGN_OR_RAISE( - Datum result, MinMax(std::move(dict_values), ScalarAggregateOptions::Defaults(), - ctx->exec_context())); - const StructScalar& struct_result = - checked_cast(*result.scalar()); - ARROW_ASSIGN_OR_RAISE(auto dict_min, struct_result.field(FieldRef("min"))); - ARROW_ASSIGN_OR_RAISE(auto dict_max, struct_result.field(FieldRef("max"))); - ARROW_RETURN_NOT_OK(UpdateMinMaxState(std::move(dict_min), std::move(dict_max), ctx)); + std::shared_ptr dict_min; + std::shared_ptr dict_max; + if (compacted_dict_arr.length() - compacted_dict_arr.null_count() == 1) { + ARROW_ASSIGN_OR_RAISE(dict_min, compacted_dict_arr.dictionary()->GetScalar(0)); + dict_max = dict_min; + } else { + Datum dict_values(compacted_dict_arr.dictionary()); + ARROW_ASSIGN_OR_RAISE( + Datum result, MinMax(std::move(dict_values), ScalarAggregateOptions::Defaults(), + ctx->exec_context())); + const StructScalar& struct_result = + checked_cast(*result.scalar()); + ARROW_ASSIGN_OR_RAISE(dict_min, struct_result.field(FieldRef("min"))); + ARROW_ASSIGN_OR_RAISE(dict_max, struct_result.field(FieldRef("max"))); + } + ARROW_RETURN_NOT_OK(UpdateMinMaxState(dict_min, dict_max, ctx)); return Status::OK(); } From b8f10db23cca7641ba7795475c76c21a66c82516 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 22:31:34 +0800 Subject: [PATCH 40/60] lint --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index d1cb13ae8ff..aabce72a049 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -523,7 +523,7 @@ void AddMinOrMaxAggKernels(ScalarAggregateFunction* func, AddAggKernel(std::move(sig), init, finalize, func); sig = KernelSignature::Make({InputType(Type::DICTIONARY)}, DictionaryValueType); -AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); + AddAggKernel(std::move(sig), std::move(init), std::move(finalize), func); } // ---------------------------------------------------------------------- From 0a875068cb236b0df0b9d9dc746a9a4824cd1eb2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 23:03:27 +0800 Subject: [PATCH 41/60] add binary test --- .../arrow/compute/kernels/aggregate_test.cc | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 5296928ce05..f0987792ece 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2480,6 +2480,95 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { } } +TEST(TestDictionaryMinMaxKernel, BinarysValue) { + ScalarAggregateOptions options; + std::shared_ptr dict_ty; + std::shared_ptr ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + for (const auto& value_ty_ : {fixed_size_binary(2), binary(), large_binary()}) { + dict_ty = dictionary(index_type, value_ty_); + ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"(["hz"])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"(["aa", "bb"])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "aa", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bf", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"(["hz", "aa", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "aa", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", + R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", + R"(["hz", "aa", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["bb"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "bb"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["hz", "bb"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", + R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["bb"])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "bb"})"))); + } + } +} + TEST(TestDictionaryMinMaxKernel, NullValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = null(); From 6979e48a81e486aa12a60f18915e60cd1dc30fce Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 23:04:50 +0800 Subject: [PATCH 42/60] lint --- .../arrow/compute/kernels/aggregate_test.cc | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index f0987792ece..e455c190d5a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2499,10 +2499,10 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "aa", "bf", "cc", "fa"])"), options), @@ -2519,10 +2519,10 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { options = ScalarAggregateOptions(/*skip_nulls=*/false); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "bb", "bf", "cc", "fa"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"(["hz", "aa", "bf", "cc", "fa"])"), options), @@ -2548,9 +2548,9 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["hz", "bb"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["hz", "bb"])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", R"(["hz", "bb", "bf", "cc", "fa"])"), From 5e3f9a45ca032ad8c19fa95a3eb0e1a4c55c07b7 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 16 Oct 2023 23:15:52 +0800 Subject: [PATCH 43/60] boolean test --- .../arrow/compute/kernels/aggregate_test.cc | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index e455c190d5a..2d95efe9e95 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2278,7 +2278,7 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { } } -TEST(TestDictionaryMinMaxKernel, BooleanValue) { +TEST(TestDictionaryMinMaxKernel, BooleansValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = boolean(); std::shared_ptr dict_ty; @@ -2287,8 +2287,15 @@ TEST(TestDictionaryMinMaxKernel, BooleanValue) { for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - dict_ty = dictionary(index_type, value_ty); + + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([true])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([false, true])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, "[false, true]"))); EXPECT_THAT( MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 1])", R"([false, true])"), options), ResultWith(ScalarFromJSON(ty, "[false, true]"))); @@ -2298,6 +2305,48 @@ TEST(TestDictionaryMinMaxKernel, BooleanValue) { ResultWith(ScalarFromJSON(ty, "[true, true]"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), ResultWith(ScalarFromJSON(ty, "[null, null]"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, "[null, null]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 1])", R"([false, true])"), + options), + ResultWith(ScalarFromJSON(ty, "[null, null]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([false])"), options), + ResultWith(ScalarFromJSON(ty, "[false, false]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([true])"), options), + ResultWith(ScalarFromJSON(ty, "[true, true]"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, "[null, null]"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([true])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": true, "max": true})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([true, false])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", + R"([false, true, false, true, false])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); + EXPECT_THAT(MinMax(chunked, options), + ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); + EXPECT_THAT( + MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([false])"), options), + ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": false})"))); } } From 4cb58bceff4c93be84c132cbda370b7d13e7551b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 17 Oct 2023 23:03:38 +0800 Subject: [PATCH 44/60] delete test --- .../arrow/compute/kernels/aggregate_test.cc | 224 ------------------ 1 file changed, 224 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 2d95efe9e95..db05acfc0d9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2293,7 +2293,6 @@ TEST(TestDictionaryMinMaxKernel, BooleansValue) { auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([false, true])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, "[false, true]"))); EXPECT_THAT( @@ -2301,52 +2300,6 @@ TEST(TestDictionaryMinMaxKernel, BooleansValue) { ResultWith(ScalarFromJSON(ty, "[false, true]"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([false])"), options), ResultWith(ScalarFromJSON(ty, "[false, false]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([true])"), options), - ResultWith(ScalarFromJSON(ty, "[true, true]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, "[null, null]"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, "[null, null]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 1])", R"([false, true])"), - options), - ResultWith(ScalarFromJSON(ty, "[null, null]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([false])"), options), - ResultWith(ScalarFromJSON(ty, "[false, false]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([true])"), options), - ResultWith(ScalarFromJSON(ty, "[true, true]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, "[null, null]"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([true])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": true, "max": true})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([true, false])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", - R"([false, true, false, true, false])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": true})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([false])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": false, "max": false})"))); } } @@ -2367,7 +2320,6 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([3, 1])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); EXPECT_THAT( @@ -2378,64 +2330,6 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 9, 2, 3, 4])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 9})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"([5, 9, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 3, "max": 9})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"([5, 100, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([2])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 2})"))); } } } @@ -2456,7 +2350,6 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([-Inf, 1])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); EXPECT_THAT( @@ -2467,64 +2360,6 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { R"([5, -Inf, 2, 3, 4])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"([5, -Inf, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"([5, -Inf, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"([5, -Inf, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"([5, 1])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", - R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"([1])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 1})"))); } } } @@ -2545,7 +2380,6 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"(["aa", "bb"])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - options = ScalarAggregateOptions(/*skip_nulls=*/true); EXPECT_THAT(MinMax(chunked, options), ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", @@ -2556,64 +2390,6 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { R"(["hz", "aa", "bf", "cc", "fa"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bf", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"(["hz", "aa", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"(["hz", "aa", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, null, 2, 3, 4])", - R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, null, 3, 4])", - R"(["hz", "aa", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["bb"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "bb"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["hz", "bb"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", - R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["bb"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "bb"})"))); } } } From af93cbfb2357d47ccebbd63e6dd8be12c8f42787 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 19 Oct 2023 08:50:48 +0800 Subject: [PATCH 45/60] Update cpp/src/arrow/compute/kernels/aggregate_basic_internal.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 7e491ba781a..63672a314d9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -935,7 +935,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - if (compacted_dict_arr.length() - compacted_dict_arr.null_count() == 0) { + const int64_t non_null_count = compacted_dict_arr.length() - compacted_dict_arr.null_count(); + if (non_null_count == 0) { return Status::OK(); } this->has_nulls |= compacted_dict_arr.null_count() > 0; From be9238e31e62a74875a408ea44d13f988c7f217b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 19 Oct 2023 08:52:50 +0800 Subject: [PATCH 46/60] Update cpp/src/arrow/compute/kernels/aggregate_basic_internal.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 63672a314d9..4268b44e988 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -941,7 +941,9 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { } this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - + if (this->has_nulls && !options.skip_nulls) { + return Status::OK(); + } std::shared_ptr dict_min; std::shared_ptr dict_max; if (compacted_dict_arr.length() - compacted_dict_arr.null_count() == 1) { From d2ed61c30f984eec83a9cd02f1a7f89dab3f7645 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 19 Oct 2023 08:54:57 +0800 Subject: [PATCH 47/60] Update cpp/src/arrow/compute/kernels/aggregate_basic_internal.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 4268b44e988..2285fad5877 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -921,8 +921,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { out_type(std::move(out_type)), has_nulls(false), count(0), - min(nullptr), - max(nullptr) { + min(MakeNullScalar(this->out_type)), + max(this->min) { this->options.min_count = std::max(1, this->options.min_count); } From 9c2174c57f36b051a5c5d82c417ebae9926bc7d2 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 19 Oct 2023 08:57:15 +0800 Subject: [PATCH 48/60] Update cpp/src/arrow/compute/kernels/aggregate_basic_internal.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 2285fad5877..3723e1341e7 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -956,8 +956,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ctx->exec_context())); const StructScalar& struct_result = checked_cast(*result.scalar()); - ARROW_ASSIGN_OR_RAISE(dict_min, struct_result.field(FieldRef("min"))); - ARROW_ASSIGN_OR_RAISE(dict_max, struct_result.field(FieldRef("max"))); + dict_min = struct_result.value[MinOrMax::Min]; + dict_max = struct_result.value[MinOrMax::Max]; } ARROW_RETURN_NOT_OK(UpdateMinMaxState(dict_min, dict_max, ctx)); return Status::OK(); From d486422796844d2382f8b8d173a0b5de02589ce9 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 22 Oct 2023 22:05:16 +0800 Subject: [PATCH 49/60] kernel member init --- .../kernels/aggregate_basic_internal.h | 237 +++++++++--------- 1 file changed, 117 insertions(+), 120 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 3723e1341e7..87314889df0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -20,6 +20,8 @@ #include #include #include +#include + #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" @@ -912,125 +914,6 @@ struct NullMinMaxImpl : public ScalarAggregator { } }; -template -struct DictionaryMinMaxImpl : public ScalarAggregator { - using ThisType = DictionaryMinMaxImpl; - - DictionaryMinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) - : options(std::move(options)), - out_type(std::move(out_type)), - has_nulls(false), - count(0), - min(MakeNullScalar(this->out_type)), - max(this->min) { - this->options.min_count = std::max(1, this->options.min_count); - } - - Status Consume(KernelContext* ctx, const ExecSpan& batch) override { - if (batch[0].is_scalar()) { - return Status::NotImplemented("No min/max implemented for DictionaryScalar"); - } - - DictionaryArray dict_arr(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); - const DictionaryArray& compacted_dict_arr = - checked_cast(*compacted_arr); - const int64_t non_null_count = compacted_dict_arr.length() - compacted_dict_arr.null_count(); - if (non_null_count == 0) { - return Status::OK(); - } - this->has_nulls |= compacted_dict_arr.null_count() > 0; - this->count += compacted_dict_arr.length() - compacted_dict_arr.null_count(); - if (this->has_nulls && !options.skip_nulls) { - return Status::OK(); - } - std::shared_ptr dict_min; - std::shared_ptr dict_max; - if (compacted_dict_arr.length() - compacted_dict_arr.null_count() == 1) { - ARROW_ASSIGN_OR_RAISE(dict_min, compacted_dict_arr.dictionary()->GetScalar(0)); - dict_max = dict_min; - } else { - Datum dict_values(compacted_dict_arr.dictionary()); - ARROW_ASSIGN_OR_RAISE( - Datum result, MinMax(std::move(dict_values), ScalarAggregateOptions::Defaults(), - ctx->exec_context())); - const StructScalar& struct_result = - checked_cast(*result.scalar()); - dict_min = struct_result.value[MinOrMax::Min]; - dict_max = struct_result.value[MinOrMax::Max]; - } - ARROW_RETURN_NOT_OK(UpdateMinMaxState(dict_min, dict_max, ctx)); - return Status::OK(); - } - - Status MergeFrom(KernelContext* ctx, KernelState&& src) override { - const auto& other = checked_cast(src); - - this->has_nulls |= other.has_nulls; - this->count += other.count; - ARROW_RETURN_NOT_OK(UpdateMinMaxState(other.min, other.max, ctx)); - return Status::OK(); - } - - Status Finalize(KernelContext*, Datum* out) override { - const auto& struct_type = checked_cast(*out_type); - const auto& child_type = struct_type.field(0)->type(); - - std::vector> values; - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { - // (null, null) - std::shared_ptr null_scalar = MakeNullScalar(child_type); - values = {null_scalar, null_scalar}; - } else { - values = {std::move(this->min), std::move(this->max)}; - } - - out->value = std::make_shared(std::move(values), this->out_type); - return Status::OK(); - } - - ScalarAggregateOptions options; - std::shared_ptr out_type; - bool has_nulls; - int64_t count; - std::shared_ptr min; - std::shared_ptr max; - - private: - Status UpdateMinMaxState(const std::shared_ptr& other_min, - const std::shared_ptr& other_max, KernelContext* ctx) { - if (this->min == nullptr || this->min->type->id() == Type::NA) { - this->min = other_min; - } else if (other_min != nullptr && other_min->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE( - Datum greater_result, - CallFunction("greater", {this->min, other_min}, ctx->exec_context())); - const BooleanScalar& greater_scalar = - checked_cast(*greater_result.scalar()); - - if (greater_scalar.value) { - this->min = other_min; - } - } - - if (this->max == nullptr || this->max->type->id() == Type::NA) { - this->max = other_max; - } else if (other_max != nullptr && other_max->type->id() != Type::NA) { - ARROW_ASSIGN_OR_RAISE( - Datum less_result, - CallFunction("less", {this->max, other_max}, ctx->exec_context())); - const BooleanScalar& less_scalar = - checked_cast(*less_result.scalar()); - - if (less_scalar.value) { - this->max = other_max; - } - } - - return Status::OK(); - } -}; - // First/Last struct FirstLastInitState { @@ -1090,6 +973,11 @@ struct FirstLastInitState { } }; +template +std::unique_ptr DictionaryMinMaxImplFunc(const DataType& in_type, + std::shared_ptr out_type, + ScalarAggregateOptions options); + template struct MinMaxInitState { std::unique_ptr state; @@ -1122,7 +1010,7 @@ struct MinMaxInitState { } Status Visit(const DictionaryType&) { - state.reset(new DictionaryMinMaxImpl(out_type, options)); + state = DictionaryMinMaxImplFunc(in_type, out_type, options); return Status::OK(); } @@ -1157,4 +1045,113 @@ struct MinMaxInitState { } }; +template +struct DictionaryMinMaxImpl : public ScalarAggregator { + using ThisType = DictionaryMinMaxImpl; + + DictionaryMinMaxImpl(const DataType& in_type, std::shared_ptr out_type, + ScalarAggregateOptions options) + : options(std::move(options)), + out_type(std::move(out_type)), + has_nulls(false), + count(0), + value_type(checked_cast(in_type).value_type()), + valueState(nullptr) { + this->options.min_count = std::max(1, this->options.min_count); + } + + Status Consume(KernelContext* ctx, const ExecSpan& batch) override { + if (batch[0].is_scalar()) { + return Status::NotImplemented("No min/max implemented for DictionaryScalar"); + } + + std::cout<<1<<"\n"; + + if (this->valueState == nullptr) { + const DataType& value_type_ref = checked_cast(*this->value_type); + MinMaxInitState valueMinMaxInitState( + nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); + ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); + } + + std::cout<<2<<"\n"; + + DictionaryArray dict_arr(batch[0].array.ToArrayData()); + ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); + const DictionaryArray& compacted_dict_arr = + checked_cast(*compacted_arr); + const int64_t non_null_count = + compacted_dict_arr.length() - compacted_dict_arr.null_count(); + if (non_null_count == 0) { + return Status::OK(); + } + + std::cout<<3<<"\n"; + + + this->has_nulls |= compacted_dict_arr.null_count() > 0; + this->count += non_null_count; + if (this->has_nulls && !options.skip_nulls) { + return Status::OK(); + } + + std::cout<<4<<"\n"; + + + const ArrayData& dict_data = + checked_cast(*compacted_dict_arr.dictionary()->data()); + RETURN_NOT_OK(checked_cast(this->valueState.get()) + ->Consume(ctx, ExecSpan(std::vector({ExecValue(dict_data)}), 1))); + return Status::OK(); + } + + Status MergeFrom(KernelContext* ctx, KernelState&& src) override { + const auto& other = checked_cast(src); + this->has_nulls |= other.has_nulls; + this->count += other.count; + if (this->has_nulls && !options.skip_nulls) { + return Status::OK(); + } + + KernelState otherValueState = *other.valueState; + RETURN_NOT_OK(checked_cast(this->valueState.get()) + ->MergeFrom(ctx, std::move(otherValueState))); + return Status::OK(); + } + + Status Finalize(KernelContext* ctx, Datum* out) override { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { + const auto& struct_type = checked_cast(*out_type); + const auto& child_type = struct_type.field(0)->type(); + + std::shared_ptr null_scalar = MakeNullScalar(child_type); + std::vector> values = {null_scalar, null_scalar}; + out->value = std::make_shared(std::move(values), this->out_type); + } else { + Datum temp; + RETURN_NOT_OK( + checked_cast(this->valueState.get())->Finalize(ctx, &temp)); + const auto& result = temp.scalar_as(); + DCHECK(result.is_valid); + out->value = result.GetSharedPtr(); + } + + return Status::OK(); + } + + ScalarAggregateOptions options; + std::shared_ptr out_type; + bool has_nulls; + int64_t count; + std::shared_ptr value_type; + std::unique_ptr valueState; +}; + +template +std::unique_ptr DictionaryMinMaxImplFunc(const DataType& in_type, + std::shared_ptr out_type, + ScalarAggregateOptions options) { + return std::make_unique>(in_type, out_type, options); +} + } // namespace arrow::compute::internal From 3a58ab6eac58ccc4d2abc21b33b51b249c55df75 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 25 Oct 2023 19:29:49 +0800 Subject: [PATCH 50/60] debug value state --- .../kernels/aggregate_basic_internal.h | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 87314889df0..182a8b4ae2d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -18,10 +18,9 @@ #pragma once #include +#include #include #include -#include - #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" @@ -1064,17 +1063,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { if (batch[0].is_scalar()) { return Status::NotImplemented("No min/max implemented for DictionaryScalar"); } - - std::cout<<1<<"\n"; - - if (this->valueState == nullptr) { - const DataType& value_type_ref = checked_cast(*this->value_type); - MinMaxInitState valueMinMaxInitState( - nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); - ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); - } - - std::cout<<2<<"\n"; + RETURN_NOT_OK(this->InitValueState()); DictionaryArray dict_arr(batch[0].array.ToArrayData()); ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); @@ -1086,26 +1075,23 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { return Status::OK(); } - std::cout<<3<<"\n"; - - this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += non_null_count; if (this->has_nulls && !options.skip_nulls) { return Status::OK(); } - std::cout<<4<<"\n"; - - const ArrayData& dict_data = checked_cast(*compacted_dict_arr.dictionary()->data()); - RETURN_NOT_OK(checked_cast(this->valueState.get()) - ->Consume(ctx, ExecSpan(std::vector({ExecValue(dict_data)}), 1))); + RETURN_NOT_OK( + checked_cast(this->valueState.get()) + ->Consume(nullptr, ExecSpan(std::vector({ExecValue(dict_data)}), 1))); return Status::OK(); } - Status MergeFrom(KernelContext* ctx, KernelState&& src) override { + Status MergeFrom(KernelContext*, KernelState&& src) override { + RETURN_NOT_OK(this->InitValueState()); + const auto& other = checked_cast(src); this->has_nulls |= other.has_nulls; this->count += other.count; @@ -1113,14 +1099,13 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { return Status::OK(); } - KernelState otherValueState = *other.valueState; RETURN_NOT_OK(checked_cast(this->valueState.get()) - ->MergeFrom(ctx, std::move(otherValueState))); + ->MergeFrom(nullptr, std::move(*other.valueState))); return Status::OK(); } - Status Finalize(KernelContext* ctx, Datum* out) override { - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { + Status Finalize(KernelContext*, Datum* out) override { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || this->valueState.get() == nullptr) { const auto& struct_type = checked_cast(*out_type); const auto& child_type = struct_type.field(0)->type(); @@ -1129,8 +1114,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { out->value = std::make_shared(std::move(values), this->out_type); } else { Datum temp; - RETURN_NOT_OK( - checked_cast(this->valueState.get())->Finalize(ctx, &temp)); + RETURN_NOT_OK(checked_cast(this->valueState.get()) + ->Finalize(nullptr, &temp)); const auto& result = temp.scalar_as(); DCHECK(result.is_valid); out->value = result.GetSharedPtr(); @@ -1145,6 +1130,17 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { int64_t count; std::shared_ptr value_type; std::unique_ptr valueState; + + private: + inline Status InitValueState() { + if (this->valueState == nullptr) { + const DataType& value_type_ref = checked_cast(*this->value_type); + MinMaxInitState valueMinMaxInitState( + nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); + ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); + } + return Status::OK(); + } }; template From a9ff8b3c6ff4d80fbac40c6bb249ca2077739723 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 25 Oct 2023 19:29:58 +0800 Subject: [PATCH 51/60] format --- .../kernels/aggregate_basic_internal.h | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 182a8b4ae2d..6f2e03db1f1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1105,7 +1105,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { } Status Finalize(KernelContext*, Datum* out) override { - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || this->valueState.get() == nullptr) { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || + this->valueState.get() == nullptr) { const auto& struct_type = checked_cast(*out_type); const auto& child_type = struct_type.field(0)->type(); @@ -1131,16 +1132,16 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { std::shared_ptr value_type; std::unique_ptr valueState; - private: - inline Status InitValueState() { - if (this->valueState == nullptr) { - const DataType& value_type_ref = checked_cast(*this->value_type); - MinMaxInitState valueMinMaxInitState( - nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); - ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); - } - return Status::OK(); - } + private: + inline Status InitValueState() { + if (this->valueState == nullptr) { + const DataType& value_type_ref = checked_cast(*this->value_type); + MinMaxInitState valueMinMaxInitState( + nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); + ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); + } + return Status::OK(); + } }; template From ccdc4af5f5a4731bbad3c9ecee08c0fe0405838c Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 25 Oct 2023 19:33:08 +0800 Subject: [PATCH 52/60] delete io --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 6f2e03db1f1..060b9ce40c0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -18,7 +18,6 @@ #pragma once #include -#include #include #include From 901484c038191a2a1e2e5c5ac56acebc261dff81 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 25 Oct 2023 19:40:34 +0800 Subject: [PATCH 53/60] rename --- .../kernels/aggregate_basic_internal.h | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 060b9ce40c0..76a8ffd156b 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1054,7 +1054,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { has_nulls(false), count(0), value_type(checked_cast(in_type).value_type()), - valueState(nullptr) { + value_state(nullptr) { this->options.min_count = std::max(1, this->options.min_count); } @@ -1076,14 +1076,14 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { this->has_nulls |= compacted_dict_arr.null_count() > 0; this->count += non_null_count; - if (this->has_nulls && !options.skip_nulls) { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { return Status::OK(); } const ArrayData& dict_data = checked_cast(*compacted_dict_arr.dictionary()->data()); RETURN_NOT_OK( - checked_cast(this->valueState.get()) + checked_cast(this->value_state.get()) ->Consume(nullptr, ExecSpan(std::vector({ExecValue(dict_data)}), 1))); return Status::OK(); } @@ -1094,18 +1094,18 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { const auto& other = checked_cast(src); this->has_nulls |= other.has_nulls; this->count += other.count; - if (this->has_nulls && !options.skip_nulls) { + if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { return Status::OK(); } - RETURN_NOT_OK(checked_cast(this->valueState.get()) - ->MergeFrom(nullptr, std::move(*other.valueState))); + RETURN_NOT_OK(checked_cast(this->value_state.get()) + ->MergeFrom(nullptr, std::move(*other.value_state))); return Status::OK(); } Status Finalize(KernelContext*, Datum* out) override { if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count) || - this->valueState.get() == nullptr) { + this->value_state.get() == nullptr) { const auto& struct_type = checked_cast(*out_type); const auto& child_type = struct_type.field(0)->type(); @@ -1114,13 +1114,12 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { out->value = std::make_shared(std::move(values), this->out_type); } else { Datum temp; - RETURN_NOT_OK(checked_cast(this->valueState.get()) + RETURN_NOT_OK(checked_cast(this->value_state.get()) ->Finalize(nullptr, &temp)); const auto& result = temp.scalar_as(); DCHECK(result.is_valid); out->value = result.GetSharedPtr(); } - return Status::OK(); } @@ -1129,15 +1128,15 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { bool has_nulls; int64_t count; std::shared_ptr value_type; - std::unique_ptr valueState; + std::unique_ptr value_state; private: inline Status InitValueState() { - if (this->valueState == nullptr) { + if (this->value_state == nullptr) { const DataType& value_type_ref = checked_cast(*this->value_type); MinMaxInitState valueMinMaxInitState( nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); - ARROW_ASSIGN_OR_RAISE(this->valueState, valueMinMaxInitState.Create()); + ARROW_ASSIGN_OR_RAISE(this->value_state, valueMinMaxInitState.Create()); } return Status::OK(); } From f9b025fcf74a4675170e17dd2af12c4b8f393d55 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 6 Nov 2023 09:58:16 +0800 Subject: [PATCH 54/60] reset --- .../arrow/compute/kernels/aggregate_basic_internal.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 76a8ffd156b..e84dc8a8cb1 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1089,17 +1089,19 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { } Status MergeFrom(KernelContext*, KernelState&& src) override { - RETURN_NOT_OK(this->InitValueState()); - - const auto& other = checked_cast(src); + auto&& other = checked_cast(src); this->has_nulls |= other.has_nulls; this->count += other.count; if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { return Status::OK(); } - RETURN_NOT_OK(checked_cast(this->value_state.get()) - ->MergeFrom(nullptr, std::move(*other.value_state))); + if (this->value_state == nullptr) { + this->value_state.reset(other.value_state.release()); + } else { + RETURN_NOT_OK(checked_cast(this->value_state.get()) + ->MergeFrom(nullptr, std::move(*other.value_state))); + } return Status::OK(); } From 7e7160399b6119bbaf7ca1332aec375ea1dd08b8 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 30 Nov 2023 00:05:13 +0800 Subject: [PATCH 55/60] CheckDictionaryMinMax --- .../arrow/compute/kernels/aggregate_test.cc | 247 ++++++------------ 1 file changed, 83 insertions(+), 164 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index db05acfc0d9..df3d387f2da 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2175,6 +2175,55 @@ TEST(TestFixedSizeBinaryMinMaxKernel, Basics) { EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); } +void CheckDictionaryMinMax(const std::shared_ptr& value_type, Datum datum, + const std::string& expected_min, + const std::string& expected_max, + const ScalarAggregateOptions& options) { + std::shared_ptr result_type = + struct_({field("min", value_type), field("max", value_type)}); + EXPECT_THAT( + MinMax(datum, options), + ResultWith(ScalarFromJSON(result_type, "{\"min\": " + expected_min + + ", \"max\": " + expected_max + "}"))); +} + +void CheckDictionaryMinMax(const std::shared_ptr& index_type, + const std::shared_ptr& value_type, + const std::string& input_index_json, + const std::string& input_dictionary_json, + const std::string& expected_min, + const std::string& expected_max, + const ScalarAggregateOptions& options) { + auto dict_type = dictionary(index_type, value_type); + auto arr = DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json); + CheckDictionaryMinMax(value_type, arr, expected_min, expected_max, options); +} + +TEST(TestDictionaryMinMaxKernel, IntegersValue) { + ScalarAggregateOptions options; + std::shared_ptr dict_ty; + + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + + for (const auto& value_ty : + {int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}) { + dict_ty = dictionary(index_type, value_ty); + + auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([5])"); + auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([3, 1])"); + ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); + + options = ScalarAggregateOptions(/*skip_nulls=*/true); + CheckDictionaryMinMax(value_ty, chunked, "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"([5, 1, 2, 3, 4])", "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"([5, 9, 2, 3, 4])", "2", "9", options); + } + } +} + TEST(TestDictionaryMinMaxKernel, DecimalsValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; @@ -2182,98 +2231,21 @@ TEST(TestDictionaryMinMaxKernel, DecimalsValue) { for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - for (const auto& value_ty_ : {decimal128(5, 2), decimal256(5, 2)}) { - dict_ty = dictionary(index_type, value_ty_); - ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + for (const auto& value_ty : {decimal128(5, 2), decimal256(5, 2)}) { + dict_ty = dictionary(index_type, value_ty); + ty = struct_({field("min", value_ty), field("max", value_ty)}); auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"(["5.10"])"); auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"(["3.10", "-1.23"])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); options = ScalarAggregateOptions(/*skip_nulls=*/true); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([3, 1, 1, 4, 0, 2, null])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, null, 1, 4, 3, 0, 2, null])", - R"(["-5.10", "-1.23", "-2.00", "-3.45", "-4.56"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-5.10", "max": "-1.23"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/false); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([3, 1, 1, 4, 0, 2])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 1, 0])", R"(["5.10", "-1.23"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, 3, 1, 1, 4, 0, 2, null, null])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - - options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/1); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([null, null, null])", R"([])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0])", R"(["1.00"])"), options), - ResultWith(ScalarFromJSON(ty, R"({"min": "1.00", "max": "1.00"})"))); - - // compact dictionary - EXPECT_THAT( - MinMax( - DictArrayFromJSON( - dict_ty, R"([3, 1, 1, 4, 0, 2])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56", "8.20", "9.20", "10.20"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - EXPECT_THAT( - MinMax( - DictArrayFromJSON( - dict_ty, R"([5, 1, 1, 6, 0, 2])", - R"(["5.10", "-1.23", "2.00", "3.45", "4.56", "8.20", "9.20", "10.20"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "9.20"})"))); + CheckDictionaryMinMax(value_ty, chunked, R"("-1.23")", R"("5.10")", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 1, 0])", + R"(["5.10", "-1.23"])", R"("-1.23")", R"("5.10")", options); + CheckDictionaryMinMax(index_type, value_ty, R"([3, 1, 1, 4, 0, 2, null])", + R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])", R"("-1.23")", + R"("5.10")", options); } } } @@ -2282,8 +2254,6 @@ TEST(TestDictionaryMinMaxKernel, BooleansValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = boolean(); std::shared_ptr dict_ty; - std::shared_ptr ty = - struct_({field("min", value_ty), field("max", value_ty)}); for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); @@ -2293,73 +2263,33 @@ TEST(TestDictionaryMinMaxKernel, BooleansValue) { auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([false, true])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, "[false, true]"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 1])", R"([false, true])"), options), - ResultWith(ScalarFromJSON(ty, "[false, true]"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 0, 0])", R"([false])"), options), - ResultWith(ScalarFromJSON(ty, "[false, false]"))); - } -} - -TEST(TestDictionaryMinMaxKernel, IntegersValue) { - ScalarAggregateOptions options; - std::shared_ptr dict_ty; - std::shared_ptr ty; - - for (const auto& index_type : all_dictionary_index_types()) { - ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - - for (const auto& value_ty_ : - {int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}) { - dict_ty = dictionary(index_type, value_ty_); - ty = struct_({field("min", value_ty_), field("max", value_ty_)}); - - auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([5])"); - auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([3, 1])"); - ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 9, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 2, "max": 9})"))); - } + CheckDictionaryMinMax(value_ty, chunked, "false", "true", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 0, 1])", R"([false, true])", + "false", "true", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 0, 0])", R"([false])", "false", + "false", options); } } TEST(TestDictionaryMinMaxKernel, FloatsValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; - std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - for (const auto& value_ty_ : {float32(), float64()}) { - dict_ty = dictionary(index_type, value_ty_); - ty = struct_({field("min", value_ty_), field("max", value_ty_)}); + for (const auto& value_ty : {float32(), float64()}) { + dict_ty = dictionary(index_type, value_ty); auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"([5])"); auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"([-Inf, 1])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); - EXPECT_THAT( - MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", R"([5, 1, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": 1, "max": 5})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"([5, -Inf, 2, 3, 4])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": -Inf, "max": 5})"))); + CheckDictionaryMinMax(value_ty, chunked, "-Inf", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"([5, 1, 2, 3, 4])", "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"([5, -Inf, 2, 3, 4])", "-Inf", "5", options); } } } @@ -2367,29 +2297,23 @@ TEST(TestDictionaryMinMaxKernel, FloatsValue) { TEST(TestDictionaryMinMaxKernel, BinarysValue) { ScalarAggregateOptions options; std::shared_ptr dict_ty; - std::shared_ptr ty; for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - for (const auto& value_ty_ : {fixed_size_binary(2), binary(), large_binary()}) { - dict_ty = dictionary(index_type, value_ty_); - ty = struct_({field("min", value_ty_), field("max", value_ty_)}); - + for (const auto& value_ty : {fixed_size_binary(2), binary(), large_binary()}) { + dict_ty = dictionary(index_type, value_ty); auto chunk1 = DictArrayFromJSON(dict_ty, R"([null, 0])", R"(["hz"])"); auto chunk2 = DictArrayFromJSON(dict_ty, R"([0, 1, 1])", R"(["aa", "bb"])"); ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); - EXPECT_THAT(MinMax(chunked, options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"(["hz", "bb", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "bb", "max": "hz"})"))); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([0, 1, 2, 3, 4])", - R"(["hz", "aa", "bf", "cc", "fa"])"), - options), - ResultWith(ScalarFromJSON(ty, R"({"min": "aa", "max": "hz"})"))); + CheckDictionaryMinMax(value_ty, chunked, R"("aa")", R"("hz")", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "bb", "bf", "cc", "fa"])", R"("bb")", R"("hz")", + options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + R"(["hz", "aa", "bf", "cc", "fa"])", R"("aa")", R"("hz")", + options); } } } @@ -2397,19 +2321,14 @@ TEST(TestDictionaryMinMaxKernel, BinarysValue) { TEST(TestDictionaryMinMaxKernel, NullValue) { ScalarAggregateOptions options; std::shared_ptr value_ty = null(); - std::shared_ptr dict_ty; - std::shared_ptr ty = - struct_({field("min", value_ty), field("max", value_ty)}); for (const auto& index_type : all_dictionary_index_types()) { ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); - dict_ty = dictionary(index_type, value_ty); - Datum result = ScalarFromJSON(ty, "[null, null]"); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([null, null])", R"([])"), options), - ResultWith(result)); - EXPECT_THAT(MinMax(DictArrayFromJSON(dict_ty, R"([])", R"([])"), options), - ResultWith(result)); + CheckDictionaryMinMax(index_type, value_ty, R"([null, null])", R"([])", "null", + "null", options); + CheckDictionaryMinMax(index_type, value_ty, R"([])", R"([])", "null", "null", + options); } } From 765512293db030a4e2cc458f8147ff4f80b40a92 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 30 Nov 2023 00:09:02 +0800 Subject: [PATCH 56/60] logic null count --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index e84dc8a8cb1..837ab2045a9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1068,13 +1068,13 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = checked_cast(*compacted_arr); - const int64_t non_null_count = - compacted_dict_arr.length() - compacted_dict_arr.null_count(); + const int64_t null_count = compacted_dict_arr.ComputeLogicalNullCount(); + const int64_t non_null_count = compacted_dict_arr.length() - null_count; if (non_null_count == 0) { return Status::OK(); } - this->has_nulls |= compacted_dict_arr.null_count() > 0; + this->has_nulls |= null_count > 0; this->count += non_null_count; if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { return Status::OK(); From 936924ea9fb4f6448e6df6a3b8a10dfc88fd5b77 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 30 Nov 2023 09:24:08 +0800 Subject: [PATCH 57/60] add test --- .../kernels/aggregate_basic_internal.h | 7 +- .../arrow/compute/kernels/aggregate_test.cc | 99 ++++++++++++++++++- 2 files changed, 97 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 837ab2045a9..f80bc5cadea 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1070,13 +1070,10 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { checked_cast(*compacted_arr); const int64_t null_count = compacted_dict_arr.ComputeLogicalNullCount(); const int64_t non_null_count = compacted_dict_arr.length() - null_count; - if (non_null_count == 0) { - return Status::OK(); - } this->has_nulls |= null_count > 0; this->count += non_null_count; - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { + if ((this->has_nulls && !options.skip_nulls) || (non_null_count == 0)) { return Status::OK(); } @@ -1092,7 +1089,7 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { auto&& other = checked_cast(src); this->has_nulls |= other.has_nulls; this->count += other.count; - if ((this->has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { + if ((this->has_nulls && !options.skip_nulls) || other.value_state == nullptr) { return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index df3d387f2da..b40ba2e2c6a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2215,11 +2215,102 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2})); options = ScalarAggregateOptions(/*skip_nulls=*/true); - CheckDictionaryMinMax(value_ty, chunked, "1", "5", options); - CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", + CheckDictionaryMinMax(value_ty, chunked, "1", "5", options); // chunked + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml R"([5, 1, 2, 3, 4])", "1", "5", options); - CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", - R"([5, 9, 2, 3, 4])", "2", "9", options); + CheckDictionaryMinMax(index_type, value_ty, // null in indices + R"([0, 1, 2, 3, null])", R"([5, 9, 2, 3])", "2", "9", + options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // null in values + R"([null, null, 2, 3, 4])", "2", "4", options); + CheckDictionaryMinMax(index_type, value_ty, // null in both indices and values + R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "2", "3", + options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced values + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100, 101])", + "1", "100", options); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/4); + CheckDictionaryMinMax(value_ty, chunked, "1", "5", options); // chunked + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml + R"([5, 1, 2, 3, 4])", "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2])", // too short + R"([5, 1, 2])", "null", "null", options); + CheckDictionaryMinMax(index_type, value_ty, // null in indices + R"([0, 1, 2, 3, null])", R"([5, 9, 2, 3])", "2", "9", + options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // null in values + R"([null, null, 2, 3, 4])", "null", "null", options); + CheckDictionaryMinMax(index_type, value_ty, // null in both indices and values + R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", + "null", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced values + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", + "null", "null", options); + + options = ScalarAggregateOptions(/*skip_nulls=*/false); + CheckDictionaryMinMax(value_ty, chunked, "null", "null", options); // chunked + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml + R"([5, 1, 2, 3, 4])", "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, // null in indices + R"([0, 1, 2, 3, null])", R"([5, 9, 2, 3])", "null", "null", + options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // null in values + R"([null, null, 2, 3, 4])", "null", "null", options); + CheckDictionaryMinMax(index_type, value_ty, // null in both indices and values + R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", + "null", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced values + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", + "null", "null", options); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); + CheckDictionaryMinMax(value_ty, chunked, "null", "null", options); // chunked + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml + R"([5, 1, 2, 3, 4])", "1", "5", options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2])", // too short + R"([5, 1, 2])", "null", "null", options); + CheckDictionaryMinMax(index_type, value_ty, // null in indices + R"([0, 1, 2, 3, null])", R"([5, 9, 2, 3])", "null", "null", + options); + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // null in values + R"([null, null, 2, 3, 4])", "null", "null", options); + CheckDictionaryMinMax(index_type, value_ty, // null in both indices and values + R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", + "null", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced values + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", + "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", + "null", "null", options); } } } From 05c85f1ed4aa003b2f59568ecede8d0635bf23be Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 30 Nov 2023 09:54:29 +0800 Subject: [PATCH 58/60] lint --- .../arrow/compute/kernels/aggregate_test.cc | 65 ++++++++++--------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index b40ba2e2c6a..04252dc63d3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -2227,8 +2227,8 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "2", "3", options); CheckDictionaryMinMax(index_type, value_ty, // unreferenced values - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", - "100", options); + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", + "1", "100", options); CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100, 101])", "1", "100", options); @@ -2248,17 +2248,18 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", "null", options); CheckDictionaryMinMax(index_type, value_ty, // unreferenced values - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", + R"([5, 1, 2, 3, null, 100, null, null])", "1", "100", + options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", "1", + "100", options); CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", - "null", "null", options); + R"([0, 1, 2, 3, 5, 0, 3, 1])", + R"([null, null, 2, null, 4, 100])", "null", "null", options); options = ScalarAggregateOptions(/*skip_nulls=*/false); CheckDictionaryMinMax(value_ty, chunked, "null", "null", options); // chunked @@ -2273,21 +2274,22 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", "null", options); CheckDictionaryMinMax(index_type, value_ty, // unreferenced values - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", + R"([5, 1, 2, 3, null, 100, null, null])", "1", "100", + options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", "1", + "100", options); CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", - "null", "null", options); + R"([0, 1, 2, 3, 5, 0, 3, 1])", + R"([null, null, 2, null, 4, 100])", "null", "null", options); options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); - CheckDictionaryMinMax(value_ty, chunked, "null", "null", options); // chunked - CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml + CheckDictionaryMinMax(value_ty, chunked, "null", "null", options); // chunked + CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2, 3, 4])", // noraml R"([5, 1, 2, 3, 4])", "1", "5", options); CheckDictionaryMinMax(index_type, value_ty, R"([0, 1, 2])", // too short R"([5, 1, 2])", "null", "null", options); @@ -2300,17 +2302,18 @@ TEST(TestDictionaryMinMaxKernel, IntegersValue) { R"([0, 1, 2, 3, null])", R"([null, null, 2, 3])", "null", "null", options); CheckDictionaryMinMax(index_type, value_ty, // unreferenced values - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls - R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, null, 100, null, null])", "1", - "100", options); - CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", + R"([0, 1, 2, 3, 5])", R"([5, 1, 2, 3, 4, 100, 101, 102])", "1", "100", options); + CheckDictionaryMinMax(index_type, value_ty, // unreferenced nulls + R"([0, 1, 2, 3, 5])", + R"([5, 1, 2, 3, null, 100, null, null])", "1", "100", + options); + CheckDictionaryMinMax(index_type, value_ty, // multiply referenced values + R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([5, 1, 2, 3, 4, 100])", "1", + "100", options); CheckDictionaryMinMax(index_type, value_ty, // multiply referenced nulls - R"([0, 1, 2, 3, 5, 0, 3, 1])", R"([null, null, 2, null, 4, 100])", - "null", "null", options); + R"([0, 1, 2, 3, 5, 0, 3, 1])", + R"([null, null, 2, null, 4, 100])", "null", "null", options); } } } From 9260e2ff7dd8ece26f9f99537ba986ee2e7da24a Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 4 Feb 2024 18:33:27 +0800 Subject: [PATCH 59/60] add a comment --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index f80bc5cadea..abb3bb34178 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1064,6 +1064,8 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { } RETURN_NOT_OK(this->InitValueState()); + // The minmax is computed from dictionay values, in case some values are not + // referenced by indices, a compaction needs to be excuted here. DictionaryArray dict_arr(batch[0].array.ToArrayData()); ARROW_ASSIGN_OR_RAISE(auto compacted_arr, dict_arr.Compact(ctx->memory_pool())); const DictionaryArray& compacted_dict_arr = From 0e53afecb9a43cfa2ac55128c257a2818ff8f13e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 4 Feb 2024 19:06:44 +0800 Subject: [PATCH 60/60] local var --- cpp/src/arrow/compute/kernels/aggregate_basic_internal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index abb3bb34178..e74addf2c4c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -1135,8 +1135,9 @@ struct DictionaryMinMaxImpl : public ScalarAggregator { inline Status InitValueState() { if (this->value_state == nullptr) { const DataType& value_type_ref = checked_cast(*this->value_type); - MinMaxInitState valueMinMaxInitState( - nullptr, value_type_ref, out_type, ScalarAggregateOptions::Defaults()); + ScalarAggregateOptions options = ScalarAggregateOptions::Defaults(); + MinMaxInitState valueMinMaxInitState(nullptr, value_type_ref, + out_type, options); ARROW_ASSIGN_OR_RAISE(this->value_state, valueMinMaxInitState.Create()); } return Status::OK();