From 7317ced67fac74e54eb50564593b7766de6f8212 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 8 Dec 2025 17:59:09 +0800 Subject: [PATCH 1/5] Fix the issue that MinMax kernel emits inf for all NaN array --- .../compute/kernels/aggregate_basic.inc.cc | 4 ++-- .../arrow/compute/kernels/aggregate_test.cc | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc index 7f2bce4063d..3733f415a04 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc @@ -694,8 +694,8 @@ struct MinMaxState> { this->max = std::fmax(this->max, value); } - T min = std::numeric_limits::infinity(); - T max = -std::numeric_limits::infinity(); + T min = std::numeric_limits::quiet_NaN(); + T max = std::numeric_limits::quiet_NaN(); bool has_nulls = false; }; diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index a2bf0b97fd1..cdc62f946a9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1841,6 +1841,24 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test { AssertMinMaxIsNull(array, options); } + void AssertMinMaxIsNaN(const Datum& array, const ScalarAggregateOptions& options) { + ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options)); + for (const auto& val : out.scalar_as().value) { + ASSERT_TRUE(std::isnan(checked_cast(*val).value)); + } + } + + void AssertMinMaxIsNaN(const std::string& json, const ScalarAggregateOptions& options) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertMinMaxIsNaN(array, options); + } + + void AssertMinMaxIsNaN(const std::vector& json, + const ScalarAggregateOptions& options) { + auto array = ChunkedArrayFromJSON(type_singleton(), json); + AssertMinMaxIsNaN(array, options); + } + std::shared_ptr type_singleton() { return default_type_instance(); } @@ -1963,6 +1981,9 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options); this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options); this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options); + this->AssertMinMaxIs("[NaN, null, 42]", 42, 42, options); + this->AssertMinMaxIsNaN("[NaN, NaN]", options); + this->AssertMinMaxIsNaN("[NaN, null]", options); this->AssertMinMaxIs(chunked_input1, 1, 9, options); this->AssertMinMaxIs(chunked_input2, 1, 9, options); this->AssertMinMaxIs(chunked_input3, 1, 9, options); @@ -1980,6 +2001,7 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options); this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options); + this->AssertMinMaxIsNull("[NaN, null]", options); this->AssertMinMaxIsNull(chunked_input1, options); this->AssertMinMaxIsNull(chunked_input2, options); this->AssertMinMaxIsNull(chunked_input3, options); From 4894506ff33c8d15d80edf0d6eec10f91a6746c9 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 5 Jan 2026 19:27:32 +0800 Subject: [PATCH 2/5] Address comment: fix hash aggregate as well --- cpp/src/arrow/acero/hash_aggregate_test.cc | 68 +++++++++++++++++++ .../arrow/compute/kernels/hash_aggregate.cc | 68 ++++++++----------- cpp/src/arrow/type_traits.h | 20 ++++++ 3 files changed, 116 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index de414219eb4..12d24429cb6 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -2000,6 +2000,74 @@ TEST_P(GroupBy, MinMaxScalar) { } } +TEST_P(GroupBy, MinMaxWithNaN) { + auto in_schema = schema({ + field("argument1", float32()), + field("argument2", float64()), + field("key", int64()), + }); + for (bool use_threads : {true, false}) { + SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); + + auto table = TableFromJSON(in_schema, {R"([ + [NaN, NaN, 1], + [NaN, NaN, 2], + [NaN, NaN, 3] +])", + R"([ + [NaN, NaN, 1], + [-Inf, -Inf, 2], + [Inf, Inf, 3] +])"}); + + ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, + GroupByTest( + { + table->GetColumnByName("argument1"), + table->GetColumnByName("argument1"), + table->GetColumnByName("argument1"), + table->GetColumnByName("argument2"), + table->GetColumnByName("argument2"), + table->GetColumnByName("argument2"), + }, + {table->GetColumnByName("key")}, + { + {"hash_min", nullptr}, + {"hash_max", nullptr}, + {"hash_min_max", nullptr}, + {"hash_min", nullptr}, + {"hash_max", nullptr}, + {"hash_min_max", nullptr}, + }, + use_threads)); + ValidateOutput(aggregated_and_grouped); + SortBy({"key_0"}, &aggregated_and_grouped); + + AssertDatumsEqual(ArrayFromJSON(struct_({ + field("key_0", int64()), + field("hash_min", float32()), + field("hash_max", float32()), + field("hash_min_max", struct_({ + field("min", float32()), + field("max", float32()), + })), + field("hash_min", float64()), + field("hash_max", float64()), + field("hash_min_max", struct_({ + field("min", float64()), + field("max", float64()), + })), + }), + R"([ + [1, NaN, NaN, {"min": NaN, "max": NaN}, NaN, NaN, {"min": NaN, "max": NaN}], + [2, -Inf, -Inf, {"min": -Inf, "max": -Inf}, -Inf, -Inf, {"min": -Inf, "max": -Inf}], + [3, Inf, Inf, {"min": Inf, "max": Inf}, Inf, Inf, {"min": Inf, "max": Inf}] + ])"), + aggregated_and_grouped, + /*verbose=*/true); + } +} + TEST_P(GroupBy, AnyAndAll) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 19f7fc2e5b0..6cf3f63d087 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -276,46 +276,34 @@ struct AntiExtrema { static constexpr CType anti_max() { return std::numeric_limits::min(); } }; -template <> -struct AntiExtrema { - static constexpr bool anti_min() { return true; } - static constexpr bool anti_max() { return false; } +template +struct AntiExtrema { + static constexpr CType anti_min() { return true; } + static constexpr CType anti_max() { return false; } }; -template <> -struct AntiExtrema { - static constexpr float anti_min() { return std::numeric_limits::infinity(); } - static constexpr float anti_max() { return -std::numeric_limits::infinity(); } +template +struct AntiExtrema { + static constexpr CType anti_min() { return std::numeric_limits::quiet_NaN(); } + static constexpr CType anti_max() { return std::numeric_limits::quiet_NaN(); } }; -template <> -struct AntiExtrema { - static constexpr double anti_min() { return std::numeric_limits::infinity(); } - static constexpr double anti_max() { return -std::numeric_limits::infinity(); } +template +struct AntiExtrema { + static constexpr CType anti_min() { return CType::GetMaxSentinel(); } + static constexpr CType anti_max() { return CType::GetMinSentinel(); } }; -template <> -struct AntiExtrema { - static constexpr Decimal32 anti_min() { return BasicDecimal32::GetMaxSentinel(); } - static constexpr Decimal32 anti_max() { return BasicDecimal32::GetMinSentinel(); } -}; - -template <> -struct AntiExtrema { - static constexpr Decimal64 anti_min() { return BasicDecimal64::GetMaxSentinel(); } - static constexpr Decimal64 anti_max() { return BasicDecimal64::GetMinSentinel(); } -}; - -template <> -struct AntiExtrema { - static constexpr Decimal128 anti_min() { return BasicDecimal128::GetMaxSentinel(); } - static constexpr Decimal128 anti_max() { return BasicDecimal128::GetMinSentinel(); } +template +struct MinMaxOp { + static constexpr CType min(CType a, CType b) { return std::min(a, b); } + static constexpr CType max(CType a, CType b) { return std::max(a, b); } }; -template <> -struct AntiExtrema { - static constexpr Decimal256 anti_min() { return BasicDecimal256::GetMaxSentinel(); } - static constexpr Decimal256 anti_max() { return BasicDecimal256::GetMinSentinel(); } +template +struct MinMaxOp { + static constexpr CType min(CType a, CType b) { return std::fmin(a, b); } + static constexpr CType max(CType a, CType b) { return std::fmax(a, b); } }; template @@ -352,8 +340,8 @@ struct GroupedMinMaxImpl final : public GroupedAggregator { VisitGroupedValues( batch, [&](uint32_t g, CType val) { - GetSet::Set(raw_mins, g, std::min(GetSet::Get(raw_mins, g), val)); - GetSet::Set(raw_maxes, g, std::max(GetSet::Get(raw_maxes, g), val)); + GetSet::Set(raw_mins, g, MinMaxOp::min(GetSet::Get(raw_mins, g), val)); + GetSet::Set(raw_maxes, g, MinMaxOp::max(GetSet::Get(raw_maxes, g), val)); bit_util::SetBit(has_values_.mutable_data(), g); }, [&](uint32_t g) { bit_util::SetBit(has_nulls_.mutable_data(), g); }); @@ -373,12 +361,12 @@ struct GroupedMinMaxImpl final : public GroupedAggregator { auto g = group_id_mapping.GetValues(1); for (uint32_t other_g = 0; static_cast(other_g) < group_id_mapping.length; ++other_g, ++g) { - GetSet::Set( - raw_mins, *g, - std::min(GetSet::Get(raw_mins, *g), GetSet::Get(other_raw_mins, other_g))); - GetSet::Set( - raw_maxes, *g, - std::max(GetSet::Get(raw_maxes, *g), GetSet::Get(other_raw_maxes, other_g))); + GetSet::Set(raw_mins, *g, + MinMaxOp::min(GetSet::Get(raw_mins, *g), + GetSet::Get(other_raw_mins, other_g))); + GetSet::Set(raw_maxes, *g, + MinMaxOp::max(GetSet::Get(raw_maxes, *g), + GetSet::Get(other_raw_maxes, other_g))); if (bit_util::GetBit(other->has_values_.data(), other_g)) { bit_util::SetBit(has_values_.mutable_data(), *g); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 1b7a02e1085..16e43b9c9f9 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -1837,4 +1838,23 @@ constexpr bool is_union(const DataType& type) { return is_union(type.id()); } /// @} +/// \addtogroup c-type-concepts +/// @{ + +// XXX: To be completed with more concepts as needed. + +template +concept CBooleanConcept = std::is_same_v; + +// XXX: Ideally we want to have std::floating_point = true. +template +concept CFloatingPointConcept = + std::floating_point || std::is_same_v; + +template +concept CDecimalConcept = std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v; + +/// @} + } // namespace arrow From 7559543395dfe8d9a37f0138bdf25d701ec8589e Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Mon, 5 Jan 2026 19:33:37 +0800 Subject: [PATCH 3/5] Fix format --- cpp/src/arrow/type_traits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 16e43b9c9f9..188a7de6f38 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -1853,7 +1853,7 @@ concept CFloatingPointConcept = template concept CDecimalConcept = std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v; + std::is_same_v || std::is_same_v; /// @} From 4730f956bb87f00255865d9ce6fc10fe9bc0064a Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 6 Jan 2026 18:13:13 +0800 Subject: [PATCH 4/5] Address comments: Move type concepts from public header to local cpp --- .../arrow/compute/kernels/hash_aggregate.cc | 15 ++++++++++++++ cpp/src/arrow/type_traits.h | 20 ------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 6cf3f63d087..61c84addc72 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -270,6 +271,20 @@ struct GroupedCountImpl : public GroupedAggregator { // ---------------------------------------------------------------------- // MinMax implementation +// XXX: Consider making these concepts complete and moving to public header. + +template +concept CBooleanConcept = std::is_same_v; + +// XXX: Ideally we want to have std::floating_point = true. +template +concept CFloatingPointConcept = + std::floating_point || std::is_same_v; + +template +concept CDecimalConcept = std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v; + template struct AntiExtrema { static constexpr CType anti_min() { return std::numeric_limits::max(); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 188a7de6f38..1b7a02e1085 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include #include @@ -1838,23 +1837,4 @@ constexpr bool is_union(const DataType& type) { return is_union(type.id()); } /// @} -/// \addtogroup c-type-concepts -/// @{ - -// XXX: To be completed with more concepts as needed. - -template -concept CBooleanConcept = std::is_same_v; - -// XXX: Ideally we want to have std::floating_point = true. -template -concept CFloatingPointConcept = - std::floating_point || std::is_same_v; - -template -concept CDecimalConcept = std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v; - -/// @} - } // namespace arrow From 461bd5c20c51f4d205281169975bbb79b1e59121 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 6 Jan 2026 23:46:14 +0800 Subject: [PATCH 5/5] Address comment: Use std::same_as instead of std::is_same_v --- cpp/src/arrow/compute/kernels/hash_aggregate.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 61c84addc72..2ab5e574e22 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -274,16 +274,15 @@ struct GroupedCountImpl : public GroupedAggregator { // XXX: Consider making these concepts complete and moving to public header. template -concept CBooleanConcept = std::is_same_v; +concept CBooleanConcept = std::same_as; // XXX: Ideally we want to have std::floating_point = true. template -concept CFloatingPointConcept = - std::floating_point || std::is_same_v; +concept CFloatingPointConcept = std::floating_point || std::same_as; template -concept CDecimalConcept = std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v; +concept CDecimalConcept = std::same_as || std::same_as || + std::same_as || std::same_as; template struct AntiExtrema {