Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions cpp/src/arrow/acero/hash_aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2000,6 +2000,74 @@ TEST_P(GroupBy, MinMaxScalar) {
}
}

TEST_P(GroupBy, MinMaxWithNaN) {
auto in_schema = schema({
field("argument1", float32()),
field("argument2", float64()),
field("key", int64()),
});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we also want a float16 column? :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The min/max kernel for half-float is not implemented so we are not able to test it.

The current half-float is not intact in terms of: 1) whether the type is in floating point category is inconsistent: e.g. type trait is_floating_type<HalfFloatType> is true but g_floating_types doesn't include it (that's why floating point kernels don't register for half-float). 2) Some std functions don't work with our half-float representation: e.g. std::is_nan/fmin/fmax, it won't compile if we try to add certain half-float kernels.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I see. Thanks for the clarification.

for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");

auto table = TableFromJSON(in_schema, {R"([
[NaN, NaN, 1],
[NaN, NaN, 2],
[NaN, NaN, 3]
])",
R"([
[NaN, NaN, 1],
[-Inf, -Inf, 2],
[Inf, Inf, 3]
])"});

ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped,
GroupByTest(
{
table->GetColumnByName("argument1"),
table->GetColumnByName("argument1"),
table->GetColumnByName("argument1"),
table->GetColumnByName("argument2"),
table->GetColumnByName("argument2"),
table->GetColumnByName("argument2"),
},
{table->GetColumnByName("key")},
{
{"hash_min", nullptr},
{"hash_max", nullptr},
{"hash_min_max", nullptr},
{"hash_min", nullptr},
{"hash_max", nullptr},
{"hash_min_max", nullptr},
},
use_threads));
ValidateOutput(aggregated_and_grouped);
SortBy({"key_0"}, &aggregated_and_grouped);

AssertDatumsEqual(ArrayFromJSON(struct_({
field("key_0", int64()),
field("hash_min", float32()),
field("hash_max", float32()),
field("hash_min_max", struct_({
field("min", float32()),
field("max", float32()),
})),
field("hash_min", float64()),
field("hash_max", float64()),
field("hash_min_max", struct_({
field("min", float64()),
field("max", float64()),
})),
}),
R"([
[1, NaN, NaN, {"min": NaN, "max": NaN}, NaN, NaN, {"min": NaN, "max": NaN}],
[2, -Inf, -Inf, {"min": -Inf, "max": -Inf}, -Inf, -Inf, {"min": -Inf, "max": -Inf}],
[3, Inf, Inf, {"min": Inf, "max": Inf}, Inf, Inf, {"min": Inf, "max": Inf}]
])"),
aggregated_and_grouped,
/*verbose=*/true);
}
}

TEST_P(GroupBy, AnyAndAll) {
for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -694,8 +694,8 @@ struct MinMaxState<ArrowType, SimdLevel, enable_if_floating_point<ArrowType>> {
this->max = std::fmax(this->max, value);
}

T min = std::numeric_limits<T>::infinity();
T max = -std::numeric_limits<T>::infinity();
T min = std::numeric_limits<T>::quiet_NaN();
T max = std::numeric_limits<T>::quiet_NaN();
bool has_nulls = false;
};

Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/compute/kernels/aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1841,6 +1841,24 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test {
AssertMinMaxIsNull(array, options);
}

void AssertMinMaxIsNaN(const Datum& array, const ScalarAggregateOptions& options) {
ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
for (const auto& val : out.scalar_as<StructScalar>().value) {
ASSERT_TRUE(std::isnan(checked_cast<const ScalarType&>(*val).value));
}
}

void AssertMinMaxIsNaN(const std::string& json, const ScalarAggregateOptions& options) {
auto array = ArrayFromJSON(type_singleton(), json);
AssertMinMaxIsNaN(array, options);
}

void AssertMinMaxIsNaN(const std::vector<std::string>& json,
const ScalarAggregateOptions& options) {
auto array = ChunkedArrayFromJSON(type_singleton(), json);
AssertMinMaxIsNaN(array, options);
}

std::shared_ptr<DataType> type_singleton() {
return default_type_instance<ArrowType>();
}
Expand Down Expand Up @@ -1963,6 +1981,9 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
this->AssertMinMaxIs("[NaN, null, 42]", 42, 42, options);
this->AssertMinMaxIsNaN("[NaN, NaN]", options);
this->AssertMinMaxIsNaN("[NaN, null]", options);
this->AssertMinMaxIs(chunked_input1, 1, 9, options);
this->AssertMinMaxIs(chunked_input2, 1, 9, options);
this->AssertMinMaxIs(chunked_input3, 1, 9, options);
Expand All @@ -1980,6 +2001,7 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
this->AssertMinMaxIsNull("[NaN, null]", options);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these tests also executed with Float16 or is there a separate test for it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as my other comment.

this->AssertMinMaxIsNull(chunked_input1, options);
this->AssertMinMaxIsNull(chunked_input2, options);
this->AssertMinMaxIsNull(chunked_input3, options);
Expand Down
82 changes: 42 additions & 40 deletions cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

#include <cmath>
#include <concepts>
#include <functional>
#include <memory>
#include <string>
Expand Down Expand Up @@ -270,52 +271,53 @@ struct GroupedCountImpl : public GroupedAggregator {
// ----------------------------------------------------------------------
// MinMax implementation

// XXX: Consider making these concepts complete and moving to public header.

template <typename T>
concept CBooleanConcept = std::same_as<T, bool>;

// XXX: Ideally we want to have std::floating_point<Float16> = true.
template <typename T>
concept CFloatingPointConcept = std::floating_point<T> || std::same_as<T, util::Float16>;

template <typename T>
concept CDecimalConcept = std::same_as<T, Decimal32> || std::same_as<T, Decimal64> ||
std::same_as<T, Decimal128> || std::same_as<T, Decimal256>;

template <typename CType>
struct AntiExtrema {
static constexpr CType anti_min() { return std::numeric_limits<CType>::max(); }
static constexpr CType anti_max() { return std::numeric_limits<CType>::min(); }
};

template <>
struct AntiExtrema<bool> {
static constexpr bool anti_min() { return true; }
static constexpr bool anti_max() { return false; }
};

template <>
struct AntiExtrema<float> {
static constexpr float anti_min() { return std::numeric_limits<float>::infinity(); }
static constexpr float anti_max() { return -std::numeric_limits<float>::infinity(); }
template <CBooleanConcept CType>
struct AntiExtrema<CType> {
static constexpr CType anti_min() { return true; }
static constexpr CType anti_max() { return false; }
};

template <>
struct AntiExtrema<double> {
static constexpr double anti_min() { return std::numeric_limits<double>::infinity(); }
static constexpr double anti_max() { return -std::numeric_limits<double>::infinity(); }
template <CFloatingPointConcept CType>
struct AntiExtrema<CType> {
static constexpr CType anti_min() { return std::numeric_limits<CType>::quiet_NaN(); }
static constexpr CType anti_max() { return std::numeric_limits<CType>::quiet_NaN(); }
};

template <>
struct AntiExtrema<Decimal32> {
static constexpr Decimal32 anti_min() { return BasicDecimal32::GetMaxSentinel(); }
static constexpr Decimal32 anti_max() { return BasicDecimal32::GetMinSentinel(); }
template <CDecimalConcept CType>
struct AntiExtrema<CType> {
static constexpr CType anti_min() { return CType::GetMaxSentinel(); }
static constexpr CType anti_max() { return CType::GetMinSentinel(); }
};

template <>
struct AntiExtrema<Decimal64> {
static constexpr Decimal64 anti_min() { return BasicDecimal64::GetMaxSentinel(); }
static constexpr Decimal64 anti_max() { return BasicDecimal64::GetMinSentinel(); }
};

template <>
struct AntiExtrema<Decimal128> {
static constexpr Decimal128 anti_min() { return BasicDecimal128::GetMaxSentinel(); }
static constexpr Decimal128 anti_max() { return BasicDecimal128::GetMinSentinel(); }
template <typename CType>
struct MinMaxOp {
static constexpr CType min(CType a, CType b) { return std::min(a, b); }
static constexpr CType max(CType a, CType b) { return std::max(a, b); }
};

template <>
struct AntiExtrema<Decimal256> {
static constexpr Decimal256 anti_min() { return BasicDecimal256::GetMaxSentinel(); }
static constexpr Decimal256 anti_max() { return BasicDecimal256::GetMinSentinel(); }
template <CFloatingPointConcept CType>
struct MinMaxOp<CType> {
static constexpr CType min(CType a, CType b) { return std::fmin(a, b); }
static constexpr CType max(CType a, CType b) { return std::fmax(a, b); }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do std::fmin/fmax actually accept util::Float16??

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, but we don't either specialize for Float16 so it compiles. Please see my other comment.

};

template <typename Type, typename Enable = void>
Expand Down Expand Up @@ -352,8 +354,8 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
VisitGroupedValues<Type>(
batch,
[&](uint32_t g, CType val) {
GetSet::Set(raw_mins, g, std::min(GetSet::Get(raw_mins, g), val));
GetSet::Set(raw_maxes, g, std::max(GetSet::Get(raw_maxes, g), val));
GetSet::Set(raw_mins, g, MinMaxOp<CType>::min(GetSet::Get(raw_mins, g), val));
GetSet::Set(raw_maxes, g, MinMaxOp<CType>::max(GetSet::Get(raw_maxes, g), val));
bit_util::SetBit(has_values_.mutable_data(), g);
},
[&](uint32_t g) { bit_util::SetBit(has_nulls_.mutable_data(), g); });
Expand All @@ -373,12 +375,12 @@ struct GroupedMinMaxImpl final : public GroupedAggregator {
auto g = group_id_mapping.GetValues<uint32_t>(1);
for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < group_id_mapping.length;
++other_g, ++g) {
GetSet::Set(
raw_mins, *g,
std::min(GetSet::Get(raw_mins, *g), GetSet::Get(other_raw_mins, other_g)));
GetSet::Set(
raw_maxes, *g,
std::max(GetSet::Get(raw_maxes, *g), GetSet::Get(other_raw_maxes, other_g)));
GetSet::Set(raw_mins, *g,
MinMaxOp<CType>::min(GetSet::Get(raw_mins, *g),
GetSet::Get(other_raw_mins, other_g)));
GetSet::Set(raw_maxes, *g,
MinMaxOp<CType>::max(GetSet::Get(raw_maxes, *g),
GetSet::Get(other_raw_maxes, other_g)));

if (bit_util::GetBit(other->has_values_.data(), other_g)) {
bit_util::SetBit(has_values_.mutable_data(), *g);
Expand Down
Loading