From 3374ac3196770b1db55de657cb162341b93d9bc4 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Tue, 13 May 2025 11:54:46 +0330 Subject: [PATCH 01/11] add Approximate equal --- cpp/src/arrow/array/statistics.cc | 66 ++++++++++++++- cpp/src/arrow/array/statistics.h | 17 ++-- cpp/src/arrow/array/statistics_test.cc | 107 +++++++++++++++++++++---- 3 files changed, 163 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index b661c9fbaff..2d28d22fd46 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -15,7 +15,67 @@ // specific language governing permissions and limitations // under the License. -// This empty .cc file is for embedding not inlined symbols in -// arrow::ArrayStatistics into libarrow. - #include "arrow/array/statistics.h" + +#include +#include + +#include "arrow/compare.h" +namespace arrow { +using ValueType = ArrayStatistics::ValueType; +namespace { +bool DoubleEquals(const double& left, const double& right, const bool is_approximate, + const EqualOptions& options) { + if (left == right) { + return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); + } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { + return true; + } else if (is_approximate) { + return std::fabs(left - right) <= options.atol(); + } else { + return false; + } +} + +bool ValueTypeEquals(const std::optional& left, + const std::optional& right, const EqualOptions& options, + const bool is_approximate) { + if (!left.has_value() && !right.has_value()) { + return true; + } else if (left->index() != right->index()) { + return false; + } else { + auto CheckVisitor = [&](const auto& v1, const auto& v2) { + using type_1 = std::decay_t; + using type_2 = std::decay_t; + if constexpr (std::conjunction_v, + std::is_same>) { + return DoubleEquals(v1, v2, is_approximate, options); + } else if constexpr (std::is_same_v) { + return v1 == v2; + } + // It is unreachable + return false; + }; + return std::visit(CheckVisitor, left.value(), right.value()); + } +} +bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& equal_options, bool is_approximate) { + return left.null_count == right.null_count && + left.distinct_count == right.distinct_count && + left.is_min_exact == right.is_min_exact && + left.is_max_exact == right.is_max_exact && + ValueTypeEquals(left.min, right.min, equal_options, is_approximate) && + ValueTypeEquals(left.max, right.max, equal_options, is_approximate); +} +} // namespace +bool ArrayStatistics::Equals(const ArrayStatistics& other, + const EqualOptions& options) const { + return ArrayStatisticsEquals(*this, other, options, false); +} +bool ArrayStatistics::ApproximateEquals(const ArrayStatistics& other, + const EqualOptions& options) const { + return ArrayStatisticsEquals(*this, other, options, true); +} +} // namespace arrow diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 5380debe3b6..4e76122de95 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/compare.h" #include "arrow/type.h" #include "arrow/util/visibility.h" @@ -128,17 +129,13 @@ struct ARROW_EXPORT ArrayStatistics { bool is_max_exact = false; /// \brief Check two statistics for equality - bool Equals(const ArrayStatistics& other) const { - return null_count == other.null_count && distinct_count == other.distinct_count && - min == other.min && is_min_exact == other.is_min_exact && max == other.max && - is_max_exact == other.is_max_exact; - } - - /// \brief Check two statistics for equality - bool operator==(const ArrayStatistics& other) const { return Equals(other); } + bool Equals(const ArrayStatistics& other, + const EqualOptions& = EqualOptions::Defaults()) const; - /// \brief Check two statistics for not equality - bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } + /// Check two statistics for approximate equality + /// epsilon is only used if the ArrayStatistics::ValueType is Double + bool ApproximateEquals(const ArrayStatistics& other, + const EqualOptions& = EqualOptions::Defaults()) const; }; } // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index cf15a5d3829..929acab16df 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. +#include + #include #include "arrow/array/statistics.h" +#include "arrow/compare.h" namespace arrow { @@ -61,41 +64,117 @@ TEST(ArrayStatisticsTest, TestMax) { ASSERT_FALSE(statistics.is_max_exact); } -TEST(ArrayStatisticsTest, TestEquality) { +TEST(ArrayStatisticsTest, TestEqualsNonDoulbeValue) { ArrayStatistics statistics1; ArrayStatistics statistics2; - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.null_count = 29; - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.null_count = 29; - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.distinct_count = 2929; - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.distinct_count = 2929; - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.min = std::string("world"); - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.min = std::string("world"); - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.is_min_exact = true; - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.is_min_exact = true; - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.max = static_cast(-29); - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.max = static_cast(-29); - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); statistics1.is_max_exact = true; - ASSERT_NE(statistics1, statistics2); + ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.is_max_exact = true; - ASSERT_EQ(statistics1, statistics2); + ASSERT_TRUE(statistics1.Equals(statistics2)); + + // Test different index + statistics1.max = static_cast(29); + ASSERT_FALSE(statistics1.Equals(statistics2)); +} + +TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + EqualOptions options = EqualOptions::Defaults(); + auto Reset = [&]() { + statistics1.min = std::nullopt; + statistics2.min = std::nullopt; + }; + + ASSERT_TRUE(statistics1.Equals(statistics2)); + statistics1.min = 29.0; + ASSERT_FALSE(statistics1.Equals(statistics2)); + statistics2.min = 29.0; + ASSERT_TRUE(statistics1.Equals(statistics2)); + statistics2.min = 30; + ASSERT_FALSE(statistics1.Equals(statistics2)); + + // Check Signed Zeros + Reset(); + statistics1.min = +0; + statistics2.min = -0; + ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); + ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); + + // Check Infinity + Reset(); + auto infinity = std::numeric_limits::infinity(); + statistics1.min = infinity; + statistics2.min = infinity; + ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); + ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); + + statistics1.min = -infinity; + ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); + + statistics1.min = 0; + ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); + + // Check NAN + Reset(); + statistics1.min = static_cast(NAN); + statistics2.min = static_cast(NAN); + ASSERT_TRUE(statistics1.Equals(statistics2, options.nans_equal(true))); + ASSERT_FALSE(statistics1.Equals(statistics2, options.nans_equal(false))); + + statistics2.min = 2.0; + ASSERT_FALSE(statistics1.Equals(statistics2)); + + // Check Approximate float is false + Reset(); + statistics1.max = 0.5001f; + statistics2.max = 0.5; + ASSERT_FALSE(statistics1.Equals(statistics2)); +} +TEST(ArrayStatisticsTest, TestApproximateEqualsDoubleValue) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + EqualOptions options = EqualOptions::Defaults(); + + statistics1.max = 0.5001f; + statistics2.max = 0.5; + + ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3))); + + ASSERT_TRUE(statistics1.ApproximateEquals(statistics2, options.atol(1e-3))); + ASSERT_TRUE(statistics2.ApproximateEquals(statistics1, options.atol(1e-3))); + ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-5))); + ASSERT_FALSE(statistics2.Equals(statistics1, options.atol(1e-5))); } } // namespace arrow From 168af2c42111c76866dba733562606f3002f7af4 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Tue, 13 May 2025 13:45:38 +0330 Subject: [PATCH 02/11] use correct format for double values --- cpp/src/arrow/array/statistics_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 929acab16df..a856ec9796c 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -119,15 +119,15 @@ TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.min = 29.0; ASSERT_TRUE(statistics1.Equals(statistics2)); - statistics2.min = 30; + statistics2.min = 30.0; ASSERT_FALSE(statistics1.Equals(statistics2)); // Check Signed Zeros Reset(); - statistics1.min = +0; - statistics2.min = -0; + statistics1.min = +0.0; + statistics2.min = -0.0; ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); - ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); + ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); // Check Infinity Reset(); @@ -141,7 +141,7 @@ TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); - statistics1.min = 0; + statistics1.min = 0.0; ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); From cd479334268567d477a41edd1b32677506eb880c Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Fri, 30 May 2025 15:41:51 +0330 Subject: [PATCH 03/11] remove ApproximateEquals method --- cpp/src/arrow/array/statistics.cc | 36 +++++++++++--------------- cpp/src/arrow/array/statistics.h | 6 +---- cpp/src/arrow/array/statistics_test.cc | 26 +++++-------------- 3 files changed, 23 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index 2d28d22fd46..4fa88554761 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -21,61 +21,55 @@ #include #include "arrow/compare.h" +#include "arrow/util/logging_internal.h" namespace arrow { using ValueType = ArrayStatistics::ValueType; namespace { -bool DoubleEquals(const double& left, const double& right, const bool is_approximate, - const EqualOptions& options) { +bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { if (left == right) { return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { return true; - } else if (is_approximate) { - return std::fabs(left - right) <= options.atol(); } else { - return false; + return std::fabs(left - right) <= options.atol(); } } bool ValueTypeEquals(const std::optional& left, - const std::optional& right, const EqualOptions& options, - const bool is_approximate) { - if (!left.has_value() && !right.has_value()) { - return true; + const std::optional& right, const EqualOptions& options) { + if (!left.has_value() || !right.has_value()) { + return left.has_value() == right.has_value(); } else if (left->index() != right->index()) { return false; } else { - auto CheckVisitor = [&](const auto& v1, const auto& v2) { + auto EqualsVisitor = [&](const auto& v1, const auto& v2) { using type_1 = std::decay_t; using type_2 = std::decay_t; if constexpr (std::conjunction_v, std::is_same>) { - return DoubleEquals(v1, v2, is_approximate, options); + return DoubleEquals(v1, v2, options); } else if constexpr (std::is_same_v) { return v1 == v2; } // It is unreachable + DCHECK(false); return false; }; - return std::visit(CheckVisitor, left.value(), right.value()); + return std::visit(EqualsVisitor, left.value(), right.value()); } } -bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, - const EqualOptions& equal_options, bool is_approximate) { +bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& equal_options) { return left.null_count == right.null_count && left.distinct_count == right.distinct_count && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && - ValueTypeEquals(left.min, right.min, equal_options, is_approximate) && - ValueTypeEquals(left.max, right.max, equal_options, is_approximate); + ValueTypeEquals(left.min, right.min, equal_options) && + ValueTypeEquals(left.max, right.max, equal_options); } } // namespace bool ArrayStatistics::Equals(const ArrayStatistics& other, const EqualOptions& options) const { - return ArrayStatisticsEquals(*this, other, options, false); -} -bool ArrayStatistics::ApproximateEquals(const ArrayStatistics& other, - const EqualOptions& options) const { - return ArrayStatisticsEquals(*this, other, options, true); + return EqualsImpl(*this, other, options); } } // namespace arrow diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 4e76122de95..2d252f88919 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -129,13 +129,9 @@ struct ARROW_EXPORT ArrayStatistics { bool is_max_exact = false; /// \brief Check two statistics for equality + /// epsilon is only used if the ArrayStatistics::ValueType is Double bool Equals(const ArrayStatistics& other, const EqualOptions& = EqualOptions::Defaults()) const; - - /// Check two statistics for approximate equality - /// epsilon is only used if the ArrayStatistics::ValueType is Double - bool ApproximateEquals(const ArrayStatistics& other, - const EqualOptions& = EqualOptions::Defaults()) const; }; } // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index a856ec9796c..7f342c36e39 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -64,7 +64,7 @@ TEST(ArrayStatisticsTest, TestMax) { ASSERT_FALSE(statistics.is_max_exact); } -TEST(ArrayStatisticsTest, TestEqualsNonDoulbeValue) { +TEST(ArrayStatisticsTest, TestEqualityNonDoulbeValue) { ArrayStatistics statistics1; ArrayStatistics statistics2; @@ -105,7 +105,7 @@ TEST(ArrayStatisticsTest, TestEqualsNonDoulbeValue) { ASSERT_FALSE(statistics1.Equals(statistics2)); } -TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { +TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { ArrayStatistics statistics1; ArrayStatistics statistics2; EqualOptions options = EqualOptions::Defaults(); @@ -115,9 +115,9 @@ TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { }; ASSERT_TRUE(statistics1.Equals(statistics2)); - statistics1.min = 29.0; - ASSERT_FALSE(statistics1.Equals(statistics2)); statistics2.min = 29.0; + ASSERT_FALSE(statistics1.Equals(statistics2)); + statistics1.min = 29.0; ASSERT_TRUE(statistics1.Equals(statistics2)); statistics2.min = 30.0; ASSERT_FALSE(statistics1.Equals(statistics2)); @@ -155,24 +155,12 @@ TEST(ArrayStatisticsTest, TestEqualsDoubleValue) { statistics2.min = 2.0; ASSERT_FALSE(statistics1.Equals(statistics2)); - // Check Approximate float is false + // Check Approximate float Reset(); statistics1.max = 0.5001f; statistics2.max = 0.5; - ASSERT_FALSE(statistics1.Equals(statistics2)); -} -TEST(ArrayStatisticsTest, TestApproximateEqualsDoubleValue) { - ArrayStatistics statistics1; - ArrayStatistics statistics2; - EqualOptions options = EqualOptions::Defaults(); - - statistics1.max = 0.5001f; - statistics2.max = 0.5; - - ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3))); - - ASSERT_TRUE(statistics1.ApproximateEquals(statistics2, options.atol(1e-3))); - ASSERT_TRUE(statistics2.ApproximateEquals(statistics1, options.atol(1e-3))); + ASSERT_TRUE(statistics1.Equals(statistics2, options.atol(1e-3))); + ASSERT_TRUE(statistics2.Equals(statistics1, options.atol(1e-3))); ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-5))); ASSERT_FALSE(statistics2.Equals(statistics1, options.atol(1e-5))); } From 3cc2f93d7550356115a1a3f730015c77ab260ae8 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Fri, 30 May 2025 18:49:45 +0330 Subject: [PATCH 04/11] Add is_approximate --- cpp/src/arrow/array/statistics.cc | 23 ++++++++++++++--------- cpp/src/arrow/array/statistics.h | 14 +++++++++++--- cpp/src/arrow/array/statistics_test.cc | 2 ++ 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index 4fa88554761..c02705898f3 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -25,18 +25,22 @@ namespace arrow { using ValueType = ArrayStatistics::ValueType; namespace { -bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { +bool DoubleEquals(const double& left, const double& right, const EqualOptions& options, + bool is_approximate) { if (left == right) { return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { return true; - } else { + } else if (is_approximate) { return std::fabs(left - right) <= options.atol(); + } else { + return false; } } bool ValueTypeEquals(const std::optional& left, - const std::optional& right, const EqualOptions& options) { + const std::optional& right, const EqualOptions& options, + bool is_approximate) { if (!left.has_value() || !right.has_value()) { return left.has_value() == right.has_value(); } else if (left->index() != right->index()) { @@ -47,7 +51,7 @@ bool ValueTypeEquals(const std::optional& left, using type_2 = std::decay_t; if constexpr (std::conjunction_v, std::is_same>) { - return DoubleEquals(v1, v2, options); + return DoubleEquals(v1, v2, options, is_approximate); } else if constexpr (std::is_same_v) { return v1 == v2; } @@ -59,17 +63,18 @@ bool ValueTypeEquals(const std::optional& left, } } bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, - const EqualOptions& equal_options) { + const EqualOptions& equal_options, bool is_approximate) { return left.null_count == right.null_count && left.distinct_count == right.distinct_count && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && - ValueTypeEquals(left.min, right.min, equal_options) && - ValueTypeEquals(left.max, right.max, equal_options); + ValueTypeEquals(left.min, right.min, equal_options, is_approximate) && + ValueTypeEquals(left.max, right.max, equal_options, is_approximate); } } // namespace bool ArrayStatistics::Equals(const ArrayStatistics& other, - const EqualOptions& options) const { - return EqualsImpl(*this, other, options); + const EqualOptions& equal_options, + bool is_approximate) const { + return EqualsImpl(*this, other, equal_options, is_approximate); } } // namespace arrow diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 2d252f88919..747cf89919d 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -128,10 +128,18 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Whether the maximum value is exact or not bool is_max_exact = false; - /// \brief Check two statistics for equality - /// epsilon is only used if the ArrayStatistics::ValueType is Double + /// \brief Check if two \ref ArrayStatistics are equal. + /// + /// \param equal_options Options used to compare double values for equality. + /// + /// \param is_approximate If true, \ref arrow::EqualOptions::atol_ is used + /// for comparing double values. + /// + /// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, + /// false. bool Equals(const ArrayStatistics& other, - const EqualOptions& = EqualOptions::Defaults()) const; + const EqualOptions& equal_options = EqualOptions::Defaults(), + bool is_approximate = true) const; }; } // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 7f342c36e39..64479d6b28f 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -159,6 +159,8 @@ TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { Reset(); statistics1.max = 0.5001f; statistics2.max = 0.5; + ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3), false)); + ASSERT_TRUE(statistics1.Equals(statistics2, options.atol(1e-3))); ASSERT_TRUE(statistics2.Equals(statistics1, options.atol(1e-3))); ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-5))); From 5560be33ee1da83b6d1aca5c5450dd4326f2cdf2 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Fri, 30 May 2025 20:08:41 +0330 Subject: [PATCH 05/11] Correct comments --- cpp/src/arrow/array/statistics.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 747cf89919d..72bdd80c631 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -128,7 +128,9 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Whether the maximum value is exact or not bool is_max_exact = false; - /// \brief Check if two \ref ArrayStatistics are equal. + /// \brief Checks whether this ArrayStatistics instance is equal to another. + /// + /// \param other The \ref ArrayStatistics instance to compare against. /// /// \param equal_options Options used to compare double values for equality. /// From 292d2afd55b2764d00a3977b2ac6d5431522737a Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Sun, 1 Jun 2025 15:02:01 +0330 Subject: [PATCH 06/11] Enable allow_atol in Equal options and add operator == and != --- cpp/src/arrow/array/statistics.cc | 21 ++++++-------- cpp/src/arrow/array/statistics.h | 14 +++++---- cpp/src/arrow/array/statistics_test.cc | 40 +++++++++++++------------- cpp/src/arrow/compare.h | 12 +++++++- 4 files changed, 49 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index c02705898f3..81e34b9a403 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -25,13 +25,12 @@ namespace arrow { using ValueType = ArrayStatistics::ValueType; namespace { -bool DoubleEquals(const double& left, const double& right, const EqualOptions& options, - bool is_approximate) { +bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { if (left == right) { return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { return true; - } else if (is_approximate) { + } else if (options.allow_atol()) { return std::fabs(left - right) <= options.atol(); } else { return false; @@ -39,8 +38,7 @@ bool DoubleEquals(const double& left, const double& right, const EqualOptions& o } bool ValueTypeEquals(const std::optional& left, - const std::optional& right, const EqualOptions& options, - bool is_approximate) { + const std::optional& right, const EqualOptions& options) { if (!left.has_value() || !right.has_value()) { return left.has_value() == right.has_value(); } else if (left->index() != right->index()) { @@ -51,7 +49,7 @@ bool ValueTypeEquals(const std::optional& left, using type_2 = std::decay_t; if constexpr (std::conjunction_v, std::is_same>) { - return DoubleEquals(v1, v2, options, is_approximate); + return DoubleEquals(v1, v2, options); } else if constexpr (std::is_same_v) { return v1 == v2; } @@ -63,18 +61,17 @@ bool ValueTypeEquals(const std::optional& left, } } bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, - const EqualOptions& equal_options, bool is_approximate) { + const EqualOptions& equal_options) { return left.null_count == right.null_count && left.distinct_count == right.distinct_count && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && - ValueTypeEquals(left.min, right.min, equal_options, is_approximate) && - ValueTypeEquals(left.max, right.max, equal_options, is_approximate); + ValueTypeEquals(left.min, right.min, equal_options) && + ValueTypeEquals(left.max, right.max, equal_options); } } // namespace bool ArrayStatistics::Equals(const ArrayStatistics& other, - const EqualOptions& equal_options, - bool is_approximate) const { - return EqualsImpl(*this, other, equal_options, is_approximate); + const EqualOptions& equal_options) const { + return EqualsImpl(*this, other, equal_options); } } // namespace arrow diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 72bdd80c631..eb0d239b094 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -134,14 +134,18 @@ struct ARROW_EXPORT ArrayStatistics { /// /// \param equal_options Options used to compare double values for equality. /// - /// \param is_approximate If true, \ref arrow::EqualOptions::atol_ is used - /// for comparing double values. - /// /// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, /// false. bool Equals(const ArrayStatistics& other, - const EqualOptions& equal_options = EqualOptions::Defaults(), - bool is_approximate = true) const; + const EqualOptions& equal_options = EqualOptions::Defaults()) const; + + bool operator==(const ArrayStatistics& other) const { + return Equals(other, EqualOptions::Defaults()); + } + + bool operator!=(const ArrayStatistics& other) const { + return !Equals(other, EqualOptions::Defaults()); + } }; } // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 64479d6b28f..0b575f112c7 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -68,41 +68,41 @@ TEST(ArrayStatisticsTest, TestEqualityNonDoulbeValue) { ArrayStatistics statistics1; ArrayStatistics statistics2; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.null_count = 29; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.null_count = 29; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.distinct_count = 2929; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.distinct_count = 2929; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.min = std::string("world"); - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.min = std::string("world"); - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.is_min_exact = true; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.is_min_exact = true; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.max = static_cast(-29); - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.max = static_cast(-29); - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics1.is_max_exact = true; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics2.is_max_exact = true; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); // Test different index statistics1.max = static_cast(29); - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); } TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { @@ -114,13 +114,13 @@ TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { statistics2.min = std::nullopt; }; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics2.min = 29.0; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); statistics1.min = 29.0; - ASSERT_TRUE(statistics1.Equals(statistics2)); + ASSERT_EQ(statistics1, statistics2); statistics2.min = 30.0; - ASSERT_FALSE(statistics1.Equals(statistics2)); + ASSERT_NE(statistics1, statistics2); // Check Signed Zeros Reset(); @@ -159,9 +159,9 @@ TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { Reset(); statistics1.max = 0.5001f; statistics2.max = 0.5; - ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3), false)); + ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3).allow_atol(false))); - ASSERT_TRUE(statistics1.Equals(statistics2, options.atol(1e-3))); + ASSERT_TRUE(statistics1.Equals(statistics2, options.atol(1e-3).allow_atol(true))); ASSERT_TRUE(statistics2.Equals(statistics1, options.atol(1e-3))); ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-5))); ASSERT_FALSE(statistics2.Equals(statistics1, options.atol(1e-5))); diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 6b365c59913..66b86c7036e 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -57,8 +57,18 @@ class EqualOptions { res.signed_zeros_equal_ = v; return res; } + /// Whether the "atol" property is used in the comparison. + bool allow_atol() const { return allow_atol_; } + + /// Return a new EqualOptions object with the "allow_atol" property changed. + EqualOptions allow_atol(bool v) const { + auto res = EqualOptions(*this); + res.allow_atol_ = v; + return res; + } /// The absolute tolerance for approximate comparisons of floating-point values. + /// Note that this option is ignored if "allow_atol" is set to false. double atol() const { return atol_; } /// Return a new EqualOptions object with the "atol" property changed. @@ -87,7 +97,7 @@ class EqualOptions { double atol_ = kDefaultAbsoluteTolerance; bool nans_equal_ = false; bool signed_zeros_equal_ = true; - + bool allow_atol_ = true; std::ostream* diff_sink_ = NULLPTR; }; From 8d26a40c893bbeb4c61febf3f87dbb4fb9e9045c Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Mon, 2 Jun 2025 10:35:12 +0330 Subject: [PATCH 07/11] Apply Kou suggestion replace allow_atol to use_atol with relevant changes move type alias declaration for ValueType into anonymous namesapce comment == and != for operator and remove default options clear comments test and comment regarding different ValueType breakdown TestEqualityDoubleValues test case into several test cases --- cpp/src/arrow/array/statistics.cc | 9 +- cpp/src/arrow/array/statistics.h | 12 ++- cpp/src/arrow/array/statistics_test.cc | 113 ++++++++++++------------- cpp/src/arrow/compare.h | 12 +-- 4 files changed, 72 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index 81e34b9a403..4d0f6f38d5b 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -23,14 +23,17 @@ #include "arrow/compare.h" #include "arrow/util/logging_internal.h" namespace arrow { -using ValueType = ArrayStatistics::ValueType; + namespace { + +using ValueType = ArrayStatistics::ValueType; + bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { if (left == right) { return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { - return true; - } else if (options.allow_atol()) { + return std::isnan(left) && std::isnan(right); + } else if (options.use_atol()) { return std::fabs(left - right) <= options.atol(); } else { return false; diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index eb0d239b094..d08f0cf2fa5 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -128,7 +128,7 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Whether the maximum value is exact or not bool is_max_exact = false; - /// \brief Checks whether this ArrayStatistics instance is equal to another. + /// \brief Check two statistics for equality /// /// \param other The \ref ArrayStatistics instance to compare against. /// @@ -139,13 +139,11 @@ struct ARROW_EXPORT ArrayStatistics { bool Equals(const ArrayStatistics& other, const EqualOptions& equal_options = EqualOptions::Defaults()) const; - bool operator==(const ArrayStatistics& other) const { - return Equals(other, EqualOptions::Defaults()); - } + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } - bool operator!=(const ArrayStatistics& other) const { - return !Equals(other, EqualOptions::Defaults()); - } + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } }; } // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 0b575f112c7..b0bded9a47f 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -100,71 +100,68 @@ TEST(ArrayStatisticsTest, TestEqualityNonDoulbeValue) { statistics2.is_max_exact = true; ASSERT_EQ(statistics1, statistics2); - // Test different index + // Test different ArrayStatistics::ValueType statistics1.max = static_cast(29); + statistics1.max = static_cast(29); ASSERT_NE(statistics1, statistics2); } +class TestEqualityDoubleValue : public ::testing::Test { + protected: + ArrayStatistics statistics1_; + ArrayStatistics statistics2_; + EqualOptions options_ = EqualOptions::Defaults(); +}; + +TEST_F(TestEqualityDoubleValue, ExactValue) { + ASSERT_EQ(statistics1_, statistics2_); + statistics2_.min = 29.0; + ASSERT_NE(statistics1_, statistics2_); + statistics1_.min = 29.0; + ASSERT_EQ(statistics1_, statistics2_); + statistics2_.min = 30.0; + ASSERT_NE(statistics1_, statistics2_); +} -TEST(ArrayStatisticsTest, TestEqualityDoubleValue) { - ArrayStatistics statistics1; - ArrayStatistics statistics2; - EqualOptions options = EqualOptions::Defaults(); - auto Reset = [&]() { - statistics1.min = std::nullopt; - statistics2.min = std::nullopt; - }; +TEST_F(TestEqualityDoubleValue, SignedZero) { + statistics1_.min = +0.0; + statistics2_.min = -0.0; + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); +} - ASSERT_EQ(statistics1, statistics2); - statistics2.min = 29.0; - ASSERT_NE(statistics1, statistics2); - statistics1.min = 29.0; - ASSERT_EQ(statistics1, statistics2); - statistics2.min = 30.0; - ASSERT_NE(statistics1, statistics2); +TEST_F(TestEqualityDoubleValue, Infinity) { + auto infinity = std::numeric_limits::infinity(); + statistics1_.min = infinity; + statistics2_.min = infinity; + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); + statistics1_.min = -infinity; + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); + statistics1_.min = 0.0; + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); +} - // Check Signed Zeros - Reset(); - statistics1.min = +0.0; - statistics2.min = -0.0; - ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); - ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); +TEST_F(TestEqualityDoubleValue, Nan) { + statistics1_.min = static_cast(NAN); + statistics2_.min = static_cast(NAN); + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); - // Check Infinity - Reset(); - auto infinity = std::numeric_limits::infinity(); - statistics1.min = infinity; - statistics2.min = infinity; - ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); - ASSERT_TRUE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); - - statistics1.min = -infinity; - ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); - ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); - - statistics1.min = 0.0; - ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(true))); - ASSERT_FALSE(statistics1.Equals(statistics2, options.signed_zeros_equal(false))); - - // Check NAN - Reset(); - statistics1.min = static_cast(NAN); - statistics2.min = static_cast(NAN); - ASSERT_TRUE(statistics1.Equals(statistics2, options.nans_equal(true))); - ASSERT_FALSE(statistics1.Equals(statistics2, options.nans_equal(false))); - - statistics2.min = 2.0; - ASSERT_FALSE(statistics1.Equals(statistics2)); - - // Check Approximate float - Reset(); - statistics1.max = 0.5001f; - statistics2.max = 0.5; - ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-3).allow_atol(false))); - - ASSERT_TRUE(statistics1.Equals(statistics2, options.atol(1e-3).allow_atol(true))); - ASSERT_TRUE(statistics2.Equals(statistics1, options.atol(1e-3))); - ASSERT_FALSE(statistics1.Equals(statistics2, options.atol(1e-5))); - ASSERT_FALSE(statistics2.Equals(statistics1, options.atol(1e-5))); + statistics2_.min = 2.0; + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); +} +TEST_F(TestEqualityDoubleValue, ApproximateEquals) { + statistics1_.max = 0.5001f; + statistics2_.max = 0.5; + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(false))); + + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(true))); + ASSERT_TRUE(statistics2_.Equals(statistics1_, options_.atol(1e-3))); + ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-5))); + ASSERT_FALSE(statistics2_.Equals(statistics1_, options_.atol(1e-5))); } } // namespace arrow diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 66b86c7036e..a09598844e9 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -58,17 +58,17 @@ class EqualOptions { return res; } /// Whether the "atol" property is used in the comparison. - bool allow_atol() const { return allow_atol_; } + bool use_atol() const { return use_atol_; } - /// Return a new EqualOptions object with the "allow_atol" property changed. - EqualOptions allow_atol(bool v) const { + /// Return a new EqualOptions object with the "use_atol" property changed. + EqualOptions use_atol(bool v) const { auto res = EqualOptions(*this); - res.allow_atol_ = v; + res.use_atol_ = v; return res; } /// The absolute tolerance for approximate comparisons of floating-point values. - /// Note that this option is ignored if "allow_atol" is set to false. + /// Note that this option is ignored if "use_atol" is set to false. double atol() const { return atol_; } /// Return a new EqualOptions object with the "atol" property changed. @@ -97,7 +97,7 @@ class EqualOptions { double atol_ = kDefaultAbsoluteTolerance; bool nans_equal_ = false; bool signed_zeros_equal_ = true; - bool allow_atol_ = true; + bool use_atol_ = true; std::ostream* diff_sink_ = NULLPTR; }; From fc5697c9581edf38ee0fe0859fd4735193cff943 Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Mon, 2 Jun 2025 18:51:10 +0330 Subject: [PATCH 08/11] Change the way that equality is implemented --- cpp/src/arrow/array/statistics.cc | 65 ++----------------------------- cpp/src/arrow/array/statistics.h | 8 ++-- cpp/src/arrow/compare.cc | 50 ++++++++++++++++++++++++ cpp/src/arrow/compare.h | 11 +++++- 4 files changed, 68 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index 4d0f6f38d5b..b661c9fbaff 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -15,66 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/array/statistics.h" - -#include -#include - -#include "arrow/compare.h" -#include "arrow/util/logging_internal.h" -namespace arrow { - -namespace { +// This empty .cc file is for embedding not inlined symbols in +// arrow::ArrayStatistics into libarrow. -using ValueType = ArrayStatistics::ValueType; - -bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { - if (left == right) { - return options.signed_zeros_equal() || (std::signbit(left) == std::signbit(right)); - } else if (options.nans_equal() && (std::isnan(left) || std::isnan(right))) { - return std::isnan(left) && std::isnan(right); - } else if (options.use_atol()) { - return std::fabs(left - right) <= options.atol(); - } else { - return false; - } -} - -bool ValueTypeEquals(const std::optional& left, - const std::optional& right, const EqualOptions& options) { - if (!left.has_value() || !right.has_value()) { - return left.has_value() == right.has_value(); - } else if (left->index() != right->index()) { - return false; - } else { - auto EqualsVisitor = [&](const auto& v1, const auto& v2) { - using type_1 = std::decay_t; - using type_2 = std::decay_t; - if constexpr (std::conjunction_v, - std::is_same>) { - return DoubleEquals(v1, v2, options); - } else if constexpr (std::is_same_v) { - return v1 == v2; - } - // It is unreachable - DCHECK(false); - return false; - }; - return std::visit(EqualsVisitor, left.value(), right.value()); - } -} -bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, - const EqualOptions& equal_options) { - return left.null_count == right.null_count && - left.distinct_count == right.distinct_count && - left.is_min_exact == right.is_min_exact && - left.is_max_exact == right.is_max_exact && - ValueTypeEquals(left.min, right.min, equal_options) && - ValueTypeEquals(left.max, right.max, equal_options); -} -} // namespace -bool ArrayStatistics::Equals(const ArrayStatistics& other, - const EqualOptions& equal_options) const { - return EqualsImpl(*this, other, equal_options); -} -} // namespace arrow +#include "arrow/array/statistics.h" diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index d08f0cf2fa5..6accd48af77 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -128,16 +128,18 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Whether the maximum value is exact or not bool is_max_exact = false; - /// \brief Check two statistics for equality + /// \brief Check two \ref arrow::ArrayStatistics for equality /// - /// \param other The \ref ArrayStatistics instance to compare against. + /// \param other The \ref arrow::ArrayStatistics instance to compare against. /// /// \param equal_options Options used to compare double values for equality. /// /// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, /// false. bool Equals(const ArrayStatistics& other, - const EqualOptions& equal_options = EqualOptions::Defaults()) const; + const EqualOptions& equal_options = EqualOptions::Defaults()) const { + return ArrayStatisticsEquals(*this, other, equal_options); + } /// \brief Check two statistics for equality bool operator==(const ArrayStatistics& other) const { return Equals(other); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 3b64a8fd09f..1bc83600ca5 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -27,10 +27,12 @@ #include #include #include +#include #include #include "arrow/array.h" #include "arrow/array/diff.h" +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/scalar.h" #include "arrow/sparse_tensor.h" @@ -1522,5 +1524,53 @@ bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata return visitor.result(); } } +namespace { + +using ValueType = ArrayStatistics::ValueType; + +bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { + bool result; + auto visitor = [&](auto&& compare_func) { result = compare_func(left, right); }; + VisitFloatingEquality(options, options.use_atol(), std::move(visitor)); + return result; +} +bool ValueTypeEquals(const std::optional& left, + const std::optional& right, const EqualOptions& options) { + if (!left.has_value() || !right.has_value()) { + return left.has_value() == right.has_value(); + } else if (left->index() != right->index()) { + return false; + } else { + auto EqualsVisitor = [&](const auto& v1, const auto& v2) { + using type_1 = std::decay_t; + using type_2 = std::decay_t; + if constexpr (std::conjunction_v, + std::is_same>) { + return DoubleEquals(v1, v2, options); + } else if constexpr (std::is_same_v) { + return v1 == v2; + } + // It is unreachable + DCHECK(false); + return false; + }; + return std::visit(EqualsVisitor, left.value(), right.value()); + } +} +bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& equal_options) { + return left.null_count == right.null_count && + left.distinct_count == right.distinct_count && + left.is_min_exact == right.is_min_exact && + left.is_max_exact == right.is_max_exact && + ValueTypeEquals(left.min, right.min, equal_options) && + ValueTypeEquals(left.max, right.max, equal_options); +} +} // namespace + +bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& options) { + return EqualsImpl(left, right, options); +} } // namespace arrow diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index a09598844e9..a9db4b49034 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -26,7 +26,7 @@ #include "arrow/util/visibility.h" namespace arrow { - +struct ArrayStatistics; class Array; class DataType; class Tensor; @@ -144,6 +144,15 @@ ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTenso /// fields ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata = true); +/// \brief Check two \ref arrow::ArrayStatistics for equality +/// \param[in] left an \ref arrow::ArrayStatistics +/// \param[in] right an \ref arrow::ArrayStatistics +/// \param[in] options Options used to compare double values for equality. +/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise, +/// false. +ARROW_EXPORT bool ArrayStatisticsEquals( + const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& options = EqualOptions::Defaults()); /// Returns true if scalars are equal /// \param[in] left a Scalar From ffde3c56125e9946c21d35968433d13826ea816e Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Wed, 4 Jun 2025 12:06:36 +0330 Subject: [PATCH 09/11] Apply Kou suggestion --- cpp/src/arrow/array/statistics_test.cc | 22 +++++++--------------- cpp/src/arrow/compare.cc | 15 ++++++++------- cpp/src/arrow/compare.h | 1 + 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index b0bded9a47f..9463fed8cfb 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -113,7 +113,6 @@ class TestEqualityDoubleValue : public ::testing::Test { }; TEST_F(TestEqualityDoubleValue, ExactValue) { - ASSERT_EQ(statistics1_, statistics2_); statistics2_.min = 29.0; ASSERT_NE(statistics1_, statistics2_); statistics1_.min = 29.0; @@ -133,32 +132,25 @@ TEST_F(TestEqualityDoubleValue, Infinity) { auto infinity = std::numeric_limits::infinity(); statistics1_.min = infinity; statistics2_.min = infinity; - ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); - ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); + ASSERT_EQ(statistics1_, statistics2_); statistics1_.min = -infinity; - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); + ASSERT_NE(statistics1_, statistics2_); statistics1_.min = 0.0; - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); + ASSERT_NE(statistics1_, statistics2_); } -TEST_F(TestEqualityDoubleValue, Nan) { - statistics1_.min = static_cast(NAN); - statistics2_.min = static_cast(NAN); +TEST_F(TestEqualityDoubleValue, NaN) { + statistics1_.min = std::numeric_limits::quiet_NaN(); + statistics2_.min = std::numeric_limits::quiet_NaN(); ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); - - statistics2_.min = 2.0; - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); } TEST_F(TestEqualityDoubleValue, ApproximateEquals) { statistics1_.max = 0.5001f; statistics2_.max = 0.5; ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(false))); - ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(true))); + ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.atol(1e-3))); ASSERT_TRUE(statistics2_.Equals(statistics1_, options_.atol(1e-3))); ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-5))); ASSERT_FALSE(statistics2_.Equals(statistics1_, options_.atol(1e-5))); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 1bc83600ca5..5ad0f3e175e 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1535,8 +1535,9 @@ bool DoubleEquals(const double& left, const double& right, const EqualOptions& o return result; } -bool ValueTypeEquals(const std::optional& left, - const std::optional& right, const EqualOptions& options) { +bool ArrayStatisticsValueTypeEquals(const std::optional& left, + const std::optional& right, + const EqualOptions& options) { if (!left.has_value() || !right.has_value()) { return left.has_value() == right.has_value(); } else if (left->index() != right->index()) { @@ -1558,19 +1559,19 @@ bool ValueTypeEquals(const std::optional& left, return std::visit(EqualsVisitor, left.value(), right.value()); } } -bool EqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, - const EqualOptions& equal_options) { +bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, + const EqualOptions& equal_options) { return left.null_count == right.null_count && left.distinct_count == right.distinct_count && left.is_min_exact == right.is_min_exact && left.is_max_exact == right.is_max_exact && - ValueTypeEquals(left.min, right.min, equal_options) && - ValueTypeEquals(left.max, right.max, equal_options); + ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) && + ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options); } } // namespace bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, const EqualOptions& options) { - return EqualsImpl(left, right, options); + return ArrayStatisticsEqualsImpl(left, right, options); } } // namespace arrow diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index a9db4b49034..449571a5998 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -98,6 +98,7 @@ class EqualOptions { bool nans_equal_ = false; bool signed_zeros_equal_ = true; bool use_atol_ = true; + std::ostream* diff_sink_ = NULLPTR; }; From 2a0760f46d33a3b3316e10ce4643d47ab448b2ce Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Fri, 6 Jun 2025 16:17:21 +0330 Subject: [PATCH 10/11] fix space, apply unified naming for fixture, --- cpp/src/arrow/array/statistics_test.cc | 34 ++++++++++++-------------- cpp/src/arrow/compare.cc | 12 +++++---- cpp/src/arrow/compare.h | 3 +++ 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 9463fed8cfb..95199a9683b 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include +#include +#include #include @@ -24,7 +25,7 @@ namespace arrow { -TEST(ArrayStatisticsTest, TestNullCount) { +TEST(TestArrayStatistics, NullCount) { ArrayStatistics statistics; ASSERT_FALSE(statistics.null_count.has_value()); statistics.null_count = 29; @@ -32,7 +33,7 @@ TEST(ArrayStatisticsTest, TestNullCount) { ASSERT_EQ(29, statistics.null_count.value()); } -TEST(ArrayStatisticsTest, TestDistinctCount) { +TEST(TestArrayStatistics, DistinctCount) { ArrayStatistics statistics; ASSERT_FALSE(statistics.distinct_count.has_value()); statistics.distinct_count = 29; @@ -40,7 +41,7 @@ TEST(ArrayStatisticsTest, TestDistinctCount) { ASSERT_EQ(29, statistics.distinct_count.value()); } -TEST(ArrayStatisticsTest, TestMin) { +TEST(TestArrayStatistics, Min) { ArrayStatistics statistics; ASSERT_FALSE(statistics.min.has_value()); ASSERT_FALSE(statistics.is_min_exact); @@ -52,7 +53,7 @@ TEST(ArrayStatisticsTest, TestMin) { ASSERT_TRUE(statistics.is_min_exact); } -TEST(ArrayStatisticsTest, TestMax) { +TEST(TestArrayStatistics, Max) { ArrayStatistics statistics; ASSERT_FALSE(statistics.max.has_value()); ASSERT_FALSE(statistics.is_max_exact); @@ -64,7 +65,7 @@ TEST(ArrayStatisticsTest, TestMax) { ASSERT_FALSE(statistics.is_max_exact); } -TEST(ArrayStatisticsTest, TestEqualityNonDoulbeValue) { +TEST(TestArrayStatistics, EqualityNonDoulbeValue) { ArrayStatistics statistics1; ArrayStatistics statistics2; @@ -105,55 +106,50 @@ TEST(ArrayStatisticsTest, TestEqualityNonDoulbeValue) { statistics1.max = static_cast(29); ASSERT_NE(statistics1, statistics2); } -class TestEqualityDoubleValue : public ::testing::Test { + +class TestArrayStatisticsEqualityDoubleValue : public ::testing::Test { protected: ArrayStatistics statistics1_; ArrayStatistics statistics2_; EqualOptions options_ = EqualOptions::Defaults(); }; -TEST_F(TestEqualityDoubleValue, ExactValue) { +TEST_F(TestArrayStatisticsEqualityDoubleValue, ExactValue) { statistics2_.min = 29.0; - ASSERT_NE(statistics1_, statistics2_); statistics1_.min = 29.0; ASSERT_EQ(statistics1_, statistics2_); statistics2_.min = 30.0; ASSERT_NE(statistics1_, statistics2_); } -TEST_F(TestEqualityDoubleValue, SignedZero) { +TEST_F(TestArrayStatisticsEqualityDoubleValue, SignedZero) { statistics1_.min = +0.0; statistics2_.min = -0.0; ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(true))); ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.signed_zeros_equal(false))); } -TEST_F(TestEqualityDoubleValue, Infinity) { +TEST_F(TestArrayStatisticsEqualityDoubleValue, Infinity) { auto infinity = std::numeric_limits::infinity(); statistics1_.min = infinity; statistics2_.min = infinity; ASSERT_EQ(statistics1_, statistics2_); statistics1_.min = -infinity; ASSERT_NE(statistics1_, statistics2_); - statistics1_.min = 0.0; - ASSERT_NE(statistics1_, statistics2_); } -TEST_F(TestEqualityDoubleValue, NaN) { +TEST_F(TestArrayStatisticsEqualityDoubleValue, NaN) { statistics1_.min = std::numeric_limits::quiet_NaN(); statistics2_.min = std::numeric_limits::quiet_NaN(); ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.nans_equal(true))); ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.nans_equal(false))); } -TEST_F(TestEqualityDoubleValue, ApproximateEquals) { + +TEST_F(TestArrayStatisticsEqualityDoubleValue, ApproximateEquals) { statistics1_.max = 0.5001f; statistics2_.max = 0.5; ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-3).use_atol(false))); - ASSERT_TRUE(statistics1_.Equals(statistics2_, options_.atol(1e-3))); - ASSERT_TRUE(statistics2_.Equals(statistics1_, options_.atol(1e-3))); - ASSERT_FALSE(statistics1_.Equals(statistics2_, options_.atol(1e-5))); - ASSERT_FALSE(statistics2_.Equals(statistics1_, options_.atol(1e-5))); } } // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 5ad0f3e175e..ba44e160c55 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1526,8 +1527,6 @@ bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata } namespace { -using ValueType = ArrayStatistics::ValueType; - bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) { bool result; auto visitor = [&](auto&& compare_func) { result = compare_func(left, right); }; @@ -1535,9 +1534,9 @@ bool DoubleEquals(const double& left, const double& right, const EqualOptions& o return result; } -bool ArrayStatisticsValueTypeEquals(const std::optional& left, - const std::optional& right, - const EqualOptions& options) { +bool ArrayStatisticsValueTypeEquals( + const std::optional& left, + const std::optional& right, const EqualOptions& options) { if (!left.has_value() || !right.has_value()) { return left.has_value() == right.has_value(); } else if (left->index() != right->index()) { @@ -1559,6 +1558,7 @@ bool ArrayStatisticsValueTypeEquals(const std::optional& left, return std::visit(EqualsVisitor, left.value(), right.value()); } } + bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, const EqualOptions& equal_options) { return left.null_count == right.null_count && @@ -1568,10 +1568,12 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistic ArrayStatisticsValueTypeEquals(left.min, right.min, equal_options) && ArrayStatisticsValueTypeEquals(left.max, right.max, equal_options); } + } // namespace bool ArrayStatisticsEquals(const ArrayStatistics& left, const ArrayStatistics& right, const EqualOptions& options) { return ArrayStatisticsEqualsImpl(left, right, options); } + } // namespace arrow diff --git a/cpp/src/arrow/compare.h b/cpp/src/arrow/compare.h index 449571a5998..ec7dc8bda18 100644 --- a/cpp/src/arrow/compare.h +++ b/cpp/src/arrow/compare.h @@ -26,6 +26,7 @@ #include "arrow/util/visibility.h" namespace arrow { + struct ArrayStatistics; class Array; class DataType; @@ -57,6 +58,7 @@ class EqualOptions { res.signed_zeros_equal_ = v; return res; } + /// Whether the "atol" property is used in the comparison. bool use_atol() const { return use_atol_; } @@ -145,6 +147,7 @@ ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTenso /// fields ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata = true); + /// \brief Check two \ref arrow::ArrayStatistics for equality /// \param[in] left an \ref arrow::ArrayStatistics /// \param[in] right an \ref arrow::ArrayStatistics From e06599f07b161dad09e717a57836611418725e2c Mon Sep 17 00:00:00 2001 From: Arash Andishgar Date: Sat, 7 Jun 2025 01:45:46 +0330 Subject: [PATCH 11/11] fix minor linting --- cpp/src/arrow/compare.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index ba44e160c55..2460afbf87c 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1525,6 +1525,7 @@ bool TypeEquals(const DataType& left, const DataType& right, bool check_metadata return visitor.result(); } } + namespace { bool DoubleEquals(const double& left, const double& right, const EqualOptions& options) {