From bed8e3de07601b1087d56d75e389c1dbcfd78468 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 17:10:23 +0900 Subject: [PATCH 01/10] GH-41909: [C++] Add arrow::ArrayStatistics See GH-42133 how to use this for Apache Parquet statistics. --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/statistics.h | 76 ++++++++++++++++++ cpp/src/arrow/array/statistics_test.cc | 103 +++++++++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 cpp/src/arrow/array/statistics.h create mode 100644 cpp/src/arrow/array/statistics_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6dc8358f502..22fe51456a3 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1168,6 +1168,7 @@ add_arrow_test(array_test array/array_struct_test.cc array/array_union_test.cc array/array_view_test.cc + array/statistics_test.cc PRECOMPILED_HEADERS "$<$:arrow/testing/pch.h>") diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 00000000000..fa399f92e83 --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associate with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + public: + using ElementBufferType = std::variant; + + ArrayStatistics() = default; + ~ArrayStatistics() = default; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value buffer, may not be set + std::optional min_buffer = std::nullopt; + + /// \brief Whether the minimum value is exact or not, may not be set + std::optional is_min_exact = std::nullopt; + + /// \brief The maximum value buffer, may not be set + std::optional max_buffer = std::nullopt; + + /// \brief Whether the maximum value is exact or not, may not be set + std::optional is_max_exact = std::nullopt; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min_buffer == other.min_buffer && is_min_exact == other.is_min_exact && + max_buffer == other.max_buffer && is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { + return Equals(other); + } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { + return !Equals(other); + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc new file mode 100644 index 00000000000..ad1f407f06e --- /dev/null +++ b/cpp/src/arrow/array/statistics_test.cc @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/statistics.h" + +namespace arrow { + +TEST(ArrayStatisticsTest, TestNullCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.null_count.has_value()); + statistics.null_count = 29; + ASSERT_TRUE(statistics.null_count.has_value()); + ASSERT_EQ(29, statistics.null_count.value()); +} + +TEST(ArrayStatisticsTest, TestDistinctCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.distinct_count.has_value()); + statistics.distinct_count = 29; + ASSERT_TRUE(statistics.distinct_count.has_value()); + ASSERT_EQ(29, statistics.distinct_count.value()); +} + +TEST(ArrayStatisticsTest, TestMin) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.min_buffer.has_value()); + ASSERT_FALSE(statistics.is_min_exact.has_value()); + statistics.min_buffer = static_cast(29); + statistics.is_min_exact = true; + ASSERT_TRUE(statistics.min_buffer.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.min_buffer.value())); + ASSERT_EQ(29, std::get(statistics.min_buffer.value())); + ASSERT_TRUE(statistics.is_min_exact.has_value()); + ASSERT_TRUE(statistics.is_min_exact.value()); +} + +TEST(ArrayStatisticsTest, TestMax) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max_buffer.has_value()); + ASSERT_FALSE(statistics.is_max_exact.has_value()); + statistics.max_buffer = static_cast(29); + statistics.is_max_exact = false; + ASSERT_TRUE(statistics.max_buffer.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.max_buffer.value())); + ASSERT_EQ(29, std::get(statistics.max_buffer.value())); + ASSERT_TRUE(statistics.is_max_exact.has_value()); + ASSERT_FALSE(statistics.is_max_exact.value()); +} + +TEST(ArrayStatisticsTest, TestEquality) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + + ASSERT_EQ(statistics1, statistics2); + + statistics1.null_count = 29; + ASSERT_NE(statistics1, statistics2); + statistics2.null_count = 29; + ASSERT_EQ(statistics1, statistics2); + + statistics1.distinct_count = 2929; + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = 2929; + ASSERT_EQ(statistics1, statistics2); + + statistics1.min_buffer = static_cast(255); + ASSERT_NE(statistics1, statistics2); + statistics2.min_buffer = static_cast(255); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_min_exact = false; + ASSERT_NE(statistics1, statistics2); + statistics2.is_min_exact = false; + ASSERT_EQ(statistics1, statistics2); + + statistics1.max_buffer = static_cast(-255); + ASSERT_NE(statistics1, statistics2); + statistics2.max_buffer = static_cast(-255); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_max_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_max_exact = true; + ASSERT_EQ(statistics1, statistics2); +} + +} // namespace arrow From f7857b1aea38a7ef2263ac13fa74792131077708 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Jul 2024 17:29:40 +0900 Subject: [PATCH 02/10] Fix style --- cpp/src/arrow/array/statistics.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index fa399f92e83..3be67950aa4 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -63,14 +63,10 @@ struct ARROW_EXPORT ArrayStatistics { } /// \brief Check two statistics for equality - bool operator==(const ArrayStatistics& other) const { - return Equals(other); - } + bool operator==(const ArrayStatistics& other) const { return Equals(other); } /// \brief Check two statistics for not equality - bool operator!=(const ArrayStatistics& other) const { - return !Equals(other); - } + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } }; } // namespace arrow From 1ee42712f106211efe61a1a0cdb9b4485cd4fa5b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:18:46 +0900 Subject: [PATCH 03/10] Fix a typo --- cpp/src/arrow/array/statistics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 3be67950aa4..dde5d94e571 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -27,7 +27,7 @@ namespace arrow { /// \brief Statistics for an Array /// /// Apache Arrow format doesn't have statistics but data source such -/// as Apache Parquet may have statistics. Statistics associate with +/// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { public: From 97a013c800cac0e8f0573df59ada6a29a6c501a1 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:19:04 +0900 Subject: [PATCH 04/10] Remove redundant public: --- cpp/src/arrow/array/statistics.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index dde5d94e571..3c0939d83fc 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -30,7 +30,6 @@ namespace arrow { /// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { - public: using ElementBufferType = std::variant; From 0fbe9ef8492871dc8a3ab88fa2269849a1dc389a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:22:05 +0900 Subject: [PATCH 05/10] Add std::string and std::string_view to element types --- cpp/src/arrow/array/statistics.h | 7 +++++-- cpp/src/arrow/array/statistics_test.cc | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 3c0939d83fc..29967ce3c15 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -18,6 +18,8 @@ #pragma once #include +#include +#include #include #include "arrow/util/visibility.h" @@ -30,8 +32,9 @@ namespace arrow { /// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { - using ElementBufferType = std::variant; + using ElementBufferType = + std::variant; ArrayStatistics() = default; ~ArrayStatistics() = default; diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index ad1f407f06e..ae23f63ede4 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -54,11 +54,11 @@ TEST(ArrayStatisticsTest, TestMax) { ArrayStatistics statistics; ASSERT_FALSE(statistics.max_buffer.has_value()); ASSERT_FALSE(statistics.is_max_exact.has_value()); - statistics.max_buffer = static_cast(29); + statistics.max_buffer = std::string("hello"); statistics.is_max_exact = false; ASSERT_TRUE(statistics.max_buffer.has_value()); - ASSERT_TRUE(std::holds_alternative(statistics.max_buffer.value())); - ASSERT_EQ(29, std::get(statistics.max_buffer.value())); + ASSERT_TRUE(std::holds_alternative(statistics.max_buffer.value())); + ASSERT_EQ("hello", std::get(statistics.max_buffer.value())); ASSERT_TRUE(statistics.is_max_exact.has_value()); ASSERT_FALSE(statistics.is_max_exact.value()); } @@ -79,9 +79,9 @@ TEST(ArrayStatisticsTest, TestEquality) { statistics2.distinct_count = 2929; ASSERT_EQ(statistics1, statistics2); - statistics1.min_buffer = static_cast(255); + statistics1.min_buffer = std::string_view("world"); ASSERT_NE(statistics1, statistics2); - statistics2.min_buffer = static_cast(255); + statistics2.min_buffer = std::string_view("world"); ASSERT_EQ(statistics1, statistics2); statistics1.is_min_exact = false; From b6b4754a117084bac66f56d2069e55e89db5d3cd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:25:57 +0900 Subject: [PATCH 06/10] Rename ElementBufferType to ValueType and {min,max}_buffer to {min,max} --- cpp/src/arrow/array/statistics.h | 14 ++++++------- cpp/src/arrow/array/statistics_test.cc | 28 +++++++++++++------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 29967ce3c15..9544581b8a3 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -32,7 +32,7 @@ namespace arrow { /// as Apache Parquet may have statistics. Statistics associated with /// data source can be read unified API via this class. struct ARROW_EXPORT ArrayStatistics { - using ElementBufferType = + using ValueType = std::variant; @@ -45,14 +45,14 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The number of distinct values, may not be set std::optional distinct_count = std::nullopt; - /// \brief The minimum value buffer, may not be set - std::optional min_buffer = std::nullopt; + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; /// \brief Whether the minimum value is exact or not, may not be set std::optional is_min_exact = std::nullopt; - /// \brief The maximum value buffer, may not be set - std::optional max_buffer = std::nullopt; + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; /// \brief Whether the maximum value is exact or not, may not be set std::optional is_max_exact = std::nullopt; @@ -60,8 +60,8 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief Check two statistics for equality bool Equals(const ArrayStatistics& other) const { return null_count == other.null_count && distinct_count == other.distinct_count && - min_buffer == other.min_buffer && is_min_exact == other.is_min_exact && - max_buffer == other.max_buffer && is_max_exact == other.is_max_exact; + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; } /// \brief Check two statistics for equality diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index ae23f63ede4..33f99266e37 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -39,26 +39,26 @@ TEST(ArrayStatisticsTest, TestDistinctCount) { TEST(ArrayStatisticsTest, TestMin) { ArrayStatistics statistics; - ASSERT_FALSE(statistics.min_buffer.has_value()); + ASSERT_FALSE(statistics.min.has_value()); ASSERT_FALSE(statistics.is_min_exact.has_value()); - statistics.min_buffer = static_cast(29); + statistics.min = static_cast(29); statistics.is_min_exact = true; - ASSERT_TRUE(statistics.min_buffer.has_value()); - ASSERT_TRUE(std::holds_alternative(statistics.min_buffer.value())); - ASSERT_EQ(29, std::get(statistics.min_buffer.value())); + ASSERT_TRUE(statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.min.value())); + ASSERT_EQ(29, std::get(statistics.min.value())); ASSERT_TRUE(statistics.is_min_exact.has_value()); ASSERT_TRUE(statistics.is_min_exact.value()); } TEST(ArrayStatisticsTest, TestMax) { ArrayStatistics statistics; - ASSERT_FALSE(statistics.max_buffer.has_value()); + ASSERT_FALSE(statistics.max.has_value()); ASSERT_FALSE(statistics.is_max_exact.has_value()); - statistics.max_buffer = std::string("hello"); + statistics.max = std::string("hello"); statistics.is_max_exact = false; - ASSERT_TRUE(statistics.max_buffer.has_value()); - ASSERT_TRUE(std::holds_alternative(statistics.max_buffer.value())); - ASSERT_EQ("hello", std::get(statistics.max_buffer.value())); + ASSERT_TRUE(statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.max.value())); + ASSERT_EQ("hello", std::get(statistics.max.value())); ASSERT_TRUE(statistics.is_max_exact.has_value()); ASSERT_FALSE(statistics.is_max_exact.value()); } @@ -79,9 +79,9 @@ TEST(ArrayStatisticsTest, TestEquality) { statistics2.distinct_count = 2929; ASSERT_EQ(statistics1, statistics2); - statistics1.min_buffer = std::string_view("world"); + statistics1.min = std::string_view("world"); ASSERT_NE(statistics1, statistics2); - statistics2.min_buffer = std::string_view("world"); + statistics2.min = std::string_view("world"); ASSERT_EQ(statistics1, statistics2); statistics1.is_min_exact = false; @@ -89,9 +89,9 @@ TEST(ArrayStatisticsTest, TestEquality) { statistics2.is_min_exact = false; ASSERT_EQ(statistics1, statistics2); - statistics1.max_buffer = static_cast(-255); + statistics1.max = static_cast(-255); ASSERT_NE(statistics1, statistics2); - statistics2.max_buffer = static_cast(-255); + statistics2.max = static_cast(-255); ASSERT_EQ(statistics1, statistics2); statistics1.is_max_exact = true; From 2af686c7ee6b73f829043fe8af6d921784f49b6f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:28:54 +0900 Subject: [PATCH 07/10] Add an empty statistics.cc for Windows --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/statistics.cc | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 cpp/src/arrow/array/statistics.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 22fe51456a3..9c66a58c542 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -412,6 +412,7 @@ arrow_add_object_library(ARROW_ARRAY array/concatenate.cc array/data.cc array/diff.cc + array/statistics.cc array/util.cc array/validate.cc) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc new file mode 100644 index 00000000000..39951b90aac --- /dev/null +++ b/cpp/src/arrow/array/statistics.cc @@ -0,0 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/statistics.h" From 4639d5807d4b55986ecbc426778457de03f82afa Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 16:45:10 +0900 Subject: [PATCH 08/10] Add missing cstdint --- cpp/src/arrow/array/statistics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 9544581b8a3..ab9069e3a74 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include From e935d66059adc391d05c78091f3865ef9a221f96 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 17 Jul 2024 17:38:55 +0900 Subject: [PATCH 09/10] Add a comment why statistics.cc is needed --- cpp/src/arrow/array/statistics.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc index 39951b90aac..b661c9fbaff 100644 --- a/cpp/src/arrow/array/statistics.cc +++ b/cpp/src/arrow/array/statistics.cc @@ -15,4 +15,7 @@ // specific language governing permissions and limitations // under the License. +// This empty .cc file is for embedding not inlined symbols in +// arrow::ArrayStatistics into libarrow. + #include "arrow/array/statistics.h" From 2c60782ced9b1f75b9089ed36dc2fd326cc96523 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 2 Aug 2024 16:16:55 +0900 Subject: [PATCH 10/10] Add float family types to ValueTYpe --- cpp/src/arrow/array/statistics.h | 3 ++- cpp/src/arrow/array/statistics_test.cc | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index ab9069e3a74..7357e27f41f 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/util/float16.h" #include "arrow/util/visibility.h" namespace arrow { @@ -35,7 +36,7 @@ namespace arrow { struct ARROW_EXPORT ArrayStatistics { using ValueType = std::variant; + uint64_t, util::Float16, float, double, std::string, std::string_view>; ArrayStatistics() = default; ~ArrayStatistics() = default; diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 33f99266e37..a465ac0bc2e 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -89,9 +89,9 @@ TEST(ArrayStatisticsTest, TestEquality) { statistics2.is_min_exact = false; ASSERT_EQ(statistics1, statistics2); - statistics1.max = static_cast(-255); + statistics1.max = arrow::util::Float16(-29); ASSERT_NE(statistics1, statistics2); - statistics2.max = static_cast(-255); + statistics2.max = arrow::util::Float16(-29); ASSERT_EQ(statistics1, statistics2); statistics1.is_max_exact = true;