From 0002140b2274867381b6cc77f74ca23babdfc5b3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 23 Aug 2024 14:32:07 +0900 Subject: [PATCH 1/5] GH-43797: [C++] Attach `arrow::ArrayStatistics` to `arrow::ArrayData` If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning. --- cpp/src/arrow/array/array_base.h | 8 +++ cpp/src/arrow/array/array_test.cc | 102 ++++++++++++++++++++++++++++++ cpp/src/arrow/array/data.h | 12 +++- 3 files changed, 120 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae072206..c36d4518bdb 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + const ArrayStatistics& statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 32806d9d2ed..7fdf60fbf3d 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3709,6 +3709,108 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +class TestArrayDataStatistics : public ::testing::Test { + public: + void SetUp() { + valids_ = {1, 0, 1, 1}; + null_count_ = std::count(valids_.begin(), valids_.end(), 0); + null_buffer_ = *internal::BytesToBits(valids_); + values_ = {1, 0, 3, -4}; + min_ = *std::min_element(values_.begin(), values_.end()); + max_ = *std::max_element(values_.begin(), values_.end()); + values_buffer_ = Buffer::FromVector(values_); + data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, + null_count_); + data_->statistics.null_count = null_count_; + data_->statistics.min = min_; + data_->statistics.is_min_exact = true; + data_->statistics.max = max_; + data_->statistics.is_max_exact = true; + } + + protected: + std::vector valids_; + size_t null_count_; + std::shared_ptr null_buffer_; + std::vector values_; + int64_t min_; + int64_t max_; + std::shared_ptr values_buffer_; + std::shared_ptr data_; +}; + +TEST_F(TestArrayDataStatistics, MoveConstructor) { + ArrayData copied_data(*data_); + ArrayData moved_data(std::move(copied_data)); + + ASSERT_TRUE(moved_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + + ASSERT_TRUE(moved_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); + ASSERT_TRUE(moved_data.statistics.is_min_exact); + + ASSERT_TRUE(moved_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); + ASSERT_TRUE(moved_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyConstructor) { + ArrayData copied_data(*data_); + + ASSERT_TRUE(copied_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + + ASSERT_TRUE(copied_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); + ASSERT_TRUE(copied_data.statistics.is_min_exact); + + ASSERT_TRUE(copied_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); + ASSERT_TRUE(copied_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, MoveAssignment) { + ArrayData copied_data(*data_); + ArrayData moved_data; + moved_data = std::move(copied_data); + + ASSERT_TRUE(moved_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + + ASSERT_TRUE(moved_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); + ASSERT_TRUE(moved_data.statistics.is_min_exact); + + ASSERT_TRUE(moved_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); + ASSERT_TRUE(moved_data.statistics.is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyAssignment) { + ArrayData copied_data; + copied_data = *data_; + + ASSERT_TRUE(copied_data.statistics.null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + + ASSERT_TRUE(copied_data.statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); + ASSERT_TRUE(copied_data.statistics.is_min_exact); + + ASSERT_TRUE(copied_data.statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); + ASSERT_TRUE(copied_data.statistics.is_max_exact); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980..14eaed67e71 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -390,6 +395,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + ArrayStatistics statistics{}; }; /// \brief A non-owning Buffer reference From 97c2c7dcc13d296e2444031d8202c09d7b23c4ea Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 28 Aug 2024 09:39:27 +0900 Subject: [PATCH 2/5] Use pointer instead of embedding --- cpp/src/arrow/array/array_base.h | 2 +- cpp/src/arrow/array/array_test.cc | 91 ++++++++++++++++--------------- cpp/src/arrow/array/data.h | 2 +- 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index c36d4518bdb..e4af67d7e5f 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -238,7 +238,7 @@ class ARROW_EXPORT Array { /// object which backs this Array. /// /// \return const ArrayStatistics& - const ArrayStatistics& statistics() const { return data_->statistics; } + std::shared_ptr statistics() const { return data_->statistics; } protected: Array() = default; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 7fdf60fbf3d..da249fb7e87 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3721,11 +3721,12 @@ class TestArrayDataStatistics : public ::testing::Test { values_buffer_ = Buffer::FromVector(values_); data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, null_count_); - data_->statistics.null_count = null_count_; - data_->statistics.min = min_; - data_->statistics.is_min_exact = true; - data_->statistics.max = max_; - data_->statistics.is_max_exact = true; + data_->statistics = std::make_shared(); + data_->statistics->null_count = null_count_; + data_->statistics->min = min_; + data_->statistics->is_min_exact = true; + data_->statistics->max = max_; + data_->statistics->is_max_exact = true; } protected: @@ -3743,35 +3744,35 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) { ArrayData copied_data(*data_); ArrayData moved_data(std::move(copied_data)); - ASSERT_TRUE(moved_data.statistics.null_count.has_value()); - ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); - ASSERT_TRUE(moved_data.statistics.min.has_value()); - ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); - ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); - ASSERT_TRUE(moved_data.statistics.is_min_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); - ASSERT_TRUE(moved_data.statistics.max.has_value()); - ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); - ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); - ASSERT_TRUE(moved_data.statistics.is_max_exact); + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); } TEST_F(TestArrayDataStatistics, CopyConstructor) { ArrayData copied_data(*data_); - ASSERT_TRUE(copied_data.statistics.null_count.has_value()); - ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); - ASSERT_TRUE(copied_data.statistics.min.has_value()); - ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); - ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); - ASSERT_TRUE(copied_data.statistics.is_min_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); - ASSERT_TRUE(copied_data.statistics.max.has_value()); - ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); - ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); - ASSERT_TRUE(copied_data.statistics.is_max_exact); + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); } TEST_F(TestArrayDataStatistics, MoveAssignment) { @@ -3779,36 +3780,36 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) { ArrayData moved_data; moved_data = std::move(copied_data); - ASSERT_TRUE(moved_data.statistics.null_count.has_value()); - ASSERT_EQ(null_count_, moved_data.statistics.null_count.value()); + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); - ASSERT_TRUE(moved_data.statistics.min.has_value()); - ASSERT_TRUE(std::holds_alternative(moved_data.statistics.min.value())); - ASSERT_EQ(min_, std::get(moved_data.statistics.min.value())); - ASSERT_TRUE(moved_data.statistics.is_min_exact); + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); - ASSERT_TRUE(moved_data.statistics.max.has_value()); - ASSERT_TRUE(std::holds_alternative(moved_data.statistics.max.value())); - ASSERT_EQ(max_, std::get(moved_data.statistics.max.value())); - ASSERT_TRUE(moved_data.statistics.is_max_exact); + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); } TEST_F(TestArrayDataStatistics, CopyAssignment) { ArrayData copied_data; copied_data = *data_; - ASSERT_TRUE(copied_data.statistics.null_count.has_value()); - ASSERT_EQ(null_count_, copied_data.statistics.null_count.value()); + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); - ASSERT_TRUE(copied_data.statistics.min.has_value()); - ASSERT_TRUE(std::holds_alternative(copied_data.statistics.min.value())); - ASSERT_EQ(min_, std::get(copied_data.statistics.min.value())); - ASSERT_TRUE(copied_data.statistics.is_min_exact); + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); - ASSERT_TRUE(copied_data.statistics.max.has_value()); - ASSERT_TRUE(std::holds_alternative(copied_data.statistics.max.value())); - ASSERT_EQ(max_, std::get(copied_data.statistics.max.value())); - ASSERT_TRUE(copied_data.statistics.is_max_exact); + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); } template diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 14eaed67e71..4915274cb69 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -397,7 +397,7 @@ struct ARROW_EXPORT ArrayData { std::shared_ptr dictionary; // The statistics for this Array. - ArrayStatistics statistics{}; + std::shared_ptr statistics; }; /// \brief A non-owning Buffer reference From ea167409edb2ddffa96d7406432da2526ddbcd6a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 30 Aug 2024 13:29:03 +0900 Subject: [PATCH 3/5] Clear statistics on slice --- cpp/src/arrow/array/array_test.cc | 5 +++++ cpp/src/arrow/array/data.cc | 1 + 2 files changed, 6 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index da249fb7e87..930095eb67e 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3812,6 +3812,11 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { ASSERT_TRUE(copied_data.statistics->is_max_exact); } +TEST_F(TestArrayDataStatistics, Slice) { + auto sliced_data = data_->Slice(0, 1); + ASSERT_FALSE(sliced_data->statistics); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 83eeb56c496..ee6b04bfca7 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -195,6 +195,7 @@ std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { } else { copy->null_count = null_count != 0 ? kUnknownNullCount : 0; } + copy->statistics = nullptr; return copy; } From 942f757e8bc0aaa7cf788bdf0c4091b63c9af7e5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 30 Aug 2024 13:37:07 +0900 Subject: [PATCH 4/5] Add support for CopyTo --- cpp/src/arrow/array/array_test.cc | 18 ++++++++++++++++++ cpp/src/arrow/array/data.cc | 2 ++ 2 files changed, 20 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 930095eb67e..73e0c692432 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3812,6 +3812,24 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { ASSERT_TRUE(copied_data.statistics->is_max_exact); } +TEST_F(TestArrayDataStatistics, CopyTo) { + ASSERT_OK_AND_ASSIGN(auto copied_data, + data_->CopyTo(arrow::default_cpu_memory_manager())); + + ASSERT_TRUE(copied_data->statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data->statistics->null_count.value()); + + ASSERT_TRUE(copied_data->statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data->statistics->min.value())); + ASSERT_TRUE(copied_data->statistics->is_min_exact); + + ASSERT_TRUE(copied_data->statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data->statistics->max.value())); + ASSERT_TRUE(copied_data->statistics->is_max_exact); +} + TEST_F(TestArrayDataStatistics, Slice) { auto sliced_data = data_->Slice(0, 1); ASSERT_FALSE(sliced_data->statistics); diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index ee6b04bfca7..8e29297a8c1 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -165,6 +165,8 @@ Result> CopyToImpl(const ArrayData& data, ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); } + output->statistics = data.statistics; + return output; } } // namespace From 72f3f2a117d4af03ea509992fb00d8f5de9b6b4b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 1 Sep 2024 06:12:12 +0900 Subject: [PATCH 5/5] Add a comment for Slice() and statistics --- cpp/src/arrow/array/data.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 4915274cb69..1e6ee9a1d32 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -279,6 +279,18 @@ struct ARROW_EXPORT ArrayData { } /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. std::shared_ptr Slice(int64_t offset, int64_t length) const; /// \brief Input-checking variant of Slice