diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae072206..76f29ae2e49 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -24,6 +24,7 @@ #include #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/compare.h" #include "arrow/result.h" @@ -232,6 +233,11 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// \return const std::shared_ptr& + const std::shared_ptr& statistics() const { return statistics_; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); @@ -240,7 +246,20 @@ class ARROW_EXPORT Array { const uint8_t* null_bitmap_data_ = NULLPTR; /// Protected method for constructors - void SetData(const std::shared_ptr& data) { + void Init(const std::shared_ptr& data, + const std::shared_ptr& statistics) { + ValidateData(data); + SetData(data); + if (statistics) { + SetStatistics(statistics); + } + } + + /// Protected method for constructors + virtual void ValidateData(const std::shared_ptr& data) {} + + /// Protected method for constructors + virtual void SetData(const std::shared_ptr& data) { if (data->buffers.size() > 0) { null_bitmap_data_ = data->GetValuesSafe(0, /*offset=*/0); } else { @@ -249,6 +268,14 @@ class ARROW_EXPORT Array { data_ = data; } + // The statistics for this Array. + std::shared_ptr statistics_; + + /// Protected method for constructors + void SetStatistics(const std::shared_ptr& statistics) { + statistics_ = statistics; + } + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); @@ -280,12 +307,15 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { protected: PrimitiveArray() : raw_values_(NULLPTR) {} - void SetData(const std::shared_ptr& data) { - this->Array::SetData(data); + void SetData(const std::shared_ptr& data) override { + Array::SetData(data); raw_values_ = data->GetValuesSafe(1, /*offset=*/0); } - explicit PrimitiveArray(const std::shared_ptr& data) { SetData(data); } + explicit PrimitiveArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } const uint8_t* raw_values_; }; @@ -295,11 +325,14 @@ class ARROW_EXPORT NullArray : public FlatArray { public: using TypeClass = NullType; - explicit NullArray(const std::shared_ptr& data) { SetData(data); } + explicit NullArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } explicit NullArray(int64_t length); private: - void SetData(const std::shared_ptr& data) { + void SetData(const std::shared_ptr& data) override { null_bitmap_data_ = NULLPTR; data->null_count = data->length; data_ = data; diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index d83ba0ca893..5ed8283c885 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -32,9 +32,8 @@ namespace arrow { using internal::checked_cast; -BinaryArray::BinaryArray(const std::shared_ptr& data) { +void BinaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK(is_binary_like(data->type->id())); - SetData(data); } BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_offsets, @@ -45,9 +44,8 @@ BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_of null_count, offset)); } -LargeBinaryArray::LargeBinaryArray(const std::shared_ptr& data) { +void LargeBinaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK(is_large_binary_like(data->type->id())); - SetData(data); } LargeBinaryArray::LargeBinaryArray(int64_t length, @@ -59,9 +57,8 @@ LargeBinaryArray::LargeBinaryArray(int64_t length, null_count, offset)); } -StringArray::StringArray(const std::shared_ptr& data) { +void StringArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRING); - SetData(data); } StringArray::StringArray(int64_t length, const std::shared_ptr& value_offsets, @@ -74,9 +71,8 @@ StringArray::StringArray(int64_t length, const std::shared_ptr& value_of Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -LargeStringArray::LargeStringArray(const std::shared_ptr& data) { +void LargeStringArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING); - SetData(data); } LargeStringArray::LargeStringArray(int64_t length, @@ -90,9 +86,8 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -BinaryViewArray::BinaryViewArray(std::shared_ptr data) { +void BinaryViewArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); - SetData(std::move(data)); } BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, @@ -110,17 +105,12 @@ std::string_view BinaryViewArray::GetView(int64_t i) const { return util::FromBinaryView(raw_values_[i], data_buffers); } -StringViewArray::StringViewArray(std::shared_ptr data) { +void StringViewArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); - SetData(std::move(data)); } Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { - SetData(data); -} - FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 19fdee61243..76bf3d6e23c 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -142,8 +142,8 @@ class BaseBinaryArray : public FlatArray { BaseBinaryArray() = default; // Protected method for constructors - void SetData(const std::shared_ptr& data) { - this->Array::SetData(data); + void SetData(const std::shared_ptr& data) override { + Array::SetData(data); raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); raw_data_ = data->GetValuesSafe(2, /*offset=*/0); } @@ -155,7 +155,10 @@ class BaseBinaryArray : public FlatArray { /// Concrete Array class for variable-size binary data class ARROW_EXPORT BinaryArray : public BaseBinaryArray { public: - explicit BinaryArray(const std::shared_ptr& data); + explicit BinaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -165,6 +168,8 @@ class ARROW_EXPORT BinaryArray : public BaseBinaryArray { protected: // For subclasses such as StringArray BinaryArray() : BaseBinaryArray() {} + + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for variable-size string (utf-8) data @@ -172,7 +177,10 @@ class ARROW_EXPORT StringArray : public BinaryArray { public: using TypeClass = StringType; - explicit StringArray(const std::shared_ptr& data); + explicit StringArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } StringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -183,12 +191,19 @@ class ARROW_EXPORT StringArray : public BinaryArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for large variable-size binary data class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { public: - explicit LargeBinaryArray(const std::shared_ptr& data); + explicit LargeBinaryArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -198,6 +213,7 @@ class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { protected: // For subclasses such as LargeStringArray LargeBinaryArray() : BaseBinaryArray() {} + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for large variable-size string (utf-8) data @@ -205,7 +221,11 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { public: using TypeClass = LargeStringType; - explicit LargeStringArray(const std::shared_ptr& data); + explicit LargeStringArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -216,6 +236,9 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -229,7 +252,10 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { using IteratorType = stl::ArrayIterator; using c_type = BinaryViewType::c_type; - explicit BinaryViewArray(std::shared_ptr data); + explicit BinaryViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BinaryViewArray(std::shared_ptr type, int64_t length, std::shared_ptr views, BufferVector data_buffers, @@ -251,10 +277,13 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - using FlatArray::FlatArray; + // This constructor defers Init() to a derived array class + BinaryViewArray() = default; - void SetData(std::shared_ptr data) { - FlatArray::SetData(std::move(data)); + void ValidateData(const std::shared_ptr& data) override; + + void SetData(const std::shared_ptr& data) override { + FlatArray::SetData(data); raw_values_ = data_->GetValuesSafe(1); } @@ -267,7 +296,10 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { public: using TypeClass = StringViewType; - explicit StringViewArray(std::shared_ptr data); + explicit StringViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } using BinaryViewArray::BinaryViewArray; @@ -275,6 +307,9 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -286,7 +321,11 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { using TypeClass = FixedSizeBinaryType; using IteratorType = stl::ArrayIterator; - explicit FixedSizeBinaryArray(const std::shared_ptr& data); + explicit FixedSizeBinaryArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -315,8 +354,8 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - void SetData(const std::shared_ptr& data) { - this->PrimitiveArray::SetData(data); + void SetData(const std::shared_ptr& data) override { + PrimitiveArray::SetData(data); byte_width_ = internal::checked_cast(*type()).byte_width(); } diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 7fd76a1dae8..6241104ce2a 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -78,11 +78,15 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const { } } -DictionaryArray::DictionaryArray(const std::shared_ptr& data) +DictionaryArray::DictionaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics) : dict_type_(checked_cast(data->type.get())) { + Init(data, statistics); +} + +void DictionaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY); ARROW_CHECK_NE(data->dictionary, nullptr); - SetData(data); } void DictionaryArray::SetData(const std::shared_ptr& data) { diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index bf376b51f8c..00f674b841c 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -54,7 +54,8 @@ class ARROW_EXPORT DictionaryArray : public Array { public: using TypeClass = DictionaryType; - explicit DictionaryArray(const std::shared_ptr& data); + explicit DictionaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR); DictionaryArray(const std::shared_ptr& type, const std::shared_ptr& indices, @@ -114,7 +115,8 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() const { return dict_type_; } private: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; const DictionaryType* dict_type_; std::shared_ptr indices_; diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 47c0fd35829..20faa4e9499 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -518,10 +518,6 @@ Result> FlattenLogicalListRecursively(const Array& in_arr // ---------------------------------------------------------------------- // ListArray -ListArray::ListArray(std::shared_ptr data) { - ListArray::SetData(std::move(data)); -} - ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, std::shared_ptr null_bitmap, int64_t null_count, @@ -575,10 +571,6 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { - LargeListArray::SetData(data); -} - LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& values, @@ -636,21 +628,16 @@ std::shared_ptr LargeListArray::offsets() const { // ---------------------------------------------------------------------- // ListViewArray -ListViewArray::ListViewArray(std::shared_ptr data) { - ListViewArray::SetData(std::move(data)); -} - ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr value_sizes, std::shared_ptr values, std::shared_ptr null_bitmap, int64_t null_count, - int64_t offset) { - ListViewArray::SetData(ArrayData::Make( - std::move(type), length, - {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, - /*child_data=*/{values->data()}, null_count, offset)); -} + int64_t offset) + : ListViewArray(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)) {} void ListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); @@ -711,21 +698,16 @@ std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), * // ---------------------------------------------------------------------- // LargeListViewArray -LargeListViewArray::LargeListViewArray(std::shared_ptr data) { - LargeListViewArray::SetData(std::move(data)); -} - LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr value_sizes, std::shared_ptr values, std::shared_ptr null_bitmap, - int64_t null_count, int64_t offset) { - LargeListViewArray::SetData(ArrayData::Make( - type, length, - {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, - /*child_data=*/{values->data()}, null_count, offset)); -} + int64_t null_count, int64_t offset) + : LargeListViewArray(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)) {} void LargeListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); @@ -774,16 +756,13 @@ std::shared_ptr LargeListViewArray::sizes() const { // ---------------------------------------------------------------------- // MapArray -MapArray::MapArray(const std::shared_ptr& data) { SetData(data); } - MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& offsets, const std::shared_ptr& values, const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) { - SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()}, - null_count, offset)); -} + int64_t offset) + : MapArray(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()}, + null_count, offset)) {} MapArray::MapArray(const std::shared_ptr& type, int64_t length, BufferVector buffers, const std::shared_ptr& keys, @@ -908,9 +887,11 @@ Status MapArray::ValidateChildData( return Status::OK(); } -void MapArray::SetData(const std::shared_ptr& data) { +void MapArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_OK(ValidateChildData(data->child_data)); +} +void MapArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data, Type::MAP); map_type_ = checked_cast(data->type.get()); const auto& pair_data = data->child_data[0]; @@ -921,10 +902,6 @@ void MapArray::SetData(const std::shared_ptr& data) { // ---------------------------------------------------------------------- // FixedSizeListArray -FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& data) { - SetData(data); -} - FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, @@ -935,9 +912,12 @@ FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& type, SetData(internal_data); } -void FixedSizeListArray::SetData(const std::shared_ptr& data) { +void FixedSizeListArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST); - this->Array::SetData(data); +} + +void FixedSizeListArray::SetData(const std::shared_ptr& data) { + Array::SetData(data); ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id()); DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type)); @@ -1004,9 +984,12 @@ Result> FixedSizeListArray::Flatten( // ---------------------------------------------------------------------- // Struct -StructArray::StructArray(const std::shared_ptr& data) { +void StructArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRUCT); - SetData(data); +} + +void StructArray::SetData(const std::shared_ptr& data) { + Array::SetData(data); boxed_fields_.resize(data->child_data.size()); } @@ -1014,12 +997,11 @@ StructArray::StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { - ARROW_CHECK_EQ(type->id(), Type::STRUCT); - SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); + auto data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset); for (const auto& child : children) { - data_->child_data.push_back(child->data()); + data->child_data.push_back(child->data()); } - boxed_fields_.resize(children.size()); + SetData(std::move(data)); } Result> StructArray::Make( @@ -1177,8 +1159,8 @@ Result> StructArray::GetFlattenedField(int index, // ---------------------------------------------------------------------- // UnionArray -void UnionArray::SetData(std::shared_ptr data) { - this->Array::SetData(std::move(data)); +void UnionArray::SetData(const std::shared_ptr& data) { + Array::SetData(data); union_type_ = checked_cast(data_->type.get()); @@ -1187,8 +1169,8 @@ void UnionArray::SetData(std::shared_ptr data) { boxed_fields_.resize(data_->child_data.size()); } -void SparseUnionArray::SetData(std::shared_ptr data) { - this->UnionArray::SetData(std::move(data)); +void SparseUnionArray::SetData(const std::shared_ptr& data) { + UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 2); @@ -1197,7 +1179,7 @@ void SparseUnionArray::SetData(std::shared_ptr data) { } void DenseUnionArray::SetData(const std::shared_ptr& data) { - this->UnionArray::SetData(data); + UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 3); @@ -1208,10 +1190,6 @@ void DenseUnionArray::SetData(const std::shared_ptr& data) { raw_value_offsets_ = data->GetValuesSafe(2, /*offset=*/0); } -SparseUnionArray::SparseUnionArray(std::shared_ptr data) { - SetData(std::move(data)); -} - SparseUnionArray::SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_codes, int64_t offset) { @@ -1261,10 +1239,6 @@ Result> SparseUnionArray::GetFlattenedField( return MakeArray(child_data); } -DenseUnionArray::DenseUnionArray(const std::shared_ptr& data) { - SetData(data); -} - DenseUnionArray::DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, std::shared_ptr value_offsets, int64_t offset) { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index a6d4977839e..95b76f61fd5 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -162,7 +162,10 @@ class BaseListArray : public VarLengthListLikeArray { /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: - explicit ListArray(std::shared_ptr data); + explicit ListArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -220,16 +223,19 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr offsets() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class ListArray() = default; - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// Concrete Array class for large list data (with 64-bit offsets) class ARROW_EXPORT LargeListArray : public BaseListArray { public: - explicit LargeListArray(const std::shared_ptr& data); + explicit LargeListArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +290,7 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr offsets() const; protected: - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -324,7 +330,10 @@ class BaseListViewArray : public VarLengthListLikeArray { /// \brief Concrete Array class for list-view data class ARROW_EXPORT ListViewArray : public BaseListViewArray { public: - explicit ListViewArray(std::shared_ptr data); + explicit ListViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } ListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -402,17 +411,20 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { std::shared_ptr sizes() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class ListViewArray() = default; - - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// \brief Concrete Array class for large list-view data (with 64-bit offsets /// and sizes) class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { public: - explicit LargeListViewArray(std::shared_ptr data); + explicit LargeListViewArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -486,10 +498,10 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray sizes() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class LargeListViewArray() = default; - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -502,7 +514,10 @@ class ARROW_EXPORT MapArray : public ListArray { public: using TypeClass = MapType; - explicit MapArray(const std::shared_ptr& data); + explicit MapArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -558,7 +573,8 @@ class ARROW_EXPORT MapArray : public ListArray { const std::vector>& child_data); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; static Result> FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, @@ -579,7 +595,11 @@ class ARROW_EXPORT FixedSizeListArray : public Array { using TypeClass = FixedSizeListType; using offset_type = TypeClass::offset_type; - explicit FixedSizeListArray(const std::shared_ptr& data); + explicit FixedSizeListArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, @@ -655,7 +675,8 @@ class ARROW_EXPORT FixedSizeListArray : public Array { int64_t null_count = kUnknownNullCount); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; int32_t list_size_; private: @@ -670,7 +691,10 @@ class ARROW_EXPORT StructArray : public Array { public: using TypeClass = StructType; - explicit StructArray(const std::shared_ptr& data); + explicit StructArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, @@ -731,6 +755,9 @@ class ARROW_EXPORT StructArray : public Array { // For caching boxed child data // XXX This is not handled in a thread-safe manner. mutable ArrayVector boxed_fields_; + + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -765,7 +792,7 @@ class ARROW_EXPORT UnionArray : public Array { std::shared_ptr field(int pos) const; protected: - void SetData(std::shared_ptr data); + void SetData(const std::shared_ptr& data) override; const type_code_t* raw_type_codes_; const UnionType* union_type_; @@ -779,7 +806,11 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { public: using TypeClass = SparseUnionType; - explicit SparseUnionArray(std::shared_ptr data); + explicit SparseUnionArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, int64_t offset = 0); @@ -822,7 +853,7 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { int index, MemoryPool* pool = default_memory_pool()) const; protected: - void SetData(std::shared_ptr data); + void SetData(const std::shared_ptr& data) override; }; /// \brief Concrete Array class for dense union data @@ -832,7 +863,10 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { public: using TypeClass = DenseUnionType; - explicit DenseUnionArray(const std::shared_ptr& data); + explicit DenseUnionArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, @@ -890,7 +924,7 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { protected: const int32_t* raw_value_offsets_; - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// @} diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index da3810aa392..7d844d14f20 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -41,8 +41,7 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le // ---------------------------------------------------------------------- // BooleanArray -BooleanArray::BooleanArray(const std::shared_ptr& data) - : PrimitiveArray(data) { +void BooleanArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::BOOL); } @@ -70,10 +69,6 @@ int64_t BooleanArray::true_count() const { // ---------------------------------------------------------------------- // Day time interval -DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr& data) { - SetData(data); -} - DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -97,11 +92,6 @@ DayTimeIntervalType::DayMilliseconds DayTimeIntervalArray::GetValue(int64_t i) c // ---------------------------------------------------------------------- // Month, day and Nanos interval -MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( - const std::shared_ptr& data) { - SetData(data); -} - MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index e6df92e3b78..1de5362d098 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -41,7 +41,10 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { using TypeClass = BooleanType; using IteratorType = stl::ArrayIterator; - explicit BooleanArray(const std::shared_ptr& data); + explicit BooleanArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BooleanArray(int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, @@ -69,7 +72,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - using PrimitiveArray::PrimitiveArray; + void ValidateData(const std::shared_ptr& data) override; }; /// \addtogroup numeric-arrays @@ -90,7 +93,10 @@ class NumericArray : public PrimitiveArray { using value_type = typename TypeClass::c_type; using IteratorType = stl::ArrayIterator>; - explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} + explicit NumericArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } // Only enable this constructor without a type argument for types without additional // metadata @@ -131,7 +137,11 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { using TypeClass = DayTimeIntervalType; using IteratorType = stl::ArrayIterator; - explicit DayTimeIntervalArray(const std::shared_ptr& data); + explicit DayTimeIntervalArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -167,7 +177,11 @@ class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { using TypeClass = MonthDayNanoIntervalType; using IteratorType = stl::ArrayIterator; - explicit MonthDayNanoIntervalArray(const std::shared_ptr& data); + explicit MonthDayNanoIntervalArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } MonthDayNanoIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_run_end.cc b/cpp/src/arrow/array/array_run_end.cc index f423bdb33c5..e9ddb23deec 100644 --- a/cpp/src/arrow/array/array_run_end.cc +++ b/cpp/src/arrow/array/array_run_end.cc @@ -26,20 +26,16 @@ namespace arrow { // ---------------------------------------------------------------------- // RunEndEncodedArray -RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr& data) { - this->SetData(data); -} - RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& run_ends, const std::shared_ptr& values, - int64_t offset) { - this->SetData(ArrayData::Make(type, length, - /*buffers=*/{NULLPTR}, - /*child_data=*/{run_ends->data(), values->data()}, - /*null_count=*/0, offset)); -} + int64_t offset) + : RunEndEncodedArray( + ArrayData::Make(type, length, + /*buffers=*/{NULLPTR}, + /*child_data=*/{run_ends->data(), values->data()}, + /*null_count=*/0, offset)) {} Result> RunEndEncodedArray::Make( const std::shared_ptr& type, int64_t logical_length, @@ -67,14 +63,16 @@ Result> RunEndEncodedArray::Make( return Make(ree_type, logical_length, run_ends, values, logical_offset); } -void RunEndEncodedArray::SetData(const std::shared_ptr& data) { +void RunEndEncodedArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::RUN_END_ENCODED); const auto* ree_type = internal::checked_cast(data->type.get()); ARROW_CHECK_EQ(data->child_data.size(), 2); ARROW_CHECK_EQ(ree_type->run_end_type()->id(), data->child_data[0]->type->id()); ARROW_CHECK_EQ(ree_type->value_type()->id(), data->child_data[1]->type->id()); +} +void RunEndEncodedArray::SetData(const std::shared_ptr& data) { Array::SetData(data); run_ends_array_ = MakeArray(this->data()->child_data[0]); values_array_ = MakeArray(this->data()->child_data[1]); diff --git a/cpp/src/arrow/array/array_run_end.h b/cpp/src/arrow/array/array_run_end.h index b46b0855ab3..66a5d8bdab8 100644 --- a/cpp/src/arrow/array/array_run_end.h +++ b/cpp/src/arrow/array/array_run_end.h @@ -53,7 +53,11 @@ class ARROW_EXPORT RunEndEncodedArray : public Array { public: using TypeClass = RunEndEncodedType; - explicit RunEndEncodedArray(const std::shared_ptr& data); + explicit RunEndEncodedArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } /// \brief Construct a RunEndEncodedArray from all parameters /// @@ -85,7 +89,8 @@ class ARROW_EXPORT RunEndEncodedArray : public Array { const std::shared_ptr& values, int64_t logical_offset = 0); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; public: /// \brief Returns an array holding the logical indexes of each run-end diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 00000000000..80ac0e157e2 --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associate with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + public: + using ElementBufferType = std::variant; + + ArrayStatistics() = default; + ~ArrayStatistics() = default; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value buffer, may not be set + std::optional min_buffer = std::nullopt; + + /// \brief Whether the minimum value is exact or not, may not be set + std::optional is_min_exact = std::nullopt; + + /// \brief The maximum value buffer, may not be set + std::optional max_buffer = std::nullopt; + + /// \brief Whether the maximum value is exact or not, may not be set + std::optional is_max_exact = std::nullopt; + + /// \brief Check two Statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min_buffer == other.min_buffer && is_min_exact == other.is_min_exact && + max_buffer == other.max_buffer && is_max_exact == other.is_max_exact; + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b56ea25f9e4..662910f7ea4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -59,22 +59,25 @@ namespace { class ArrayDataWrapper { public: - ArrayDataWrapper(const std::shared_ptr& data, std::shared_ptr* out) - : data_(data), out_(out) {} + ArrayDataWrapper(const std::shared_ptr& data, + const std::shared_ptr& statistics, + std::shared_ptr* out) + : data_(data), statistics_(statistics), out_(out) {} template Status Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; - *out_ = std::make_shared(data_); + *out_ = std::make_shared(data_, statistics_); return Status::OK(); } Status Visit(const ExtensionType& type) { - *out_ = type.MakeArray(data_); + *out_ = type.MakeArray(data_, statistics_); return Status::OK(); } const std::shared_ptr& data_; + const std::shared_ptr& statistics_; std::shared_ptr* out_; }; @@ -333,9 +336,10 @@ Result> SwapEndianArrayData( } // namespace internal -std::shared_ptr MakeArray(const std::shared_ptr& data) { +std::shared_ptr MakeArray(const std::shared_ptr& data, + const std::shared_ptr& statistics) { std::shared_ptr out; - ArrayDataWrapper wrapper_visitor(data, &out); + ArrayDataWrapper wrapper_visitor(data, statistics, &out); DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor)); DCHECK(out); return out; diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index fd8e75ddb86..cc163ebb1c2 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -22,6 +22,7 @@ #include #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/compare.h" #include "arrow/result.h" #include "arrow/status.h" @@ -37,9 +38,12 @@ namespace arrow { /// \brief Create a strongly-typed Array instance from generic ArrayData /// \param[in] data the array contents +/// \param[in] statistics the array statistics /// \return the resulting Array instance ARROW_EXPORT -std::shared_ptr MakeArray(const std::shared_ptr& data); +std::shared_ptr MakeArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR); /// \brief Create a strongly-typed Array instance with all elements null /// \param[in] type the array type diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index b3f085198be..9ac3ea0a1d7 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -72,6 +72,15 @@ class ARROW_EXPORT ExtensionType : public DataType { /// \param[in] data the physical storage for the extension type virtual std::shared_ptr MakeArray(std::shared_ptr data) const = 0; + /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance + /// \param[in] data the physical storage for the extension type + /// \param[in] statistics the physical statistics for the extension type + virtual std::shared_ptr MakeArray( + std::shared_ptr data, + std::shared_ptr statistics) const { + return MakeArray(std::move(data)); + } + /// \brief Create an instance of the ExtensionType given the actual storage /// type and the serialized representation /// \param[in] storage_type the physical storage type of the extension diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index ad4496933ef..488fdd9543d 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -17,12 +17,14 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "parquet/api/reader.h" #include "parquet/api/writer.h" +#include "parquet/arrow/reader.h" #include "parquet/arrow/schema.h" #include "parquet/arrow/writer.h" #include "parquet/file_writer.h" @@ -156,4 +158,68 @@ INSTANTIATE_TEST_SUITE_P( /*expected_min=*/"z", /*expected_max=*/"z"})); +namespace { +::arrow::Result> StatisticsReadArray( + std::shared_ptr<::arrow::DataType> data_type, const std::string& json) { + auto schema = ::arrow::schema({::arrow::field("column", data_type)}); + auto array = ::arrow::ArrayFromJSON(data_type, json); + auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array}); + ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(auto writer, + FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink)); + ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch)); + ARROW_RETURN_NOT_OK(writer->Close()); + ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish()); + + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); + std::unique_ptr file_reader; + ARROW_RETURN_NOT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + std::shared_ptr<::arrow::ChunkedArray> chunked_array; + ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array)); + return chunked_array->chunk(0); +} +} // namespace + +TEST(TestStatisticsRead, Boolean) { + ASSERT_OK_AND_ASSIGN(auto array, + StatisticsReadArray(::arrow::boolean(), R"([true, null, true])")); + auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array); + auto statistics = typed_array->statistics(); + ASSERT_EQ(true, statistics->null_count.has_value()); + ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(false, statistics->distinct_count.has_value()); + ASSERT_EQ(true, statistics->min_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->min_buffer)); + ASSERT_EQ(true, std::get(*statistics->min_buffer)); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); + ASSERT_EQ(true, statistics->max_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->max_buffer)); + ASSERT_EQ(true, std::get(*statistics->max_buffer)); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); +} + +TEST(TestStatisticsRead, Int8) { + ASSERT_OK_AND_ASSIGN(auto array, + StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])")); + auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array); + auto statistics = typed_array->statistics(); + ASSERT_EQ(true, statistics->null_count.has_value()); + ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(false, statistics->distinct_count.has_value()); + ASSERT_EQ(true, statistics->min_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->min_buffer)); + ASSERT_EQ(-1, std::get(*statistics->min_buffer)); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); + ASSERT_EQ(true, statistics->max_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->max_buffer)); + ASSERT_EQ(1, std::get(*statistics->max_buffer)); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); +} + } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 285e2a59738..42ce6d4fc5b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -485,8 +485,9 @@ class LeafReader : public ColumnReaderImpl { NextRowGroup(); } } - RETURN_NOT_OK( - TransferColumnData(record_reader_.get(), field_, descr_, ctx_->pool, &out_)); + RETURN_NOT_OK(TransferColumnData(record_reader_.get(), + input_->column_chunk_metadata(), field_, descr_, + ctx_->pool, &out_)); return Status::OK(); END_PARQUET_CATCH_EXCEPTIONS } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index e5aef5a45b5..14cd42e638f 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -319,8 +319,9 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) { } template -Status TransferInt(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr& field, Datum* out) { +Status TransferInt(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + MemoryPool* pool, const std::shared_ptr& field, Datum* out) { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; int64_t length = reader->values_written(); @@ -330,15 +331,33 @@ Status TransferInt(RecordReader* reader, MemoryPool* pool, auto values = reinterpret_cast(reader->values()); auto out_ptr = reinterpret_cast(data->mutable_data()); std::copy(values, values + length, out_ptr); + int64_t null_count = 0; + std::vector> buffers = {nullptr, std::move(data)}; if (field->nullable()) { - *out = std::make_shared>(field->type(), length, std::move(data), - reader->ReleaseIsValid(), - reader->null_count()); - } else { - *out = - std::make_shared>(field->type(), length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + null_count = reader->null_count(); + buffers[0] = reader->ReleaseIsValid(); + } + auto array_data = + ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); + auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); + array_statistics->null_count = null_count; + auto statistics = metadata->statistics().get(); + if (statistics) { + if (statistics->HasDistinctCount()) { + array_statistics->distinct_count = statistics->distinct_count(); + } + if (statistics->HasMinMax()) { + auto typed_statistics = + static_cast<::parquet::TypedStatistics*>(statistics); + array_statistics->min_buffer = static_cast(typed_statistics->min()); + array_statistics->is_min_exact = true; + array_statistics->max_buffer = static_cast(typed_statistics->max()); + array_statistics->is_max_exact = true; + } } + auto array = std::make_shared>(std::move(array_data), + std::move(array_statistics)); + *out = std::make_shared(std::move(array)); return Status::OK(); } @@ -358,7 +377,9 @@ std::shared_ptr TransferZeroCopy(RecordReader* reader, return ::arrow::MakeArray(data); } -Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) { +Status TransferBool(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + bool nullable, MemoryPool* pool, Datum* out) { int64_t length = reader->values_written(); const int64_t buffer_size = bit_util::BytesForBits(length); @@ -375,13 +396,32 @@ Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum } } + int64_t null_count = 0; + std::vector> buffers = {nullptr, std::move(data)}; if (nullable) { - *out = std::make_shared(length, std::move(data), - reader->ReleaseIsValid(), reader->null_count()); - } else { - *out = std::make_shared(length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + null_count = reader->null_count(); + buffers[0] = reader->ReleaseIsValid(); + } + auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + std::move(buffers), null_count); + auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); + array_statistics->null_count = null_count; + auto statistics = metadata->statistics().get(); + if (statistics) { + if (statistics->HasDistinctCount()) { + array_statistics->distinct_count = statistics->distinct_count(); + } + if (statistics->HasMinMax()) { + auto bool_statistics = static_cast<::parquet::BoolStatistics*>(statistics); + array_statistics->min_buffer = bool_statistics->min(); + array_statistics->is_min_exact = true; + array_statistics->max_buffer = bool_statistics->max(); + array_statistics->is_max_exact = true; + } } + auto array = + std::make_shared(std::move(array_data), std::move(array_statistics)); + *out = std::make_shared(std::move(array)); return Status::OK(); } @@ -728,19 +768,23 @@ Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool, } // namespace -#define TRANSFER_INT32(ENUM, ArrowType) \ - case ::arrow::Type::ENUM: { \ - Status s = TransferInt(reader, pool, value_field, &result); \ - RETURN_NOT_OK(s); \ +#define TRANSFER_INT32(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(reader, std::move(metadata), pool, \ + value_field, &result); \ + RETURN_NOT_OK(s); \ } break; -#define TRANSFER_INT64(ENUM, ArrowType) \ - case ::arrow::Type::ENUM: { \ - Status s = TransferInt(reader, pool, value_field, &result); \ - RETURN_NOT_OK(s); \ +#define TRANSFER_INT64(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(reader, std::move(metadata), pool, \ + value_field, &result); \ + RETURN_NOT_OK(s); \ } break; -Status TransferColumnData(RecordReader* reader, const std::shared_ptr& value_field, +Status TransferColumnData(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + const std::shared_ptr& value_field, const ColumnDescriptor* descr, MemoryPool* pool, std::shared_ptr* out) { Datum result; @@ -762,7 +806,8 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr& va result = TransferZeroCopy(reader, value_field); break; case ::arrow::Type::BOOL: - RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result)); + RETURN_NOT_OK(TransferBool(reader, std::move(metadata), value_field->nullable(), + pool, &result)); break; TRANSFER_INT32(UINT8, ::arrow::UInt8Type); TRANSFER_INT32(INT8, ::arrow::Int8Type); diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index cf9dbb86577..3efd4d2cd9e 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -66,7 +66,8 @@ class FileColumnIterator { : column_index_(column_index), reader_(reader), schema_(reader->metadata()->schema()), - row_groups_(row_groups.begin(), row_groups.end()) {} + row_groups_(row_groups.begin(), row_groups.end()), + row_group_index_(-1) {} virtual ~FileColumnIterator() {} @@ -75,7 +76,8 @@ class FileColumnIterator { return nullptr; } - auto row_group_reader = reader_->RowGroup(row_groups_.front()); + row_group_index_ = row_groups_.front(); + auto row_group_reader = reader_->RowGroup(row_group_index_); row_groups_.pop_front(); return row_group_reader->GetColumnPageReader(column_index_); } @@ -86,19 +88,31 @@ class FileColumnIterator { std::shared_ptr metadata() const { return reader_->metadata(); } + std::unique_ptr row_group_metadata() const { + return metadata()->RowGroup(row_group_index_); + } + + std::unique_ptr column_chunk_metadata() const { + return row_group_metadata()->ColumnChunk(column_index_); + } + int column_index() const { return column_index_; } + int row_group_index() const { return row_group_index_; } + protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; std::deque row_groups_; + int row_group_index_; }; using FileColumnIteratorFactory = std::function; Status TransferColumnData(::parquet::internal::RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, const std::shared_ptr<::arrow::Field>& value_field, const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::ChunkedArray>* out);