From d5013c98428ae582116b185f8ad0b4b1f5f908a9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 13 Jun 2024 16:45:42 +0900 Subject: [PATCH 01/10] GH-41909: PoC: [C++] Add arrow::ArrayStatistics --- cpp/src/arrow/array/array_base.h | 8 ++ cpp/src/arrow/array/array_primitive.h | 10 +++ cpp/src/arrow/array/data.h | 12 ++- cpp/src/arrow/array/statistics.h | 84 ++++++++++++++++++ cpp/src/arrow/type_fwd.h | 15 ++-- .../parquet/arrow/arrow_statistics_test.cc | 54 ++++++++++++ cpp/src/parquet/arrow/reader.cc | 5 +- cpp/src/parquet/arrow/reader_internal.cc | 87 +++++++++++++------ cpp/src/parquet/arrow/reader_internal.h | 18 +++- 9 files changed, 257 insertions(+), 36 deletions(-) create mode 100644 cpp/src/arrow/array/statistics.h diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae072206..c36d4518bdb 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + const ArrayStatistics& statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index e6df92e3b78..900b4bbdf52 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -68,6 +68,11 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } + /// \brief Return the statistics for boolean. + const BooleanArrayStatistics& statistics() const { + return static_cast(Array::statistics()); + } + protected: using PrimitiveArray::PrimitiveArray; }; @@ -119,6 +124,11 @@ class NumericArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } + /// \brief Return the typed statistics. + const TypedArrayStatistics& statistics() const { + return static_cast&>(Array::statistics()); + } + protected: using PrimitiveArray::PrimitiveArray; }; diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980..14eaed67e71 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -390,6 +395,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + ArrayStatistics statistics{}; }; /// \brief A non-owning Buffer reference diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 00000000000..c080087a00b --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associate with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + public: + using ElementBufferType = std::variant; + + ArrayStatistics() = default; + ~ArrayStatistics() = default; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The current minimum value buffer, may not be set + std::optional min_buffer = std::nullopt; + + /// \brief The current maximum value buffer, may not be set + std::optional max_buffer = std::nullopt; + + /// \brief Check two Statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min_buffer == other.min_buffer && max_buffer == other.max_buffer; + } +}; + +/// \brief A typed implementation of ArrayStatistics +template +class TypedArrayStatistics : public ArrayStatistics { + public: + using ElementType = typename TypeClass::c_type; + + /// \brief The current minimum value, may not be set + std::optional min() const { + if (min_buffer && std::holds_alternative(*min_buffer)) { + return std::get(*min_buffer); + } else { + return std::nullopt; + } + } + + /// \brief The current maximum value, may not be set + std::optional max() const { + if (max_buffer && std::holds_alternative(*max_buffer)) { + return std::get(*max_buffer); + } else { + return std::nullopt; + } + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 08777d247ed..54d3e09f01d 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -89,6 +89,9 @@ using ChunkedArrayVector = std::vector>; using RecordBatchVector = std::vector>; using RecordBatchIterator = Iterator>; +template +class TypedArrayStatistics; + class DictionaryType; class DictionaryArray; struct DictionaryScalar; @@ -102,6 +105,7 @@ class FixedWidthType; class BooleanType; class BooleanArray; +using BooleanArrayStatistics = TypedArrayStatistics; class BooleanBuilder; struct BooleanScalar; @@ -215,11 +219,12 @@ class NumericBuilder; template class NumericTensor; -#define _NUMERIC_TYPE_DECL(KLASS) \ - class KLASS##Type; \ - using KLASS##Array = NumericArray; \ - using KLASS##Builder = NumericBuilder; \ - struct KLASS##Scalar; \ +#define _NUMERIC_TYPE_DECL(KLASS) \ + class KLASS##Type; \ + using KLASS##Array = NumericArray; \ + using KLASS##ArrayStatistics = TypedArrayStatistics; \ + using KLASS##Builder = NumericBuilder; \ + struct KLASS##Scalar; \ using KLASS##Tensor = NumericTensor; _NUMERIC_TYPE_DECL(Int8) diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index ad4496933ef..4b4bf5e7406 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -17,12 +17,14 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "parquet/api/reader.h" #include "parquet/api/writer.h" +#include "parquet/arrow/reader.h" #include "parquet/arrow/schema.h" #include "parquet/arrow/writer.h" #include "parquet/file_writer.h" @@ -156,4 +158,56 @@ INSTANTIATE_TEST_SUITE_P( /*expected_min=*/"z", /*expected_max=*/"z"})); +namespace { +::arrow::Result> StatisticsReadArray( + std::shared_ptr<::arrow::DataType> data_type, const std::string& json) { + auto schema = ::arrow::schema({::arrow::field("column", data_type)}); + auto array = ::arrow::ArrayFromJSON(data_type, json); + auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array}); + ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(auto writer, + FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink)); + ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch)); + ARROW_RETURN_NOT_OK(writer->Close()); + ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish()); + + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); + std::unique_ptr file_reader; + ARROW_RETURN_NOT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + std::shared_ptr<::arrow::ChunkedArray> chunked_array; + ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array)); + return chunked_array->chunk(0); +} +} // namespace + +TEST(TestStatisticsRead, Boolean) { + ASSERT_OK_AND_ASSIGN(auto array, + StatisticsReadArray(::arrow::boolean(), R"([true, null, true])")); + auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array); + auto statistics = typed_array->statistics(); + ASSERT_EQ(true, statistics.null_count.has_value()); + ASSERT_EQ(1, statistics.null_count.value()); + ASSERT_EQ(false, statistics.distinct_count.has_value()); + ASSERT_EQ(true, statistics.min().has_value()); + ASSERT_EQ(true, statistics.min().value()); + ASSERT_EQ(true, statistics.max().has_value()); + ASSERT_EQ(true, statistics.max().value()); +} + +TEST(TestStatisticsRead, Int8) { + ASSERT_OK_AND_ASSIGN(auto array, + StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])")); + auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array); + auto statistics = typed_array->statistics(); + ASSERT_EQ(true, statistics.null_count.has_value()); + ASSERT_EQ(1, statistics.null_count.value()); + ASSERT_EQ(false, statistics.distinct_count.has_value()); + ASSERT_EQ(true, statistics.min().has_value()); + ASSERT_EQ(-1, statistics.min().value()); + ASSERT_EQ(true, statistics.max().has_value()); + ASSERT_EQ(1, statistics.max().value()); +} + } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 285e2a59738..42ce6d4fc5b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -485,8 +485,9 @@ class LeafReader : public ColumnReaderImpl { NextRowGroup(); } } - RETURN_NOT_OK( - TransferColumnData(record_reader_.get(), field_, descr_, ctx_->pool, &out_)); + RETURN_NOT_OK(TransferColumnData(record_reader_.get(), + input_->column_chunk_metadata(), field_, descr_, + ctx_->pool, &out_)); return Status::OK(); END_PARQUET_CATCH_EXCEPTIONS } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index e5aef5a45b5..0bb6b6104b2 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -319,8 +319,9 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) { } template -Status TransferInt(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr& field, Datum* out) { +Status TransferInt(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + MemoryPool* pool, const std::shared_ptr& field, Datum* out) { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; int64_t length = reader->values_written(); @@ -330,15 +331,30 @@ Status TransferInt(RecordReader* reader, MemoryPool* pool, auto values = reinterpret_cast(reader->values()); auto out_ptr = reinterpret_cast(data->mutable_data()); std::copy(values, values + length, out_ptr); + int64_t null_count = 0; + std::vector> buffers = {nullptr, std::move(data)}; if (field->nullable()) { - *out = std::make_shared>(field->type(), length, std::move(data), - reader->ReleaseIsValid(), - reader->null_count()); - } else { - *out = - std::make_shared>(field->type(), length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + null_count = reader->null_count(); + buffers[0] = reader->ReleaseIsValid(); + } + auto array_data = + ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); + array_data->statistics.null_count = null_count; + auto statistics = metadata->statistics().get(); + if (statistics) { + if (statistics->HasDistinctCount()) { + array_data->statistics.distinct_count = statistics->distinct_count(); + } + if (statistics->HasMinMax()) { + auto typed_statistics = + static_cast<::parquet::TypedStatistics*>(statistics); + array_data->statistics.min_buffer = + static_cast(typed_statistics->min()); + array_data->statistics.max_buffer = + static_cast(typed_statistics->max()); + } } + *out = std::make_shared>(std::move(array_data)); return Status::OK(); } @@ -358,7 +374,9 @@ std::shared_ptr TransferZeroCopy(RecordReader* reader, return ::arrow::MakeArray(data); } -Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) { +Status TransferBool(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + bool nullable, MemoryPool* pool, Datum* out) { int64_t length = reader->values_written(); const int64_t buffer_size = bit_util::BytesForBits(length); @@ -375,13 +393,27 @@ Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum } } + int64_t null_count = 0; + std::vector> buffers = {nullptr, std::move(data)}; if (nullable) { - *out = std::make_shared(length, std::move(data), - reader->ReleaseIsValid(), reader->null_count()); - } else { - *out = std::make_shared(length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + null_count = reader->null_count(); + buffers[0] = reader->ReleaseIsValid(); + } + auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + std::move(buffers), null_count); + array_data->statistics.null_count = null_count; + auto statistics = metadata->statistics().get(); + if (statistics) { + if (statistics->HasDistinctCount()) { + array_data->statistics.distinct_count = statistics->distinct_count(); + } + if (statistics->HasMinMax()) { + auto bool_statistics = static_cast<::parquet::BoolStatistics*>(statistics); + array_data->statistics.min_buffer = bool_statistics->min(); + array_data->statistics.max_buffer = bool_statistics->max(); + } } + *out = std::make_shared(std::move(array_data)); return Status::OK(); } @@ -728,19 +760,23 @@ Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool, } // namespace -#define TRANSFER_INT32(ENUM, ArrowType) \ - case ::arrow::Type::ENUM: { \ - Status s = TransferInt(reader, pool, value_field, &result); \ - RETURN_NOT_OK(s); \ +#define TRANSFER_INT32(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(reader, std::move(metadata), pool, \ + value_field, &result); \ + RETURN_NOT_OK(s); \ } break; -#define TRANSFER_INT64(ENUM, ArrowType) \ - case ::arrow::Type::ENUM: { \ - Status s = TransferInt(reader, pool, value_field, &result); \ - RETURN_NOT_OK(s); \ +#define TRANSFER_INT64(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(reader, std::move(metadata), pool, \ + value_field, &result); \ + RETURN_NOT_OK(s); \ } break; -Status TransferColumnData(RecordReader* reader, const std::shared_ptr& value_field, +Status TransferColumnData(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + const std::shared_ptr& value_field, const ColumnDescriptor* descr, MemoryPool* pool, std::shared_ptr* out) { Datum result; @@ -762,7 +798,8 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr& va result = TransferZeroCopy(reader, value_field); break; case ::arrow::Type::BOOL: - RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result)); + RETURN_NOT_OK(TransferBool(reader, std::move(metadata), value_field->nullable(), + pool, &result)); break; TRANSFER_INT32(UINT8, ::arrow::UInt8Type); TRANSFER_INT32(INT8, ::arrow::Int8Type); diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index cf9dbb86577..3efd4d2cd9e 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -66,7 +66,8 @@ class FileColumnIterator { : column_index_(column_index), reader_(reader), schema_(reader->metadata()->schema()), - row_groups_(row_groups.begin(), row_groups.end()) {} + row_groups_(row_groups.begin(), row_groups.end()), + row_group_index_(-1) {} virtual ~FileColumnIterator() {} @@ -75,7 +76,8 @@ class FileColumnIterator { return nullptr; } - auto row_group_reader = reader_->RowGroup(row_groups_.front()); + row_group_index_ = row_groups_.front(); + auto row_group_reader = reader_->RowGroup(row_group_index_); row_groups_.pop_front(); return row_group_reader->GetColumnPageReader(column_index_); } @@ -86,19 +88,31 @@ class FileColumnIterator { std::shared_ptr metadata() const { return reader_->metadata(); } + std::unique_ptr row_group_metadata() const { + return metadata()->RowGroup(row_group_index_); + } + + std::unique_ptr column_chunk_metadata() const { + return row_group_metadata()->ColumnChunk(column_index_); + } + int column_index() const { return column_index_; } + int row_group_index() const { return row_group_index_; } + protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; std::deque row_groups_; + int row_group_index_; }; using FileColumnIteratorFactory = std::function; Status TransferColumnData(::parquet::internal::RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, const std::shared_ptr<::arrow::Field>& value_field, const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::ChunkedArray>* out); From 355172f9ace66c03017261b50936cd8d2ab36231 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 15:32:25 +0900 Subject: [PATCH 02/10] Move statistics to Array from ArrayData --- cpp/src/arrow/array/array_base.h | 18 ++++++++--- cpp/src/arrow/array/array_primitive.h | 8 ++--- cpp/src/arrow/array/data.h | 12 ++----- .../parquet/arrow/arrow_statistics_test.cc | 32 +++++++++---------- cpp/src/parquet/arrow/reader_internal.cc | 28 +++++++++------- 5 files changed, 51 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index c36d4518bdb..a0d0f2fbc61 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -24,6 +24,7 @@ #include #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/compare.h" #include "arrow/result.h" @@ -232,13 +233,17 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } - /// \brief Return the statistics of this Array + /// \brief Set the statistics to this Array /// - /// This just delegates to calling statistics on the underlying ArrayData - /// object which backs this Array. + /// \param[in] statistics the statistics of this Array + void SetStatistics(std::shared_ptr statistics) { + statistics_ = std::move(statistics); + } + + /// \brief Return the statistics of this Array /// - /// \return const ArrayStatistics& - const ArrayStatistics& statistics() const { return data_->statistics; } + /// \return std::shared_ptr + std::shared_ptr GetStatistics() const { return statistics_; } protected: Array() = default; @@ -257,6 +262,9 @@ class ARROW_EXPORT Array { data_ = data; } + // The statistics for this Array. + std::shared_ptr statistics_; + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index 900b4bbdf52..375c884faeb 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -69,8 +69,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } /// \brief Return the statistics for boolean. - const BooleanArrayStatistics& statistics() const { - return static_cast(Array::statistics()); + std::shared_ptr GetStatistics() const { + return std::static_pointer_cast(Array::GetStatistics()); } protected: @@ -125,8 +125,8 @@ class NumericArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } /// \brief Return the typed statistics. - const TypedArrayStatistics& statistics() const { - return static_cast&>(Array::statistics()); + std::shared_ptr> GetStatistics() const { + return std::static_pointer_cast>(Array::GetStatistics()); } protected: diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 14eaed67e71..e0508fe6980 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,7 +24,6 @@ #include #include -#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -153,8 +152,7 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)), - statistics(std::move(other.statistics)) { + dictionary(std::move(other.dictionary)) { SetNullCount(other.null_count); } @@ -165,8 +163,7 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary), - statistics(other.statistics) { + dictionary(other.dictionary) { SetNullCount(other.null_count); } @@ -179,7 +176,6 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); - statistics = std::move(other.statistics); return *this; } @@ -192,7 +188,6 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; - statistics = other.statistics; return *this; } @@ -395,9 +390,6 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; - - // The statistics for this Array. - ArrayStatistics statistics{}; }; /// \brief A non-owning Buffer reference diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 4b4bf5e7406..00c708b1d21 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -186,28 +186,28 @@ TEST(TestStatisticsRead, Boolean) { ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(::arrow::boolean(), R"([true, null, true])")); auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array); - auto statistics = typed_array->statistics(); - ASSERT_EQ(true, statistics.null_count.has_value()); - ASSERT_EQ(1, statistics.null_count.value()); - ASSERT_EQ(false, statistics.distinct_count.has_value()); - ASSERT_EQ(true, statistics.min().has_value()); - ASSERT_EQ(true, statistics.min().value()); - ASSERT_EQ(true, statistics.max().has_value()); - ASSERT_EQ(true, statistics.max().value()); + auto statistics = typed_array->GetStatistics(); + ASSERT_EQ(true, statistics->null_count.has_value()); + ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(false, statistics->distinct_count.has_value()); + ASSERT_EQ(true, statistics->min().has_value()); + ASSERT_EQ(true, statistics->min().value()); + ASSERT_EQ(true, statistics->max().has_value()); + ASSERT_EQ(true, statistics->max().value()); } TEST(TestStatisticsRead, Int8) { ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])")); auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array); - auto statistics = typed_array->statistics(); - ASSERT_EQ(true, statistics.null_count.has_value()); - ASSERT_EQ(1, statistics.null_count.value()); - ASSERT_EQ(false, statistics.distinct_count.has_value()); - ASSERT_EQ(true, statistics.min().has_value()); - ASSERT_EQ(-1, statistics.min().value()); - ASSERT_EQ(true, statistics.max().has_value()); - ASSERT_EQ(1, statistics.max().value()); + auto statistics = typed_array->GetStatistics(); + ASSERT_EQ(true, statistics->null_count.has_value()); + ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(false, statistics->distinct_count.has_value()); + ASSERT_EQ(true, statistics->min().has_value()); + ASSERT_EQ(-1, statistics->min().value()); + ASSERT_EQ(true, statistics->max().has_value()); + ASSERT_EQ(1, statistics->max().value()); } } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 0bb6b6104b2..35ac540dfb9 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -339,22 +339,23 @@ Status TransferInt(RecordReader* reader, } auto array_data = ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); - array_data->statistics.null_count = null_count; + auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); + array_statistics->null_count = null_count; auto statistics = metadata->statistics().get(); if (statistics) { if (statistics->HasDistinctCount()) { - array_data->statistics.distinct_count = statistics->distinct_count(); + array_statistics->distinct_count = statistics->distinct_count(); } if (statistics->HasMinMax()) { auto typed_statistics = static_cast<::parquet::TypedStatistics*>(statistics); - array_data->statistics.min_buffer = - static_cast(typed_statistics->min()); - array_data->statistics.max_buffer = - static_cast(typed_statistics->max()); + array_statistics->min_buffer = static_cast(typed_statistics->min()); + array_statistics->max_buffer = static_cast(typed_statistics->max()); } } - *out = std::make_shared>(std::move(array_data)); + auto array = std::make_shared>(std::move(array_data)); + array->SetStatistics(std::move(array_statistics)); + *out = std::move(array); return Status::OK(); } @@ -401,19 +402,22 @@ Status TransferBool(RecordReader* reader, } auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, std::move(buffers), null_count); - array_data->statistics.null_count = null_count; + auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); + array_statistics->null_count = null_count; auto statistics = metadata->statistics().get(); if (statistics) { if (statistics->HasDistinctCount()) { - array_data->statistics.distinct_count = statistics->distinct_count(); + array_statistics->distinct_count = statistics->distinct_count(); } if (statistics->HasMinMax()) { auto bool_statistics = static_cast<::parquet::BoolStatistics*>(statistics); - array_data->statistics.min_buffer = bool_statistics->min(); - array_data->statistics.max_buffer = bool_statistics->max(); + array_statistics->min_buffer = bool_statistics->min(); + array_statistics->max_buffer = bool_statistics->max(); } } - *out = std::make_shared(std::move(array_data)); + auto array = std::make_shared(std::move(array_data)); + array->SetStatistics(std::move(array_statistics)); + *out = std::move(array); return Status::OK(); } From 49e7b0398fa13048a3056ebff84cb7c78cae5757 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 15:51:54 +0900 Subject: [PATCH 03/10] Add is_min_exact/is_max_exact --- cpp/src/arrow/array/statistics.h | 13 ++++++++++--- cpp/src/parquet/arrow/arrow_statistics_test.cc | 8 ++++++++ cpp/src/parquet/arrow/reader_internal.cc | 4 ++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index c080087a00b..43dfd656b31 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -43,16 +43,23 @@ struct ARROW_EXPORT ArrayStatistics { /// \brief The number of distinct values, may not be set std::optional distinct_count = std::nullopt; - /// \brief The current minimum value buffer, may not be set + /// \brief The minimum value buffer, may not be set std::optional min_buffer = std::nullopt; - /// \brief The current maximum value buffer, may not be set + /// \brief Whether the minimum value is exact or not, may not be set + std::optional is_min_exact = std::nullopt; + + /// \brief The maximum value buffer, may not be set std::optional max_buffer = std::nullopt; + /// \brief Whether the maximum value is exact or not, may not be set + std::optional is_max_exact = std::nullopt; + /// \brief Check two Statistics for equality bool Equals(const ArrayStatistics& other) const { return null_count == other.null_count && distinct_count == other.distinct_count && - min_buffer == other.min_buffer && max_buffer == other.max_buffer; + min_buffer == other.min_buffer && is_min_exact == other.is_min_exact && + max_buffer == other.max_buffer && is_max_exact == other.is_max_exact; } }; diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 00c708b1d21..d6b5c34e935 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -192,8 +192,12 @@ TEST(TestStatisticsRead, Boolean) { ASSERT_EQ(false, statistics->distinct_count.has_value()); ASSERT_EQ(true, statistics->min().has_value()); ASSERT_EQ(true, statistics->min().value()); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); ASSERT_EQ(true, statistics->max().has_value()); ASSERT_EQ(true, statistics->max().value()); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); } TEST(TestStatisticsRead, Int8) { @@ -206,8 +210,12 @@ TEST(TestStatisticsRead, Int8) { ASSERT_EQ(false, statistics->distinct_count.has_value()); ASSERT_EQ(true, statistics->min().has_value()); ASSERT_EQ(-1, statistics->min().value()); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); ASSERT_EQ(true, statistics->max().has_value()); ASSERT_EQ(1, statistics->max().value()); + ASSERT_EQ(true, statistics->is_min_exact.has_value()); + ASSERT_EQ(true, statistics->is_min_exact.value()); } } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 35ac540dfb9..f42f67e1e90 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -350,7 +350,9 @@ Status TransferInt(RecordReader* reader, auto typed_statistics = static_cast<::parquet::TypedStatistics*>(statistics); array_statistics->min_buffer = static_cast(typed_statistics->min()); + array_statistics->is_min_exact = true; array_statistics->max_buffer = static_cast(typed_statistics->max()); + array_statistics->is_max_exact = true; } } auto array = std::make_shared>(std::move(array_data)); @@ -412,7 +414,9 @@ Status TransferBool(RecordReader* reader, if (statistics->HasMinMax()) { auto bool_statistics = static_cast<::parquet::BoolStatistics*>(statistics); array_statistics->min_buffer = bool_statistics->min(); + array_statistics->is_min_exact = true; array_statistics->max_buffer = bool_statistics->max(); + array_statistics->is_max_exact = true; } } auto array = std::make_shared(std::move(array_data)); From 2bed0ae00b0565034282c51f4b8b7ce7880f65a3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 16:27:24 +0900 Subject: [PATCH 04/10] Use BooleanArrayStatistics --- cpp/src/parquet/arrow/reader_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index f42f67e1e90..c28982030ba 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -404,7 +404,7 @@ Status TransferBool(RecordReader* reader, } auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, std::move(buffers), null_count); - auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); + auto array_statistics = std::make_shared<::arrow::BooleanArrayStatistics>(); array_statistics->null_count = null_count; auto statistics = metadata->statistics().get(); if (statistics) { From 0ef48e19c9edb389ea0d9f8f92ed7a1b136f4e9f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 11 Jul 2024 16:27:31 +0900 Subject: [PATCH 05/10] Use ChunkedArray to keep statistics --- cpp/src/parquet/arrow/reader_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index c28982030ba..e7fdde19f5c 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -357,7 +357,7 @@ Status TransferInt(RecordReader* reader, } auto array = std::make_shared>(std::move(array_data)); array->SetStatistics(std::move(array_statistics)); - *out = std::move(array); + *out = std::make_shared(std::move(array)); return Status::OK(); } @@ -421,7 +421,7 @@ Status TransferBool(RecordReader* reader, } auto array = std::make_shared(std::move(array_data)); array->SetStatistics(std::move(array_statistics)); - *out = std::move(array); + *out = std::make_shared(std::move(array)); return Status::OK(); } From fd7af219d207454c3ff6b8d28a0be52f2bcef013 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 12 Jul 2024 13:52:07 +0900 Subject: [PATCH 06/10] Use Array's ArrayData constructor to assign statistics --- cpp/src/arrow/array/array_base.h | 41 ++++++--- cpp/src/arrow/array/array_binary.cc | 22 ++--- cpp/src/arrow/array/array_binary.h | 52 ++++++++--- cpp/src/arrow/array/array_dict.cc | 9 +- cpp/src/arrow/array/array_dict.h | 6 +- cpp/src/arrow/array/array_nested.cc | 90 +++++++------------ cpp/src/arrow/array/array_nested.h | 68 ++++++++++---- cpp/src/arrow/array/array_primitive.cc | 12 +-- cpp/src/arrow/array/array_primitive.h | 28 ++++-- cpp/src/arrow/array/array_run_end.cc | 20 ++--- cpp/src/arrow/array/array_run_end.h | 8 +- cpp/src/arrow/array/util.cc | 16 ++-- cpp/src/arrow/array/util.h | 6 +- cpp/src/arrow/extension_type.h | 9 ++ .../parquet/arrow/arrow_statistics_test.cc | 4 +- cpp/src/parquet/arrow/reader_internal.cc | 8 +- 16 files changed, 232 insertions(+), 167 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index a0d0f2fbc61..6e47584b032 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -233,27 +233,31 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } - /// \brief Set the statistics to this Array - /// - /// \param[in] statistics the statistics of this Array - void SetStatistics(std::shared_ptr statistics) { - statistics_ = std::move(statistics); - } - /// \brief Return the statistics of this Array /// - /// \return std::shared_ptr - std::shared_ptr GetStatistics() const { return statistics_; } + /// \return const std::shared_ptr& + const std::shared_ptr& statistics() const { return statistics_; } protected: Array() = default; + explicit Array(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + SetData(data); + if (statistics) { + SetStatistics(statistics); + } + } ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); std::shared_ptr data_; const uint8_t* null_bitmap_data_ = NULLPTR; /// Protected method for constructors - void SetData(const std::shared_ptr& data) { + virtual void ValidateData(const std::shared_ptr& data) {} + + /// Protected method for constructors + virtual void SetData(const std::shared_ptr& data) { + ValidateData(data); if (data->buffers.size() > 0) { null_bitmap_data_ = data->GetValuesSafe(0, /*offset=*/0); } else { @@ -265,6 +269,11 @@ class ARROW_EXPORT Array { // The statistics for this Array. std::shared_ptr statistics_; + /// Protected method for constructors + void SetStatistics(const std::shared_ptr& statistics) { + statistics_ = statistics; + } + private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); @@ -296,12 +305,14 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { protected: PrimitiveArray() : raw_values_(NULLPTR) {} - void SetData(const std::shared_ptr& data) { + void SetData(const std::shared_ptr& data) override { this->Array::SetData(data); raw_values_ = data->GetValuesSafe(1, /*offset=*/0); } - explicit PrimitiveArray(const std::shared_ptr& data) { SetData(data); } + explicit PrimitiveArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : FlatArray(data, statistics) {} const uint8_t* raw_values_; }; @@ -311,11 +322,13 @@ class ARROW_EXPORT NullArray : public FlatArray { public: using TypeClass = NullType; - explicit NullArray(const std::shared_ptr& data) { SetData(data); } + explicit NullArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : FlatArray(data, statistics) {} explicit NullArray(int64_t length); private: - void SetData(const std::shared_ptr& data) { + void SetData(const std::shared_ptr& data) override { null_bitmap_data_ = NULLPTR; data->null_count = data->length; data_ = data; diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index d83ba0ca893..5ed8283c885 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -32,9 +32,8 @@ namespace arrow { using internal::checked_cast; -BinaryArray::BinaryArray(const std::shared_ptr& data) { +void BinaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK(is_binary_like(data->type->id())); - SetData(data); } BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_offsets, @@ -45,9 +44,8 @@ BinaryArray::BinaryArray(int64_t length, const std::shared_ptr& value_of null_count, offset)); } -LargeBinaryArray::LargeBinaryArray(const std::shared_ptr& data) { +void LargeBinaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK(is_large_binary_like(data->type->id())); - SetData(data); } LargeBinaryArray::LargeBinaryArray(int64_t length, @@ -59,9 +57,8 @@ LargeBinaryArray::LargeBinaryArray(int64_t length, null_count, offset)); } -StringArray::StringArray(const std::shared_ptr& data) { +void StringArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRING); - SetData(data); } StringArray::StringArray(int64_t length, const std::shared_ptr& value_offsets, @@ -74,9 +71,8 @@ StringArray::StringArray(int64_t length, const std::shared_ptr& value_of Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -LargeStringArray::LargeStringArray(const std::shared_ptr& data) { +void LargeStringArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING); - SetData(data); } LargeStringArray::LargeStringArray(int64_t length, @@ -90,9 +86,8 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -BinaryViewArray::BinaryViewArray(std::shared_ptr data) { +void BinaryViewArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); - SetData(std::move(data)); } BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, @@ -110,17 +105,12 @@ std::string_view BinaryViewArray::GetView(int64_t i) const { return util::FromBinaryView(raw_values_[i], data_buffers); } -StringViewArray::StringViewArray(std::shared_ptr data) { +void StringViewArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); - SetData(std::move(data)); } Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } -FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { - SetData(data); -} - FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 19fdee61243..ff16e4b6c57 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -140,9 +140,12 @@ class BaseBinaryArray : public FlatArray { protected: // For subclasses BaseBinaryArray() = default; + explicit BaseBinaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : FlatArray(data, statistics) {} // Protected method for constructors - void SetData(const std::shared_ptr& data) { + void SetData(const std::shared_ptr& data) override { this->Array::SetData(data); raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); raw_data_ = data->GetValuesSafe(2, /*offset=*/0); @@ -155,7 +158,9 @@ class BaseBinaryArray : public FlatArray { /// Concrete Array class for variable-size binary data class ARROW_EXPORT BinaryArray : public BaseBinaryArray { public: - explicit BinaryArray(const std::shared_ptr& data); + explicit BinaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseBinaryArray(data, statistics) {} BinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -165,6 +170,8 @@ class ARROW_EXPORT BinaryArray : public BaseBinaryArray { protected: // For subclasses such as StringArray BinaryArray() : BaseBinaryArray() {} + + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for variable-size string (utf-8) data @@ -172,7 +179,9 @@ class ARROW_EXPORT StringArray : public BinaryArray { public: using TypeClass = StringType; - explicit StringArray(const std::shared_ptr& data); + explicit StringArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BinaryArray(data, statistics) {} StringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -183,12 +192,17 @@ class ARROW_EXPORT StringArray : public BinaryArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for large variable-size binary data class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { public: - explicit LargeBinaryArray(const std::shared_ptr& data); + explicit LargeBinaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseBinaryArray(data, statistics) {} LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -198,6 +212,7 @@ class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { protected: // For subclasses such as LargeStringArray LargeBinaryArray() : BaseBinaryArray() {} + void ValidateData(const std::shared_ptr& data) override; }; /// Concrete Array class for large variable-size string (utf-8) data @@ -205,7 +220,9 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { public: using TypeClass = LargeStringType; - explicit LargeStringArray(const std::shared_ptr& data); + explicit LargeStringArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : LargeBinaryArray(data, statistics) {} LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -216,6 +233,9 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -229,7 +249,9 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { using IteratorType = stl::ArrayIterator; using c_type = BinaryViewType::c_type; - explicit BinaryViewArray(std::shared_ptr data); + explicit BinaryViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : FlatArray(data, statistics) {} BinaryViewArray(std::shared_ptr type, int64_t length, std::shared_ptr views, BufferVector data_buffers, @@ -253,8 +275,10 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { protected: using FlatArray::FlatArray; - void SetData(std::shared_ptr data) { - FlatArray::SetData(std::move(data)); + void ValidateData(const std::shared_ptr& data) override; + + void SetData(const std::shared_ptr& data) override { + FlatArray::SetData(data); raw_values_ = data_->GetValuesSafe(1); } @@ -267,7 +291,9 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { public: using TypeClass = StringViewType; - explicit StringViewArray(std::shared_ptr data); + explicit StringViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BinaryViewArray(data, statistics) {} using BinaryViewArray::BinaryViewArray; @@ -275,6 +301,9 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { /// /// This check is also implied by ValidateFull() Status ValidateUTF8() const; + + protected: + void ValidateData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -286,7 +315,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { using TypeClass = FixedSizeBinaryType; using IteratorType = stl::ArrayIterator; - explicit FixedSizeBinaryArray(const std::shared_ptr& data); + explicit FixedSizeBinaryArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : PrimitiveArray(data, statistics) {} FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 7fd76a1dae8..9e37fb42d99 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -78,11 +78,14 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const { } } -DictionaryArray::DictionaryArray(const std::shared_ptr& data) - : dict_type_(checked_cast(data->type.get())) { +DictionaryArray::DictionaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics) + : Array(data, statistics), + dict_type_(checked_cast(data->type.get())) {} + +void DictionaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY); ARROW_CHECK_NE(data->dictionary, nullptr); - SetData(data); } void DictionaryArray::SetData(const std::shared_ptr& data) { diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index bf376b51f8c..00f674b841c 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -54,7 +54,8 @@ class ARROW_EXPORT DictionaryArray : public Array { public: using TypeClass = DictionaryType; - explicit DictionaryArray(const std::shared_ptr& data); + explicit DictionaryArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR); DictionaryArray(const std::shared_ptr& type, const std::shared_ptr& indices, @@ -114,7 +115,8 @@ class ARROW_EXPORT DictionaryArray : public Array { const DictionaryType* dict_type() const { return dict_type_; } private: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; const DictionaryType* dict_type_; std::shared_ptr indices_; diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 47c0fd35829..639454a096c 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -518,10 +518,6 @@ Result> FlattenLogicalListRecursively(const Array& in_arr // ---------------------------------------------------------------------- // ListArray -ListArray::ListArray(std::shared_ptr data) { - ListArray::SetData(std::move(data)); -} - ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, std::shared_ptr null_bitmap, int64_t null_count, @@ -575,10 +571,6 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { - LargeListArray::SetData(data); -} - LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& values, @@ -636,21 +628,16 @@ std::shared_ptr LargeListArray::offsets() const { // ---------------------------------------------------------------------- // ListViewArray -ListViewArray::ListViewArray(std::shared_ptr data) { - ListViewArray::SetData(std::move(data)); -} - ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr value_sizes, std::shared_ptr values, std::shared_ptr null_bitmap, int64_t null_count, - int64_t offset) { - ListViewArray::SetData(ArrayData::Make( - std::move(type), length, - {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, - /*child_data=*/{values->data()}, null_count, offset)); -} + int64_t offset) + : ListViewArray(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)) {} void ListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); @@ -711,21 +698,16 @@ std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), * // ---------------------------------------------------------------------- // LargeListViewArray -LargeListViewArray::LargeListViewArray(std::shared_ptr data) { - LargeListViewArray::SetData(std::move(data)); -} - LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr value_sizes, std::shared_ptr values, std::shared_ptr null_bitmap, - int64_t null_count, int64_t offset) { - LargeListViewArray::SetData(ArrayData::Make( - type, length, - {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, - /*child_data=*/{values->data()}, null_count, offset)); -} + int64_t null_count, int64_t offset) + : LargeListViewArray(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)) {} void LargeListViewArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data); @@ -774,16 +756,13 @@ std::shared_ptr LargeListViewArray::sizes() const { // ---------------------------------------------------------------------- // MapArray -MapArray::MapArray(const std::shared_ptr& data) { SetData(data); } - MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& offsets, const std::shared_ptr& values, const std::shared_ptr& null_bitmap, int64_t null_count, - int64_t offset) { - SetData(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()}, - null_count, offset)); -} + int64_t offset) + : MapArray(ArrayData::Make(type, length, {null_bitmap, offsets}, {values->data()}, + null_count, offset)) {} MapArray::MapArray(const std::shared_ptr& type, int64_t length, BufferVector buffers, const std::shared_ptr& keys, @@ -908,9 +887,11 @@ Status MapArray::ValidateChildData( return Status::OK(); } -void MapArray::SetData(const std::shared_ptr& data) { +void MapArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_OK(ValidateChildData(data->child_data)); +} +void MapArray::SetData(const std::shared_ptr& data) { internal::SetListData(this, data, Type::MAP); map_type_ = checked_cast(data->type.get()); const auto& pair_data = data->child_data[0]; @@ -921,10 +902,6 @@ void MapArray::SetData(const std::shared_ptr& data) { // ---------------------------------------------------------------------- // FixedSizeListArray -FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& data) { - SetData(data); -} - FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, @@ -935,8 +912,11 @@ FixedSizeListArray::FixedSizeListArray(const std::shared_ptr& type, SetData(internal_data); } -void FixedSizeListArray::SetData(const std::shared_ptr& data) { +void FixedSizeListArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST); +} + +void FixedSizeListArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id()); @@ -1004,9 +984,12 @@ Result> FixedSizeListArray::Flatten( // ---------------------------------------------------------------------- // Struct -StructArray::StructArray(const std::shared_ptr& data) { +void StructArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::STRUCT); - SetData(data); +} + +void StructArray::SetData(const std::shared_ptr& data) { + Array::SetData(data); boxed_fields_.resize(data->child_data.size()); } @@ -1014,12 +997,11 @@ StructArray::StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { - ARROW_CHECK_EQ(type->id(), Type::STRUCT); - SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); + auto data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset); for (const auto& child : children) { - data_->child_data.push_back(child->data()); + data->child_data.push_back(child->data()); } - boxed_fields_.resize(children.size()); + SetData(std::move(data)); } Result> StructArray::Make( @@ -1177,8 +1159,8 @@ Result> StructArray::GetFlattenedField(int index, // ---------------------------------------------------------------------- // UnionArray -void UnionArray::SetData(std::shared_ptr data) { - this->Array::SetData(std::move(data)); +void UnionArray::SetData(const std::shared_ptr& data) { + this->Array::SetData(data); union_type_ = checked_cast(data_->type.get()); @@ -1187,8 +1169,8 @@ void UnionArray::SetData(std::shared_ptr data) { boxed_fields_.resize(data_->child_data.size()); } -void SparseUnionArray::SetData(std::shared_ptr data) { - this->UnionArray::SetData(std::move(data)); +void SparseUnionArray::SetData(const std::shared_ptr& data) { + this->UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 2); @@ -1208,10 +1190,6 @@ void DenseUnionArray::SetData(const std::shared_ptr& data) { raw_value_offsets_ = data->GetValuesSafe(2, /*offset=*/0); } -SparseUnionArray::SparseUnionArray(std::shared_ptr data) { - SetData(std::move(data)); -} - SparseUnionArray::SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_codes, int64_t offset) { @@ -1261,10 +1239,6 @@ Result> SparseUnionArray::GetFlattenedField( return MakeArray(child_data); } -DenseUnionArray::DenseUnionArray(const std::shared_ptr& data) { - SetData(data); -} - DenseUnionArray::DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, std::shared_ptr value_offsets, int64_t offset) { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index a6d4977839e..750fcb6a309 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -127,6 +127,7 @@ class VarLengthListLikeArray : public Array { } protected: + using Array::Array; friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -157,12 +158,17 @@ class BaseListArray : public VarLengthListLikeArray { i += this->data_->offset; return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; } + + protected: + using VarLengthListLikeArray::VarLengthListLikeArray; }; /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: - explicit ListArray(std::shared_ptr data); + explicit ListArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseListArray(data, statistics) {} ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -223,13 +229,15 @@ class ARROW_EXPORT ListArray : public BaseListArray { // This constructor defers SetData to a derived array class ListArray() = default; - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// Concrete Array class for large list data (with 64-bit offsets) class ARROW_EXPORT LargeListArray : public BaseListArray { public: - explicit LargeListArray(const std::shared_ptr& data); + explicit LargeListArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseListArray(data, statistics) {} LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +292,7 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr offsets() const; protected: - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -318,13 +326,16 @@ class BaseListViewArray : public VarLengthListLikeArray { } protected: + using VarLengthListLikeArray::VarLengthListLikeArray; const offset_type* raw_value_sizes_ = NULLPTR; }; /// \brief Concrete Array class for list-view data class ARROW_EXPORT ListViewArray : public BaseListViewArray { public: - explicit ListViewArray(std::shared_ptr data); + explicit ListViewArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseListViewArray(data, statistics) {} ListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -404,15 +415,17 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { protected: // This constructor defers SetData to a derived array class ListViewArray() = default; - - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// \brief Concrete Array class for large list-view data (with 64-bit offsets /// and sizes) class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { public: - explicit LargeListViewArray(std::shared_ptr data); + explicit LargeListViewArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : BaseListViewArray(data, statistics) {} LargeListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -489,7 +502,7 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray& data); + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -502,7 +515,9 @@ class ARROW_EXPORT MapArray : public ListArray { public: using TypeClass = MapType; - explicit MapArray(const std::shared_ptr& data); + explicit MapArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : ListArray(data, statistics) {} MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -558,7 +573,8 @@ class ARROW_EXPORT MapArray : public ListArray { const std::vector>& child_data); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; static Result> FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, @@ -579,7 +595,10 @@ class ARROW_EXPORT FixedSizeListArray : public Array { using TypeClass = FixedSizeListType; using offset_type = TypeClass::offset_type; - explicit FixedSizeListArray(const std::shared_ptr& data); + explicit FixedSizeListArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : Array(data, statistics) {} FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, @@ -655,7 +674,8 @@ class ARROW_EXPORT FixedSizeListArray : public Array { int64_t null_count = kUnknownNullCount); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; int32_t list_size_; private: @@ -670,7 +690,9 @@ class ARROW_EXPORT StructArray : public Array { public: using TypeClass = StructType; - explicit StructArray(const std::shared_ptr& data); + explicit StructArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : Array(data, statistics) {} StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, @@ -731,6 +753,9 @@ class ARROW_EXPORT StructArray : public Array { // For caching boxed child data // XXX This is not handled in a thread-safe manner. mutable ArrayVector boxed_fields_; + + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; }; // ---------------------------------------------------------------------- @@ -765,7 +790,8 @@ class ARROW_EXPORT UnionArray : public Array { std::shared_ptr field(int pos) const; protected: - void SetData(std::shared_ptr data); + using Array::Array; + void SetData(const std::shared_ptr& data) override; const type_code_t* raw_type_codes_; const UnionType* union_type_; @@ -779,7 +805,9 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { public: using TypeClass = SparseUnionType; - explicit SparseUnionArray(std::shared_ptr data); + explicit SparseUnionArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : UnionArray(data, statistics) {} SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, int64_t offset = 0); @@ -822,7 +850,7 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { int index, MemoryPool* pool = default_memory_pool()) const; protected: - void SetData(std::shared_ptr data); + void SetData(const std::shared_ptr& data) override; }; /// \brief Concrete Array class for dense union data @@ -832,7 +860,9 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { public: using TypeClass = DenseUnionType; - explicit DenseUnionArray(const std::shared_ptr& data); + explicit DenseUnionArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : UnionArray(data, statistics) {} DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, @@ -890,7 +920,7 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { protected: const int32_t* raw_value_offsets_; - void SetData(const std::shared_ptr& data); + void SetData(const std::shared_ptr& data) override; }; /// @} diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc index da3810aa392..7d844d14f20 100644 --- a/cpp/src/arrow/array/array_primitive.cc +++ b/cpp/src/arrow/array/array_primitive.cc @@ -41,8 +41,7 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int64_t le // ---------------------------------------------------------------------- // BooleanArray -BooleanArray::BooleanArray(const std::shared_ptr& data) - : PrimitiveArray(data) { +void BooleanArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::BOOL); } @@ -70,10 +69,6 @@ int64_t BooleanArray::true_count() const { // ---------------------------------------------------------------------- // Day time interval -DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr& data) { - SetData(data); -} - DayTimeIntervalArray::DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -97,11 +92,6 @@ DayTimeIntervalType::DayMilliseconds DayTimeIntervalArray::GetValue(int64_t i) c // ---------------------------------------------------------------------- // Month, day and Nanos interval -MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( - const std::shared_ptr& data) { - SetData(data); -} - MonthDayNanoIntervalArray::MonthDayNanoIntervalArray( const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index 375c884faeb..19251a88742 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -41,7 +41,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { using TypeClass = BooleanType; using IteratorType = stl::ArrayIterator; - explicit BooleanArray(const std::shared_ptr& data); + explicit BooleanArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : PrimitiveArray(data, statistics) {} BooleanArray(int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, @@ -69,12 +71,14 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } /// \brief Return the statistics for boolean. - std::shared_ptr GetStatistics() const { - return std::static_pointer_cast(Array::GetStatistics()); + std::shared_ptr statistics() const { + return std::static_pointer_cast(statistics_); } protected: using PrimitiveArray::PrimitiveArray; + + void ValidateData(const std::shared_ptr& data) override; }; /// \addtogroup numeric-arrays @@ -95,7 +99,9 @@ class NumericArray : public PrimitiveArray { using value_type = typename TypeClass::c_type; using IteratorType = stl::ArrayIterator>; - explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} + explicit NumericArray(const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : PrimitiveArray(data, statistics) {} // Only enable this constructor without a type argument for types without additional // metadata @@ -125,8 +131,8 @@ class NumericArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } /// \brief Return the typed statistics. - std::shared_ptr> GetStatistics() const { - return std::static_pointer_cast>(Array::GetStatistics()); + std::shared_ptr> statistics() const { + return std::static_pointer_cast>(Array::statistics()); } protected: @@ -141,7 +147,10 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { using TypeClass = DayTimeIntervalType; using IteratorType = stl::ArrayIterator; - explicit DayTimeIntervalArray(const std::shared_ptr& data); + explicit DayTimeIntervalArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : PrimitiveArray(data, statistics) {} DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -177,7 +186,10 @@ class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { using TypeClass = MonthDayNanoIntervalType; using IteratorType = stl::ArrayIterator; - explicit MonthDayNanoIntervalArray(const std::shared_ptr& data); + explicit MonthDayNanoIntervalArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : PrimitiveArray(data, statistics) {} MonthDayNanoIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_run_end.cc b/cpp/src/arrow/array/array_run_end.cc index f423bdb33c5..e9ddb23deec 100644 --- a/cpp/src/arrow/array/array_run_end.cc +++ b/cpp/src/arrow/array/array_run_end.cc @@ -26,20 +26,16 @@ namespace arrow { // ---------------------------------------------------------------------- // RunEndEncodedArray -RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr& data) { - this->SetData(data); -} - RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& run_ends, const std::shared_ptr& values, - int64_t offset) { - this->SetData(ArrayData::Make(type, length, - /*buffers=*/{NULLPTR}, - /*child_data=*/{run_ends->data(), values->data()}, - /*null_count=*/0, offset)); -} + int64_t offset) + : RunEndEncodedArray( + ArrayData::Make(type, length, + /*buffers=*/{NULLPTR}, + /*child_data=*/{run_ends->data(), values->data()}, + /*null_count=*/0, offset)) {} Result> RunEndEncodedArray::Make( const std::shared_ptr& type, int64_t logical_length, @@ -67,14 +63,16 @@ Result> RunEndEncodedArray::Make( return Make(ree_type, logical_length, run_ends, values, logical_offset); } -void RunEndEncodedArray::SetData(const std::shared_ptr& data) { +void RunEndEncodedArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::RUN_END_ENCODED); const auto* ree_type = internal::checked_cast(data->type.get()); ARROW_CHECK_EQ(data->child_data.size(), 2); ARROW_CHECK_EQ(ree_type->run_end_type()->id(), data->child_data[0]->type->id()); ARROW_CHECK_EQ(ree_type->value_type()->id(), data->child_data[1]->type->id()); +} +void RunEndEncodedArray::SetData(const std::shared_ptr& data) { Array::SetData(data); run_ends_array_ = MakeArray(this->data()->child_data[0]); values_array_ = MakeArray(this->data()->child_data[1]); diff --git a/cpp/src/arrow/array/array_run_end.h b/cpp/src/arrow/array/array_run_end.h index b46b0855ab3..2823077b185 100644 --- a/cpp/src/arrow/array/array_run_end.h +++ b/cpp/src/arrow/array/array_run_end.h @@ -53,7 +53,10 @@ class ARROW_EXPORT RunEndEncodedArray : public Array { public: using TypeClass = RunEndEncodedType; - explicit RunEndEncodedArray(const std::shared_ptr& data); + explicit RunEndEncodedArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) + : Array(data, statistics) {} /// \brief Construct a RunEndEncodedArray from all parameters /// @@ -85,7 +88,8 @@ class ARROW_EXPORT RunEndEncodedArray : public Array { const std::shared_ptr& values, int64_t logical_offset = 0); protected: - void SetData(const std::shared_ptr& data); + void ValidateData(const std::shared_ptr& data) override; + void SetData(const std::shared_ptr& data) override; public: /// \brief Returns an array holding the logical indexes of each run-end diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b56ea25f9e4..ddcfde76d5d 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -59,22 +59,25 @@ namespace { class ArrayDataWrapper { public: - ArrayDataWrapper(const std::shared_ptr& data, std::shared_ptr* out) - : data_(data), out_(out) {} + ArrayDataWrapper(const std::shared_ptr& data, + const std::shared_ptr& statistics, + std::shared_ptr* out) + : data_(data), statistics_(statistics), out_(out) {} template Status Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; - *out_ = std::make_shared(data_); + *out_ = std::make_shared(data_, statistics_); return Status::OK(); } Status Visit(const ExtensionType& type) { - *out_ = type.MakeArray(data_); + *out_ = type.MakeArray(data_, statistics_); return Status::OK(); } const std::shared_ptr& data_; + const std::shared_ptr& statistics_; std::shared_ptr* out_; }; @@ -333,9 +336,10 @@ Result> SwapEndianArrayData( } // namespace internal -std::shared_ptr MakeArray(const std::shared_ptr& data) { +std::shared_ptr MakeArray(const std::shared_ptr& data, + const std::shared_ptr statistics) { std::shared_ptr out; - ArrayDataWrapper wrapper_visitor(data, &out); + ArrayDataWrapper wrapper_visitor(data, statistics, &out); DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor)); DCHECK(out); return out; diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index fd8e75ddb86..cc163ebb1c2 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -22,6 +22,7 @@ #include #include "arrow/array/data.h" +#include "arrow/array/statistics.h" #include "arrow/compare.h" #include "arrow/result.h" #include "arrow/status.h" @@ -37,9 +38,12 @@ namespace arrow { /// \brief Create a strongly-typed Array instance from generic ArrayData /// \param[in] data the array contents +/// \param[in] statistics the array statistics /// \return the resulting Array instance ARROW_EXPORT -std::shared_ptr MakeArray(const std::shared_ptr& data); +std::shared_ptr MakeArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR); /// \brief Create a strongly-typed Array instance with all elements null /// \param[in] type the array type diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index b3f085198be..9ac3ea0a1d7 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -72,6 +72,15 @@ class ARROW_EXPORT ExtensionType : public DataType { /// \param[in] data the physical storage for the extension type virtual std::shared_ptr MakeArray(std::shared_ptr data) const = 0; + /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance + /// \param[in] data the physical storage for the extension type + /// \param[in] statistics the physical statistics for the extension type + virtual std::shared_ptr MakeArray( + std::shared_ptr data, + std::shared_ptr statistics) const { + return MakeArray(std::move(data)); + } + /// \brief Create an instance of the ExtensionType given the actual storage /// type and the serialized representation /// \param[in] storage_type the physical storage type of the extension diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index d6b5c34e935..561385c0770 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -186,7 +186,7 @@ TEST(TestStatisticsRead, Boolean) { ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(::arrow::boolean(), R"([true, null, true])")); auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array); - auto statistics = typed_array->GetStatistics(); + auto statistics = typed_array->statistics(); ASSERT_EQ(true, statistics->null_count.has_value()); ASSERT_EQ(1, statistics->null_count.value()); ASSERT_EQ(false, statistics->distinct_count.has_value()); @@ -204,7 +204,7 @@ TEST(TestStatisticsRead, Int8) { ASSERT_OK_AND_ASSIGN(auto array, StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])")); auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array); - auto statistics = typed_array->GetStatistics(); + auto statistics = typed_array->statistics(); ASSERT_EQ(true, statistics->null_count.has_value()); ASSERT_EQ(1, statistics->null_count.value()); ASSERT_EQ(false, statistics->distinct_count.has_value()); diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index e7fdde19f5c..75d0557b386 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -355,8 +355,8 @@ Status TransferInt(RecordReader* reader, array_statistics->is_max_exact = true; } } - auto array = std::make_shared>(std::move(array_data)); - array->SetStatistics(std::move(array_statistics)); + auto array = std::make_shared>(std::move(array_data), + std::move(array_statistics)); *out = std::make_shared(std::move(array)); return Status::OK(); } @@ -419,8 +419,8 @@ Status TransferBool(RecordReader* reader, array_statistics->is_max_exact = true; } } - auto array = std::make_shared(std::move(array_data)); - array->SetStatistics(std::move(array_statistics)); + auto array = + std::make_shared(std::move(array_data), std::move(array_statistics)); *out = std::make_shared(std::move(array)); return Status::OK(); } From 4f609bb3afbc0527493bbba429a40a06c943b978 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 12 Jul 2024 14:50:25 +0900 Subject: [PATCH 07/10] Add a missing & --- cpp/src/arrow/array/util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index ddcfde76d5d..662910f7ea4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -337,7 +337,7 @@ Result> SwapEndianArrayData( } // namespace internal std::shared_ptr MakeArray(const std::shared_ptr& data, - const std::shared_ptr statistics) { + const std::shared_ptr& statistics) { std::shared_ptr out; ArrayDataWrapper wrapper_visitor(data, statistics, &out); DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor)); From 23844db0406ab1ba2c2a5cdc61802c06ae7986bd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 12 Jul 2024 15:03:29 +0900 Subject: [PATCH 08/10] Remove TypedArrayStatistics --- cpp/src/arrow/array/array_primitive.h | 10 -------- cpp/src/arrow/array/statistics.h | 25 ------------------- cpp/src/arrow/type_fwd.h | 5 ---- .../parquet/arrow/arrow_statistics_test.cc | 20 +++++++++------ cpp/src/parquet/arrow/reader_internal.cc | 2 +- 5 files changed, 13 insertions(+), 49 deletions(-) diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index 19251a88742..ab82fde8777 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -70,11 +70,6 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } - /// \brief Return the statistics for boolean. - std::shared_ptr statistics() const { - return std::static_pointer_cast(statistics_); - } - protected: using PrimitiveArray::PrimitiveArray; @@ -130,11 +125,6 @@ class NumericArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } - /// \brief Return the typed statistics. - std::shared_ptr> statistics() const { - return std::static_pointer_cast>(Array::statistics()); - } - protected: using PrimitiveArray::PrimitiveArray; }; diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index 43dfd656b31..80ac0e157e2 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -63,29 +63,4 @@ struct ARROW_EXPORT ArrayStatistics { } }; -/// \brief A typed implementation of ArrayStatistics -template -class TypedArrayStatistics : public ArrayStatistics { - public: - using ElementType = typename TypeClass::c_type; - - /// \brief The current minimum value, may not be set - std::optional min() const { - if (min_buffer && std::holds_alternative(*min_buffer)) { - return std::get(*min_buffer); - } else { - return std::nullopt; - } - } - - /// \brief The current maximum value, may not be set - std::optional max() const { - if (max_buffer && std::holds_alternative(*max_buffer)) { - return std::get(*max_buffer); - } else { - return std::nullopt; - } - } -}; - } // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 54d3e09f01d..ecaceaf4690 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -89,9 +89,6 @@ using ChunkedArrayVector = std::vector>; using RecordBatchVector = std::vector>; using RecordBatchIterator = Iterator>; -template -class TypedArrayStatistics; - class DictionaryType; class DictionaryArray; struct DictionaryScalar; @@ -105,7 +102,6 @@ class FixedWidthType; class BooleanType; class BooleanArray; -using BooleanArrayStatistics = TypedArrayStatistics; class BooleanBuilder; struct BooleanScalar; @@ -222,7 +218,6 @@ class NumericTensor; #define _NUMERIC_TYPE_DECL(KLASS) \ class KLASS##Type; \ using KLASS##Array = NumericArray; \ - using KLASS##ArrayStatistics = TypedArrayStatistics; \ using KLASS##Builder = NumericBuilder; \ struct KLASS##Scalar; \ using KLASS##Tensor = NumericTensor; diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 561385c0770..488fdd9543d 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -190,12 +190,14 @@ TEST(TestStatisticsRead, Boolean) { ASSERT_EQ(true, statistics->null_count.has_value()); ASSERT_EQ(1, statistics->null_count.value()); ASSERT_EQ(false, statistics->distinct_count.has_value()); - ASSERT_EQ(true, statistics->min().has_value()); - ASSERT_EQ(true, statistics->min().value()); + ASSERT_EQ(true, statistics->min_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->min_buffer)); + ASSERT_EQ(true, std::get(*statistics->min_buffer)); ASSERT_EQ(true, statistics->is_min_exact.has_value()); ASSERT_EQ(true, statistics->is_min_exact.value()); - ASSERT_EQ(true, statistics->max().has_value()); - ASSERT_EQ(true, statistics->max().value()); + ASSERT_EQ(true, statistics->max_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->max_buffer)); + ASSERT_EQ(true, std::get(*statistics->max_buffer)); ASSERT_EQ(true, statistics->is_min_exact.has_value()); ASSERT_EQ(true, statistics->is_min_exact.value()); } @@ -208,12 +210,14 @@ TEST(TestStatisticsRead, Int8) { ASSERT_EQ(true, statistics->null_count.has_value()); ASSERT_EQ(1, statistics->null_count.value()); ASSERT_EQ(false, statistics->distinct_count.has_value()); - ASSERT_EQ(true, statistics->min().has_value()); - ASSERT_EQ(-1, statistics->min().value()); + ASSERT_EQ(true, statistics->min_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->min_buffer)); + ASSERT_EQ(-1, std::get(*statistics->min_buffer)); ASSERT_EQ(true, statistics->is_min_exact.has_value()); ASSERT_EQ(true, statistics->is_min_exact.value()); - ASSERT_EQ(true, statistics->max().has_value()); - ASSERT_EQ(1, statistics->max().value()); + ASSERT_EQ(true, statistics->max_buffer.has_value()); + ASSERT_EQ(true, std::holds_alternative(*statistics->max_buffer)); + ASSERT_EQ(1, std::get(*statistics->max_buffer)); ASSERT_EQ(true, statistics->is_min_exact.has_value()); ASSERT_EQ(true, statistics->is_min_exact.value()); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 75d0557b386..14cd42e638f 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -404,7 +404,7 @@ Status TransferBool(RecordReader* reader, } auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, std::move(buffers), null_count); - auto array_statistics = std::make_shared<::arrow::BooleanArrayStatistics>(); + auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); array_statistics->null_count = null_count; auto statistics = metadata->statistics().get(); if (statistics) { From e58ce084bdea1d2c733c679fa781c2a940ce7aae Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 12 Jul 2024 15:09:58 +0900 Subject: [PATCH 09/10] Fix style --- cpp/src/arrow/type_fwd.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ecaceaf4690..08777d247ed 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -215,11 +215,11 @@ class NumericBuilder; template class NumericTensor; -#define _NUMERIC_TYPE_DECL(KLASS) \ - class KLASS##Type; \ - using KLASS##Array = NumericArray; \ - using KLASS##Builder = NumericBuilder; \ - struct KLASS##Scalar; \ +#define _NUMERIC_TYPE_DECL(KLASS) \ + class KLASS##Type; \ + using KLASS##Array = NumericArray; \ + using KLASS##Builder = NumericBuilder; \ + struct KLASS##Scalar; \ using KLASS##Tensor = NumericTensor; _NUMERIC_TYPE_DECL(Int8) From 5bf9935a7fe097c8cdb850147618ef88080c7b8b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 13 Jul 2024 10:32:11 +0900 Subject: [PATCH 10/10] Don't use virtual functions in constructor directly --- cpp/src/arrow/array/array_base.h | 28 +++++++------ cpp/src/arrow/array/array_binary.h | 53 +++++++++++++---------- cpp/src/arrow/array/array_dict.cc | 5 ++- cpp/src/arrow/array/array_nested.cc | 8 ++-- cpp/src/arrow/array/array_nested.h | 60 ++++++++++++++------------- cpp/src/arrow/array/array_primitive.h | 22 +++++----- cpp/src/arrow/array/array_run_end.h | 5 ++- 7 files changed, 100 insertions(+), 81 deletions(-) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 6e47584b032..76f29ae2e49 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -240,24 +240,26 @@ class ARROW_EXPORT Array { protected: Array() = default; - explicit Array(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) { + ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); + + std::shared_ptr data_; + const uint8_t* null_bitmap_data_ = NULLPTR; + + /// Protected method for constructors + void Init(const std::shared_ptr& data, + const std::shared_ptr& statistics) { + ValidateData(data); SetData(data); if (statistics) { SetStatistics(statistics); } } - ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); - - std::shared_ptr data_; - const uint8_t* null_bitmap_data_ = NULLPTR; /// Protected method for constructors virtual void ValidateData(const std::shared_ptr& data) {} /// Protected method for constructors virtual void SetData(const std::shared_ptr& data) { - ValidateData(data); if (data->buffers.size() > 0) { null_bitmap_data_ = data->GetValuesSafe(0, /*offset=*/0); } else { @@ -306,13 +308,14 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { PrimitiveArray() : raw_values_(NULLPTR) {} void SetData(const std::shared_ptr& data) override { - this->Array::SetData(data); + Array::SetData(data); raw_values_ = data->GetValuesSafe(1, /*offset=*/0); } explicit PrimitiveArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : FlatArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } const uint8_t* raw_values_; }; @@ -323,8 +326,9 @@ class ARROW_EXPORT NullArray : public FlatArray { using TypeClass = NullType; explicit NullArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : FlatArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } explicit NullArray(int64_t length); private: diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index ff16e4b6c57..76bf3d6e23c 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -140,13 +140,10 @@ class BaseBinaryArray : public FlatArray { protected: // For subclasses BaseBinaryArray() = default; - explicit BaseBinaryArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : FlatArray(data, statistics) {} // Protected method for constructors void SetData(const std::shared_ptr& data) override { - this->Array::SetData(data); + Array::SetData(data); raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); raw_data_ = data->GetValuesSafe(2, /*offset=*/0); } @@ -159,8 +156,9 @@ class BaseBinaryArray : public FlatArray { class ARROW_EXPORT BinaryArray : public BaseBinaryArray { public: explicit BinaryArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseBinaryArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -180,8 +178,9 @@ class ARROW_EXPORT StringArray : public BinaryArray { using TypeClass = StringType; explicit StringArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BinaryArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } StringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -200,9 +199,11 @@ class ARROW_EXPORT StringArray : public BinaryArray { /// Concrete Array class for large variable-size binary data class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray { public: - explicit LargeBinaryArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseBinaryArray(data, statistics) {} + explicit LargeBinaryArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeBinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -220,9 +221,11 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { public: using TypeClass = LargeStringType; - explicit LargeStringArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : LargeBinaryArray(data, statistics) {} + explicit LargeStringArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeStringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, @@ -250,8 +253,9 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { using c_type = BinaryViewType::c_type; explicit BinaryViewArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : FlatArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BinaryViewArray(std::shared_ptr type, int64_t length, std::shared_ptr views, BufferVector data_buffers, @@ -273,7 +277,8 @@ class ARROW_EXPORT BinaryViewArray : public FlatArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - using FlatArray::FlatArray; + // This constructor defers Init() to a derived array class + BinaryViewArray() = default; void ValidateData(const std::shared_ptr& data) override; @@ -292,8 +297,9 @@ class ARROW_EXPORT StringViewArray : public BinaryViewArray { using TypeClass = StringViewType; explicit StringViewArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BinaryViewArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } using BinaryViewArray::BinaryViewArray; @@ -317,8 +323,9 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { explicit FixedSizeBinaryArray( const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : PrimitiveArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -347,8 +354,8 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - void SetData(const std::shared_ptr& data) { - this->PrimitiveArray::SetData(data); + void SetData(const std::shared_ptr& data) override { + PrimitiveArray::SetData(data); byte_width_ = internal::checked_cast(*type()).byte_width(); } diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 9e37fb42d99..6241104ce2a 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -80,8 +80,9 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const { DictionaryArray::DictionaryArray(const std::shared_ptr& data, const std::shared_ptr& statistics) - : Array(data, statistics), - dict_type_(checked_cast(data->type.get())) {} + : dict_type_(checked_cast(data->type.get())) { + Init(data, statistics); +} void DictionaryArray::ValidateData(const std::shared_ptr& data) { ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 639454a096c..20faa4e9499 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -917,7 +917,7 @@ void FixedSizeListArray::ValidateData(const std::shared_ptr& data) { } void FixedSizeListArray::SetData(const std::shared_ptr& data) { - this->Array::SetData(data); + Array::SetData(data); ARROW_CHECK_EQ(list_type()->value_type()->id(), data->child_data[0]->type->id()); DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type)); @@ -1160,7 +1160,7 @@ Result> StructArray::GetFlattenedField(int index, // UnionArray void UnionArray::SetData(const std::shared_ptr& data) { - this->Array::SetData(data); + Array::SetData(data); union_type_ = checked_cast(data_->type.get()); @@ -1170,7 +1170,7 @@ void UnionArray::SetData(const std::shared_ptr& data) { } void SparseUnionArray::SetData(const std::shared_ptr& data) { - this->UnionArray::SetData(data); + UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 2); @@ -1179,7 +1179,7 @@ void SparseUnionArray::SetData(const std::shared_ptr& data) { } void DenseUnionArray::SetData(const std::shared_ptr& data) { - this->UnionArray::SetData(data); + UnionArray::SetData(data); ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION); ARROW_CHECK_EQ(data_->buffers.size(), 3); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 750fcb6a309..95b76f61fd5 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -127,7 +127,6 @@ class VarLengthListLikeArray : public Array { } protected: - using Array::Array; friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -158,17 +157,15 @@ class BaseListArray : public VarLengthListLikeArray { i += this->data_->offset; return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; } - - protected: - using VarLengthListLikeArray::VarLengthListLikeArray; }; /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: explicit ListArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseListArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -226,7 +223,7 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr offsets() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class ListArray() = default; void SetData(const std::shared_ptr& data) override; @@ -236,8 +233,9 @@ class ARROW_EXPORT ListArray : public BaseListArray { class ARROW_EXPORT LargeListArray : public BaseListArray { public: explicit LargeListArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseListArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -326,7 +324,6 @@ class BaseListViewArray : public VarLengthListLikeArray { } protected: - using VarLengthListLikeArray::VarLengthListLikeArray; const offset_type* raw_value_sizes_ = NULLPTR; }; @@ -334,8 +331,9 @@ class BaseListViewArray : public VarLengthListLikeArray { class ARROW_EXPORT ListViewArray : public BaseListViewArray { public: explicit ListViewArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseListViewArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } ListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -413,7 +411,7 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { std::shared_ptr sizes() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class ListViewArray() = default; void SetData(const std::shared_ptr& data) override; }; @@ -424,8 +422,9 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray& data, - const std::shared_ptr& statistics = NULLPTR) - : BaseListViewArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } LargeListViewArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, @@ -499,7 +498,7 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray sizes() const; protected: - // This constructor defers SetData to a derived array class + // This constructor defers Init() to a derived array class LargeListViewArray() = default; void SetData(const std::shared_ptr& data) override; @@ -516,8 +515,9 @@ class ARROW_EXPORT MapArray : public ListArray { using TypeClass = MapType; explicit MapArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : ListArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -597,8 +597,9 @@ class ARROW_EXPORT FixedSizeListArray : public Array { explicit FixedSizeListArray( const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : Array(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, @@ -691,8 +692,9 @@ class ARROW_EXPORT StructArray : public Array { using TypeClass = StructType; explicit StructArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : Array(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, @@ -790,7 +792,6 @@ class ARROW_EXPORT UnionArray : public Array { std::shared_ptr field(int pos) const; protected: - using Array::Array; void SetData(const std::shared_ptr& data) override; const type_code_t* raw_type_codes_; @@ -805,9 +806,11 @@ class ARROW_EXPORT SparseUnionArray : public UnionArray { public: using TypeClass = SparseUnionType; - explicit SparseUnionArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : UnionArray(data, statistics) {} + explicit SparseUnionArray( + const std::shared_ptr& data, + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } SparseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, int64_t offset = 0); @@ -861,8 +864,9 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray { using TypeClass = DenseUnionType; explicit DenseUnionArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : UnionArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } DenseUnionArray(std::shared_ptr type, int64_t length, ArrayVector children, std::shared_ptr type_ids, diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index ab82fde8777..1de5362d098 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -42,8 +42,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { using IteratorType = stl::ArrayIterator; explicit BooleanArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : PrimitiveArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } BooleanArray(int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, @@ -71,8 +72,6 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { IteratorType end() const { return IteratorType(*this, length()); } protected: - using PrimitiveArray::PrimitiveArray; - void ValidateData(const std::shared_ptr& data) override; }; @@ -95,8 +94,9 @@ class NumericArray : public PrimitiveArray { using IteratorType = stl::ArrayIterator>; explicit NumericArray(const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : PrimitiveArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } // Only enable this constructor without a type argument for types without additional // metadata @@ -139,8 +139,9 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { explicit DayTimeIntervalArray( const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : PrimitiveArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, @@ -178,8 +179,9 @@ class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { explicit MonthDayNanoIntervalArray( const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : PrimitiveArray(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } MonthDayNanoIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, diff --git a/cpp/src/arrow/array/array_run_end.h b/cpp/src/arrow/array/array_run_end.h index 2823077b185..66a5d8bdab8 100644 --- a/cpp/src/arrow/array/array_run_end.h +++ b/cpp/src/arrow/array/array_run_end.h @@ -55,8 +55,9 @@ class ARROW_EXPORT RunEndEncodedArray : public Array { explicit RunEndEncodedArray( const std::shared_ptr& data, - const std::shared_ptr& statistics = NULLPTR) - : Array(data, statistics) {} + const std::shared_ptr& statistics = NULLPTR) { + Init(data, statistics); + } /// \brief Construct a RunEndEncodedArray from all parameters ///