From 87f329bccd79ccd1cf46bb65565b9506ac14c088 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 27 Nov 2022 15:50:39 +0800 Subject: [PATCH 1/6] ARROW-10158: [C++][Parquet] Expose page index info from ColumnChunkMetaData --- cpp/src/parquet/metadata.cc | 36 ++++++++++++++++++++++++++++++++++++ cpp/src/parquet/metadata.h | 7 +++++++ 2 files changed, 43 insertions(+) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c97e1949552..e5720a60886 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,6 +312,22 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } + inline bool has_column_index() const { + return column_->__isset.column_index_offset && column_->__isset.column_index_length; + } + + inline int64_t column_index_offset() const { return column_->column_index_offset; } + + inline int32_t column_index_length() const { return column_->column_index_length; } + + inline bool has_offset_index() const { + return column_->__isset.offset_index_offset && column_->__isset.offset_index_length; + } + + inline int64_t offset_index_offset() const { return column_->offset_index_offset; } + + inline int32_t offset_index_length() const { return column_->offset_index_length; } + private: mutable std::shared_ptr possible_stats_; std::vector encodings_; @@ -420,6 +436,26 @@ std::unique_ptr ColumnChunkMetaData::crypto_metadata() con return impl_->crypto_metadata(); } +bool ColumnChunkMetaData::has_column_index() const { return impl_->has_column_index(); } + +int64_t ColumnChunkMetaData::column_index_offset() const { + return impl_->column_index_offset(); +} + +int32_t ColumnChunkMetaData::column_index_length() const { + return impl_->column_index_length(); +} + +bool ColumnChunkMetaData::has_offset_index() const { return impl_->has_offset_index(); } + +int64_t ColumnChunkMetaData::offset_index_offset() const { + return impl_->offset_index_offset(); +} + +int32_t ColumnChunkMetaData::offset_index_length() const { + return impl_->offset_index_length(); +} + bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const { return impl_->Equals(*other.impl_); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index bd59c628dc8..665956a257c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -171,6 +171,13 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_uncompressed_size() const; std::unique_ptr crypto_metadata() const; + bool has_column_index() const; + int64_t column_index_offset() const; + int32_t column_index_length() const; + bool has_offset_index() const; + int64_t offset_index_offset() const; + int32_t offset_index_length() const; + private: explicit ColumnChunkMetaData( const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, From 9cfc072daccc795b242e5973dc7ac4f4375bbaa5 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Nov 2022 10:09:51 +0800 Subject: [PATCH 2/6] simplify interface --- cpp/src/parquet/metadata.cc | 44 ++++++++++++++----------------------- cpp/src/parquet/metadata.h | 16 ++++++++------ 2 files changed, 25 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e5720a60886..1ff473b1eba 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,22 +312,22 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } - inline bool has_column_index() const { - return column_->__isset.column_index_offset && column_->__isset.column_index_length; + inline std::optional GetColumIndexLocation() const { + if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { + return IndexLocation{.index_file_offset_bytes = column_->column_index_offset, + .offset_index_length = column_->column_index_length}; + } + return std::nullopt; } - inline int64_t column_index_offset() const { return column_->column_index_offset; } - - inline int32_t column_index_length() const { return column_->column_index_length; } - - inline bool has_offset_index() const { - return column_->__isset.offset_index_offset && column_->__isset.offset_index_length; + inline std::optional GetOffsetIndexLocation() const { + if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) { + return IndexLocation{.index_file_offset_bytes = column_->offset_index_offset, + .offset_index_length = column_->offset_index_length}; + } + return std::nullopt; } - inline int64_t offset_index_offset() const { return column_->offset_index_offset; } - - inline int32_t offset_index_length() const { return column_->offset_index_length; } - private: mutable std::shared_ptr possible_stats_; std::vector encodings_; @@ -436,24 +436,12 @@ std::unique_ptr ColumnChunkMetaData::crypto_metadata() con return impl_->crypto_metadata(); } -bool ColumnChunkMetaData::has_column_index() const { return impl_->has_column_index(); } - -int64_t ColumnChunkMetaData::column_index_offset() const { - return impl_->column_index_offset(); -} - -int32_t ColumnChunkMetaData::column_index_length() const { - return impl_->column_index_length(); -} - -bool ColumnChunkMetaData::has_offset_index() const { return impl_->has_offset_index(); } - -int64_t ColumnChunkMetaData::offset_index_offset() const { - return impl_->offset_index_offset(); +std::optional ColumnChunkMetaData::GetColumIndexLocation() const { + return impl_->GetColumIndexLocation(); } -int32_t ColumnChunkMetaData::offset_index_length() const { - return impl_->offset_index_length(); +std::optional ColumnChunkMetaData::GetOffsetIndexLocation() const { + return impl_->GetOffsetIndexLocation(); } bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 665956a257c..3564ae4447b 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -118,6 +119,12 @@ struct PageEncodingStats { int32_t count; }; +/// \brief Public struct for location to page index in ColumnChunkMetaData. +struct IndexLocation { + int64_t index_file_offset_bytes; + int32_t offset_index_length; +}; + /// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData. class PARQUET_EXPORT ColumnChunkMetaData { public: @@ -170,13 +177,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; std::unique_ptr crypto_metadata() const; - - bool has_column_index() const; - int64_t column_index_offset() const; - int32_t column_index_length() const; - bool has_offset_index() const; - int64_t offset_index_offset() const; - int32_t offset_index_length() const; + std::optional GetColumIndexLocation() const; + std::optional GetOffsetIndexLocation() const; private: explicit ColumnChunkMetaData( From 089feee2a6ef80ad12f5521b1cd4d2a0c4d752df Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Nov 2022 10:37:05 +0800 Subject: [PATCH 3/6] fix MSVC error C7555: use of designated initializers requires at least '/std:c++20' --- cpp/src/parquet/metadata.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1ff473b1eba..f4fdacff304 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -314,16 +314,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { inline std::optional GetColumIndexLocation() const { if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { - return IndexLocation{.index_file_offset_bytes = column_->column_index_offset, - .offset_index_length = column_->column_index_length}; + return IndexLocation{column_->column_index_offset, column_->column_index_length}; } return std::nullopt; } inline std::optional GetOffsetIndexLocation() const { if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) { - return IndexLocation{.index_file_offset_bytes = column_->offset_index_offset, - .offset_index_length = column_->offset_index_length}; + return IndexLocation{column_->offset_index_offset, column_->offset_index_length}; } return std::nullopt; } From a35a148908677c828594de2d71da396aa0fa595b Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Nov 2022 23:13:15 +0800 Subject: [PATCH 4/6] added test --- cpp/src/parquet/metadata.h | 6 +++-- cpp/src/parquet/metadata_test.cc | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 3564ae4447b..8c619c5c63b 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -121,8 +121,10 @@ struct PageEncodingStats { /// \brief Public struct for location to page index in ColumnChunkMetaData. struct IndexLocation { - int64_t index_file_offset_bytes; - int32_t offset_index_length; + /// File offset of the given index, in bytes + int64_t offset; + /// Length of the given index, in bytes + int32_t length; }; /// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData. diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index a89d3d97fa9..d08ee797a83 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -20,11 +20,15 @@ #include #include "arrow/util/key_value_metadata.h" +#include "parquet/file_reader.h" #include "parquet/schema.h" #include "parquet/statistics.h" +#include "parquet/test_util.h" #include "parquet/thrift_internal.h" #include "parquet/types.h" +#include + namespace parquet { namespace metadata { @@ -292,6 +296,46 @@ TEST(Metadata, TestKeyValueMetadata) { EXPECT_TRUE(f_accessor->key_value_metadata()->Equals(*kvmeta)); } +TEST(Metadata, TestReadPageIndex) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + ASSERT_EQ(1, file_metadata->num_row_groups()); + auto row_group_metadata = file_metadata->RowGroup(0); + ASSERT_EQ(13, row_group_metadata->num_columns()); + std::array ci_offsets = {323583, 327502, 328009, 331928, 335847, + 339766, 350345, 354264, 364843, 384342, + -1, 386473, 390392}; + std::array ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919, + 10579, 19499, 2131, -1, 3919, 3919}; + std::array oi_offsets = {394311, 397814, 398637, 401888, 405139, + 408390, 413670, 416921, 422201, 431936, + 435457, 446002, 449253}; + std::array oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251, + 5280, 9735, 3521, 10545, 3251, 3251}; + for (int i = 0; i < row_group_metadata->num_columns(); ++i) { + auto col_chunk_metadata = row_group_metadata->ColumnChunk(i); + auto ci_locaiton = col_chunk_metadata->GetColumIndexLocation(); + if (i == 10) { + // column_id 10 does not have column index + ASSERT_FALSE(ci_locaiton.has_value()); + } else { + ASSERT_TRUE(ci_locaiton.has_value()); + } + if (ci_locaiton.has_value()) { + ASSERT_EQ(ci_offsets.at(i), ci_locaiton->offset); + ASSERT_EQ(ci_lengths.at(i), ci_locaiton->length); + } + auto oi_location = col_chunk_metadata->GetOffsetIndexLocation(); + ASSERT_TRUE(oi_location.has_value()); + if (oi_location.has_value()) { + ASSERT_EQ(oi_offsets.at(i), oi_location->offset); + ASSERT_EQ(oi_lengths.at(i), oi_location->length); + } + } +} + TEST(ApplicationVersion, Basics) { ApplicationVersion version("parquet-mr version 1.7.9"); ApplicationVersion version1("parquet-mr version 1.8.0"); From 71b769667bf9f0e160df0658fde1525987f9d4d2 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 30 Nov 2022 23:14:40 +0800 Subject: [PATCH 5/6] remove inline --- cpp/src/parquet/metadata.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index f4fdacff304..1e1f96d906a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,14 +312,14 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } - inline std::optional GetColumIndexLocation() const { + std::optional GetColumIndexLocation() const { if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { return IndexLocation{column_->column_index_offset, column_->column_index_length}; } return std::nullopt; } - inline std::optional GetOffsetIndexLocation() const { + std::optional GetOffsetIndexLocation() const { if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) { return IndexLocation{column_->offset_index_offset, column_->offset_index_length}; } From c038b0c7f872952c919fed4f6d662d1254a87feb Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 1 Dec 2022 10:32:46 +0100 Subject: [PATCH 6/6] Some nits --- cpp/src/parquet/metadata_test.cc | 40 ++++++++++++++------------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index d08ee797a83..cabfb8078cf 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -27,8 +27,6 @@ #include "parquet/thrift_internal.h" #include "parquet/types.h" -#include - namespace parquet { namespace metadata { @@ -304,35 +302,33 @@ TEST(Metadata, TestReadPageIndex) { ASSERT_EQ(1, file_metadata->num_row_groups()); auto row_group_metadata = file_metadata->RowGroup(0); ASSERT_EQ(13, row_group_metadata->num_columns()); - std::array ci_offsets = {323583, 327502, 328009, 331928, 335847, - 339766, 350345, 354264, 364843, 384342, - -1, 386473, 390392}; - std::array ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919, - 10579, 19499, 2131, -1, 3919, 3919}; - std::array oi_offsets = {394311, 397814, 398637, 401888, 405139, - 408390, 413670, 416921, 422201, 431936, - 435457, 446002, 449253}; - std::array oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251, - 5280, 9735, 3521, 10545, 3251, 3251}; + std::vector ci_offsets = {323583, 327502, 328009, 331928, 335847, + 339766, 350345, 354264, 364843, 384342, + -1, 386473, 390392}; + std::vector ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919, + 10579, 19499, 2131, -1, 3919, 3919}; + std::vector oi_offsets = {394311, 397814, 398637, 401888, 405139, + 408390, 413670, 416921, 422201, 431936, + 435457, 446002, 449253}; + std::vector oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251, + 5280, 9735, 3521, 10545, 3251, 3251}; for (int i = 0; i < row_group_metadata->num_columns(); ++i) { auto col_chunk_metadata = row_group_metadata->ColumnChunk(i); - auto ci_locaiton = col_chunk_metadata->GetColumIndexLocation(); + auto ci_location = col_chunk_metadata->GetColumIndexLocation(); if (i == 10) { // column_id 10 does not have column index - ASSERT_FALSE(ci_locaiton.has_value()); + ASSERT_FALSE(ci_location.has_value()); } else { - ASSERT_TRUE(ci_locaiton.has_value()); + ASSERT_TRUE(ci_location.has_value()); } - if (ci_locaiton.has_value()) { - ASSERT_EQ(ci_offsets.at(i), ci_locaiton->offset); - ASSERT_EQ(ci_lengths.at(i), ci_locaiton->length); + if (ci_location.has_value()) { + ASSERT_EQ(ci_offsets.at(i), ci_location->offset); + ASSERT_EQ(ci_lengths.at(i), ci_location->length); } auto oi_location = col_chunk_metadata->GetOffsetIndexLocation(); ASSERT_TRUE(oi_location.has_value()); - if (oi_location.has_value()) { - ASSERT_EQ(oi_offsets.at(i), oi_location->offset); - ASSERT_EQ(oi_lengths.at(i), oi_location->length); - } + ASSERT_EQ(oi_offsets.at(i), oi_location->offset); + ASSERT_EQ(oi_lengths.at(i), oi_location->length); } }