diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c97e1949552..1e1f96d906a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -312,6 +312,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { } } + std::optional GetColumIndexLocation() const { + if (column_->__isset.column_index_offset && column_->__isset.column_index_length) { + return IndexLocation{column_->column_index_offset, column_->column_index_length}; + } + return std::nullopt; + } + + std::optional GetOffsetIndexLocation() const { + if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) { + return IndexLocation{column_->offset_index_offset, column_->offset_index_length}; + } + return std::nullopt; + } + private: mutable std::shared_ptr possible_stats_; std::vector encodings_; @@ -420,6 +434,14 @@ std::unique_ptr ColumnChunkMetaData::crypto_metadata() con return impl_->crypto_metadata(); } +std::optional ColumnChunkMetaData::GetColumIndexLocation() const { + return impl_->GetColumIndexLocation(); +} + +std::optional ColumnChunkMetaData::GetOffsetIndexLocation() const { + return impl_->GetOffsetIndexLocation(); +} + bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const { return impl_->Equals(*other.impl_); } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index bd59c628dc8..8c619c5c63b 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -118,6 +119,14 @@ struct PageEncodingStats { int32_t count; }; +/// \brief Public struct for location to page index in ColumnChunkMetaData. +struct IndexLocation { + /// File offset of the given index, in bytes + int64_t offset; + /// Length of the given index, in bytes + int32_t length; +}; + /// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData. class PARQUET_EXPORT ColumnChunkMetaData { public: @@ -170,6 +179,8 @@ class PARQUET_EXPORT ColumnChunkMetaData { int64_t total_compressed_size() const; int64_t total_uncompressed_size() const; std::unique_ptr crypto_metadata() const; + std::optional GetColumIndexLocation() const; + std::optional GetOffsetIndexLocation() const; private: explicit ColumnChunkMetaData( diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index a89d3d97fa9..cabfb8078cf 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -20,8 +20,10 @@ #include #include "arrow/util/key_value_metadata.h" +#include "parquet/file_reader.h" #include "parquet/schema.h" #include "parquet/statistics.h" +#include "parquet/test_util.h" #include "parquet/thrift_internal.h" #include "parquet/types.h" @@ -292,6 +294,44 @@ TEST(Metadata, TestKeyValueMetadata) { EXPECT_TRUE(f_accessor->key_value_metadata()->Equals(*kvmeta)); } +TEST(Metadata, TestReadPageIndex) { + std::string dir_string(parquet::test::get_data_dir()); + std::string path = dir_string + "/alltypes_tiny_pages.parquet"; + auto reader = ParquetFileReader::OpenFile(path, false); + auto file_metadata = reader->metadata(); + ASSERT_EQ(1, file_metadata->num_row_groups()); + auto row_group_metadata = file_metadata->RowGroup(0); + ASSERT_EQ(13, row_group_metadata->num_columns()); + std::vector ci_offsets = {323583, 327502, 328009, 331928, 335847, + 339766, 350345, 354264, 364843, 384342, + -1, 386473, 390392}; + std::vector ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919, + 10579, 19499, 2131, -1, 3919, 3919}; + std::vector oi_offsets = {394311, 397814, 398637, 401888, 405139, + 408390, 413670, 416921, 422201, 431936, + 435457, 446002, 449253}; + std::vector oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251, + 5280, 9735, 3521, 10545, 3251, 3251}; + for (int i = 0; i < row_group_metadata->num_columns(); ++i) { + auto col_chunk_metadata = row_group_metadata->ColumnChunk(i); + auto ci_location = col_chunk_metadata->GetColumIndexLocation(); + if (i == 10) { + // column_id 10 does not have column index + ASSERT_FALSE(ci_location.has_value()); + } else { + ASSERT_TRUE(ci_location.has_value()); + } + if (ci_location.has_value()) { + ASSERT_EQ(ci_offsets.at(i), ci_location->offset); + ASSERT_EQ(ci_lengths.at(i), ci_location->length); + } + auto oi_location = col_chunk_metadata->GetOffsetIndexLocation(); + ASSERT_TRUE(oi_location.has_value()); + ASSERT_EQ(oi_offsets.at(i), oi_location->offset); + ASSERT_EQ(oi_lengths.at(i), oi_location->length); + } +} + TEST(ApplicationVersion, Basics) { ApplicationVersion version("parquet-mr version 1.7.9"); ApplicationVersion version1("parquet-mr version 1.8.0");