Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
}
}

std::optional<IndexLocation> GetColumIndexLocation() const {
if (column_->__isset.column_index_offset && column_->__isset.column_index_length) {
return IndexLocation{column_->column_index_offset, column_->column_index_length};
}
return std::nullopt;
}

std::optional<IndexLocation> GetOffsetIndexLocation() const {
if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) {
return IndexLocation{column_->offset_index_offset, column_->offset_index_length};
}
return std::nullopt;
}

private:
mutable std::shared_ptr<Statistics> possible_stats_;
std::vector<Encoding::type> encodings_;
Expand Down Expand Up @@ -420,6 +434,14 @@ std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() con
return impl_->crypto_metadata();
}

std::optional<IndexLocation> ColumnChunkMetaData::GetColumIndexLocation() const {
return impl_->GetColumIndexLocation();
}

std::optional<IndexLocation> ColumnChunkMetaData::GetOffsetIndexLocation() const {
return impl_->GetOffsetIndexLocation();
}

bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
return impl_->Equals(*other.impl_);
}
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cstdint>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -118,6 +119,14 @@ struct PageEncodingStats {
int32_t count;
};

/// \brief Public struct for location to page index in ColumnChunkMetaData.
struct IndexLocation {
/// File offset of the given index, in bytes
int64_t offset;
/// Length of the given index, in bytes
int32_t length;
};

/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
class PARQUET_EXPORT ColumnChunkMetaData {
public:
Expand Down Expand Up @@ -170,6 +179,8 @@ class PARQUET_EXPORT ColumnChunkMetaData {
int64_t total_compressed_size() const;
int64_t total_uncompressed_size() const;
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
std::optional<IndexLocation> GetColumIndexLocation() const;
std::optional<IndexLocation> GetOffsetIndexLocation() const;

private:
explicit ColumnChunkMetaData(
Expand Down
40 changes: 40 additions & 0 deletions cpp/src/parquet/metadata_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
#include <gtest/gtest.h>

#include "arrow/util/key_value_metadata.h"
#include "parquet/file_reader.h"
#include "parquet/schema.h"
#include "parquet/statistics.h"
#include "parquet/test_util.h"
#include "parquet/thrift_internal.h"
#include "parquet/types.h"

Expand Down Expand Up @@ -292,6 +294,44 @@ TEST(Metadata, TestKeyValueMetadata) {
EXPECT_TRUE(f_accessor->key_value_metadata()->Equals(*kvmeta));
}

TEST(Metadata, TestReadPageIndex) {
std::string dir_string(parquet::test::get_data_dir());
std::string path = dir_string + "/alltypes_tiny_pages.parquet";
auto reader = ParquetFileReader::OpenFile(path, false);
auto file_metadata = reader->metadata();
ASSERT_EQ(1, file_metadata->num_row_groups());
auto row_group_metadata = file_metadata->RowGroup(0);
ASSERT_EQ(13, row_group_metadata->num_columns());
std::vector<int64_t> ci_offsets = {323583, 327502, 328009, 331928, 335847,
339766, 350345, 354264, 364843, 384342,
-1, 386473, 390392};
std::vector<int32_t> ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919,
10579, 19499, 2131, -1, 3919, 3919};
std::vector<int64_t> oi_offsets = {394311, 397814, 398637, 401888, 405139,
408390, 413670, 416921, 422201, 431936,
435457, 446002, 449253};
std::vector<int32_t> oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251,
5280, 9735, 3521, 10545, 3251, 3251};
for (int i = 0; i < row_group_metadata->num_columns(); ++i) {
auto col_chunk_metadata = row_group_metadata->ColumnChunk(i);
auto ci_location = col_chunk_metadata->GetColumIndexLocation();
if (i == 10) {
// column_id 10 does not have column index
ASSERT_FALSE(ci_location.has_value());
} else {
ASSERT_TRUE(ci_location.has_value());
}
if (ci_location.has_value()) {
ASSERT_EQ(ci_offsets.at(i), ci_location->offset);
ASSERT_EQ(ci_lengths.at(i), ci_location->length);
}
auto oi_location = col_chunk_metadata->GetOffsetIndexLocation();
ASSERT_TRUE(oi_location.has_value());
ASSERT_EQ(oi_offsets.at(i), oi_location->offset);
ASSERT_EQ(oi_lengths.at(i), oi_location->length);
}
}

TEST(ApplicationVersion, Basics) {
ApplicationVersion version("parquet-mr version 1.7.9");
ApplicationVersion version1("parquet-mr version 1.8.0");
Expand Down