From b9f1278495ad3e8cdb9e678cac0b06f829dd9da1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 20 Dec 2024 11:52:54 -0500 Subject: [PATCH 1/4] GH-45092: [C++][Parquet] Add GetReadRanges function to FileReader --- cpp/src/parquet/file_reader.cc | 16 ++++++++++++++++ cpp/src/parquet/file_reader.h | 26 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1c9b2323de50..96ff364f170d 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/io/caching.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" +#include "arrow/io/util_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" @@ -400,6 +401,21 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); } + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit, int64_t range_size_limit) { + std::vector<::arrow::io::ReadRange> ranges; + for (int row : row_groups) { + for (int col : column_indices) { + ranges.push_back( + ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); + } + } + + return ::arrow::io::internal::CoalesceReadRanges(std::move(ranges), hole_size_limit, + range_size_limit); + } + ::arrow::Future<> WhenBuffered(const std::vector& row_groups, const std::vector& column_indices) const { if (!cached_source_) { diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index b59b59f95c2d..1c85c10a686e 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -201,6 +201,32 @@ class PARQUET_EXPORT ParquetFileReader { const ::arrow::io::IOContext& ctx, const ::arrow::io::CacheOptions& options); + // Retrieve the list of byte ranges that would need to be read to retrieve + // the data for the specified row groups and column indices. + // + // A reader can optionally call this if they wish to handle their own + // caching and management of file reads (or offload them to other readers). + // Unlike PreBuffer, this method will not perform any actual caching or + // reads, instead just using the file metadata to determine the byte ranges + // that would need to be read if you were to consume the entirety of the column + // chunks for the provided columns in the specified row groups. + // + // If row_groups or column_indices are empty, then the result of this will be empty. + // + // hole_size_limit represents the maximum distance, in bytes, between two + // consecutive ranges; beyond this value, ranges will not be combined. The default + // value is 1MB. + // + // range_size_limit is the maximum size in bytes of a combined range; if combining + // two consecutive ranges would produce a range larger than this, they are not combined. + // The default values is 64MB. This *must* be larger than hole_size_limit. + // + // This will not take into account page indexes or any other predicate push down + // benefits that may be available. + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); + /// Wait for the specified row groups and column indices to be pre-buffered. /// /// After the returned Future completes, reading the specified row From 5fdd78ce8f96456809aa81b00115c36aee5be0b2 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 23 Dec 2024 17:17:44 -0500 Subject: [PATCH 2/4] fix lint and rename var --- cpp/src/parquet/file_reader.cc | 6 +++--- cpp/src/parquet/file_reader.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 96ff364f170d..f42256540788 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -404,11 +404,11 @@ class SerializedFile : public ParquetFileReader::Contents { ::arrow::Result> GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit, int64_t range_size_limit) { - std::vector<::arrow::io::ReadRange> ranges; - for (int row : row_groups) { + std::vector<::arrow::io::ReadRange> ranges; + for (int row_group : row_groups) { for (int col : column_indices) { ranges.push_back( - ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); + ComputeColumnChunkRange(file_metadata_.get(), source_size_, row_group, col)); } } diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index 1c85c10a686e..5b76cc200bda 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -206,13 +206,13 @@ class PARQUET_EXPORT ParquetFileReader { // // A reader can optionally call this if they wish to handle their own // caching and management of file reads (or offload them to other readers). - // Unlike PreBuffer, this method will not perform any actual caching or + // Unlike PreBuffer, this method will not perform any actual caching or // reads, instead just using the file metadata to determine the byte ranges // that would need to be read if you were to consume the entirety of the column // chunks for the provided columns in the specified row groups. // // If row_groups or column_indices are empty, then the result of this will be empty. - // + // // hole_size_limit represents the maximum distance, in bytes, between two // consecutive ranges; beyond this value, ranges will not be combined. The default // value is 1MB. From b5621a4b0b67a155eb04df737f784f4d10a5067e Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 3 Jan 2025 11:06:56 -0500 Subject: [PATCH 3/4] add third forward slash for doxygen --- cpp/src/parquet/file_reader.h | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index 5b76cc200bda..8b605f8187fc 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -201,28 +201,28 @@ class PARQUET_EXPORT ParquetFileReader { const ::arrow::io::IOContext& ctx, const ::arrow::io::CacheOptions& options); - // Retrieve the list of byte ranges that would need to be read to retrieve - // the data for the specified row groups and column indices. - // - // A reader can optionally call this if they wish to handle their own - // caching and management of file reads (or offload them to other readers). - // Unlike PreBuffer, this method will not perform any actual caching or - // reads, instead just using the file metadata to determine the byte ranges - // that would need to be read if you were to consume the entirety of the column - // chunks for the provided columns in the specified row groups. - // - // If row_groups or column_indices are empty, then the result of this will be empty. - // - // hole_size_limit represents the maximum distance, in bytes, between two - // consecutive ranges; beyond this value, ranges will not be combined. The default - // value is 1MB. - // - // range_size_limit is the maximum size in bytes of a combined range; if combining - // two consecutive ranges would produce a range larger than this, they are not combined. - // The default values is 64MB. This *must* be larger than hole_size_limit. - // - // This will not take into account page indexes or any other predicate push down - // benefits that may be available. + /// Retrieve the list of byte ranges that would need to be read to retrieve + /// the data for the specified row groups and column indices. + /// + /// A reader can optionally call this if they wish to handle their own + /// caching and management of file reads (or offload them to other readers). + /// Unlike PreBuffer, this method will not perform any actual caching or + /// reads, instead just using the file metadata to determine the byte ranges + /// that would need to be read if you were to consume the entirety of the column + /// chunks for the provided columns in the specified row groups. + /// + /// If row_groups or column_indices are empty, then the result of this will be empty. + /// + /// hole_size_limit represents the maximum distance, in bytes, between two + /// consecutive ranges; beyond this value, ranges will not be combined. The default + /// value is 1MB. + /// + /// range_size_limit is the maximum size in bytes of a combined range; if combining + /// two consecutive ranges would produce a range larger than this, they are not combined. + /// The default values is 64MB. This *must* be larger than hole_size_limit. + /// + /// This will not take into account page indexes or any other predicate push down + /// benefits that may be available. ::arrow::Result> GetReadRanges( const std::vector& row_groups, const std::vector& column_indices, int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); From 42002fad1acb1601ab92fa8f0393d63b466a0f2c Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 3 Jan 2025 11:24:05 -0500 Subject: [PATCH 4/4] run pre-commit lint --- cpp/src/parquet/file_reader.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index 8b605f8187fc..c42163276cda 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -218,8 +218,8 @@ class PARQUET_EXPORT ParquetFileReader { /// value is 1MB. /// /// range_size_limit is the maximum size in bytes of a combined range; if combining - /// two consecutive ranges would produce a range larger than this, they are not combined. - /// The default values is 64MB. This *must* be larger than hole_size_limit. + /// two consecutive ranges would produce a range larger than this, they are not + /// combined. The default values is 64MB. This *must* be larger than hole_size_limit. /// /// This will not take into account page indexes or any other predicate push down /// benefits that may be available.