From 2ead01724d7baab88af49755e0c3a9c8bde6528a Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Wed, 22 Nov 2023 21:36:44 +0800
Subject: [PATCH 01/25] happy path pass

---
 cpp/examples/arrow/parquet_read_write.cc | 297 ++++++++++----------
 cpp/src/parquet/arrow/reader.cc          | 116 +++++++-
 cpp/src/parquet/arrow/reader.h           |  10 +
 cpp/src/parquet/arrow/reader_internal.h  |   5 +
 cpp/src/parquet/column_reader.cc         |  33 ++-
 cpp/src/parquet/column_reader.h          | 333 +++++++++++++++++++++++
 cpp/src/parquet/reader_test.cc           |   3 +
 7 files changed, 647 insertions(+), 150 deletions(-)
diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index 3b8b4c2212b7..cc267f38d73d 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -23,168 +23,189 @@
 #include "parquet/arrow/writer.h"
 
 #include <iostream>
-
-arrow::Status ReadFullFile(std::string path_to_file) {
-  // #include "arrow/io/api.h"
-  // #include "arrow/parquet/arrow/reader.h"
-
-  arrow::MemoryPool* pool = arrow::default_memory_pool();
-  std::shared_ptr<arrow::io::RandomAccessFile> input;
-  ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(path_to_file));
-
-  // Open Parquet file reader
-  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
-  ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader));
-
-  // Read entire file as a single Arrow table
-  std::shared_ptr<arrow::Table> table;
-  ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table));
-  return arrow::Status::OK();
-}
+#include <arrow/util/range.h>
 
 arrow::Status ReadInBatches(std::string path_to_file) {
-  // #include "arrow/io/api.h"
-  // #include "arrow/parquet/arrow/reader.h"
-
-  arrow::MemoryPool* pool = arrow::default_memory_pool();
-
-  // Configure general Parquet reader settings
-  auto reader_properties = parquet::ReaderProperties(pool);
-  reader_properties.set_buffer_size(4096 * 4);
-  reader_properties.enable_buffered_stream();
-
-  // Configure Arrow-specific Parquet reader settings
-  auto arrow_reader_props = parquet::ArrowReaderProperties();
-  arrow_reader_props.set_batch_size(128 * 1024);  // default 64 * 1024
-
-  parquet::arrow::FileReaderBuilder reader_builder;
-  ARROW_RETURN_NOT_OK(
-      reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
-  reader_builder.memory_pool(pool);
-  reader_builder.properties(arrow_reader_props);
-
-  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
-  ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
-
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader));
-
-  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
-    // Operate on each batch...
-  }
-  return arrow::Status::OK();
+    // #include "arrow/io/api.h"
+    // #include "arrow/parquet/arrow/reader.h"
+
+    arrow::MemoryPool* pool = arrow::default_memory_pool();
+
+    // Configure general Parquet reader settings
+    auto reader_properties = parquet::ReaderProperties(pool);
+    reader_properties.set_buffer_size(4096 * 4);
+    reader_properties.enable_buffered_stream();
+
+    // Configure Arrow-specific Parquet reader settings
+    auto arrow_reader_props = parquet::ArrowReaderProperties();
+    arrow_reader_props.set_batch_size(10); // default 64 * 1024
+
+    parquet::arrow::FileReaderBuilder reader_builder;
+    ARROW_RETURN_NOT_OK(
+        reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
+    reader_builder.memory_pool(pool);
+    reader_builder.properties(arrow_reader_props);
+
+    std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+    ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
+
+    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+
+    std::vector<parquet::Range> ranges;
+    for (int64_t i = 0; i < 50; i++) {
+        if (i % 2 == 0)
+            ranges.push_back({i, i});
+    }
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+
+
+    ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader));
+
+    size_t total_rows = 0;
+    size_t total_values = 0;
+    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *rb_reader) {
+        // Operate on each batch...
+        auto batch = maybe_batch.ValueOrDie();
+        total_rows += batch->num_rows();
+        std::cout << "batch size: " << batch->num_rows() << std::endl;
+
+        auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
+        for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
+            total_values += (*iter).value();
+        }
+    }
+    std::cout << "total rows is : " << total_rows << std::endl;
+    std::cout << "total value of y is : " << total_values << std::endl;
+    return arrow::Status::OK();
 }
 
 arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
-  auto builder = arrow::Int32Builder();
-
-  std::shared_ptr<arrow::Array> arr_x;
-  ARROW_RETURN_NOT_OK(builder.AppendValues({1, 3, 5, 7, 1}));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
-
-  std::shared_ptr<arrow::Array> arr_y;
-  ARROW_RETURN_NOT_OK(builder.AppendValues({2, 4, 6, 8, 10}));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
-
-  auto schema = arrow::schema(
-      {arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32())});
-
-  return arrow::Table::Make(schema, {arr_x, arr_y});
+    auto builder = arrow::Int32Builder();
+
+    std::shared_ptr<arrow::Array> arr_x;
+    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100)));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
+
+    std::shared_ptr<arrow::Array> arr_y;
+    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100)));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
+
+    std::shared_ptr<arrow::Array> arr_z_values;
+    std::shared_ptr<arrow::Array> arr_z_offsets;
+    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,300)));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values));
+    std::vector<int> offsets = arrow::internal::Iota(0, 101);
+    std::transform(offsets.begin(), offsets.end(), offsets.begin(), [](int x) { return x * 3; });
+    ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets));
+    ARROW_ASSIGN_OR_RAISE(auto arr_z, arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values));
+
+
+    auto schema = arrow::schema(
+        {
+            arrow::field("x", arrow::int32()),
+            arrow::field("y", arrow::int32()),
+            arrow::field("z", arrow::list(arrow::int32()))
+        });
+
+    return arrow::Table::Make(schema, {arr_x, arr_y, arr_z});
 }
 
 arrow::Result<std::shared_ptr<arrow::TableBatchReader>> GetRBR() {
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
-  auto reader = std::make_shared<arrow::TableBatchReader>(table);
-  reader->set_chunksize(10);
-  return reader;
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+    auto reader = std::make_shared<arrow::TableBatchReader>(table);
+    reader->set_chunksize(10);
+    return reader;
 }
 
 arrow::Status WriteFullFile(std::string path_to_file) {
-  // #include "parquet/arrow/writer.h"
-  // #include "arrow/util/type_fwd.h"
-  using parquet::ArrowWriterProperties;
-  using parquet::WriterProperties;
+    using parquet::ArrowWriterProperties;
+    using parquet::WriterProperties;
 
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
 
-  // Choose compression
-  std::shared_ptr<WriterProperties> props =
-      WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
+    // Choose compression
+    std::shared_ptr<WriterProperties> props =
+            WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13)
+            ->data_pagesize(1) // this will cause every batch creating a page
+            ->compression(arrow::Compression::SNAPPY)->build();
+    std::cout << "hello" << std::endl;
 
-  // Opt to store Arrow schema for easier reads back into Arrow
-  std::shared_ptr<ArrowWriterProperties> arrow_props =
-      ArrowWriterProperties::Builder().store_schema()->build();
+    // Opt to store Arrow schema for easier reads back into Arrow
+    std::shared_ptr<ArrowWriterProperties> arrow_props =
+            ArrowWriterProperties::Builder().store_schema()->build();
 
-  std::shared_ptr<arrow::io::FileOutputStream> outfile;
-  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+    std::shared_ptr<arrow::io::FileOutputStream> outfile;
+    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
 
-  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
-                                                 arrow::default_memory_pool(), outfile,
-                                                 /*chunk_size=*/3, props, arrow_props));
-  return arrow::Status::OK();
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+        arrow::default_memory_pool(), outfile,
+        /*chunk_size=*/100, props, arrow_props));
+    return arrow::Status::OK();
 }
 
 arrow::Status WriteInBatches(std::string path_to_file) {
-  // #include "parquet/arrow/writer.h"
-  // #include "arrow/util/type_fwd.h"
-  using parquet::ArrowWriterProperties;
-  using parquet::WriterProperties;
-
-  // Data is in RBR
-  std::shared_ptr<arrow::RecordBatchReader> batch_stream;
-  ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
-
-  // Choose compression
-  std::shared_ptr<WriterProperties> props =
-      WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
-
-  // Opt to store Arrow schema for easier reads back into Arrow
-  std::shared_ptr<ArrowWriterProperties> arrow_props =
-      ArrowWriterProperties::Builder().store_schema()->build();
-
-  // Create a writer
-  std::shared_ptr<arrow::io::FileOutputStream> outfile;
-  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
-  std::unique_ptr<parquet::arrow::FileWriter> writer;
-  ARROW_ASSIGN_OR_RAISE(
-      writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
-                                               arrow::default_memory_pool(), outfile,
-                                               props, arrow_props));
-
-  // Write each batch as a row_group
-  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *batch_stream) {
-    ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
-    ARROW_ASSIGN_OR_RAISE(auto table,
-                          arrow::Table::FromRecordBatches(batch->schema(), {batch}));
-    ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
-  }
-
-  // Write file footer and close
-  ARROW_RETURN_NOT_OK(writer->Close());
-
-  return arrow::Status::OK();
+    // #include "parquet/arrow/writer.h"
+    // #include "arrow/util/type_fwd.h"
+    using parquet::ArrowWriterProperties;
+    using parquet::WriterProperties;
+
+    // Data is in RBR
+    std::shared_ptr<arrow::RecordBatchReader> batch_stream;
+    ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
+
+    // Choose compression
+    std::shared_ptr<WriterProperties> props =
+            WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
+
+    // Opt to store Arrow schema for easier reads back into Arrow
+    std::shared_ptr<ArrowWriterProperties> arrow_props =
+            ArrowWriterProperties::Builder().store_schema()->build();
+
+    // Create a writer
+    std::shared_ptr<arrow::io::FileOutputStream> outfile;
+    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+    std::unique_ptr<parquet::arrow::FileWriter> writer;
+    ARROW_ASSIGN_OR_RAISE(
+        writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
+            arrow::default_memory_pool(), outfile,
+            props, arrow_props));
+
+    // Write each batch as a row_group
+    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *batch_stream) {
+        ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
+        ARROW_ASSIGN_OR_RAISE(auto table,
+                              arrow::Table::FromRecordBatches(batch->schema(), {batch}));
+        ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
+    }
+
+    // Write file footer and close
+    ARROW_RETURN_NOT_OK(writer->Close());
+
+    return arrow::Status::OK();
 }
 
 arrow::Status RunExamples(std::string path_to_file) {
-  ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
-  ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
-  ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
-  ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
-  return arrow::Status::OK();
+    // ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
+    // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
+    // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
+    ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
+    return arrow::Status::OK();
 }
 
 int main(int argc, char** argv) {
-  if (argc != 2) {
-    // Fake success for CI purposes.
+    if (argc != 2) {
+        // Fake success for CI purposes.
+        return EXIT_SUCCESS;
+    }
+
+    std::string path_to_file = argv[1];
+    arrow::Status status = RunExamples(path_to_file);
+
+    if (!status.ok()) {
+        std::cerr << "Error occurred: " << status.message() << std::endl;
+        return EXIT_FAILURE;
+    }
     return EXIT_SUCCESS;
-  }
-
-  std::string path_to_file = argv[1];
-  arrow::Status status = RunExamples(path_to_file);
-
-  if (!status.ok()) {
-    std::cerr << "Error occurred: " << status.message() << std::endl;
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
 }
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 40fbdcbb562b..34316bf47c1b 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -17,12 +17,15 @@
 
 #include "parquet/arrow/reader.h"
 
+#include <parquet/page_index.h>
+
 #include <algorithm>
 #include <cstring>
 #include <memory>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include <zconf.h>
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
@@ -72,6 +75,8 @@ using arrow::internal::Iota;
 // Help reduce verbosity
 using ParquetReader = parquet::ParquetFileReader;
 
+using parquet::RowRangesPtr;
+using parquet::Range;
 using parquet::internal::RecordReader;
 
 namespace bit_util = arrow::bit_util;
@@ -203,6 +208,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(int i,
                         const std::shared_ptr<std::unordered_set<int>>& included_leaves,
                         const std::vector<int>& row_groups,
+                        const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
                         std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
@@ -219,11 +225,13 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
+    ctx->row_ranges_map = row_ranges_map;
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(const std::vector<int>& column_indices,
                          const std::vector<int>& row_groups,
+                         const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
                          std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
                          std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -238,7 +246,8 @@ class FileReaderImpl : public FileReader {
     for (size_t i = 0; i < out->size(); ++i) {
       std::unique_ptr<ColumnReaderImpl> reader;
       RETURN_NOT_OK(
-          GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
+          GetFieldReader(field_indices[i], included_leaves, row_groups,
+            row_ranges_map, &reader));
 
       out_fields[i] = reader->field();
       out->at(i) = std::move(reader);
@@ -265,7 +274,7 @@ class FileReaderImpl : public FileReader {
     std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
 
     std::unique_ptr<ColumnReaderImpl> reader;
-    RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
+    RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, NULLPTR, &reader));
 
     return ReadColumn(i, row_groups, reader.get(), out);
   }
@@ -336,19 +345,26 @@ class FileReaderImpl : public FileReader {
     return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
   }
 
+  Status GetRecordBatchReader(
+    const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+    std::unique_ptr<RecordBatchReader>* out) override;
+
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               const std::vector<int>& column_indices,
-                              std::unique_ptr<RecordBatchReader>* out) override;
+                              std::unique_ptr<RecordBatchReader>* out) override {
+    return GetRecordBatchReader(row_group_indices, column_indices, NULLPTR, out);
+  }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
     return GetRecordBatchReader(row_group_indices,
-                                Iota(reader_->metadata()->num_columns()), out);
+                                Iota(reader_->metadata()->num_columns()), NULLPTR, out);
   }
 
   Status GetRecordBatchReader(std::unique_ptr<RecordBatchReader>* out) override {
     return GetRecordBatchReader(Iota(num_row_groups()),
-                                Iota(reader_->metadata()->num_columns()), out);
+                                Iota(reader_->metadata()->num_columns()), NULLPTR, out);
   }
 
   ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
@@ -451,6 +467,43 @@ class RowGroupReaderImpl : public RowGroupReader {
 // ----------------------------------------------------------------------
 // Column reader implementations
 
+struct RowRangesPageFilter {
+  explicit RowRangesPageFilter(const RowRangesPtr & row_ranges_, const RowRangesPtr & page_ranges_)
+    : row_ranges(row_ranges_), page_ranges(page_ranges_) {
+    assert(row_ranges != nullptr);
+    assert(page_ranges != nullptr);
+    assert(row_ranges->getRanges().size() > 0);
+    assert(page_ranges->getRanges().size() > 0);
+  }
+
+  bool operator() (const DataPageStats & stats) {
+    ++page_range_idx;
+
+    if (row_range_idx >= row_ranges->getRanges().size()) {
+      return true;
+    }
+
+    Range current_page_range = (*page_ranges)[page_range_idx];
+
+    if (current_page_range.isBefore((*row_ranges)[row_range_idx])) {
+      return true;
+    }
+
+    while (row_range_idx < row_ranges->getRanges().size() &&
+      current_page_range.isAfter((*row_ranges)[row_range_idx])) {
+        row_range_idx++;
+    }
+
+    return row_range_idx >= row_ranges->getRanges().size();
+  }
+
+  size_t row_range_idx = 0;
+  const RowRangesPtr row_ranges;
+
+  int page_range_idx = -1;
+  const RowRangesPtr page_ranges;
+};
+
 // Leaf reader is for primitive arrays and primitive children of nested arrays
 class LeafReader : public ColumnReaderImpl {
  public:
@@ -514,6 +567,43 @@ class LeafReader : public ColumnReaderImpl {
   std::shared_ptr<ChunkedArray> out_;
   void NextRowGroup() {
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
+
+    /// using page index to reduce cost
+    if (page_reader != nullptr && ctx_->row_ranges_map) {
+      // if specific row range is provided for this rg
+      if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
+        iter != ctx_->row_ranges_map->end()) {
+
+        // check offset exists
+        auto offset_index = ctx_->reader->GetPageIndexReader()
+                                       ->RowGroup(input_->current_row_group())
+                                       ->GetOffsetIndex(input_->column_index());
+        if (!offset_index) {
+          throw ParquetException("Offset index is not found for column: " + field_->name());
+        }
+
+        const auto page_locations = offset_index->page_locations();
+        auto page_ranges = std::make_shared<RowRanges>();
+        for (size_t i = 0; i < page_locations.size() - 1; i++) {
+          page_ranges->add({page_locations[i].first_row_index,
+                           page_locations[i + 1].first_row_index - 1}, false);
+        }
+        if (page_locations.size() >= 1) {
+          page_ranges->add({
+              page_locations[page_locations.size() - 1].first_row_index,
+              ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - 1}, false);
+        }
+
+        // part 1, skip decompressing & decoding unnecessary pages
+        page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges));
+
+        // part 2, skip unnecessary rows in necessary pages
+        record_reader_->set_record_skipper(std::make_shared<parquet::internal::RecordSkipper>(
+          *page_ranges, *iter->second));
+        }
+    }
+
+    record_reader_->reset_current_rg_processed_records();
     record_reader_->SetPageReader(std::move(page_reader));
   }
 
@@ -984,6 +1074,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
                                             const std::vector<int>& column_indices,
+                                            const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
                                             std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
@@ -997,7 +1088,7 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
 
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> batch_schema;
-  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
+  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups,row_ranges_map, &readers, &batch_schema));
 
   if (readers.empty()) {
     // Just generate all batches right now; they're cheap since they have no columns.
@@ -1218,6 +1309,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
   ctx->pool = pool_;
   ctx->iterator_factory = iterator_factory;
   ctx->filter_leaves = false;
+
   std::unique_ptr<ColumnReaderImpl> result;
   RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
   *out = std::move(result);
@@ -1251,7 +1343,7 @@ Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
   // in a sync context too so use `this` over `self`
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> result_schema;
-  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
+  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema));
   // OptionalParallelForAsync requires an executor
   if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
 
@@ -1314,6 +1406,16 @@ Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indice
   return Status::OK();
 }
 
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+                                        const std::vector<int>& column_indices,
+                                        const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+                                        std::shared_ptr<RecordBatchReader>* out) {
+  std::unique_ptr<RecordBatchReader> tmp;
+  RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp));
+  out->reset(tmp.release());
+  return Status::OK();
+}
+
 Status FileReader::Make(::arrow::MemoryPool* pool,
                         std::unique_ptr<ParquetFileReader> reader,
                         const ArrowReaderProperties& properties,
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 2cbd36176f5e..0fd35349b643 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "parquet/file_reader.h"
+#include "parquet/column_reader.h"
 #include "parquet/platform.h"
 #include "parquet/properties.h"
 
@@ -187,6 +188,11 @@ class PARQUET_EXPORT FileReader {
       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
       std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
+  virtual ::arrow::Status GetRecordBatchReader(
+      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
   /// \brief Return a RecordBatchReader of row groups selected from
   /// row_group_indices, whose columns are selected by column_indices.
   ///
@@ -199,6 +205,10 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either row_group_indices or column_indices
   ///     contains an invalid index
+  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+                                       const std::vector<int>& column_indices,
+                                       const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+                                       std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                        const std::vector<int>& column_indices,
                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index cf9dbb86577b..56be0f93f414 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -76,6 +76,7 @@ class FileColumnIterator {
     }
 
     auto row_group_reader = reader_->RowGroup(row_groups_.front());
+    current_rg = row_groups_.front();
     row_groups_.pop_front();
     return row_group_reader->GetColumnPageReader(column_index_);
   }
@@ -88,11 +89,14 @@ class FileColumnIterator {
 
   int column_index() const { return column_index_; }
 
+  int current_row_group() const { return current_rg; }
+
  protected:
   int column_index_;
   ParquetFileReader* reader_;
   const SchemaDescriptor* schema_;
   std::deque<int> row_groups_;
+  int current_rg = 0;
 };
 
 using FileColumnIteratorFactory =
@@ -109,6 +113,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
+  std::shared_ptr<std::map<int, RowRangesPtr>> row_ranges_map;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 3294aaaf283f..5187ef94aa9c 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1373,7 +1373,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
     int64_t records_read = 0;
 
     if (has_values_to_process()) {
-      records_read += ReadRecordData(num_records);
+      records_read += ReadRecordDataWithSkipCheck(num_records);
     }
 
     int64_t level_batch_size = std::max<int64_t>(kMinLevelBatchSize, num_records);
@@ -1427,11 +1427,11 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
         }
 
         levels_written_ += levels_read;
-        records_read += ReadRecordData(num_records - records_read);
+        records_read += ReadRecordDataWithSkipCheck(num_records - records_read);
       } else {
         // No repetition or definition levels
         batch_size = std::min(num_records - records_read, batch_size);
-        records_read += ReadRecordData(batch_size);
+        records_read += ReadRecordDataWithSkipCheck(batch_size);
       }
     }
 
@@ -1634,10 +1634,12 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
 
     // Top level required field. Number of records equals to number of levels,
     // and there is not read-ahead for levels.
+    int64_t skipped_records = 0;
     if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) {
-      return this->Skip(num_records);
+      skipped_records =  this->Skip(num_records);
+      current_rg_processed_records += skipped_records;
+      return skipped_records;
     }
-    int64_t skipped_records = 0;
     if (this->max_rep_level_ == 0) {
       // Non-repeated optional field.
       // First consume whatever is in the buffer.
@@ -1653,6 +1655,8 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
     } else {
       skipped_records += this->SkipRecordsRepeated(num_records);
     }
+
+    current_rg_processed_records += skipped_records;
     return skipped_records;
   }
 
@@ -1984,9 +1988,28 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
       this->ConsumeBufferedValues(values_to_read);
     }
 
+    current_rg_processed_records += records_read;
     return records_read;
   }
 
+  int64_t ReadRecordDataWithSkipCheck(const int64_t num_records) {
+    if (!skipper) {
+      return ReadRecordData(num_records);
+    }
+
+    while (true) {
+      const auto advise = skipper->advise_next(current_rg_processed_records);
+      std::cout << "advise got after current_rg_processed_records: " << current_rg_processed_records  << " is: " << advise <<std::endl;
+      if (advise == 0) {
+        return 0;
+      }
+      if (advise > 0) {
+        return ReadRecordData(std::min(num_records, advise));
+      }
+      SkipRecords(-advise);
+    }
+  }
+
   void DebugPrintState() override {
     const int16_t* def_levels = this->def_levels();
     const int16_t* rep_levels = this->rep_levels();
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 334b8bcffe0b..4468fd1aa20e 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <utility>
@@ -302,8 +303,332 @@ class TypedColumnReader : public ColumnReader {
                                           int32_t* dict_len) = 0;
 };
 
+struct Range {
+    static Range unionRange(const Range&left, const Range&right) {
+        if (left.from <= right.from) {
+            if (left.to + 1 >= right.from) {
+                return {left.from, std::max(left.to, right.to)};
+            }
+        }
+        else if (right.to + 1 >= left.from) {
+            return {right.from, std::max(left.to, right.to)};
+        }
+        return {-1, -1};
+    }
+
+    static Range intersection(const Range&left, const Range&right) {
+        if (left.from <= right.from) {
+            if (left.to >= right.from) {
+                return {right.from, std::min(left.to, right.to)};
+            }
+        }
+        else if (right.to >= left.from) {
+            return {left.from, std::min(left.to, right.to)};
+        }
+        return {-1, -1}; // Return a default Range object if no intersection range found
+    }
+
+    int64_t from;
+    int64_t to;
+
+    Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
+        assert(from <= to);
+    }
+
+    size_t count() const {
+        return to - from + 1;
+    }
+
+    bool isBefore(const Range&other) const {
+        return to < other.from;
+    }
+
+    bool isAfter(const Range&other) const {
+        return from > other.to;
+    }
+
+    bool isOverlap(const Range&other) const {
+        return !isBefore(other) && !isAfter(other);
+    }
+
+    std::string toString() const {
+        return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
+    }
+};
+
+class RowRanges {
+    std::vector<Range> ranges;
+
+public:
+    RowRanges() = default;
+
+    explicit RowRanges(const Range&range) {
+        ranges.push_back(range);
+    }
+
+    RowRanges(const std::vector<Range>&ranges) {
+        this->ranges = ranges;
+    }
+
+    //copy cstr
+    RowRanges(const RowRanges&other) {
+        ranges = other.ranges;
+    }
+
+    RowRanges(RowRanges&&other) noexcept {
+        ranges = std::move(other.ranges);
+    }
+
+    static RowRanges createSingle(const size_t rowCount) {
+        return RowRanges({Range(0L, rowCount - 1L)});
+    }
+
+    // static RowRanges create(size_t rowCount, const std::vector<int>& pageIndexes, const OffsetIndex& offsetIndex) {
+    //     RowRanges ranges;
+    //     for (int pageIndex : pageIndexes) {
+    //         ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount)));
+    //     }
+    //     return ranges;
+    // }
+
+    static RowRanges unionRanges(const RowRanges&left, const RowRanges&right) {
+        RowRanges result;
+        auto it1 = left.ranges.begin();
+        auto it2 = right.ranges.begin();
+        if (it2 != right.ranges.end()) {
+            Range range2 = *it2;
+            while (it1 != left.ranges.end()) {
+                Range range1 = *it1;
+                if (range1.isAfter(range2)) {
+                    result.add(range2);
+                    range2 = range1;
+                    const auto tmp = it1;
+                    it1 = it2;
+                    it2 = tmp;
+                }
+                else {
+                    result.add(range1);
+                }
+                ++it1;
+            }
+            result.add(range2);
+        }
+        else {
+            it2 = it1;
+        }
+        while (it2 != right.ranges.end()) {
+            result.add(*it2);
+            ++it2;
+        }
+
+        return result;
+    }
+
+    static RowRanges intersection(const RowRanges&left, const RowRanges&right) {
+        RowRanges result;
+
+        size_t rightIndex = 0;
+        for (const Range&l: left.ranges) {
+            for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
+                const Range&r = right.ranges[i];
+                if (l.isBefore(r)) {
+                    break;
+                }
+                else if (l.isAfter(r)) {
+                    rightIndex = i + 1;
+                    continue;
+                }
+                result.add(Range::intersection(l, r));
+            }
+        }
+
+        return result;
+    }
+
+    RowRanges slice(const int64_t from, const int64_t to) const {
+        RowRanges result;
+        for (const Range&range: ranges) {
+            if (range.from >= from && range.to <= to) {
+                result.add(range);
+            }
+        }
+        return result;
+    }
+
+    void add(const Range&range, bool merge = true) {
+        Range rangeToAdd = range;
+        if(merge) {
+            for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
+                Range last = ranges[i];
+                assert(!last.isAfter(range));
+                const Range u = Range::unionRange(last, rangeToAdd);
+                assert (u.from != -1 && u.to != -1);
+                rangeToAdd = u;
+                ranges.erase(ranges.begin() + i);
+            }
+        }
+        ranges.push_back(rangeToAdd);
+    }
+
+    size_t rowCount() const {
+        size_t cnt = 0;
+        for (const Range&range: ranges) {
+            cnt += range.count();
+        }
+        return cnt;
+    }
+
+    //
+    // class Iterator {
+    // private:
+    //     int currentRangeIndex;
+    //     Range currentRange;
+    //     long next;
+    //     std::vector<Range> ranges;
+    //
+    //     long findNext() {
+    //         if (currentRangeIndex < ranges.size()) {
+    //             currentRange = ranges[++currentRangeIndex];
+    //             next = currentRange.from;
+    //         } else {
+    //             return -1;
+    //         }
+    //         return next;
+    //     }
+    //
+    // public:
+    //     Iterator(const std::vector<Range>& ranges) {
+    //         this->ranges = ranges;
+    //         currentRangeIndex = -1;
+    //         next = findNext();
+    //     }
+    //
+    //     bool hasNext() const {
+    //         return next >= 0;
+    //     }
+    //
+    //     long nextLong() {
+    //         long ret = next;
+    //         if (ret < 0) {
+    //             throw std::out_of_range("No such element");
+    //         }
+    //         next = findNext();
+    //         return ret;
+    //     }
+    // };
+    //
+    // Iterator iterator() const {
+    //     return Iterator(ranges);
+    // }
+
+    bool isOverlapping(int64_t from, int64_t to) const {
+        const Range searchRange(from, to);
+        return isOverlapping(searchRange);
+    }
+
+    bool isOverlapping(const Range&searchRange) const {
+        auto it = std::lower_bound(ranges.begin(), ranges.end(), searchRange, [](const Range&r1, const Range&r2) {
+            return r1.isBefore(r2);
+        });
+        return it != ranges.end() && !(*it).isAfter(searchRange);
+    }
+
+    std::vector<Range>& getRanges() {
+        return ranges;
+    }
+
+    const Range& operator[](size_t index) const {
+        return ranges[index];
+    }
+
+    std::string toString() const {
+        std::string result = "[";
+        for (const Range&range: ranges) {
+            result += "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
+        }
+        if (!ranges.empty()) {
+            result = result.substr(0, result.size() - 2);
+        }
+        result += "]";
+        return result;
+    }
+};
+
+using RowRangesPtr = std::shared_ptr<RowRanges>;
+
 namespace internal {
 
+class PARQUET_EXPORT RecordSkipper {
+public:
+    RecordSkipper(RowRanges & pages, RowRanges & row_ranges_) : row_ranges(row_ranges_) {
+        RowRanges will_process_pages, skip_pages;
+        for(auto & page : pages.getRanges()) {
+            if(row_ranges.isOverlapping(page)) {
+                // will_process_pages.add(page);
+            } else {
+                skip_pages.add(page, false);
+            }
+        }
+        adjust_ranges(skip_pages, row_ranges);
+        // adjust_ranges(skip_pages, will_process_pages);
+    }
+
+    /// \brief Return the number of records to read or to skip
+    /// if return values is positive, it means to read N records
+    /// if return values is negative, it means to skip N records
+    /// if return values is 0, it means to skip all records in this row group
+    int64_t advise_next(const int64_t current_rg_procesed)
+    {
+        if (row_ranges.getRanges().size() == row_range_idx)
+        {
+            return 0;
+        }
+
+        if (row_ranges[row_range_idx].to < current_rg_procesed)
+        {
+            row_range_idx++;
+            if (row_ranges.getRanges().size() == row_range_idx)
+            {
+                return 0;
+            }
+        }
+
+        if (row_ranges[row_range_idx].from > current_rg_procesed)
+        {
+            // return negative
+            return current_rg_procesed - row_ranges[row_range_idx].from;
+        }
+
+        const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
+        assert(ret >= 1);
+        return ret;
+    }
+private:
+    /// Keep copy of ranges, because advise_next() will modify them
+    // RowRanges will_process_pages;
+    RowRanges row_ranges;
+
+    size_t row_range_idx = 0;
+
+    /// Since the skipped pages will be slienly skipped without updating current_rg_processed_records
+    /// or records_read_, we need to pre-process the row ranges as if these skipped pages never existed
+    void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) {
+        size_t skipped_rows = 0;
+        auto iter = to_adjust.getRanges().begin();
+        auto skip_iter = skip_pages.getRanges().begin();
+        while(iter != to_adjust.getRanges().end()) {
+            while(skip_iter != skip_pages.getRanges().end() &&
+                skip_iter->isBefore(*iter)) {
+                skipped_rows += skip_iter->count();
+                ++skip_iter;
+            }
+            iter->from -= skipped_rows;
+            iter->to -= skipped_rows;
+            ++iter;
+        }
+    }
+};
+
 /// \brief Stateful column reader that delimits semantic records for both flat
 /// and nested columns
 ///
@@ -414,6 +739,10 @@ class PARQUET_EXPORT RecordReader {
   /// \brief True if reading dense for nullable columns.
   bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
 
+  void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
+
+  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
+
  protected:
   /// \brief Indicates if we can have nullable values. Note that repeated fields
   /// may or may not be nullable.
@@ -422,6 +751,8 @@ class PARQUET_EXPORT RecordReader {
   bool at_record_start_;
   int64_t records_read_;
 
+  int64_t current_rg_processed_records; // counting both read and skip records
+
   /// \brief Stores values. These values are populated based on each ReadRecords
   /// call. No extra values are buffered for the next call. SkipRecords will not
   /// add any value to this buffer.
@@ -463,6 +794,8 @@ class PARQUET_EXPORT RecordReader {
   // If true, we will not leave any space for the null values in the values_
   // vector.
   bool read_dense_for_nullable_ = false;
+
+  std::shared_ptr<RecordSkipper> skipper = NULLPTR;
 };
 
 class BinaryRecordReader : virtual public RecordReader {
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 0a73002846ad..a9adcdf5b9c3 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1457,3 +1457,6 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) {
 }
 
 }  // namespace parquet
+
+
+//TODO: TEST_P ,enable dictionary
\ No newline at end of file

From 3e9af38857a7c2bf29f9d11dc6256cd982904ce8 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Wed, 22 Nov 2023 22:25:30 +0800
Subject: [PATCH 02/25] happy path pass 2

---
 cpp/examples/arrow/parquet_read_write.cc | 263 +++++++++--------------
 1 file changed, 104 insertions(+), 159 deletions(-)

diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index cc267f38d73d..63f1a28fe475 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -17,195 +17,140 @@
 
 #include "arrow/api.h"
 #include "arrow/io/api.h"
+#include "arrow/io/memory.h"
 #include "arrow/result.h"
 #include "arrow/util/type_fwd.h"
 #include "parquet/arrow/reader.h"
 #include "parquet/arrow/writer.h"
 
-#include <iostream>
+#include <arrow/testing/gtest_util.h>
 #include <arrow/util/range.h>
+#include <iostream>
 
-arrow::Status ReadInBatches(std::string path_to_file) {
-    // #include "arrow/io/api.h"
-    // #include "arrow/parquet/arrow/reader.h"
-
-    arrow::MemoryPool* pool = arrow::default_memory_pool();
-
-    // Configure general Parquet reader settings
-    auto reader_properties = parquet::ReaderProperties(pool);
-    reader_properties.set_buffer_size(4096 * 4);
-    reader_properties.enable_buffered_stream();
-
-    // Configure Arrow-specific Parquet reader settings
-    auto arrow_reader_props = parquet::ArrowReaderProperties();
-    arrow_reader_props.set_batch_size(10); // default 64 * 1024
-
-    parquet::arrow::FileReaderBuilder reader_builder;
-    ARROW_RETURN_NOT_OK(
-        reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
-    reader_builder.memory_pool(pool);
-    reader_builder.properties(arrow_reader_props);
-
-    std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
-    ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
+arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
+  auto builder = arrow::Int32Builder();
+
+  std::shared_ptr<arrow::Array> arr_x;
+  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
+
+  std::shared_ptr<arrow::Array> arr_y;
+  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
+
+  std::shared_ptr<arrow::Array> arr_z_values;
+  std::shared_ptr<arrow::Array> arr_z_offsets;
+  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 300)));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values));
+  std::vector<int> offsets = arrow::internal::Iota(0, 101);
+  std::transform(offsets.begin(), offsets.end(), offsets.begin(),
+                 [](int x) { return x * 3; });
+  ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets));
+  ARROW_ASSIGN_OR_RAISE(auto arr_z,
+                        arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values));
+
+  auto schema =
+      arrow::schema({arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32()),
+                     arrow::field("z", arrow::list(arrow::int32()))});
+
+  return arrow::Table::Make(schema, {arr_x, arr_y, arr_z});
+}
 
-    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
 
-    std::vector<parquet::Range> ranges;
-    for (int64_t i = 0; i < 50; i++) {
-        if (i % 2 == 0)
-            ranges.push_back({i, i});
-    }
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
 
+  std::shared_ptr<WriterProperties> props =
+      WriterProperties::Builder()
+          .max_row_group_length(50)
+          ->enable_write_page_index()
+          ->write_batch_size(13)
+          ->data_pagesize(1)  // this will cause every batch creating a page
+          ->compression(arrow::Compression::SNAPPY)
+          ->build();
 
-    ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader));
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
 
-    size_t total_rows = 0;
-    size_t total_values = 0;
-    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *rb_reader) {
-        // Operate on each batch...
-        auto batch = maybe_batch.ValueOrDie();
-        total_rows += batch->num_rows();
-        std::cout << "batch size: " << batch->num_rows() << std::endl;
+  ARROW_ASSIGN_OR_RAISE(auto outfile, ::arrow::io::BufferOutputStream::Create());
 
-        auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
-        for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
-            total_values += (*iter).value();
-        }
-    }
-    std::cout << "total rows is : " << total_rows << std::endl;
-    std::cout << "total value of y is : " << total_values << std::endl;
-    return arrow::Status::OK();
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+                                                 arrow::default_memory_pool(), outfile,
+                                                 /*chunk_size=*/100, props, arrow_props));
+  return outfile->Finish();
 }
 
-arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
-    auto builder = arrow::Int32Builder();
-
-    std::shared_ptr<arrow::Array> arr_x;
-    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100)));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
-
-    std::shared_ptr<arrow::Array> arr_y;
-    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100)));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
-
-    std::shared_ptr<arrow::Array> arr_z_values;
-    std::shared_ptr<arrow::Array> arr_z_offsets;
-    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,300)));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values));
-    std::vector<int> offsets = arrow::internal::Iota(0, 101);
-    std::transform(offsets.begin(), offsets.end(), offsets.begin(), [](int x) { return x * 3; });
-    ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets));
-    ARROW_ASSIGN_OR_RAISE(auto arr_z, arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values));
-
-
-    auto schema = arrow::schema(
-        {
-            arrow::field("x", arrow::int32()),
-            arrow::field("y", arrow::int32()),
-            arrow::field("z", arrow::list(arrow::int32()))
-        });
-
-    return arrow::Table::Make(schema, {arr_x, arr_y, arr_z});
-}
 
-arrow::Result<std::shared_ptr<arrow::TableBatchReader>> GetRBR() {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
-    auto reader = std::make_shared<arrow::TableBatchReader>(table);
-    reader->set_chunksize(10);
-    return reader;
-}
+arrow::Status ReadInBatches(std::shared_ptr<arrow::Buffer> buffer) {
+  arrow::MemoryPool* pool = arrow::default_memory_pool();
 
-arrow::Status WriteFullFile(std::string path_to_file) {
-    using parquet::ArrowWriterProperties;
-    using parquet::WriterProperties;
+  auto reader_properties = parquet::ReaderProperties(pool);
+  reader_properties.set_buffer_size(4096 * 4);
+  reader_properties.enable_buffered_stream();
 
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+  auto arrow_reader_props = parquet::ArrowReaderProperties();
+  arrow_reader_props.set_batch_size(10);  // default 64 * 1024
 
-    // Choose compression
-    std::shared_ptr<WriterProperties> props =
-            WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13)
-            ->data_pagesize(1) // this will cause every batch creating a page
-            ->compression(arrow::Compression::SNAPPY)->build();
-    std::cout << "hello" << std::endl;
+  parquet::arrow::FileReaderBuilder reader_builder;
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+  ARROW_RETURN_NOT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
+  reader_builder.memory_pool(pool);
+  reader_builder.properties(arrow_reader_props);
 
-    // Opt to store Arrow schema for easier reads back into Arrow
-    std::shared_ptr<ArrowWriterProperties> arrow_props =
-            ArrowWriterProperties::Builder().store_schema()->build();
+  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, reader_builder.Build());
 
-    std::shared_ptr<arrow::io::FileOutputStream> outfile;
-    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
 
-    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
-        arrow::default_memory_pool(), outfile,
-        /*chunk_size=*/100, props, arrow_props));
-    return arrow::Status::OK();
-}
+  std::vector<parquet::Range> ranges;
+  for (int64_t i = 0; i < 50; i++) {
+    if (i % 2 == 0) ranges.push_back({i, i});
+  }
+  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
 
-arrow::Status WriteInBatches(std::string path_to_file) {
-    // #include "parquet/arrow/writer.h"
-    // #include "arrow/util/type_fwd.h"
-    using parquet::ArrowWriterProperties;
-    using parquet::WriterProperties;
-
-    // Data is in RBR
-    std::shared_ptr<arrow::RecordBatchReader> batch_stream;
-    ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
-
-    // Choose compression
-    std::shared_ptr<WriterProperties> props =
-            WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
-
-    // Opt to store Arrow schema for easier reads back into Arrow
-    std::shared_ptr<ArrowWriterProperties> arrow_props =
-            ArrowWriterProperties::Builder().store_schema()->build();
-
-    // Create a writer
-    std::shared_ptr<arrow::io::FileOutputStream> outfile;
-    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
-    std::unique_ptr<parquet::arrow::FileWriter> writer;
-    ARROW_ASSIGN_OR_RAISE(
-        writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
-            arrow::default_memory_pool(), outfile,
-            props, arrow_props));
-
-    // Write each batch as a row_group
-    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *batch_stream) {
-        ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
-        ARROW_ASSIGN_OR_RAISE(auto table,
-                              arrow::Table::FromRecordBatches(batch->schema(), {batch}));
-        ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
-    }
+  ARROW_RETURN_NOT_OK(
+      arrow_reader->GetRecordBatchReader({0, 1}, {0, 1}, row_ranges_map, &rb_reader));
 
-    // Write file footer and close
-    ARROW_RETURN_NOT_OK(writer->Close());
+  size_t total_rows = 0;
+  size_t total_values = 0;
+  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
+    // Operate on each batch...
+    auto batch = maybe_batch.ValueOrDie();
+    total_rows += batch->num_rows();
+    std::cout << "batch size: " << batch->num_rows() << std::endl;
 
-    return arrow::Status::OK();
+    auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
+    for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
+      total_values += (*iter).value();
+    }
+  }
+  std::cout << "total rows is : " << total_rows << std::endl;
+  std::cout << "total value of y is : " << total_values << std::endl;
+  return arrow::Status::OK();
 }
 
-arrow::Status RunExamples(std::string path_to_file) {
-    // ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
-    // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
-    // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
-    ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
-    return arrow::Status::OK();
+
+arrow::Status RunExamples() {
+  ARROW_ASSIGN_OR_RAISE(auto buffer, WriteFullFile());
+  ARROW_RETURN_NOT_OK(ReadInBatches(buffer));
+  return arrow::Status::OK();
 }
 
 int main(int argc, char** argv) {
-    if (argc != 2) {
-        // Fake success for CI purposes.
-        return EXIT_SUCCESS;
-    }
+  if (argc != 2) {
+    // Fake success for CI purposes.
+    return EXIT_SUCCESS;
+  }
 
-    std::string path_to_file = argv[1];
-    arrow::Status status = RunExamples(path_to_file);
+  std::string path_to_file = argv[1];
+  arrow::Status status = RunExamples();
 
-    if (!status.ok()) {
-        std::cerr << "Error occurred: " << status.message() << std::endl;
-        return EXIT_FAILURE;
-    }
-    return EXIT_SUCCESS;
+  if (!status.ok()) {
+    std::cerr << "Error occurred: " << status.message() << std::endl;
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
 }

From a185d54cf16a95f1eec5c7809666740e52b16c17 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 15:00:44 +0800
Subject: [PATCH 03/25] happy path pass 3

---
 cpp/examples/arrow/parquet_read_write.cc | 273 +++++++-----
 cpp/src/parquet/CMakeLists.txt           |   1 +
 cpp/src/parquet/arrow/reader.cc          | 107 +++--
 cpp/src/parquet/column_reader.h          | 527 +++++++++++------------
 cpp/src/parquet/filtered_reader_test.cc  | 207 +++++++++
 5 files changed, 693 insertions(+), 422 deletions(-)
 create mode 100644 cpp/src/parquet/filtered_reader_test.cc

diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index 63f1a28fe475..fa45a34cff49 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -17,140 +17,207 @@
 
 #include "arrow/api.h"
 #include "arrow/io/api.h"
-#include "arrow/io/memory.h"
 #include "arrow/result.h"
 #include "arrow/util/type_fwd.h"
 #include "parquet/arrow/reader.h"
 #include "parquet/arrow/writer.h"
 
-#include <arrow/testing/gtest_util.h>
-#include <arrow/util/range.h>
 #include <iostream>
+#include <arrow/util/range.h>
 
-arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
-  auto builder = arrow::Int32Builder();
-
-  std::shared_ptr<arrow::Array> arr_x;
-  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
-
-  std::shared_ptr<arrow::Array> arr_y;
-  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
-
-  std::shared_ptr<arrow::Array> arr_z_values;
-  std::shared_ptr<arrow::Array> arr_z_offsets;
-  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 300)));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values));
-  std::vector<int> offsets = arrow::internal::Iota(0, 101);
-  std::transform(offsets.begin(), offsets.end(), offsets.begin(),
-                 [](int x) { return x * 3; });
-  ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets));
-  ARROW_ASSIGN_OR_RAISE(auto arr_z,
-                        arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values));
-
-  auto schema =
-      arrow::schema({arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32()),
-                     arrow::field("z", arrow::list(arrow::int32()))});
-
-  return arrow::Table::Make(schema, {arr_x, arr_y, arr_z});
-}
+arrow::Status ReadInBatches(std::string path_to_file) {
+    // #include "arrow/io/api.h"
+    // #include "arrow/parquet/arrow/reader.h"
 
-arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
-  using parquet::ArrowWriterProperties;
-  using parquet::WriterProperties;
+    arrow::MemoryPool* pool = arrow::default_memory_pool();
 
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+    // Configure general Parquet reader settings
+    auto reader_properties = parquet::ReaderProperties(pool);
+    reader_properties.set_buffer_size(4096 * 4);
+    reader_properties.enable_buffered_stream();
 
-  std::shared_ptr<WriterProperties> props =
-      WriterProperties::Builder()
-          .max_row_group_length(50)
-          ->enable_write_page_index()
-          ->write_batch_size(13)
-          ->data_pagesize(1)  // this will cause every batch creating a page
-          ->compression(arrow::Compression::SNAPPY)
-          ->build();
+    // Configure Arrow-specific Parquet reader settings
+    auto arrow_reader_props = parquet::ArrowReaderProperties();
+    arrow_reader_props.set_batch_size(10); // default 64 * 1024
 
-  std::shared_ptr<ArrowWriterProperties> arrow_props =
-      ArrowWriterProperties::Builder().store_schema()->build();
+    parquet::arrow::FileReaderBuilder reader_builder;
+    ARROW_RETURN_NOT_OK(
+        reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
+    reader_builder.memory_pool(pool);
+    reader_builder.properties(arrow_reader_props);
 
-  ARROW_ASSIGN_OR_RAISE(auto outfile, ::arrow::io::BufferOutputStream::Create());
+    std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+    ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
+
+    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+
+    std::vector<parquet::Range> ranges;
+    for (int64_t i = 0; i < 50; i++) {
+        if (i % 2 == 0)
+            ranges.push_back({i, i});
+    }
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
 
-  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
-                                                 arrow::default_memory_pool(), outfile,
-                                                 /*chunk_size=*/100, props, arrow_props));
-  return outfile->Finish();
+
+    ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader));
+
+    size_t total_rows = 0;
+    size_t total_values = 0;
+    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *rb_reader) {
+        // Operate on each batch...
+        auto batch = maybe_batch.ValueOrDie();
+        total_rows += batch->num_rows();
+        std::cout << "batch size: " << batch->num_rows() << std::endl;
+
+        auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
+        for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
+            total_values += (*iter).value();
+        }
+    }
+    std::cout << "total rows is : " << total_rows << std::endl;
+    std::cout << "total value of y is : " << total_values << std::endl;
+    return arrow::Status::OK();
 }
 
 
-arrow::Status ReadInBatches(std::shared_ptr<arrow::Buffer> buffer) {
-  arrow::MemoryPool* pool = arrow::default_memory_pool();
+arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
+    auto builder = arrow::Int32Builder();
+
+    std::shared_ptr<arrow::Array> arr_a_values;
+    std::shared_ptr<arrow::Array> arr_a_offsets;
+    std::vector<int> a_values;
+    for (int i = 0; i < 100; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            a_values.push_back(i);
+        }
+    }
+    ARROW_RETURN_NOT_OK(builder.AppendValues(a_values));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values));
+    std::vector<int> offsets = arrow::internal::Iota(0, 101);
+    std::transform(offsets.begin(), offsets.end(), offsets.begin(),
+                   [](int x) { return x * 3; });
+    ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets));
+    ARROW_ASSIGN_OR_RAISE(auto arr_a,
+                          arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values));
+
+    std::shared_ptr<arrow::Array> arr_b;
+    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
+    ARROW_RETURN_NOT_OK(builder.Finish(&arr_b));
+
+    auto string_builder = arrow::StringBuilder();
+    std::shared_ptr<arrow::Array> arr_c;
+    std::vector<std::string> strs;
+    for (size_t i = 0; i < 100; i++) {
+        strs.push_back("" + std::to_string(i));
+    }
+    ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs));
+    ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
+
+    auto schema = arrow::schema({
+        arrow::field("a", arrow::list(arrow::int32())),
+        arrow::field("b", arrow::int32()),
+        arrow::field("c", arrow::utf8()),
+    });
 
-  auto reader_properties = parquet::ReaderProperties(pool);
-  reader_properties.set_buffer_size(4096 * 4);
-  reader_properties.enable_buffered_stream();
+    return arrow::Table::Make(schema, {arr_a, arr_b, arr_c});
+}
 
-  auto arrow_reader_props = parquet::ArrowReaderProperties();
-  arrow_reader_props.set_batch_size(10);  // default 64 * 1024
+arrow::Result<std::shared_ptr<arrow::TableBatchReader>> GetRBR() {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+    auto reader = std::make_shared<arrow::TableBatchReader>(table);
+    reader->set_chunksize(10);
+    return reader;
+}
 
-  parquet::arrow::FileReaderBuilder reader_builder;
-  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
-  ARROW_RETURN_NOT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
-  reader_builder.memory_pool(pool);
-  reader_builder.properties(arrow_reader_props);
+arrow::Status WriteFullFile(std::string path_to_file) {
+    using parquet::ArrowWriterProperties;
+    using parquet::WriterProperties;
 
-  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, reader_builder.Build());
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
 
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    // Choose compression
+    std::shared_ptr<WriterProperties> props =
+            WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13)
+            ->data_pagesize(1) // this will cause every batch creating a page
+            ->compression(arrow::Compression::SNAPPY)->build();
+    std::cout << "hello" << std::endl;
 
-  std::vector<parquet::Range> ranges;
-  for (int64_t i = 0; i < 50; i++) {
-    if (i % 2 == 0) ranges.push_back({i, i});
-  }
-  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    // Opt to store Arrow schema for easier reads back into Arrow
+    std::shared_ptr<ArrowWriterProperties> arrow_props =
+            ArrowWriterProperties::Builder().store_schema()->build();
 
-  ARROW_RETURN_NOT_OK(
-      arrow_reader->GetRecordBatchReader({0, 1}, {0, 1}, row_ranges_map, &rb_reader));
+    std::shared_ptr<arrow::io::FileOutputStream> outfile;
+    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
 
-  size_t total_rows = 0;
-  size_t total_values = 0;
-  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
-    // Operate on each batch...
-    auto batch = maybe_batch.ValueOrDie();
-    total_rows += batch->num_rows();
-    std::cout << "batch size: " << batch->num_rows() << std::endl;
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+        arrow::default_memory_pool(), outfile,
+        /*chunk_size=*/100, props, arrow_props));
+    return arrow::Status::OK();
+}
 
-    auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
-    for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
-      total_values += (*iter).value();
+arrow::Status WriteInBatches(std::string path_to_file) {
+    // #include "parquet/arrow/writer.h"
+    // #include "arrow/util/type_fwd.h"
+    using parquet::ArrowWriterProperties;
+    using parquet::WriterProperties;
+
+    // Data is in RBR
+    std::shared_ptr<arrow::RecordBatchReader> batch_stream;
+    ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
+
+    // Choose compression
+    std::shared_ptr<WriterProperties> props =
+            WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
+
+    // Opt to store Arrow schema for easier reads back into Arrow
+    std::shared_ptr<ArrowWriterProperties> arrow_props =
+            ArrowWriterProperties::Builder().store_schema()->build();
+
+    // Create a writer
+    std::shared_ptr<arrow::io::FileOutputStream> outfile;
+    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+    std::unique_ptr<parquet::arrow::FileWriter> writer;
+    ARROW_ASSIGN_OR_RAISE(
+        writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
+            arrow::default_memory_pool(), outfile,
+            props, arrow_props));
+
+    // Write each batch as a row_group
+    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *batch_stream) {
+        ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
+        ARROW_ASSIGN_OR_RAISE(auto table,
+                              arrow::Table::FromRecordBatches(batch->schema(), {batch}));
+        ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
     }
-  }
-  std::cout << "total rows is : " << total_rows << std::endl;
-  std::cout << "total value of y is : " << total_values << std::endl;
-  return arrow::Status::OK();
-}
 
+    // Write file footer and close
+    ARROW_RETURN_NOT_OK(writer->Close());
 
-arrow::Status RunExamples() {
-  ARROW_ASSIGN_OR_RAISE(auto buffer, WriteFullFile());
-  ARROW_RETURN_NOT_OK(ReadInBatches(buffer));
-  return arrow::Status::OK();
+    return arrow::Status::OK();
+}
+
+arrow::Status RunExamples(std::string path_to_file) {
+    ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
+    // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
+    // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
+    // ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
+    return arrow::Status::OK();
 }
 
 int main(int argc, char** argv) {
-  if (argc != 2) {
-    // Fake success for CI purposes.
-    return EXIT_SUCCESS;
-  }
+    if (argc != 2) {
+        // Fake success for CI purposes.
+        return EXIT_SUCCESS;
+    }
 
-  std::string path_to_file = argv[1];
-  arrow::Status status = RunExamples();
+    std::string path_to_file = argv[1];
+    arrow::Status status = RunExamples(path_to_file);
 
-  if (!status.ok()) {
-    std::cerr << "Error occurred: " << status.message() << std::endl;
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
+    if (!status.ok()) {
+        std::cerr << "Error occurred: " << status.message() << std::endl;
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
 }
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index e6aad7cee2a3..06be0da74aa6 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -354,6 +354,7 @@ add_parquet_test(reader-test
                  level_conversion_test.cc
                  column_scanner_test.cc
                  reader_test.cc
+                 filtered_reader_test.cc
                  stream_reader_test.cc
                  test_util.cc)
 
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 34316bf47c1b..81c7b1188895 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -19,13 +19,13 @@
 
 #include <parquet/page_index.h>
 
+#include <zconf.h>
 #include <algorithm>
 #include <cstring>
 #include <memory>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include <zconf.h>
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
@@ -75,8 +75,8 @@ using arrow::internal::Iota;
 // Help reduce verbosity
 using ParquetReader = parquet::ParquetFileReader;
 
-using parquet::RowRangesPtr;
 using parquet::Range;
+using parquet::RowRangesPtr;
 using parquet::internal::RecordReader;
 
 namespace bit_util = arrow::bit_util;
@@ -205,11 +205,11 @@ class FileReaderImpl : public FileReader {
     return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
   }
 
-  Status GetFieldReader(int i,
-                        const std::shared_ptr<std::unordered_set<int>>& included_leaves,
-                        const std::vector<int>& row_groups,
-                        const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-                        std::unique_ptr<ColumnReaderImpl>* out) {
+  Status GetFieldReader(
+      int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
+      const std::vector<int>& row_groups,
+      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
     if (ARROW_PREDICT_FALSE(i < 0 ||
@@ -229,11 +229,11 @@ class FileReaderImpl : public FileReader {
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
-  Status GetFieldReaders(const std::vector<int>& column_indices,
-                         const std::vector<int>& row_groups,
-                         const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-                         std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
-                         std::shared_ptr<::arrow::Schema>* out_schema) {
+  Status GetFieldReaders(
+      const std::vector<int>& column_indices, const std::vector<int>& row_groups,
+      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
+      std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
     // in the indices vector
     ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
@@ -245,9 +245,8 @@ class FileReaderImpl : public FileReader {
     ::arrow::FieldVector out_fields(field_indices.size());
     for (size_t i = 0; i < out->size(); ++i) {
       std::unique_ptr<ColumnReaderImpl> reader;
-      RETURN_NOT_OK(
-          GetFieldReader(field_indices[i], included_leaves, row_groups,
-            row_ranges_map, &reader));
+      RETURN_NOT_OK(GetFieldReader(field_indices[i], included_leaves, row_groups,
+                                   row_ranges_map, &reader));
 
       out_fields[i] = reader->field();
       out->at(i) = std::move(reader);
@@ -346,9 +345,9 @@ class FileReaderImpl : public FileReader {
   }
 
   Status GetRecordBatchReader(
-    const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-    std::unique_ptr<RecordBatchReader>* out) override;
+      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      std::unique_ptr<RecordBatchReader>* out) override;
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               const std::vector<int>& column_indices,
@@ -468,15 +467,16 @@ class RowGroupReaderImpl : public RowGroupReader {
 // Column reader implementations
 
 struct RowRangesPageFilter {
-  explicit RowRangesPageFilter(const RowRangesPtr & row_ranges_, const RowRangesPtr & page_ranges_)
-    : row_ranges(row_ranges_), page_ranges(page_ranges_) {
+  explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_,
+                               const RowRangesPtr& page_ranges_)
+      : row_ranges(row_ranges_), page_ranges(page_ranges_) {
     assert(row_ranges != nullptr);
     assert(page_ranges != nullptr);
     assert(row_ranges->getRanges().size() > 0);
     assert(page_ranges->getRanges().size() > 0);
   }
 
-  bool operator() (const DataPageStats & stats) {
+  bool operator()(const DataPageStats& stats) {
     ++page_range_idx;
 
     if (row_range_idx >= row_ranges->getRanges().size()) {
@@ -490,8 +490,8 @@ struct RowRangesPageFilter {
     }
 
     while (row_range_idx < row_ranges->getRanges().size() &&
-      current_page_range.isAfter((*row_ranges)[row_range_idx])) {
-        row_range_idx++;
+           current_page_range.isAfter((*row_ranges)[row_range_idx])) {
+      row_range_idx++;
     }
 
     return row_range_idx >= row_ranges->getRanges().size();
@@ -570,37 +570,49 @@ class LeafReader : public ColumnReaderImpl {
 
     /// using page index to reduce cost
     if (page_reader != nullptr && ctx_->row_ranges_map) {
+      // reset skipper
+      record_reader_->set_record_skipper(NULLPTR);
+
       // if specific row range is provided for this rg
       if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
-        iter != ctx_->row_ranges_map->end()) {
-
+          iter != ctx_->row_ranges_map->end()) {
         // check offset exists
         auto offset_index = ctx_->reader->GetPageIndexReader()
-                                       ->RowGroup(input_->current_row_group())
-                                       ->GetOffsetIndex(input_->column_index());
+                                ->RowGroup(input_->current_row_group())
+                                ->GetOffsetIndex(input_->column_index());
         if (!offset_index) {
-          throw ParquetException("Offset index is not found for column: " + field_->name());
+          throw ParquetException("Attempting to filter pages but Offset index is not found for column: " +
+                                 field_->name());
         }
 
         const auto page_locations = offset_index->page_locations();
         auto page_ranges = std::make_shared<RowRanges>();
         for (size_t i = 0; i < page_locations.size() - 1; i++) {
           page_ranges->add({page_locations[i].first_row_index,
-                           page_locations[i + 1].first_row_index - 1}, false);
+                            page_locations[i + 1].first_row_index - 1},
+                           false);
         }
         if (page_locations.size() >= 1) {
-          page_ranges->add({
-              page_locations[page_locations.size() - 1].first_row_index,
-              ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - 1}, false);
+          page_ranges->add({page_locations[page_locations.size() - 1].first_row_index,
+                            ctx_->reader->metadata()
+                                    ->RowGroup(input_->current_row_group())
+                                    ->num_rows() -
+                                1},
+                           false);
         }
 
         // part 1, skip decompressing & decoding unnecessary pages
         page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges));
 
         // part 2, skip unnecessary rows in necessary pages
-        record_reader_->set_record_skipper(std::make_shared<parquet::internal::RecordSkipper>(
-          *page_ranges, *iter->second));
-        }
+        record_reader_->set_record_skipper(
+            std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
+                                                               *iter->second));
+      } else {
+        // If row_ranges_map exists but no row_ranges is found for this RG, skip this RG
+        NextRowGroup();
+        return;
+      }
     }
 
     record_reader_->reset_current_rg_processed_records();
@@ -1072,10 +1084,10 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 }  // namespace
 
-Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
-                                            const std::vector<int>& column_indices,
-                                            const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-                                            std::unique_ptr<RecordBatchReader>* out) {
+Status FileReaderImpl::GetRecordBatchReader(
+    const std::vector<int>& row_groups, const std::vector<int>& column_indices,
+    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+    std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
   if (reader_properties_.pre_buffer()) {
@@ -1088,7 +1100,8 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
 
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> batch_schema;
-  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups,row_ranges_map, &readers, &batch_schema));
+  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_map, &readers,
+                                &batch_schema));
 
   if (readers.empty()) {
     // Just generate all batches right now; they're cheap since they have no columns.
@@ -1343,7 +1356,8 @@ Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
   // in a sync context too so use `this` over `self`
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> result_schema;
-  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema));
+  RETURN_NOT_OK(
+      GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema));
   // OptionalParallelForAsync requires an executor
   if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
 
@@ -1406,12 +1420,13 @@ Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indice
   return Status::OK();
 }
 
-Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
-                                        const std::vector<int>& column_indices,
-                                        const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-                                        std::shared_ptr<RecordBatchReader>* out) {
+Status FileReader::GetRecordBatchReader(
+    const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+    std::shared_ptr<RecordBatchReader>* out) {
   std::unique_ptr<RecordBatchReader> tmp;
-  RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp));
+  RETURN_NOT_OK(
+      GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp));
   out->reset(tmp.release());
   return Status::OK();
 }
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 4468fd1aa20e..860642bb3657 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -304,254 +304,234 @@ class TypedColumnReader : public ColumnReader {
 };
 
 struct Range {
-    static Range unionRange(const Range&left, const Range&right) {
-        if (left.from <= right.from) {
-            if (left.to + 1 >= right.from) {
-                return {left.from, std::max(left.to, right.to)};
-            }
-        }
-        else if (right.to + 1 >= left.from) {
-            return {right.from, std::max(left.to, right.to)};
-        }
-        return {-1, -1};
+  static Range unionRange(const Range& left, const Range& right) {
+    if (left.from <= right.from) {
+      if (left.to + 1 >= right.from) {
+        return {left.from, std::max(left.to, right.to)};
+      }
+    } else if (right.to + 1 >= left.from) {
+      return {right.from, std::max(left.to, right.to)};
     }
+    return {-1, -1};
+  }
 
-    static Range intersection(const Range&left, const Range&right) {
-        if (left.from <= right.from) {
-            if (left.to >= right.from) {
-                return {right.from, std::min(left.to, right.to)};
-            }
-        }
-        else if (right.to >= left.from) {
-            return {left.from, std::min(left.to, right.to)};
-        }
-        return {-1, -1}; // Return a default Range object if no intersection range found
+  static Range intersection(const Range& left, const Range& right) {
+    if (left.from <= right.from) {
+      if (left.to >= right.from) {
+        return {right.from, std::min(left.to, right.to)};
+      }
+    } else if (right.to >= left.from) {
+      return {left.from, std::min(left.to, right.to)};
     }
+    return {-1, -1};  // Return a default Range object if no intersection range found
+  }
 
-    int64_t from;
-    int64_t to;
+  int64_t from;
+  int64_t to;
 
-    Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
-        assert(from <= to);
-    }
+  Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
+    assert(from <= to);
+  }
 
-    size_t count() const {
-        return to - from + 1;
-    }
+  size_t count() const { return to - from + 1; }
 
-    bool isBefore(const Range&other) const {
-        return to < other.from;
-    }
+  bool isBefore(const Range& other) const { return to < other.from; }
 
-    bool isAfter(const Range&other) const {
-        return from > other.to;
-    }
+  bool isAfter(const Range& other) const { return from > other.to; }
 
-    bool isOverlap(const Range&other) const {
-        return !isBefore(other) && !isAfter(other);
-    }
+  bool isOverlap(const Range& other) const { return !isBefore(other) && !isAfter(other); }
 
-    std::string toString() const {
-        return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
-    }
+  std::string toString() const {
+    return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
+  }
 };
 
 class RowRanges {
-    std::vector<Range> ranges;
+  std::vector<Range> ranges;
 
-public:
-    RowRanges() = default;
+ public:
+  RowRanges() = default;
 
-    explicit RowRanges(const Range&range) {
-        ranges.push_back(range);
-    }
+  explicit RowRanges(const Range& range) { ranges.push_back(range); }
 
-    RowRanges(const std::vector<Range>&ranges) {
-        this->ranges = ranges;
-    }
+  RowRanges(const std::vector<Range>& ranges) { this->ranges = ranges; }
 
-    //copy cstr
-    RowRanges(const RowRanges&other) {
-        ranges = other.ranges;
-    }
+  // copy cstr
+  RowRanges(const RowRanges& other) { ranges = other.ranges; }
 
-    RowRanges(RowRanges&&other) noexcept {
-        ranges = std::move(other.ranges);
-    }
+  RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
 
-    static RowRanges createSingle(const size_t rowCount) {
-        return RowRanges({Range(0L, rowCount - 1L)});
-    }
+  static RowRanges createSingle(const size_t rowCount) {
+    return RowRanges({Range(0L, rowCount - 1L)});
+  }
 
-    // static RowRanges create(size_t rowCount, const std::vector<int>& pageIndexes, const OffsetIndex& offsetIndex) {
-    //     RowRanges ranges;
-    //     for (int pageIndex : pageIndexes) {
-    //         ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount)));
-    //     }
-    //     return ranges;
-    // }
-
-    static RowRanges unionRanges(const RowRanges&left, const RowRanges&right) {
-        RowRanges result;
-        auto it1 = left.ranges.begin();
-        auto it2 = right.ranges.begin();
-        if (it2 != right.ranges.end()) {
-            Range range2 = *it2;
-            while (it1 != left.ranges.end()) {
-                Range range1 = *it1;
-                if (range1.isAfter(range2)) {
-                    result.add(range2);
-                    range2 = range1;
-                    const auto tmp = it1;
-                    it1 = it2;
-                    it2 = tmp;
-                }
-                else {
-                    result.add(range1);
-                }
-                ++it1;
-            }
-            result.add(range2);
-        }
-        else {
-            it2 = it1;
+  // static RowRanges create(size_t rowCount, const std::vector<int>& pageIndexes, const
+  // OffsetIndex& offsetIndex) {
+  //     RowRanges ranges;
+  //     for (int pageIndex : pageIndexes) {
+  //         ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex),
+  //         offsetIndex.getLastRowIndex(pageIndex, rowCount)));
+  //     }
+  //     return ranges;
+  // }
+
+  static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) {
+    RowRanges result;
+    auto it1 = left.ranges.begin();
+    auto it2 = right.ranges.begin();
+    if (it2 != right.ranges.end()) {
+      Range range2 = *it2;
+      while (it1 != left.ranges.end()) {
+        Range range1 = *it1;
+        if (range1.isAfter(range2)) {
+          result.add(range2);
+          range2 = range1;
+          const auto tmp = it1;
+          it1 = it2;
+          it2 = tmp;
+        } else {
+          result.add(range1);
         }
-        while (it2 != right.ranges.end()) {
-            result.add(*it2);
-            ++it2;
-        }
-
-        return result;
+        ++it1;
+      }
+      result.add(range2);
+    } else {
+      it2 = it1;
     }
-
-    static RowRanges intersection(const RowRanges&left, const RowRanges&right) {
-        RowRanges result;
-
-        size_t rightIndex = 0;
-        for (const Range&l: left.ranges) {
-            for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
-                const Range&r = right.ranges[i];
-                if (l.isBefore(r)) {
-                    break;
-                }
-                else if (l.isAfter(r)) {
-                    rightIndex = i + 1;
-                    continue;
-                }
-                result.add(Range::intersection(l, r));
-            }
-        }
-
-        return result;
+    while (it2 != right.ranges.end()) {
+      result.add(*it2);
+      ++it2;
     }
 
-    RowRanges slice(const int64_t from, const int64_t to) const {
-        RowRanges result;
-        for (const Range&range: ranges) {
-            if (range.from >= from && range.to <= to) {
-                result.add(range);
-            }
-        }
-        return result;
-    }
+    return result;
+  }
 
-    void add(const Range&range, bool merge = true) {
-        Range rangeToAdd = range;
-        if(merge) {
-            for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
-                Range last = ranges[i];
-                assert(!last.isAfter(range));
-                const Range u = Range::unionRange(last, rangeToAdd);
-                assert (u.from != -1 && u.to != -1);
-                rangeToAdd = u;
-                ranges.erase(ranges.begin() + i);
-            }
+  static RowRanges intersection(const RowRanges& left, const RowRanges& right) {
+    RowRanges result;
+
+    size_t rightIndex = 0;
+    for (const Range& l : left.ranges) {
+      for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
+        const Range& r = right.ranges[i];
+        if (l.isBefore(r)) {
+          break;
+        } else if (l.isAfter(r)) {
+          rightIndex = i + 1;
+          continue;
         }
-        ranges.push_back(rangeToAdd);
+        result.add(Range::intersection(l, r));
+      }
     }
 
-    size_t rowCount() const {
-        size_t cnt = 0;
-        for (const Range&range: ranges) {
-            cnt += range.count();
-        }
-        return cnt;
-    }
+    return result;
+  }
 
-    //
-    // class Iterator {
-    // private:
-    //     int currentRangeIndex;
-    //     Range currentRange;
-    //     long next;
-    //     std::vector<Range> ranges;
-    //
-    //     long findNext() {
-    //         if (currentRangeIndex < ranges.size()) {
-    //             currentRange = ranges[++currentRangeIndex];
-    //             next = currentRange.from;
-    //         } else {
-    //             return -1;
-    //         }
-    //         return next;
-    //     }
-    //
-    // public:
-    //     Iterator(const std::vector<Range>& ranges) {
-    //         this->ranges = ranges;
-    //         currentRangeIndex = -1;
-    //         next = findNext();
-    //     }
-    //
-    //     bool hasNext() const {
-    //         return next >= 0;
-    //     }
-    //
-    //     long nextLong() {
-    //         long ret = next;
-    //         if (ret < 0) {
-    //             throw std::out_of_range("No such element");
-    //         }
-    //         next = findNext();
-    //         return ret;
-    //     }
-    // };
-    //
-    // Iterator iterator() const {
-    //     return Iterator(ranges);
-    // }
-
-    bool isOverlapping(int64_t from, int64_t to) const {
-        const Range searchRange(from, to);
-        return isOverlapping(searchRange);
+  RowRanges slice(const int64_t from, const int64_t to) const {
+    RowRanges result;
+    for (const Range& range : ranges) {
+      if (range.from >= from && range.to <= to) {
+        result.add(range);
+      }
     }
+    return result;
+  }
 
-    bool isOverlapping(const Range&searchRange) const {
-        auto it = std::lower_bound(ranges.begin(), ranges.end(), searchRange, [](const Range&r1, const Range&r2) {
-            return r1.isBefore(r2);
-        });
-        return it != ranges.end() && !(*it).isAfter(searchRange);
+  void add(const Range& range, bool merge = true) {
+    Range rangeToAdd = range;
+    if (merge) {
+      for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
+        Range last = ranges[i];
+        assert(!last.isAfter(range));
+        const Range u = Range::unionRange(last, rangeToAdd);
+        assert(u.from != -1 && u.to != -1);
+        rangeToAdd = u;
+        ranges.erase(ranges.begin() + i);
+      }
+    } else {
+      if (ranges.size() > 1) assert(rangeToAdd.from > ranges.back().to);
     }
+    ranges.push_back(rangeToAdd);
+  }
 
-    std::vector<Range>& getRanges() {
-        return ranges;
+  size_t rowCount() const {
+    size_t cnt = 0;
+    for (const Range& range : ranges) {
+      cnt += range.count();
     }
+    return cnt;
+  }
 
-    const Range& operator[](size_t index) const {
-        return ranges[index];
-    }
+  //
+  // class Iterator {
+  // private:
+  //     int currentRangeIndex;
+  //     Range currentRange;
+  //     long next;
+  //     std::vector<Range> ranges;
+  //
+  //     long findNext() {
+  //         if (currentRangeIndex < ranges.size()) {
+  //             currentRange = ranges[++currentRangeIndex];
+  //             next = currentRange.from;
+  //         } else {
+  //             return -1;
+  //         }
+  //         return next;
+  //     }
+  //
+  // public:
+  //     Iterator(const std::vector<Range>& ranges) {
+  //         this->ranges = ranges;
+  //         currentRangeIndex = -1;
+  //         next = findNext();
+  //     }
+  //
+  //     bool hasNext() const {
+  //         return next >= 0;
+  //     }
+  //
+  //     long nextLong() {
+  //         long ret = next;
+  //         if (ret < 0) {
+  //             throw std::out_of_range("No such element");
+  //         }
+  //         next = findNext();
+  //         return ret;
+  //     }
+  // };
+  //
+  // Iterator iterator() const {
+  //     return Iterator(ranges);
+  // }
 
-    std::string toString() const {
-        std::string result = "[";
-        for (const Range&range: ranges) {
-            result += "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
-        }
-        if (!ranges.empty()) {
-            result = result.substr(0, result.size() - 2);
-        }
-        result += "]";
-        return result;
+  bool isOverlapping(int64_t from, int64_t to) const {
+    const Range searchRange(from, to);
+    return isOverlapping(searchRange);
+  }
+
+  bool isOverlapping(const Range& searchRange) const {
+    auto it = std::lower_bound(
+        ranges.begin(), ranges.end(), searchRange,
+        [](const Range& r1, const Range& r2) { return r1.isBefore(r2); });
+    return it != ranges.end() && !(*it).isAfter(searchRange);
+  }
+
+  std::vector<Range>& getRanges() { return ranges; }
+
+  const Range& operator[](size_t index) const { return ranges[index]; }
+
+  std::string toString() const {
+    std::string result = "[";
+    for (const Range& range : ranges) {
+      result +=
+          "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
     }
+    if (!ranges.empty()) {
+      result = result.substr(0, result.size() - 2);
+    }
+    result += "]";
+    return result;
+  }
 };
 
 using RowRangesPtr = std::shared_ptr<RowRanges>;
@@ -559,74 +539,75 @@ using RowRangesPtr = std::shared_ptr<RowRanges>;
 namespace internal {
 
 class PARQUET_EXPORT RecordSkipper {
-public:
-    RecordSkipper(RowRanges & pages, RowRanges & row_ranges_) : row_ranges(row_ranges_) {
-        RowRanges will_process_pages, skip_pages;
-        for(auto & page : pages.getRanges()) {
-            if(row_ranges.isOverlapping(page)) {
-                // will_process_pages.add(page);
-            } else {
-                skip_pages.add(page, false);
-            }
-        }
-        adjust_ranges(skip_pages, row_ranges);
-        // adjust_ranges(skip_pages, will_process_pages);
+ public:
+  RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) : row_ranges(row_ranges_) {
+    RowRanges will_process_pages, skip_pages;
+    for (auto& page : pages.getRanges()) {
+      if (row_ranges.isOverlapping(page)) {
+        // will_process_pages.add(page);
+      } else {
+        skip_pages.add(page, false);
+      }
     }
+    adjust_ranges(skip_pages, row_ranges);
+    // adjust_ranges(skip_pages, will_process_pages);
 
-    /// \brief Return the number of records to read or to skip
-    /// if return values is positive, it means to read N records
-    /// if return values is negative, it means to skip N records
-    /// if return values is 0, it means to skip all records in this row group
-    int64_t advise_next(const int64_t current_rg_procesed)
-    {
-        if (row_ranges.getRanges().size() == row_range_idx)
-        {
-            return 0;
-        }
+    total_rows_to_process = pages.rowCount() - skip_pages.rowCount() + 1;
+  }
 
-        if (row_ranges[row_range_idx].to < current_rg_procesed)
-        {
-            row_range_idx++;
-            if (row_ranges.getRanges().size() == row_range_idx)
-            {
-                return 0;
-            }
-        }
+  /// \brief Return the number of records to read or to skip
+  /// if return values is positive, it means to read N records
+  /// if return values is negative, it means to skip N records
+  /// if return values is 0, it means end of RG
+  int64_t advise_next(const int64_t current_rg_procesed) {
+    if (row_ranges.getRanges().size() == row_range_idx) {
+      return 0;
+    }
 
-        if (row_ranges[row_range_idx].from > current_rg_procesed)
-        {
-            // return negative
-            return current_rg_procesed - row_ranges[row_range_idx].from;
-        }
+    if (row_ranges[row_range_idx].to < current_rg_procesed) {
+      row_range_idx++;
+      if (row_ranges.getRanges().size() == row_range_idx) {
+        // negative, skip the ramaining rows
+        return current_rg_procesed - total_rows_to_process;
+      }
+    }
 
-        const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
-        assert(ret >= 1);
-        return ret;
+    if (row_ranges[row_range_idx].from > current_rg_procesed) {
+      // negative, skip
+      return current_rg_procesed - row_ranges[row_range_idx].from;
     }
-private:
-    /// Keep copy of ranges, because advise_next() will modify them
-    // RowRanges will_process_pages;
-    RowRanges row_ranges;
-
-    size_t row_range_idx = 0;
-
-    /// Since the skipped pages will be slienly skipped without updating current_rg_processed_records
-    /// or records_read_, we need to pre-process the row ranges as if these skipped pages never existed
-    void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) {
-        size_t skipped_rows = 0;
-        auto iter = to_adjust.getRanges().begin();
-        auto skip_iter = skip_pages.getRanges().begin();
-        while(iter != to_adjust.getRanges().end()) {
-            while(skip_iter != skip_pages.getRanges().end() &&
-                skip_iter->isBefore(*iter)) {
-                skipped_rows += skip_iter->count();
-                ++skip_iter;
-            }
-            iter->from -= skipped_rows;
-            iter->to -= skipped_rows;
-            ++iter;
-        }
+
+    const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
+    assert(ret > 0);
+    return ret;
+  }
+
+ private:
+  /// Keep copy of ranges, because advise_next() will modify them
+  // RowRanges will_process_pages;
+  RowRanges row_ranges;
+
+  size_t row_range_idx = 0;
+
+  size_t total_rows_to_process = 0;
+
+  /// Since the skipped pages will be slienly skipped without updating
+  /// current_rg_processed_records or records_read_, we need to pre-process the row ranges
+  /// as if these skipped pages never existed
+  void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
+    size_t skipped_rows = 0;
+    auto iter = to_adjust.getRanges().begin();
+    auto skip_iter = skip_pages.getRanges().begin();
+    while (iter != to_adjust.getRanges().end()) {
+      while (skip_iter != skip_pages.getRanges().end() && skip_iter->isBefore(*iter)) {
+        skipped_rows += skip_iter->count();
+        ++skip_iter;
+      }
+      iter->from -= skipped_rows;
+      iter->to -= skipped_rows;
+      ++iter;
     }
+  }
 };
 
 /// \brief Stateful column reader that delimits semantic records for both flat
@@ -751,7 +732,7 @@ class PARQUET_EXPORT RecordReader {
   bool at_record_start_;
   int64_t records_read_;
 
-  int64_t current_rg_processed_records; // counting both read and skip records
+  int64_t current_rg_processed_records;  // counting both read and skip records
 
   /// \brief Stores values. These values are populated based on each ReadRecords
   /// call. No extra values are buffered for the next call. SkipRecords will not
diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc
new file mode 100644
index 000000000000..427476c7c3b1
--- /dev/null
+++ b/cpp/src/parquet/filtered_reader_test.cc
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/api.h"
+#include "arrow/io/api.h"
+#include "arrow/io/memory.h"
+#include "arrow/result.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/writer.h"
+
+#include <arrow/testing/gtest_util.h>
+#include <arrow/util/range.h>
+#include <iostream>
+
+arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
+  auto builder = arrow::Int32Builder();
+
+  std::shared_ptr<arrow::Array> arr_a_values;
+  std::shared_ptr<arrow::Array> arr_a_offsets;
+  std::vector<int> a_values;
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      a_values.push_back(i);
+    }
+  }
+  ARROW_RETURN_NOT_OK(builder.AppendValues(a_values));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values));
+  std::vector<int> offsets = arrow::internal::Iota(0, 101);
+  std::transform(offsets.begin(), offsets.end(), offsets.begin(),
+                 [](int x) { return x * 3; });
+  ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets));
+  ARROW_ASSIGN_OR_RAISE(auto arr_a,
+                        arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values));
+
+  std::shared_ptr<arrow::Array> arr_b;
+  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_b));
+
+  auto string_builder = arrow::StringBuilder();
+  std::shared_ptr<arrow::Array> arr_c;
+  std::vector<std::string> strs;
+  for (size_t i = 0; i < 100; i++) {
+    strs.push_back("" + std::to_string(i));
+  }
+  ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs));
+  ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
+
+  auto schema = arrow::schema({
+      arrow::field("a", arrow::list(arrow::int32())),
+      arrow::field("b", arrow::int32()),
+      arrow::field("c", arrow::utf8()),
+  });
+
+  return arrow::Table::Make(schema, {arr_a, arr_b, arr_c});
+}
+
+arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+
+  std::shared_ptr<WriterProperties> props =
+      WriterProperties::Builder()
+          .max_row_group_length(30)
+          ->enable_write_page_index()
+          ->write_batch_size(13)
+          ->data_pagesize(1)  // this will cause every batch creating a page
+          ->compression(arrow::Compression::SNAPPY)
+          ->build();
+
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
+
+  ARROW_ASSIGN_OR_RAISE(auto out_stream, ::arrow::io::BufferOutputStream::Create());
+
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+                                                 arrow::default_memory_pool(), out_stream,
+                                                 /*chunk_size=*/100, props, arrow_props));
+
+  // {
+  //   // output to a local file for debugging
+  //   ARROW_ASSIGN_OR_RAISE(auto outfile, arrow::io::FileOutputStream::Open(
+  //                                           "/tmp/filtered_reader_test.parquet"));
+  //
+  //   ARROW_RETURN_NOT_OK(
+  //       parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile,
+  //                                  /*chunk_size=*/100, props, arrow_props));
+  // }
+
+  return out_stream->Finish();
+}
+
+void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader, size_t expect_rows,
+              int64_t expect_sum_of_b) {
+  size_t total_rows = 0;
+  int64_t sum_a = 0;
+  int64_t sum_b = 0;
+  int64_t sum_c = 0;
+  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
+    ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+    total_rows += batch->num_rows();
+
+    auto a_array = std::dynamic_pointer_cast<arrow::ListArray>(batch->column(0));
+    ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten());
+    auto a_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_a_array);
+    for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) {
+      sum_a += (*iter).value();
+    }
+
+    auto b_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
+    for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) {
+      sum_b += (*iter).value();
+    }
+
+    auto c_array = std::dynamic_pointer_cast<arrow::StringArray>(batch->column(2));
+    for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
+      sum_c += std::stoi(std::string((*iter).value()));
+    }
+  }
+  ASSERT_EQ(expect_rows, total_rows);
+  ASSERT_EQ(expect_sum_of_b * 3, sum_a);
+  ASSERT_EQ(expect_sum_of_b, sum_b);
+  ASSERT_EQ(expect_sum_of_b, sum_c);
+}
+
+class TestRecordBatchReaderWithRanges : public ::testing::Test {
+public:
+  void SetUp() {
+
+  }
+
+  void TearDown() {}
+
+protected:
+
+};
+
+TEST(TestRecordBatchReaderWithRanges2, Normal) {
+  ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile());
+
+  arrow::MemoryPool* pool = arrow::default_memory_pool();
+
+  auto reader_properties = parquet::ReaderProperties(pool);
+  reader_properties.set_buffer_size(4096 * 4);
+  reader_properties.enable_buffered_stream();
+
+  auto arrow_reader_props = parquet::ArrowReaderProperties();
+  // arrow_reader_props.set_batch_size(64 * 1024);  // default 64 * 1024
+  arrow_reader_props.set_batch_size(10);  // default 64 * 1024
+
+  parquet::arrow::FileReaderBuilder reader_builder;
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+  ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
+  reader_builder.memory_pool(pool);
+  reader_builder.properties(arrow_reader_props);
+
+  ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
+
+  // // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
+  // {
+  //   std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  //   auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  //   std::vector<parquet::Range> ranges;
+  //   for (int64_t i = 0; i < 30; i++) {
+  //     if (i % 2 == 0) ranges.push_back({i, i});
+  //   }
+  //   row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+  //   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2},
+  //   row_ranges_map,
+  //                                                &rb_reader));
+  //
+  //   check_rb(rb_reader, 15, 210);  // 0 + 2 + ... + 28 = 210
+  // }
+
+  // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped
+  {
+    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    std::vector<parquet::Range> ranges;
+    for (int64_t i = 0; i < 30; i++) {
+      if (i % 2 == 0) ranges.push_back({i, i});
+    }
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, row_ranges_map,
+                                                 &rb_reader));
+
+    check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
+  }
+}

From bdebb741943898ab6ca8d96fedf0656b2d97d99e Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 15:49:51 +0800
Subject: [PATCH 04/25] happy path pass 4

---
 cpp/src/parquet/arrow/reader.cc         |   2 +
 cpp/src/parquet/column_reader.h         |   2 +-
 cpp/src/parquet/filtered_reader_test.cc | 164 ++++++++++++++++--------
 3 files changed, 111 insertions(+), 57 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 81c7b1188895..aa07912a373b 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -22,6 +22,7 @@
 #include <zconf.h>
 #include <algorithm>
 #include <cstring>
+#include <iostream>
 #include <memory>
 #include <unordered_set>
 #include <utility>
@@ -566,6 +567,7 @@ class LeafReader : public ColumnReaderImpl {
  private:
   std::shared_ptr<ChunkedArray> out_;
   void NextRowGroup() {
+    std::cout << "Entering NextRowGroup" << std::endl;
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
 
     /// using page index to reduce cost
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 860642bb3657..d9227ebcb025 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -552,7 +552,7 @@ class PARQUET_EXPORT RecordSkipper {
     adjust_ranges(skip_pages, row_ranges);
     // adjust_ranges(skip_pages, will_process_pages);
 
-    total_rows_to_process = pages.rowCount() - skip_pages.rowCount() + 1;
+    total_rows_to_process = pages.rowCount() - skip_pages.rowCount();
   }
 
   /// \brief Return the number of records to read or to skip
diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc
index 427476c7c3b1..272bbe463af3 100644
--- a/cpp/src/parquet/filtered_reader_test.cc
+++ b/cpp/src/parquet/filtered_reader_test.cc
@@ -56,7 +56,7 @@ arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
   std::shared_ptr<arrow::Array> arr_c;
   std::vector<std::string> strs;
   for (size_t i = 0; i < 100; i++) {
-    strs.push_back("" + std::to_string(i));
+    strs.push_back(std::to_string(i));
   }
   ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs));
   ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
@@ -108,7 +108,7 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
 }
 
 void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader, size_t expect_rows,
-              int64_t expect_sum_of_b) {
+              int64_t expect_sum_of_b, const std::vector<int>& column_indices) {
   size_t total_rows = 0;
   int64_t sum_a = 0;
   int64_t sum_b = 0;
@@ -117,77 +117,91 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader, size_t expect
     ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
     total_rows += batch->num_rows();
 
-    auto a_array = std::dynamic_pointer_cast<arrow::ListArray>(batch->column(0));
-    ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten());
-    auto a_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_a_array);
-    for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) {
-      sum_a += (*iter).value();
+    if (std::find(column_indices.begin(), column_indices.end(), 0) !=
+        column_indices.end()) {
+      auto a_array =
+          std::dynamic_pointer_cast<arrow::ListArray>(batch->GetColumnByName("a"));
+      ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten());
+      auto a_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_a_array);
+      for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) {
+        sum_a += (*iter).value();
+      }
     }
 
-    auto b_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
-    for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) {
-      sum_b += (*iter).value();
+    if (std::find(column_indices.begin(), column_indices.end(), 1) !=
+        column_indices.end()) {
+      auto b_array =
+          std::dynamic_pointer_cast<arrow::Int32Array>(batch->GetColumnByName("b"));
+      for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) {
+        sum_b += (*iter).value();
+      }
     }
 
-    auto c_array = std::dynamic_pointer_cast<arrow::StringArray>(batch->column(2));
-    for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
-      sum_c += std::stoi(std::string((*iter).value()));
+    if (std::find(column_indices.begin(), column_indices.end(), 2) !=
+        column_indices.end()) {
+      auto c_array =
+          std::dynamic_pointer_cast<arrow::StringArray>(batch->GetColumnByName("c"));
+      for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
+        sum_c += std::stoi(std::string((*iter).value()));
+      }
     }
   }
   ASSERT_EQ(expect_rows, total_rows);
-  ASSERT_EQ(expect_sum_of_b * 3, sum_a);
-  ASSERT_EQ(expect_sum_of_b, sum_b);
-  ASSERT_EQ(expect_sum_of_b, sum_c);
+
+  if (std::find(column_indices.begin(), column_indices.end(), 0) != column_indices.end())
+    ASSERT_EQ(expect_sum_of_b * 3, sum_a);
+  if (std::find(column_indices.begin(), column_indices.end(), 1) != column_indices.end())
+    ASSERT_EQ(expect_sum_of_b, sum_b);
+  if (std::find(column_indices.begin(), column_indices.end(), 2) != column_indices.end())
+    ASSERT_EQ(expect_sum_of_b, sum_c);
 }
 
 class TestRecordBatchReaderWithRanges : public ::testing::Test {
-public:
+ public:
   void SetUp() {
+    ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile());
 
-  }
+    arrow::MemoryPool* pool = arrow::default_memory_pool();
 
-  void TearDown() {}
+    auto reader_properties = parquet::ReaderProperties(pool);
+    reader_properties.set_buffer_size(4096 * 4);
+    reader_properties.enable_buffered_stream();
 
-protected:
+    auto arrow_reader_props = parquet::ArrowReaderProperties();
+    // arrow_reader_props.set_batch_size(64 * 1024);  // default 64 * 1024
+    arrow_reader_props.set_batch_size(10);  // default 64 * 1024
 
-};
+    parquet::arrow::FileReaderBuilder reader_builder;
+    auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+    ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
+    reader_builder.memory_pool(pool);
+    reader_builder.properties(arrow_reader_props);
 
-TEST(TestRecordBatchReaderWithRanges2, Normal) {
-  ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile());
-
-  arrow::MemoryPool* pool = arrow::default_memory_pool();
-
-  auto reader_properties = parquet::ReaderProperties(pool);
-  reader_properties.set_buffer_size(4096 * 4);
-  reader_properties.enable_buffered_stream();
+    ASSERT_OK_AND_ASSIGN(arrow_reader, reader_builder.Build());
+  }
 
-  auto arrow_reader_props = parquet::ArrowReaderProperties();
-  // arrow_reader_props.set_batch_size(64 * 1024);  // default 64 * 1024
-  arrow_reader_props.set_batch_size(10);  // default 64 * 1024
+  void TearDown() {}
 
-  parquet::arrow::FileReaderBuilder reader_builder;
-  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
-  ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
-  reader_builder.memory_pool(pool);
-  reader_builder.properties(arrow_reader_props);
+ protected:
+  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+};
 
-  ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
+TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
+  // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
+  {
+    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    std::vector<parquet::Range> ranges;
+    for (int64_t i = 0; i < 30; i++) {
+      if (i % 2 == 0) ranges.push_back({i, i});
+    }
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    std::vector column_indices{0, 1, 2};
+    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                 row_ranges_map, &rb_reader));
 
-  // // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
-  // {
-  //   std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  //   auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  //   std::vector<parquet::Range> ranges;
-  //   for (int64_t i = 0; i < 30; i++) {
-  //     if (i % 2 == 0) ranges.push_back({i, i});
-  //   }
-  //   row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-  //   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2},
-  //   row_ranges_map,
-  //                                                &rb_reader));
-  //
-  //   check_rb(rb_reader, 15, 210);  // 0 + 2 + ... + 28 = 210
-  // }
+    check_rb(rb_reader, 15, 210, column_indices);  // 0 + 2 + ... + 28 = 210
+  }
 
   // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped
   {
@@ -199,9 +213,47 @@ TEST(TestRecordBatchReaderWithRanges2, Normal) {
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
     row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, row_ranges_map,
-                                                 &rb_reader));
+    std::vector column_indices{0, 1, 2};
+    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                 row_ranges_map, &rb_reader));
 
-    check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
+    check_rb(rb_reader, 30, 1320,
+             column_indices);  // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
   }
 }
+
+TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  row_ranges_map->insert(
+      {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
+  row_ranges_map->insert(
+      {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
+  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+
+  std::vector column_indices{0, 1, 2};
+  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                               row_ranges_map, &rb_reader));
+
+  // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280
+  check_rb(rb_reader, 40, 2280, column_indices);
+}
+
+TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert(
+      {1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert(
+      {2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+
+  std::vector column_indices{0, 1, 2};
+  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                               row_ranges_map, &rb_reader));
+
+  // (0+...+99) = 4950
+  check_rb(rb_reader, 100, 4950, column_indices);
+}

From 29c471a8e1528bf9cba963938f5b4b2a81df81d0 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 19:45:13 +0800
Subject: [PATCH 05/25] happy path pass 5

---
 cpp/src/parquet/arrow/reader.cc         |  79 +++++--
 cpp/src/parquet/column_reader.h         |  84 ++-----
 cpp/src/parquet/filtered_reader_test.cc | 290 ++++++++++++++++++------
 3 files changed, 296 insertions(+), 157 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index aa07912a373b..93b4089ef68e 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -566,8 +566,59 @@ class LeafReader : public ColumnReaderImpl {
 
  private:
   std::shared_ptr<ChunkedArray> out_;
+
+  void checkAndGetPageRanges(const std::shared_ptr<RowRanges>& row_ranges,
+                             std::shared_ptr<RowRanges>& page_ranges) {
+    // check offset exists
+    auto rg_pg_index_reader =
+        ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group());
+
+    if (!rg_pg_index_reader) {
+      throw ParquetException(
+          "Attempting to read with Ranges but Page Index is not found for Row "
+          "Group: " +
+          std::to_string(input_->current_row_group()));
+    }
+    auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index());
+
+    if (!offset_index) {
+      throw ParquetException(
+          "Attempting to read with Ranges but Offset index is not found for "
+          "column: " +
+          field_->name());
+    }
+
+    if (!row_ranges->isValid()) {
+      throw ParquetException(
+          "The provided row range is invalid, keep it monotone and non-interleaving: " +
+          row_ranges->toString());
+    }
+
+    const auto page_locations = offset_index->page_locations();
+    page_ranges = std::make_shared<RowRanges>();
+    for (size_t i = 0; i < page_locations.size() - 1; i++) {
+      page_ranges->add(
+          {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1},
+          false);
+    }
+    if (page_locations.size() >= 1) {
+      page_ranges->add(
+          {page_locations[page_locations.size() - 1].first_row_index,
+           ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() -
+               1},
+          false);
+    }
+
+    if (row_ranges->getRanges().size() > 0) {
+      if ((*row_ranges).getRanges().back().to > page_ranges->getRanges().back().to) {
+        throw ParquetException(
+            "The provided row range " + row_ranges->toString() +
+            " exceeds last page :" + page_ranges->getRanges().back().toString());
+      }
+    }
+  }
+
   void NextRowGroup() {
-    std::cout << "Entering NextRowGroup" << std::endl;
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
 
     /// using page index to reduce cost
@@ -578,30 +629,8 @@ class LeafReader : public ColumnReaderImpl {
       // if specific row range is provided for this rg
       if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
           iter != ctx_->row_ranges_map->end()) {
-        // check offset exists
-        auto offset_index = ctx_->reader->GetPageIndexReader()
-                                ->RowGroup(input_->current_row_group())
-                                ->GetOffsetIndex(input_->column_index());
-        if (!offset_index) {
-          throw ParquetException("Attempting to filter pages but Offset index is not found for column: " +
-                                 field_->name());
-        }
-
-        const auto page_locations = offset_index->page_locations();
-        auto page_ranges = std::make_shared<RowRanges>();
-        for (size_t i = 0; i < page_locations.size() - 1; i++) {
-          page_ranges->add({page_locations[i].first_row_index,
-                            page_locations[i + 1].first_row_index - 1},
-                           false);
-        }
-        if (page_locations.size() >= 1) {
-          page_ranges->add({page_locations[page_locations.size() - 1].first_row_index,
-                            ctx_->reader->metadata()
-                                    ->RowGroup(input_->current_row_group())
-                                    ->num_rows() -
-                                1},
-                           false);
-        }
+        std::shared_ptr<RowRanges> page_ranges;
+        checkAndGetPageRanges(iter->second, page_ranges);
 
         // part 1, skip decompressing & decoding unnecessary pages
         page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges));
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index d9227ebcb025..44288e25afea 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <gmock/gmock-matchers.h>
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -354,27 +355,15 @@ class RowRanges {
 
   explicit RowRanges(const Range& range) { ranges.push_back(range); }
 
-  RowRanges(const std::vector<Range>& ranges) { this->ranges = ranges; }
+  RowRanges(const std::vector<Range>& ranges) {
+    this->ranges = ranges;
+  }
 
   // copy cstr
   RowRanges(const RowRanges& other) { ranges = other.ranges; }
 
   RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
 
-  static RowRanges createSingle(const size_t rowCount) {
-    return RowRanges({Range(0L, rowCount - 1L)});
-  }
-
-  // static RowRanges create(size_t rowCount, const std::vector<int>& pageIndexes, const
-  // OffsetIndex& offsetIndex) {
-  //     RowRanges ranges;
-  //     for (int pageIndex : pageIndexes) {
-  //         ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex),
-  //         offsetIndex.getLastRowIndex(pageIndex, rowCount)));
-  //     }
-  //     return ranges;
-  // }
-
   static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) {
     RowRanges result;
     auto it1 = left.ranges.begin();
@@ -441,9 +430,14 @@ class RowRanges {
     if (merge) {
       for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
         Range last = ranges[i];
-        assert(!last.isAfter(range));
+        if (last.isAfter(range)) {
+          throw ParquetException(range.toString() + " cannot be added to " +
+                                 this->toString());
+        }
         const Range u = Range::unionRange(last, rangeToAdd);
-        assert(u.from != -1 && u.to != -1);
+        if (u.from == -1 && u.to == -1) {
+          break;
+        }
         rangeToAdd = u;
         ranges.erase(ranges.begin() + i);
       }
@@ -461,48 +455,20 @@ class RowRanges {
     return cnt;
   }
 
-  //
-  // class Iterator {
-  // private:
-  //     int currentRangeIndex;
-  //     Range currentRange;
-  //     long next;
-  //     std::vector<Range> ranges;
-  //
-  //     long findNext() {
-  //         if (currentRangeIndex < ranges.size()) {
-  //             currentRange = ranges[++currentRangeIndex];
-  //             next = currentRange.from;
-  //         } else {
-  //             return -1;
-  //         }
-  //         return next;
-  //     }
-  //
-  // public:
-  //     Iterator(const std::vector<Range>& ranges) {
-  //         this->ranges = ranges;
-  //         currentRangeIndex = -1;
-  //         next = findNext();
-  //     }
-  //
-  //     bool hasNext() const {
-  //         return next >= 0;
-  //     }
-  //
-  //     long nextLong() {
-  //         long ret = next;
-  //         if (ret < 0) {
-  //             throw std::out_of_range("No such element");
-  //         }
-  //         next = findNext();
-  //         return ret;
-  //     }
-  // };
-  //
-  // Iterator iterator() const {
-  //     return Iterator(ranges);
-  // }
+  bool isValid() const {
+    if (ranges.size() == 0) {
+      return false;
+    }
+    if (ranges[0].from < 0) {
+      return false;
+    }
+    for (size_t i = 1; i < ranges.size(); i++) {
+      if (ranges[i].from <= ranges[i - 1].to) {
+        return false;
+      }
+    }
+    return true;
+  }
 
   bool isOverlapping(int64_t from, int64_t to) const {
     const Range searchRange(from, to);
diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc
index 272bbe463af3..9cd711cdf176 100644
--- a/cpp/src/parquet/filtered_reader_test.cc
+++ b/cpp/src/parquet/filtered_reader_test.cc
@@ -25,33 +25,56 @@
 
 #include <arrow/testing/gtest_util.h>
 #include <arrow/util/range.h>
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
 #include <iostream>
 
+/// The table looks like:
+// {
+// { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0},
+// { a: {x: 1, y: 1}, b: {1, 1, 1}, c: "1", d: 1},
+// ...
+// { a: {x: 99, y: 99}, b: {99, 99, 99}, c: "99", d: 99}
+// }
 arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
-  auto builder = arrow::Int32Builder();
-
-  std::shared_ptr<arrow::Array> arr_a_values;
-  std::shared_ptr<arrow::Array> arr_a_offsets;
-  std::vector<int> a_values;
+  auto int32_builder = arrow::Int32Builder();
+
+  // Struct col
+  std::shared_ptr<arrow::Array> arr_a_x;
+  std::shared_ptr<arrow::Array> arr_a_y;
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_x));
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_y));
+  ARROW_ASSIGN_OR_RAISE(
+      auto arr_a,
+      arrow::StructArray::Make({arr_a_x, arr_a_y}, std::vector<std::string>{"x", "y"}));
+
+  // List<int> col
+  std::shared_ptr<arrow::Array> arr_b_values;
+  std::shared_ptr<arrow::Array> arr_b_offsets;
+  std::vector<int> b_values;
   for (int i = 0; i < 100; ++i) {
     for (int j = 0; j < 3; ++j) {
-      a_values.push_back(i);
+      b_values.push_back(i);
     }
   }
-  ARROW_RETURN_NOT_OK(builder.AppendValues(a_values));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values));
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(b_values));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_values));
   std::vector<int> offsets = arrow::internal::Iota(0, 101);
   std::transform(offsets.begin(), offsets.end(), offsets.begin(),
                  [](int x) { return x * 3; });
-  ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets));
-  ARROW_ASSIGN_OR_RAISE(auto arr_a,
-                        arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values));
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(offsets));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_offsets));
+  ARROW_ASSIGN_OR_RAISE(auto arr_b,
+                        arrow::ListArray::FromArrays(*arr_b_offsets, *arr_b_values));
 
-  std::shared_ptr<arrow::Array> arr_b;
-  ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
-  ARROW_RETURN_NOT_OK(builder.Finish(&arr_b));
+  // int col
+  std::shared_ptr<arrow::Array> arr_d;
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100)));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d));
 
+  // string col
   auto string_builder = arrow::StringBuilder();
   std::shared_ptr<arrow::Array> arr_c;
   std::vector<std::string> strs;
@@ -62,12 +85,14 @@ arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
   ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
 
   auto schema = arrow::schema({
-      arrow::field("a", arrow::list(arrow::int32())),
-      arrow::field("b", arrow::int32()),
+      // complex types prior to simple types
+      arrow::field("a", arr_a->type()),
+      arrow::field("b", arrow::list(arrow::int32())),
       arrow::field("c", arrow::utf8()),
+      arrow::field("d", arrow::int32()),
   });
 
-  return arrow::Table::Make(schema, {arr_a, arr_b, arr_c});
+  return arrow::Table::Make(schema, {arr_a, arr_b, arr_c, arr_d});
 }
 
 arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
@@ -107,53 +132,70 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
   return out_stream->Finish();
 }
 
-void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader, size_t expect_rows,
-              int64_t expect_sum_of_b, const std::vector<int>& column_indices) {
+bool checking_col(const std::string col_name,
+                  const std::vector<std::string>& column_names) {
+  return std::find(column_names.begin(), column_names.end(), col_name) !=
+         column_names.end();
+}
+
+void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
+              const size_t expected_rows, const int64_t expected_sum) {
+  const std::vector<std::string> column_names = rb_reader->schema()->field_names();
+
   size_t total_rows = 0;
   int64_t sum_a = 0;
   int64_t sum_b = 0;
   int64_t sum_c = 0;
+  int64_t sum_d = 0;
   for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
     ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
     total_rows += batch->num_rows();
 
-    if (std::find(column_indices.begin(), column_indices.end(), 0) !=
-        column_indices.end()) {
+    if (checking_col("a", column_names)) {
       auto a_array =
-          std::dynamic_pointer_cast<arrow::ListArray>(batch->GetColumnByName("a"));
-      ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten());
-      auto a_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_a_array);
-      for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) {
+          std::dynamic_pointer_cast<arrow::StructArray>(batch->GetColumnByName("a"));
+      auto a_x_array = std::dynamic_pointer_cast<arrow::Int32Array>(a_array->field(0));
+      auto a_y_array = std::dynamic_pointer_cast<arrow::Int32Array>(a_array->field(1));
+      for (auto iter = a_x_array->begin(); iter != a_x_array->end(); ++iter) {
+        sum_a += (*iter).value();
+      }
+      for (auto iter = a_y_array->begin(); iter != a_y_array->end(); ++iter) {
         sum_a += (*iter).value();
       }
     }
 
-    if (std::find(column_indices.begin(), column_indices.end(), 1) !=
-        column_indices.end()) {
+    if (checking_col("b", column_names)) {
       auto b_array =
-          std::dynamic_pointer_cast<arrow::Int32Array>(batch->GetColumnByName("b"));
-      for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) {
+          std::dynamic_pointer_cast<arrow::ListArray>(batch->GetColumnByName("b"));
+      ASSERT_OK_AND_ASSIGN(auto flatten_b_array, b_array->Flatten());
+      auto b_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_b_array);
+      for (auto iter = b_array_values->begin(); iter != b_array_values->end(); ++iter) {
         sum_b += (*iter).value();
       }
     }
 
-    if (std::find(column_indices.begin(), column_indices.end(), 2) !=
-        column_indices.end()) {
+    if (checking_col("c", column_names)) {
       auto c_array =
           std::dynamic_pointer_cast<arrow::StringArray>(batch->GetColumnByName("c"));
       for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
         sum_c += std::stoi(std::string((*iter).value()));
       }
     }
+
+    if (checking_col("d", column_names)) {
+      auto d_array =
+          std::dynamic_pointer_cast<arrow::Int32Array>(batch->GetColumnByName("d"));
+      for (auto iter = d_array->begin(); iter != d_array->end(); ++iter) {
+        sum_d += (*iter).value();
+      }
+    }
   }
-  ASSERT_EQ(expect_rows, total_rows);
-
-  if (std::find(column_indices.begin(), column_indices.end(), 0) != column_indices.end())
-    ASSERT_EQ(expect_sum_of_b * 3, sum_a);
-  if (std::find(column_indices.begin(), column_indices.end(), 1) != column_indices.end())
-    ASSERT_EQ(expect_sum_of_b, sum_b);
-  if (std::find(column_indices.begin(), column_indices.end(), 2) != column_indices.end())
-    ASSERT_EQ(expect_sum_of_b, sum_c);
+  ASSERT_EQ(expected_rows, total_rows);
+
+  if (checking_col("a", column_names)) ASSERT_EQ(expected_sum * 2, sum_a);
+  if (checking_col("b", column_names)) ASSERT_EQ(expected_sum * 3, sum_b);
+  if (checking_col("c", column_names)) ASSERT_EQ(expected_sum, sum_c);
+  if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d);
 }
 
 class TestRecordBatchReaderWithRanges : public ::testing::Test {
@@ -168,7 +210,6 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test {
     reader_properties.enable_buffered_stream();
 
     auto arrow_reader_props = parquet::ArrowReaderProperties();
-    // arrow_reader_props.set_batch_size(64 * 1024);  // default 64 * 1024
     arrow_reader_props.set_batch_size(10);  // default 64 * 1024
 
     parquet::arrow::FileReaderBuilder reader_builder;
@@ -186,6 +227,56 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test {
   std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
 };
 
+TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  row_ranges_map->insert(
+      {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
+  row_ranges_map->insert(
+      {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
+  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+
+  std::vector column_indices{0, 1, 2, 3, 4};
+  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                               row_ranges_map, &rb_reader));
+
+  // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280
+  check_rb(rb_reader, 40, 2280);
+}
+
+TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert(
+      {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert(
+      {1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert(
+      {2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+
+  std::vector column_indices{0, 1, 2, 3, 4};
+  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                               row_ranges_map, &rb_reader));
+
+  // (0+...+99) = 4950
+  check_rb(rb_reader, 100, 4950);
+}
+
+TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert(
+      {0, std::make_shared<parquet::RowRanges>(std::vector<parquet::Range>())});
+  std::vector column_indices{0, 1, 2, 3, 4};
+  auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                   row_ranges_map, &rb_reader);
+  ASSERT_NOT_OK(status);
+  EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone "
+                                    "and non-interleaving: []") != std::string::npos);
+}
+
 TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
   {
@@ -196,11 +287,11 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    std::vector column_indices{0, 1, 2};
+    std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
 
-    check_rb(rb_reader, 15, 210, column_indices);  // 0 + 2 + ... + 28 = 210
+    check_rb(rb_reader, 15, 210);  // 0 + 2 + ... + 28 = 210
   }
 
   // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped
@@ -213,47 +304,100 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
     row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    std::vector column_indices{0, 1, 2};
+    std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
 
-    check_rb(rb_reader, 30, 1320,
-             column_indices);  // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
+    check_rb(rb_reader, 30, 1320);  // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
   }
 }
 
-TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
+TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
-  row_ranges_map->insert(
-      {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
-  row_ranges_map->insert(
-      {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
-  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
-
-  std::vector column_indices{0, 1, 2};
-  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                               row_ranges_map, &rb_reader));
+  {
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    row_ranges_map->insert(
+        {0, std::make_shared<parquet::RowRanges>(parquet::Range{-1, 5})});
+    std::vector column_indices{0, 1, 2, 3, 4};
+    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                     row_ranges_map, &rb_reader);
+    ASSERT_NOT_OK(status);
+    EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it "
+                                      "monotone and non-interleaving: [(-1, 5)]") !=
+                std::string::npos);
+  }
 
-  // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280
-  check_rb(rb_reader, 40, 2280, column_indices);
+  {
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(std::vector{
+                                   parquet::Range{0, 4}, parquet::Range{2, 5}})});
+    std::vector column_indices{0, 1, 2, 3, 4};
+    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                     row_ranges_map, &rb_reader);
+    ASSERT_NOT_OK(status);
+    EXPECT_TRUE(
+        status.message().find("The provided row range is invalid, keep it monotone and "
+                              "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos);
+  }
+  {
+    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    row_ranges_map->insert(
+        {0, std::make_shared<parquet::RowRanges>(std::vector{parquet::Range{0, 30}})});
+    std::vector column_indices{0, 1, 2, 3, 4};
+    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                     row_ranges_map, &rb_reader);
+    ASSERT_NOT_OK(status);
+    EXPECT_TRUE(status.message().find(
+                    "The provided row range [(0, 30)] exceeds last page :[26, 29]") !=
+                std::string::npos);
+  }
 }
 
-TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
+TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
+
+  // write a file without page index
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<arrow::Table> table, GetTable());
+  std::shared_ptr<WriterProperties> props =
+      WriterProperties::Builder()
+          .max_row_group_length(30)
+          ->disable_write_page_index()  // NO INDEX !!!!
+          ->write_batch_size(13)
+          ->data_pagesize(1)
+          ->compression(arrow::Compression::SNAPPY)
+          ->build();
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
+  ASSERT_OK_AND_ASSIGN(auto out_stream, ::arrow::io::BufferOutputStream::Create());
+  ASSERT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(),
+                                       out_stream,
+                                       /*chunk_size=*/100, props, arrow_props));
+  ASSERT_OK_AND_ASSIGN(auto buffer, out_stream->Finish());
+
+  // try to read the file with Range
+  arrow::MemoryPool* pool = arrow::default_memory_pool();
+  auto reader_properties = parquet::ReaderProperties(pool);
+  reader_properties.set_buffer_size(4096 * 4);
+  reader_properties.enable_buffered_stream();
+  auto arrow_reader_props = parquet::ArrowReaderProperties();
+  arrow_reader_props.set_batch_size(10);  // default 64 * 1024
+
+  parquet::arrow::FileReaderBuilder reader_builder;
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+  ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
+  reader_builder.memory_pool(pool);
+  reader_builder.properties(arrow_reader_props);
+  ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
+
   std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
   auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map->insert(
-      {1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
   row_ranges_map->insert(
-      {2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
-
-  std::vector column_indices{0, 1, 2};
-  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                               row_ranges_map, &rb_reader));
-
-  // (0+...+99) = 4950
-  check_rb(rb_reader, 100, 4950, column_indices);
+      {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  std::vector column_indices{0, 1, 2, 3, 4};
+  auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                   row_ranges_map, &rb_reader);
+  ASSERT_NOT_OK(status);
+  EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is "
+                                    "not found for Row Group: 0") != std::string::npos);
 }

From 7bf0e97e9de543175d781b9712ff1d2dccb93f12 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 20:33:05 +0800
Subject: [PATCH 06/25] happy path pass 6

---
 cpp/src/parquet/filtered_reader_test.cc | 200 ++++++++++++++++--------
 1 file changed, 137 insertions(+), 63 deletions(-)

diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc
index 9cd711cdf176..018e2580f254 100644
--- a/cpp/src/parquet/filtered_reader_test.cc
+++ b/cpp/src/parquet/filtered_reader_test.cc
@@ -23,20 +23,41 @@
 #include "parquet/arrow/reader.h"
 #include "parquet/arrow/writer.h"
 
+#include <arrow/testing/builder.h>
 #include <arrow/testing/gtest_util.h>
 #include <arrow/util/range.h>
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 #include <iostream>
 
-/// The table looks like:
+/// The table looks like (with_nulls = false):
 // {
 // { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0},
 // { a: {x: 1, y: 1}, b: {1, 1, 1}, c: "1", d: 1},
 // ...
 // { a: {x: 99, y: 99}, b: {99, 99, 99}, c: "99", d: 99}
 // }
-arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
+arrow::Result<std::shared_ptr<arrow::Table>> GetTable(bool with_nulls = false) {
+  // if with_nulls, the generated table should null values
+  // set first 10 rows and last 10 rows to null
+  std::shared_ptr<arrow::Buffer> null_bitmap;
+  std::vector flags(100, true);
+  if (with_nulls) {
+    std::fill_n(flags.begin(), 10, false);
+    std::fill_n(flags.begin() + 90, 10, false);
+
+    size_t length = flags.size();
+
+    ARROW_ASSIGN_OR_RAISE(null_bitmap, arrow::AllocateEmptyBitmap(length));
+
+    uint8_t* bitmap = null_bitmap->mutable_data();
+    for (size_t i = 0; i < length; ++i) {
+      if (flags[i]) {
+        arrow::bit_util::SetBit(bitmap, i);
+      }
+    }
+  }
+
   auto int32_builder = arrow::Int32Builder();
 
   // Struct col
@@ -46,9 +67,9 @@ arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
   ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_x));
   ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100)));
   ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_y));
-  ARROW_ASSIGN_OR_RAISE(
-      auto arr_a,
-      arrow::StructArray::Make({arr_a_x, arr_a_y}, std::vector<std::string>{"x", "y"}));
+  ARROW_ASSIGN_OR_RAISE(auto arr_a, arrow::StructArray::Make(
+                                        {arr_a_x, arr_a_y},
+                                        std::vector<std::string>{"x", "y"}, null_bitmap));
 
   // List<int> col
   std::shared_ptr<arrow::Array> arr_b_values;
@@ -63,45 +84,49 @@ arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
   ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_values));
   std::vector<int> offsets = arrow::internal::Iota(0, 101);
   std::transform(offsets.begin(), offsets.end(), offsets.begin(),
-                 [](int x) { return x * 3; });
+                 [](const int x) { return x * 3; });
   ARROW_RETURN_NOT_OK(int32_builder.AppendValues(offsets));
   ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_offsets));
-  ARROW_ASSIGN_OR_RAISE(auto arr_b,
-                        arrow::ListArray::FromArrays(*arr_b_offsets, *arr_b_values));
-
-  // int col
-  std::shared_ptr<arrow::Array> arr_d;
-  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100)));
-  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d));
+  ARROW_ASSIGN_OR_RAISE(auto arr_b, arrow::ListArray::FromArrays(
+                                        *arr_b_offsets, *arr_b_values,
+                                        arrow::default_memory_pool(), null_bitmap));
 
   // string col
   auto string_builder = arrow::StringBuilder();
   std::shared_ptr<arrow::Array> arr_c;
   std::vector<std::string> strs;
+  uint8_t valid_bytes[100];
   for (size_t i = 0; i < 100; i++) {
     strs.push_back(std::to_string(i));
+    valid_bytes[i] = flags[i];
   }
-  ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs));
+  ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs, &valid_bytes[0]));
   ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
 
+  // int col
+  std::shared_ptr<arrow::Array> arr_d;
+  ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100), flags));
+  ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d));
+
   auto schema = arrow::schema({
       // complex types prior to simple types
-      arrow::field("a", arr_a->type()),
-      arrow::field("b", arrow::list(arrow::int32())),
-      arrow::field("c", arrow::utf8()),
-      arrow::field("d", arrow::int32()),
+      field("a", arr_a->type()),
+      field("b", list(arrow::int32())),
+      field("c", arrow::utf8()),
+      field("d", arrow::int32()),
   });
 
   return arrow::Table::Make(schema, {arr_a, arr_b, arr_c, arr_d});
 }
 
-arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
+arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile(
+    const bool with_nulls = false) {
   using parquet::ArrowWriterProperties;
   using parquet::WriterProperties;
 
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+  ARROW_ASSIGN_OR_RAISE(const auto table, GetTable(with_nulls));
 
-  std::shared_ptr<WriterProperties> props =
+  const std::shared_ptr<WriterProperties> props =
       WriterProperties::Builder()
           .max_row_group_length(30)
           ->enable_write_page_index()
@@ -110,10 +135,10 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
           ->compression(arrow::Compression::SNAPPY)
           ->build();
 
-  std::shared_ptr<ArrowWriterProperties> arrow_props =
+  const std::shared_ptr<ArrowWriterProperties> arrow_props =
       ArrowWriterProperties::Builder().store_schema()->build();
 
-  ARROW_ASSIGN_OR_RAISE(auto out_stream, ::arrow::io::BufferOutputStream::Create());
+  ARROW_ASSIGN_OR_RAISE(const auto out_stream, ::arrow::io::BufferOutputStream::Create());
 
   ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
                                                  arrow::default_memory_pool(), out_stream,
@@ -132,7 +157,7 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile() {
   return out_stream->Finish();
 }
 
-bool checking_col(const std::string col_name,
+bool checking_col(const std::string& col_name,
                   const std::vector<std::string>& column_names) {
   return std::find(column_names.begin(), column_names.end(), col_name) !=
          column_names.end();
@@ -157,10 +182,10 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
       auto a_x_array = std::dynamic_pointer_cast<arrow::Int32Array>(a_array->field(0));
       auto a_y_array = std::dynamic_pointer_cast<arrow::Int32Array>(a_array->field(1));
       for (auto iter = a_x_array->begin(); iter != a_x_array->end(); ++iter) {
-        sum_a += (*iter).value();
+        sum_a += (*iter).has_value() ? (*iter).value() : 0;
       }
       for (auto iter = a_y_array->begin(); iter != a_y_array->end(); ++iter) {
-        sum_a += (*iter).value();
+        sum_a += (*iter).has_value() ? (*iter).value() : 0;
       }
     }
 
@@ -170,7 +195,7 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
       ASSERT_OK_AND_ASSIGN(auto flatten_b_array, b_array->Flatten());
       auto b_array_values = std::dynamic_pointer_cast<arrow::Int32Array>(flatten_b_array);
       for (auto iter = b_array_values->begin(); iter != b_array_values->end(); ++iter) {
-        sum_b += (*iter).value();
+        sum_b += (*iter).has_value() ? (*iter).value() : 0;
       }
     }
 
@@ -178,7 +203,7 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
       auto c_array =
           std::dynamic_pointer_cast<arrow::StringArray>(batch->GetColumnByName("c"));
       for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
-        sum_c += std::stoi(std::string((*iter).value()));
+        sum_c += std::stoi(std::string((*iter).has_value() ? (*iter).value() : "0"));
       }
     }
 
@@ -186,7 +211,7 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
       auto d_array =
           std::dynamic_pointer_cast<arrow::Int32Array>(batch->GetColumnByName("d"));
       for (auto iter = d_array->begin(); iter != d_array->end(); ++iter) {
-        sum_d += (*iter).value();
+        sum_d += (*iter).has_value() ? (*iter).value() : 0;
       }
     }
   }
@@ -198,7 +223,7 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
   if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d);
 }
 
-class TestRecordBatchReaderWithRanges : public ::testing::Test {
+class TestRecordBatchReaderWithRanges : public testing::Test {
  public:
   void SetUp() {
     ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile());
@@ -213,7 +238,7 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test {
     arrow_reader_props.set_batch_size(10);  // default 64 * 1024
 
     parquet::arrow::FileReaderBuilder reader_builder;
-    auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+    const auto in_file = std::make_shared<arrow::io::BufferReader>(buffer);
     ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
     reader_builder.memory_pool(pool);
     reader_builder.properties(arrow_reader_props);
@@ -228,8 +253,8 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test {
 };
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
   row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
   row_ranges_map->insert(
       {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
@@ -237,7 +262,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
       {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
   row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
 
-  std::vector column_indices{0, 1, 2, 3, 4};
+  const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                row_ranges_map, &rb_reader));
 
@@ -246,8 +271,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
   row_ranges_map->insert(
       {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
   row_ranges_map->insert(
@@ -256,7 +281,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
       {2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
   row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
 
-  std::vector column_indices{0, 1, 2, 3, 4};
+  const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                row_ranges_map, &rb_reader));
 
@@ -265,13 +290,13 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
   row_ranges_map->insert(
       {0, std::make_shared<parquet::RowRanges>(std::vector<parquet::Range>())});
-  std::vector column_indices{0, 1, 2, 3, 4};
-  auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                   row_ranges_map, &rb_reader);
+  const std::vector column_indices{0, 1, 2, 3, 4};
+  const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                         row_ranges_map, &rb_reader);
   ASSERT_NOT_OK(status);
   EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone "
                                     "and non-interleaving: []") != std::string::npos);
@@ -280,14 +305,14 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
 TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
   {
-    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    std::vector column_indices{0, 1, 2, 3, 4};
+    const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
 
@@ -296,15 +321,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 
   // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped
   {
-    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
     row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    std::vector column_indices{0, 1, 2, 3, 4};
+    const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
 
@@ -313,14 +338,14 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
     row_ranges_map->insert(
         {0, std::make_shared<parquet::RowRanges>(parquet::Range{-1, 5})});
-    std::vector column_indices{0, 1, 2, 3, 4};
-    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                     row_ranges_map, &rb_reader);
+    const std::vector column_indices{0, 1, 2, 3, 4};
+    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                           row_ranges_map, &rb_reader);
     ASSERT_NOT_OK(status);
     EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it "
                                       "monotone and non-interleaving: [(-1, 5)]") !=
@@ -328,24 +353,24 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
 
   {
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(std::vector{
                                    parquet::Range{0, 4}, parquet::Range{2, 5}})});
-    std::vector column_indices{0, 1, 2, 3, 4};
-    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                     row_ranges_map, &rb_reader);
+    const std::vector column_indices{0, 1, 2, 3, 4};
+    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                           row_ranges_map, &rb_reader);
     ASSERT_NOT_OK(status);
     EXPECT_TRUE(
         status.message().find("The provided row range is invalid, keep it monotone and "
                               "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos);
   }
   {
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
     row_ranges_map->insert(
         {0, std::make_shared<parquet::RowRanges>(std::vector{parquet::Range{0, 30}})});
-    std::vector column_indices{0, 1, 2, 3, 4};
-    auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                     row_ranges_map, &rb_reader);
+    const std::vector column_indices{0, 1, 2, 3, 4};
+    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                           row_ranges_map, &rb_reader);
     ASSERT_NOT_OK(status);
     EXPECT_TRUE(status.message().find(
                     "The provided row range [(0, 30)] exceeds last page :[26, 29]") !=
@@ -384,13 +409,13 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   arrow_reader_props.set_batch_size(10);  // default 64 * 1024
 
   parquet::arrow::FileReaderBuilder reader_builder;
-  auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer);
+  auto in_file = std::make_shared<arrow::io::BufferReader>(buffer);
   ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
   reader_builder.memory_pool(pool);
   reader_builder.properties(arrow_reader_props);
   ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
 
-  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
   row_ranges_map->insert(
       {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
@@ -401,3 +426,52 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is "
                                     "not found for Row Group: 0") != std::string::npos);
 }
+
+class TestRecordBatchReaderWithRangesWithNulls : public testing::Test {
+ public:
+  void SetUp() {
+    ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile(true));
+
+    arrow::MemoryPool* pool = arrow::default_memory_pool();
+
+    auto reader_properties = parquet::ReaderProperties(pool);
+    reader_properties.set_buffer_size(4096 * 4);
+    reader_properties.enable_buffered_stream();
+
+    auto arrow_reader_props = parquet::ArrowReaderProperties();
+    arrow_reader_props.set_batch_size(10);  // default 64 * 1024
+
+    parquet::arrow::FileReaderBuilder reader_builder;
+    const auto in_file = std::make_shared<arrow::io::BufferReader>(buffer);
+    ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties));
+    reader_builder.memory_pool(pool);
+    reader_builder.properties(arrow_reader_props);
+
+    ASSERT_OK_AND_ASSIGN(arrow_reader, reader_builder.Build());
+  }
+
+  void TearDown() {}
+
+ protected:
+  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+};
+
+TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
+  {
+    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    std::vector<parquet::Range> ranges;
+    for (int64_t i = 0; i < 30; i++) {
+      if (i % 2 == 0) ranges.push_back({i, i});
+    }
+    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    const std::vector column_indices{0, 1, 2, 3, 4};
+    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                                 row_ranges_map, &rb_reader));
+
+    // 0-9 is masked as null, so the ramaining is:
+    // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320
+    check_rb(rb_reader, 30, 1300);
+  }
+}
\ No newline at end of file

From c97ea481de6191e59aca254714bae57be38aae29 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 20:57:42 +0800
Subject: [PATCH 07/25] happy path pass 7

---
 cpp/examples/arrow/parquet_read_write.cc      | 309 ++++++++----------
 cpp/src/parquet/CMakeLists.txt                |   2 +-
 cpp/src/parquet/arrow/reader.cc               |   2 -
 cpp/src/parquet/arrow/reader.h                |  10 +-
 cpp/src/parquet/column_reader.cc              |   3 +-
 cpp/src/parquet/column_reader.h               |   5 +-
 ...ed_reader_test.cc => range_reader_test.cc} |   2 +-
 cpp/src/parquet/reader_test.cc                |   3 -
 8 files changed, 147 insertions(+), 189 deletions(-)
 rename cpp/src/parquet/{filtered_reader_test.cc => range_reader_test.cc} (99%)

diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index fa45a34cff49..3b8b4c2212b7 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -23,201 +23,168 @@
 #include "parquet/arrow/writer.h"
 
 #include <iostream>
-#include <arrow/util/range.h>
 
-arrow::Status ReadInBatches(std::string path_to_file) {
-    // #include "arrow/io/api.h"
-    // #include "arrow/parquet/arrow/reader.h"
-
-    arrow::MemoryPool* pool = arrow::default_memory_pool();
-
-    // Configure general Parquet reader settings
-    auto reader_properties = parquet::ReaderProperties(pool);
-    reader_properties.set_buffer_size(4096 * 4);
-    reader_properties.enable_buffered_stream();
-
-    // Configure Arrow-specific Parquet reader settings
-    auto arrow_reader_props = parquet::ArrowReaderProperties();
-    arrow_reader_props.set_batch_size(10); // default 64 * 1024
-
-    parquet::arrow::FileReaderBuilder reader_builder;
-    ARROW_RETURN_NOT_OK(
-        reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
-    reader_builder.memory_pool(pool);
-    reader_builder.properties(arrow_reader_props);
-
-    std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
-    ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
-
-    std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-
-    std::vector<parquet::Range> ranges;
-    for (int64_t i = 0; i < 50; i++) {
-        if (i % 2 == 0)
-            ranges.push_back({i, i});
-    }
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-
-
-    ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader));
-
-    size_t total_rows = 0;
-    size_t total_values = 0;
-    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *rb_reader) {
-        // Operate on each batch...
-        auto batch = maybe_batch.ValueOrDie();
-        total_rows += batch->num_rows();
-        std::cout << "batch size: " << batch->num_rows() << std::endl;
-
-        auto int_array = std::dynamic_pointer_cast<arrow::Int32Array>(batch->column(1));
-        for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) {
-            total_values += (*iter).value();
-        }
-    }
-    std::cout << "total rows is : " << total_rows << std::endl;
-    std::cout << "total value of y is : " << total_values << std::endl;
-    return arrow::Status::OK();
+arrow::Status ReadFullFile(std::string path_to_file) {
+  // #include "arrow/io/api.h"
+  // #include "arrow/parquet/arrow/reader.h"
+
+  arrow::MemoryPool* pool = arrow::default_memory_pool();
+  std::shared_ptr<arrow::io::RandomAccessFile> input;
+  ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(path_to_file));
+
+  // Open Parquet file reader
+  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+  ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader));
+
+  // Read entire file as a single Arrow table
+  std::shared_ptr<arrow::Table> table;
+  ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table));
+  return arrow::Status::OK();
 }
 
+arrow::Status ReadInBatches(std::string path_to_file) {
+  // #include "arrow/io/api.h"
+  // #include "arrow/parquet/arrow/reader.h"
+
+  arrow::MemoryPool* pool = arrow::default_memory_pool();
+
+  // Configure general Parquet reader settings
+  auto reader_properties = parquet::ReaderProperties(pool);
+  reader_properties.set_buffer_size(4096 * 4);
+  reader_properties.enable_buffered_stream();
+
+  // Configure Arrow-specific Parquet reader settings
+  auto arrow_reader_props = parquet::ArrowReaderProperties();
+  arrow_reader_props.set_batch_size(128 * 1024);  // default 64 * 1024
+
+  parquet::arrow::FileReaderBuilder reader_builder;
+  ARROW_RETURN_NOT_OK(
+      reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties));
+  reader_builder.memory_pool(pool);
+  reader_builder.properties(arrow_reader_props);
+
+  std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+  ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
+
+  std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
+  ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader));
+
+  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *rb_reader) {
+    // Operate on each batch...
+  }
+  return arrow::Status::OK();
+}
 
 arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
-    auto builder = arrow::Int32Builder();
-
-    std::shared_ptr<arrow::Array> arr_a_values;
-    std::shared_ptr<arrow::Array> arr_a_offsets;
-    std::vector<int> a_values;
-    for (int i = 0; i < 100; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            a_values.push_back(i);
-        }
-    }
-    ARROW_RETURN_NOT_OK(builder.AppendValues(a_values));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values));
-    std::vector<int> offsets = arrow::internal::Iota(0, 101);
-    std::transform(offsets.begin(), offsets.end(), offsets.begin(),
-                   [](int x) { return x * 3; });
-    ARROW_RETURN_NOT_OK(builder.AppendValues(offsets));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets));
-    ARROW_ASSIGN_OR_RAISE(auto arr_a,
-                          arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values));
-
-    std::shared_ptr<arrow::Array> arr_b;
-    ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100)));
-    ARROW_RETURN_NOT_OK(builder.Finish(&arr_b));
-
-    auto string_builder = arrow::StringBuilder();
-    std::shared_ptr<arrow::Array> arr_c;
-    std::vector<std::string> strs;
-    for (size_t i = 0; i < 100; i++) {
-        strs.push_back("" + std::to_string(i));
-    }
-    ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs));
-    ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c));
-
-    auto schema = arrow::schema({
-        arrow::field("a", arrow::list(arrow::int32())),
-        arrow::field("b", arrow::int32()),
-        arrow::field("c", arrow::utf8()),
-    });
-
-    return arrow::Table::Make(schema, {arr_a, arr_b, arr_c});
+  auto builder = arrow::Int32Builder();
+
+  std::shared_ptr<arrow::Array> arr_x;
+  ARROW_RETURN_NOT_OK(builder.AppendValues({1, 3, 5, 7, 1}));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_x));
+
+  std::shared_ptr<arrow::Array> arr_y;
+  ARROW_RETURN_NOT_OK(builder.AppendValues({2, 4, 6, 8, 10}));
+  ARROW_RETURN_NOT_OK(builder.Finish(&arr_y));
+
+  auto schema = arrow::schema(
+      {arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32())});
+
+  return arrow::Table::Make(schema, {arr_x, arr_y});
 }
 
 arrow::Result<std::shared_ptr<arrow::TableBatchReader>> GetRBR() {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
-    auto reader = std::make_shared<arrow::TableBatchReader>(table);
-    reader->set_chunksize(10);
-    return reader;
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+  auto reader = std::make_shared<arrow::TableBatchReader>(table);
+  reader->set_chunksize(10);
+  return reader;
 }
 
 arrow::Status WriteFullFile(std::string path_to_file) {
-    using parquet::ArrowWriterProperties;
-    using parquet::WriterProperties;
+  // #include "parquet/arrow/writer.h"
+  // #include "arrow/util/type_fwd.h"
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
 
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
 
-    // Choose compression
-    std::shared_ptr<WriterProperties> props =
-            WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13)
-            ->data_pagesize(1) // this will cause every batch creating a page
-            ->compression(arrow::Compression::SNAPPY)->build();
-    std::cout << "hello" << std::endl;
+  // Choose compression
+  std::shared_ptr<WriterProperties> props =
+      WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
 
-    // Opt to store Arrow schema for easier reads back into Arrow
-    std::shared_ptr<ArrowWriterProperties> arrow_props =
-            ArrowWriterProperties::Builder().store_schema()->build();
+  // Opt to store Arrow schema for easier reads back into Arrow
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
 
-    std::shared_ptr<arrow::io::FileOutputStream> outfile;
-    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
 
-    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
-        arrow::default_memory_pool(), outfile,
-        /*chunk_size=*/100, props, arrow_props));
-    return arrow::Status::OK();
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+                                                 arrow::default_memory_pool(), outfile,
+                                                 /*chunk_size=*/3, props, arrow_props));
+  return arrow::Status::OK();
 }
 
 arrow::Status WriteInBatches(std::string path_to_file) {
-    // #include "parquet/arrow/writer.h"
-    // #include "arrow/util/type_fwd.h"
-    using parquet::ArrowWriterProperties;
-    using parquet::WriterProperties;
-
-    // Data is in RBR
-    std::shared_ptr<arrow::RecordBatchReader> batch_stream;
-    ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
-
-    // Choose compression
-    std::shared_ptr<WriterProperties> props =
-            WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
-
-    // Opt to store Arrow schema for easier reads back into Arrow
-    std::shared_ptr<ArrowWriterProperties> arrow_props =
-            ArrowWriterProperties::Builder().store_schema()->build();
-
-    // Create a writer
-    std::shared_ptr<arrow::io::FileOutputStream> outfile;
-    ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
-    std::unique_ptr<parquet::arrow::FileWriter> writer;
-    ARROW_ASSIGN_OR_RAISE(
-        writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
-            arrow::default_memory_pool(), outfile,
-            props, arrow_props));
-
-    // Write each batch as a row_group
-    for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch: *batch_stream) {
-        ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
-        ARROW_ASSIGN_OR_RAISE(auto table,
-                              arrow::Table::FromRecordBatches(batch->schema(), {batch}));
-        ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
-    }
-
-    // Write file footer and close
-    ARROW_RETURN_NOT_OK(writer->Close());
-
-    return arrow::Status::OK();
+  // #include "parquet/arrow/writer.h"
+  // #include "arrow/util/type_fwd.h"
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
+
+  // Data is in RBR
+  std::shared_ptr<arrow::RecordBatchReader> batch_stream;
+  ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR());
+
+  // Choose compression
+  std::shared_ptr<WriterProperties> props =
+      WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build();
+
+  // Opt to store Arrow schema for easier reads back into Arrow
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
+
+  // Create a writer
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file));
+  std::unique_ptr<parquet::arrow::FileWriter> writer;
+  ARROW_ASSIGN_OR_RAISE(
+      writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(),
+                                               arrow::default_memory_pool(), outfile,
+                                               props, arrow_props));
+
+  // Write each batch as a row_group
+  for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : *batch_stream) {
+    ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch);
+    ARROW_ASSIGN_OR_RAISE(auto table,
+                          arrow::Table::FromRecordBatches(batch->schema(), {batch}));
+    ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows()));
+  }
+
+  // Write file footer and close
+  ARROW_RETURN_NOT_OK(writer->Close());
+
+  return arrow::Status::OK();
 }
 
 arrow::Status RunExamples(std::string path_to_file) {
-    ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
-    // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
-    // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
-    // ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
-    return arrow::Status::OK();
+  ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
+  ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
+  ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
+  ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
+  return arrow::Status::OK();
 }
 
 int main(int argc, char** argv) {
-    if (argc != 2) {
-        // Fake success for CI purposes.
-        return EXIT_SUCCESS;
-    }
-
-    std::string path_to_file = argv[1];
-    arrow::Status status = RunExamples(path_to_file);
-
-    if (!status.ok()) {
-        std::cerr << "Error occurred: " << status.message() << std::endl;
-        return EXIT_FAILURE;
-    }
+  if (argc != 2) {
+    // Fake success for CI purposes.
     return EXIT_SUCCESS;
+  }
+
+  std::string path_to_file = argv[1];
+  arrow::Status status = RunExamples(path_to_file);
+
+  if (!status.ok()) {
+    std::cerr << "Error occurred: " << status.message() << std::endl;
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
 }
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 06be0da74aa6..0b947af762b2 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -354,7 +354,7 @@ add_parquet_test(reader-test
                  level_conversion_test.cc
                  column_scanner_test.cc
                  reader_test.cc
-                 filtered_reader_test.cc
+                 range_reader_test.cc
                  stream_reader_test.cc
                  test_util.cc)
 
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 93b4089ef68e..52a0d36412d7 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -19,10 +19,8 @@
 
 #include <parquet/page_index.h>
 
-#include <zconf.h>
 #include <algorithm>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <unordered_set>
 #include <utility>
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 0fd35349b643..0cd8f298d79d 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -23,8 +23,8 @@
 #include <memory>
 #include <vector>
 
-#include "parquet/file_reader.h"
 #include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
 #include "parquet/platform.h"
 #include "parquet/properties.h"
 
@@ -205,10 +205,10 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either row_group_indices or column_indices
   ///     contains an invalid index
-  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
-                                       const std::vector<int>& column_indices,
-                                       const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
-                                       std::shared_ptr<::arrow::RecordBatchReader>* out);
+  ::arrow::Status GetRecordBatchReader(
+      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                        const std::vector<int>& column_indices,
                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 5187ef94aa9c..b517ee7c798e 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1636,7 +1636,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
     // and there is not read-ahead for levels.
     int64_t skipped_records = 0;
     if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) {
-      skipped_records =  this->Skip(num_records);
+      skipped_records = this->Skip(num_records);
       current_rg_processed_records += skipped_records;
       return skipped_records;
     }
@@ -1999,7 +1999,6 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
 
     while (true) {
       const auto advise = skipper->advise_next(current_rg_processed_records);
-      std::cout << "advise got after current_rg_processed_records: " << current_rg_processed_records  << " is: " << advise <<std::endl;
       if (advise == 0) {
         return 0;
       }
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 44288e25afea..aee4d45ef835 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -17,7 +17,6 @@
 
 #pragma once
 
-#include <gmock/gmock-matchers.h>
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -355,9 +354,7 @@ class RowRanges {
 
   explicit RowRanges(const Range& range) { ranges.push_back(range); }
 
-  RowRanges(const std::vector<Range>& ranges) {
-    this->ranges = ranges;
-  }
+  RowRanges(const std::vector<Range>& ranges) { this->ranges = ranges; }
 
   // copy cstr
   RowRanges(const RowRanges& other) { ranges = other.ranges; }
diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
similarity index 99%
rename from cpp/src/parquet/filtered_reader_test.cc
rename to cpp/src/parquet/range_reader_test.cc
index 018e2580f254..835c5e7fe1e2 100644
--- a/cpp/src/parquet/filtered_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -147,7 +147,7 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile(
   // {
   //   // output to a local file for debugging
   //   ARROW_ASSIGN_OR_RAISE(auto outfile, arrow::io::FileOutputStream::Open(
-  //                                           "/tmp/filtered_reader_test.parquet"));
+  //                                           "/tmp/range_reader_test.parquet"));
   //
   //   ARROW_RETURN_NOT_OK(
   //       parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile,
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index a9adcdf5b9c3..0a73002846ad 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1457,6 +1457,3 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) {
 }
 
 }  // namespace parquet
-
-
-//TODO: TEST_P ,enable dictionary
\ No newline at end of file

From f11d6a87593508d8e729e1998affd3614ad187a1 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 23 Nov 2023 21:32:05 +0800
Subject: [PATCH 08/25] happy path pass 8

---
 cpp/src/parquet/column_reader.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index aee4d45ef835..40b734d33ea0 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -503,17 +503,19 @@ namespace internal {
 
 class PARQUET_EXPORT RecordSkipper {
  public:
-  RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) : row_ranges(row_ranges_) {
+  RecordSkipper(RowRanges& pages, RowRanges& row_ranges_)
+      : row_ranges(row_ranges_) {  // copy row_ranges
     RowRanges will_process_pages, skip_pages;
     for (auto& page : pages.getRanges()) {
-      if (row_ranges.isOverlapping(page)) {
-        // will_process_pages.add(page);
-      } else {
+      if (!row_ranges.isOverlapping(page)) {
         skip_pages.add(page, false);
       }
     }
+
+    /// Since the skipped pages will be slienly skipped without updating
+    /// current_rg_processed_records or records_read_, we need to pre-process the row
+    /// ranges as if these skipped pages never existed
     adjust_ranges(skip_pages, row_ranges);
-    // adjust_ranges(skip_pages, will_process_pages);
 
     total_rows_to_process = pages.rowCount() - skip_pages.rowCount();
   }
@@ -547,16 +549,12 @@ class PARQUET_EXPORT RecordSkipper {
 
  private:
   /// Keep copy of ranges, because advise_next() will modify them
-  // RowRanges will_process_pages;
   RowRanges row_ranges;
 
   size_t row_range_idx = 0;
 
   size_t total_rows_to_process = 0;
 
-  /// Since the skipped pages will be slienly skipped without updating
-  /// current_rg_processed_records or records_read_, we need to pre-process the row ranges
-  /// as if these skipped pages never existed
   void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
     size_t skipped_rows = 0;
     auto iter = to_adjust.getRanges().begin();

From 53ea5e5d6dd2ec9a140288cd4b0a68f354ecb24e Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Wed, 29 Nov 2023 16:01:37 +0800
Subject: [PATCH 09/25] refine emtpy logic

---
 cpp/src/parquet/arrow/reader.cc      | 37 ++++++++++++++--------------
 cpp/src/parquet/column_reader.h      |  9 ++++---
 cpp/src/parquet/range_reader_test.cc | 26 ++++++++++++++-----
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 52a0d36412d7..10c731a6a8b9 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -469,9 +469,7 @@ struct RowRangesPageFilter {
   explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_,
                                const RowRangesPtr& page_ranges_)
       : row_ranges(row_ranges_), page_ranges(page_ranges_) {
-    assert(row_ranges != nullptr);
     assert(page_ranges != nullptr);
-    assert(row_ranges->getRanges().size() > 0);
     assert(page_ranges->getRanges().size() > 0);
   }
 
@@ -568,7 +566,7 @@ class LeafReader : public ColumnReaderImpl {
   void checkAndGetPageRanges(const std::shared_ptr<RowRanges>& row_ranges,
                              std::shared_ptr<RowRanges>& page_ranges) {
     // check offset exists
-    auto rg_pg_index_reader =
+    const auto rg_pg_index_reader =
         ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group());
 
     if (!rg_pg_index_reader) {
@@ -577,7 +575,7 @@ class LeafReader : public ColumnReaderImpl {
           "Group: " +
           std::to_string(input_->current_row_group()));
     }
-    auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index());
+    const auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index());
 
     if (!offset_index) {
       throw ParquetException(
@@ -627,21 +625,24 @@ class LeafReader : public ColumnReaderImpl {
       // if specific row range is provided for this rg
       if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
           iter != ctx_->row_ranges_map->end()) {
-        std::shared_ptr<RowRanges> page_ranges;
-        checkAndGetPageRanges(iter->second, page_ranges);
-
-        // part 1, skip decompressing & decoding unnecessary pages
-        page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges));
-
-        // part 2, skip unnecessary rows in necessary pages
-        record_reader_->set_record_skipper(
-            std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
-                                                               *iter->second));
-      } else {
-        // If row_ranges_map exists but no row_ranges is found for this RG, skip this RG
-        NextRowGroup();
-        return;
+        if (iter->second != nullptr && iter->second->rowCount() != 0) {
+          std::shared_ptr<RowRanges> page_ranges;
+          checkAndGetPageRanges(iter->second, page_ranges);
+
+          // part 1, skip decompressing & decoding unnecessary pages
+          page_reader->set_data_page_filter(
+              RowRangesPageFilter(iter->second, page_ranges));
+
+          // part 2, skip unnecessary rows in necessary pages
+          record_reader_->set_record_skipper(
+              std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
+                                                                 *iter->second));
+        } else {
+          NextRowGroup();
+          return;
+        }
       }
+      // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this RG will be read
     }
 
     record_reader_->reset_current_rg_processed_records();
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 40b734d33ea0..dde78d5115c3 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -453,9 +453,7 @@ class RowRanges {
   }
 
   bool isValid() const {
-    if (ranges.size() == 0) {
-      return false;
-    }
+    if (ranges.size() == 0) return true;
     if (ranges[0].from < 0) {
       return false;
     }
@@ -481,7 +479,10 @@ class RowRanges {
 
   std::vector<Range>& getRanges() { return ranges; }
 
-  const Range& operator[](size_t index) const { return ranges[index]; }
+  const Range& operator[](size_t index) const {
+    assert(index < ranges.size());
+    return ranges[index];
+  }
 
   std::string toString() const {
     std::string result = "[";
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index 835c5e7fe1e2..7a7c7e001bb7 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -292,18 +292,25 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
 TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  // here we test four kinds of empty range:
+
+  // rg 0 not put into map -> will read
+  row_ranges_map->insert({1, nullptr});  // value is nullptr -> will skip
   row_ranges_map->insert(
-      {0, std::make_shared<parquet::RowRanges>(std::vector<parquet::Range>())});
+      {2, std::make_shared<parquet::RowRanges>(
+              std::vector<parquet::Range>())});  // value is empty -> will skip
+  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>()});  // value is empty -> will skip
+
   const std::vector column_indices{0, 1, 2, 3, 4};
   const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                          row_ranges_map, &rb_reader);
-  ASSERT_NOT_OK(status);
-  EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone "
-                                    "and non-interleaving: []") != std::string::npos);
+  ASSERT_OK(status);
+  // (0+...29) = 435
+  check_rb(rb_reader, 30, 435);
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
-  // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped
+  // case 1: only care about RG 0
   {
     std::shared_ptr<arrow::RecordBatchReader> rb_reader;
     const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
@@ -312,6 +319,9 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({1, nullptr});
+    row_ranges_map->insert({2, nullptr});
+    row_ranges_map->insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
@@ -319,7 +329,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
     check_rb(rb_reader, 15, 210);  // 0 + 2 + ... + 28 = 210
   }
 
-  // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped
+  // case 2: care about RG 0 and 2
   {
     std::shared_ptr<arrow::RecordBatchReader> rb_reader;
     const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
@@ -328,7 +338,9 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({1, nullptr});
     row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
@@ -465,7 +477,9 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({1, nullptr});
     row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map->insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));

From dca69af1b5b61ae93cc992dfda1631ebc4edf694 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Thu, 30 Nov 2023 12:18:27 +0800
Subject: [PATCH 10/25] fix bug

---
 cpp/src/parquet/arrow/reader.cc      | 17 ++++-----
 cpp/src/parquet/range_reader_test.cc | 57 +++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 10c731a6a8b9..1606c60d64e3 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -476,22 +476,18 @@ struct RowRangesPageFilter {
   bool operator()(const DataPageStats& stats) {
     ++page_range_idx;
 
-    if (row_range_idx >= row_ranges->getRanges().size()) {
-      return true;
-    }
-
     Range current_page_range = (*page_ranges)[page_range_idx];
 
-    if (current_page_range.isBefore((*row_ranges)[row_range_idx])) {
-      return true;
-    }
-
     while (row_range_idx < row_ranges->getRanges().size() &&
            current_page_range.isAfter((*row_ranges)[row_range_idx])) {
       row_range_idx++;
     }
 
-    return row_range_idx >= row_ranges->getRanges().size();
+    if (row_range_idx >= row_ranges->getRanges().size()) {
+      return true;
+    }
+
+    return current_page_range.isBefore((*row_ranges)[row_range_idx]);
   }
 
   size_t row_range_idx = 0;
@@ -642,7 +638,8 @@ class LeafReader : public ColumnReaderImpl {
           return;
         }
       }
-      // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this RG will be read
+      // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this
+      // RG will be read
     }
 
     record_reader_->reset_current_rg_processed_records();
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index 7a7c7e001bb7..5bccaaa0c0f6 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -30,6 +30,22 @@
 #include <gtest/gtest.h>
 #include <iostream>
 
+#include <random>
+#include <string>
+
+std::string random_string(std::string::size_type length) {
+  static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  static std::mt19937 rg{std::random_device{}()};
+  static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
+
+  std::string s;
+  s.reserve(length);
+  while (length--) s += chrs[pick(rg)];
+
+  return s;
+}
+
 /// The table looks like (with_nulls = false):
 // {
 // { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0},
@@ -97,7 +113,8 @@ arrow::Result<std::shared_ptr<arrow::Table>> GetTable(bool with_nulls = false) {
   std::vector<std::string> strs;
   uint8_t valid_bytes[100];
   for (size_t i = 0; i < 100; i++) {
-    strs.push_back(std::to_string(i));
+    // add more chars to make this column unaligned with other columns' page
+    strs.push_back(std::to_string(i) + random_string(20));
     valid_bytes[i] = flags[i];
   }
   ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs, &valid_bytes[0]));
@@ -130,8 +147,9 @@ arrow::Result<std::shared_ptr<arrow::Buffer>> WriteFullFile(
       WriterProperties::Builder()
           .max_row_group_length(30)
           ->enable_write_page_index()
-          ->write_batch_size(13)
-          ->data_pagesize(1)  // this will cause every batch creating a page
+          ->disable_dictionary()
+          ->write_batch_size(1)
+          ->data_pagesize(30)  // small pages
           ->compression(arrow::Compression::SNAPPY)
           ->build();
 
@@ -203,7 +221,9 @@ void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
       auto c_array =
           std::dynamic_pointer_cast<arrow::StringArray>(batch->GetColumnByName("c"));
       for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) {
-        sum_c += std::stoi(std::string((*iter).has_value() ? (*iter).value() : "0"));
+        sum_c += std::stoi(std::string(
+            (*iter).has_value() ? (*iter).value().substr(0, (*iter).value().size() - 20)
+                                : "0"));
       }
     }
 
@@ -252,7 +272,7 @@ class TestRecordBatchReaderWithRanges : public testing::Test {
   std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
 };
 
-TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
+TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
   row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
@@ -270,6 +290,24 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) {
   check_rb(rb_reader, 40, 2280);
 }
 
+TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
+  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  row_ranges_map->insert(
+      {0, std::make_shared<parquet::RowRanges>(
+              std::vector<parquet::Range>{parquet::Range{0, 7}, parquet::Range{16, 23}})});
+  row_ranges_map->insert({1, nullptr});
+  row_ranges_map->insert({2, nullptr});
+  row_ranges_map->insert({3, nullptr});
+
+  const std::vector column_indices{0, 1, 2, 3, 4};
+  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
+                                               row_ranges_map, &rb_reader));
+
+  // (0+...+7) + (16+...+23) = 184
+  check_rb(rb_reader, 16, 184);
+}
+
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
@@ -299,7 +337,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
   row_ranges_map->insert(
       {2, std::make_shared<parquet::RowRanges>(
               std::vector<parquet::Range>())});  // value is empty -> will skip
-  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>()});  // value is empty -> will skip
+  row_ranges_map->insert(
+      {3, std::make_shared<parquet::RowRanges>()});  // value is empty -> will skip
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -384,9 +423,9 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
     const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                            row_ranges_map, &rb_reader);
     ASSERT_NOT_OK(status);
-    EXPECT_TRUE(status.message().find(
-                    "The provided row range [(0, 30)] exceeds last page :[26, 29]") !=
-                std::string::npos);
+    EXPECT_TRUE(
+        status.message().find("The provided row range [(0, 30)] exceeds last page :") !=
+        std::string::npos);
   }
 }
 

From 2c8b06872b3dd93a0d7bdba18aa101b279e7ff06 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 1 Jan 2024 17:12:43 +0800
Subject: [PATCH 11/25] camel naming

---
 cpp/src/parquet/arrow/reader.cc |   28 +-
 cpp/src/parquet/column_reader.h | 1390 +++++++++++++++----------------
 2 files changed, 681 insertions(+), 737 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 1606c60d64e3..06e8b5bcf026 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -470,7 +470,7 @@ struct RowRangesPageFilter {
                                const RowRangesPtr& page_ranges_)
       : row_ranges(row_ranges_), page_ranges(page_ranges_) {
     assert(page_ranges != nullptr);
-    assert(page_ranges->getRanges().size() > 0);
+    assert(page_ranges->GetRanges().size() > 0);
   }
 
   bool operator()(const DataPageStats& stats) {
@@ -478,16 +478,16 @@ struct RowRangesPageFilter {
 
     Range current_page_range = (*page_ranges)[page_range_idx];
 
-    while (row_range_idx < row_ranges->getRanges().size() &&
-           current_page_range.isAfter((*row_ranges)[row_range_idx])) {
+    while (row_range_idx < row_ranges->GetRanges().size() &&
+           current_page_range.IsAfter((*row_ranges)[row_range_idx])) {
       row_range_idx++;
     }
 
-    if (row_range_idx >= row_ranges->getRanges().size()) {
+    if (row_range_idx >= row_ranges->GetRanges().size()) {
       return true;
     }
 
-    return current_page_range.isBefore((*row_ranges)[row_range_idx]);
+    return current_page_range.IsBefore((*row_ranges)[row_range_idx]);
   }
 
   size_t row_range_idx = 0;
@@ -580,32 +580,32 @@ class LeafReader : public ColumnReaderImpl {
           field_->name());
     }
 
-    if (!row_ranges->isValid()) {
+    if (!row_ranges->IsValid()) {
       throw ParquetException(
           "The provided row range is invalid, keep it monotone and non-interleaving: " +
-          row_ranges->toString());
+          row_ranges->ToString());
     }
 
     const auto page_locations = offset_index->page_locations();
     page_ranges = std::make_shared<RowRanges>();
     for (size_t i = 0; i < page_locations.size() - 1; i++) {
-      page_ranges->add(
+      page_ranges->Add(
           {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1},
           false);
     }
     if (page_locations.size() >= 1) {
-      page_ranges->add(
+      page_ranges->Add(
           {page_locations[page_locations.size() - 1].first_row_index,
            ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() -
                1},
           false);
     }
 
-    if (row_ranges->getRanges().size() > 0) {
-      if ((*row_ranges).getRanges().back().to > page_ranges->getRanges().back().to) {
+    if (row_ranges->GetRanges().size() > 0) {
+      if ((*row_ranges).GetRanges().back().to > page_ranges->GetRanges().back().to) {
         throw ParquetException(
-            "The provided row range " + row_ranges->toString() +
-            " exceeds last page :" + page_ranges->getRanges().back().toString());
+            "The provided row range " + row_ranges->ToString() +
+            " exceeds last page :" + page_ranges->GetRanges().back().ToString());
       }
     }
   }
@@ -621,7 +621,7 @@ class LeafReader : public ColumnReaderImpl {
       // if specific row range is provided for this rg
       if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
           iter != ctx_->row_ranges_map->end()) {
-        if (iter->second != nullptr && iter->second->rowCount() != 0) {
+        if (iter->second != nullptr && iter->second->RowCount() != 0) {
           std::shared_ptr<RowRanges> page_ranges;
           checkAndGetPageRanges(iter->second, page_ranges);
 
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index dde78d5115c3..0c81087a3770 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -32,736 +32,680 @@
 #include "parquet/types.h"
 
 namespace arrow {
+    class Array;
+    class ChunkedArray;
 
-class Array;
-class ChunkedArray;
+    namespace bit_util {
+        class BitReader;
+    } // namespace bit_util
 
-namespace bit_util {
-class BitReader;
-}  // namespace bit_util
+    namespace util {
+        class RleDecoder;
+    } // namespace util
+} // namespace arrow
 
-namespace util {
-class RleDecoder;
-}  // namespace util
+namespace parquet {
+    class Decryptor;
+    class Page;
+
+    // 16 MB is the default maximum page header size
+    static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+    // 16 KB is the default expected page header size
+    static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+    // \brief DataPageStats stores encoded statistics and number of values/rows for
+    // a page.
+    struct PARQUET_EXPORT DataPageStats {
+        DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
+                      std::optional<int32_t> num_rows)
+            : encoded_statistics(encoded_statistics),
+              num_values(num_values),
+              num_rows(num_rows) {
+        }
 
-}  // namespace arrow
+        // Encoded statistics extracted from the page header.
+        // Nullptr if there are no statistics in the page header.
+        const EncodedStatistics* encoded_statistics;
+        // Number of values stored in the page. Filled for both V1 and V2 data pages.
+        // For repeated fields, this can be greater than number of rows. For
+        // non-repeated fields, this will be the same as the number of rows.
+        int32_t num_values;
+        // Number of rows stored in the page. std::nullopt if not available.
+        std::optional<int32_t> num_rows;
+    };
+
+    class PARQUET_EXPORT LevelDecoder {
+    public:
+        LevelDecoder();
+
+        ~LevelDecoder();
+
+        // Initialize the LevelDecoder state with new data
+        // and return the number of bytes consumed
+        int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+                    const uint8_t* data, int32_t data_size);
+
+        void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+                       const uint8_t* data);
+
+        // Decodes a batch of levels into an array and returns the number of levels decoded
+        int Decode(int batch_size, int16_t* levels);
+
+    private:
+        int bit_width_;
+        int num_values_remaining_;
+        Encoding::type encoding_;
+        std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+        std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
+        int16_t max_level_;
+    };
+
+    struct CryptoContext {
+        CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
+                      std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
+            : start_decrypt_with_dictionary_page(start_with_dictionary_page),
+              row_group_ordinal(rg_ordinal),
+              column_ordinal(col_ordinal),
+              meta_decryptor(std::move(meta)),
+              data_decryptor(std::move(data)) {
+        }
 
-namespace parquet {
+        CryptoContext() {
+        }
+
+        bool start_decrypt_with_dictionary_page = false;
+        int16_t row_group_ordinal = -1;
+        int16_t column_ordinal = -1;
+        std::shared_ptr<Decryptor> meta_decryptor;
+        std::shared_ptr<Decryptor> data_decryptor;
+    };
+
+    // Abstract page iterator interface. This way, we can feed column pages to the
+    // ColumnReader through whatever mechanism we choose
+    class PARQUET_EXPORT PageReader {
+        using DataPageFilter = std::function<bool(const DataPageStats&)>;
+
+    public:
+        virtual ~PageReader() = default;
+
+        static std::unique_ptr<PageReader> Open(
+            std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
+            Compression::type codec, bool always_compressed = false,
+            ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+            const CryptoContext* ctx = NULLPTR);
+
+        static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
+                                                int64_t total_num_values,
+                                                Compression::type codec,
+                                                const ReaderProperties&properties,
+                                                bool always_compressed = false,
+                                                const CryptoContext* ctx = NULLPTR);
+
+        // If data_page_filter is present (not null), NextPage() will call the
+        // callback function exactly once per page in the order the pages appear in
+        // the column. If the callback function returns true the page will be
+        // skipped. The callback will be called only if the page type is DATA_PAGE or
+        // DATA_PAGE_V2. Dictionary pages will not be skipped.
+        // Caller is responsible for checking that statistics are correct using
+        // ApplicationVersion::HasCorrectStatistics().
+        // \note API EXPERIMENTAL
+        void set_data_page_filter(DataPageFilter data_page_filter) {
+            data_page_filter_ = std::move(data_page_filter);
+        }
+
+        // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+        // containing new Page otherwise
+        //
+        // The returned Page may contain references that aren't guaranteed to live
+        // beyond the next call to NextPage().
+        virtual std::shared_ptr<Page> NextPage() = 0;
+
+        virtual void set_max_page_header_size(uint32_t size) = 0;
+
+    protected:
+        // Callback that decides if we should skip a page or not.
+        DataPageFilter data_page_filter_;
+    };
+
+    class PARQUET_EXPORT ColumnReader {
+    public:
+        virtual ~ColumnReader() = default;
+
+        static std::shared_ptr<ColumnReader> Make(
+            const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+            ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+        // Returns true if there are still values in this column.
+        virtual bool HasNext() = 0;
+
+        virtual Type::type type() const = 0;
+
+        virtual const ColumnDescriptor* descr() const = 0;
+
+        // Get the encoding that can be exposed by this reader. If it returns
+        // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+        //
+        // \note API EXPERIMENTAL
+        virtual ExposedEncoding GetExposedEncoding() = 0;
+
+    protected:
+        friend class RowGroupReader;
+        // Set the encoding that can be exposed by this reader.
+        //
+        // \note API EXPERIMENTAL
+        virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+    };
+
+    // API to read values from a single column. This is a main client facing API.
+    template<typename DType>
+    class TypedColumnReader : public ColumnReader {
+    public:
+        typedef typename DType::c_type T;
+
+        // Read a batch of repetition levels, definition levels, and values from the
+        // column.
+        //
+        // Since null values are not stored in the values, the number of values read
+        // may be less than the number of repetition and definition levels. With
+        // nested data this is almost certainly true.
+        //
+        // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+        // This is only safe if you know through some other source that there are no
+        // undefined values.
+        //
+        // To fully exhaust a row group, you must read batches until the number of
+        // values read reaches the number of stored values according to the metadata.
+        //
+        // This API is the same for both V1 and V2 of the DataPage
+        //
+        // @returns: actual number of levels read (see values_read for number of values read)
+        virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+                                  T* values, int64_t* values_read) = 0;
+
+        /// Read a batch of repetition levels, definition levels, and values from the
+        /// column and leave spaces for null entries on the lowest level in the values
+        /// buffer.
+        ///
+        /// In comparison to ReadBatch the length of repetition and definition levels
+        /// is the same as of the number of values read for max_definition_level == 1.
+        /// In the case of max_definition_level > 1, the repetition and definition
+        /// levels are larger than the values but the values include the null entries
+        /// with definition_level == (max_definition_level - 1).
+        ///
+        /// To fully exhaust a row group, you must read batches until the number of
+        /// values read reaches the number of stored values according to the metadata.
+        ///
+        /// @param batch_size the number of levels to read
+        /// @param[out] def_levels The Parquet definition levels, output has
+        ///   the length levels_read.
+        /// @param[out] rep_levels The Parquet repetition levels, output has
+        ///   the length levels_read.
+        /// @param[out] values The values in the lowest nested level including
+        ///   spacing for nulls on the lowest levels; output has the length
+        ///   values_read.
+        /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
+        ///   the row is null or on the maximum definition level. For performance
+        ///   reasons the underlying buffer should be able to store 1 bit more than
+        ///   required. If this requires an additional byte, this byte is only read
+        ///   but never written to.
+        /// @param valid_bits_offset The offset in bits of the valid_bits where the
+        ///   first relevant bit resides.
+        /// @param[out] levels_read The number of repetition/definition levels that were read.
+        /// @param[out] values_read The number of values read, this includes all
+        ///   non-null entries as well as all null-entries on the lowest level
+        ///   (i.e. definition_level == max_definition_level - 1)
+        /// @param[out] null_count The number of nulls on the lowest levels.
+        ///   (i.e. (values_read - null_count) is total number of non-null entries)
+        ///
+        /// \deprecated Since 4.0.0
+        ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
+        virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
+                                        int16_t* rep_levels, T* values, uint8_t* valid_bits,
+                                        int64_t valid_bits_offset, int64_t* levels_read,
+                                        int64_t* values_read, int64_t* null_count) = 0;
+
+        // Skip reading values. This method will work for both repeated and
+        // non-repeated fields. Note that this method is skipping values and not
+        // records. This distinction is important for repeated fields, meaning that
+        // we are not skipping over the values to the next record. For example,
+        // consider the following two consecutive records containing one repeated field:
+        // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
+        // is inside the first record.
+        // Returns the number of values skipped.
+        virtual int64_t Skip(int64_t num_values_to_skip) = 0;
+
+        // Read a batch of repetition levels, definition levels, and indices from the
+        // column. And read the dictionary if a dictionary page is encountered during
+        // reading pages. This API is similar to ReadBatch(), with ability to read
+        // dictionary and indices. It is only valid to call this method  when the reader can
+        // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+        // DICTIONARY).
+        //
+        // The dictionary is read along with the data page. When there's no data page,
+        // the dictionary won't be returned.
+        //
+        // @param batch_size The batch size to read
+        // @param[out] def_levels The Parquet definition levels.
+        // @param[out] rep_levels The Parquet repetition levels.
+        // @param[out] indices The dictionary indices.
+        // @param[out] indices_read The number of indices read.
+        // @param[out] dict The pointer to dictionary values. It will return nullptr if
+        // there's no data page. Each column chunk only has one dictionary page. The dictionary
+        // is owned by the reader, so the caller is responsible for copying the dictionary
+        // values before the reader gets destroyed.
+        // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+        // page.
+        // @returns: actual number of levels read (see indices_read for number of
+        // indices read
+        //
+        // \note API EXPERIMENTAL
+        virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+                                                int16_t* rep_levels, int32_t* indices,
+                                                int64_t* indices_read, const T** dict,
+                                                int32_t* dict_len) = 0;
+    };
+
+    struct Range {
+        static Range UnionRange(const Range&left, const Range&right) {
+            if (left.from <= right.from) {
+                if (left.to + 1 >= right.from) {
+                    return {left.from, std::max(left.to, right.to)};
+                }
+            }
+            else if (right.to + 1 >= left.from) {
+                return {right.from, std::max(left.to, right.to)};
+            }
+            return {-1, -1};
+        }
+
+        static Range Intersection(const Range&left, const Range&right) {
+            if (left.from <= right.from) {
+                if (left.to >= right.from) {
+                    return {right.from, std::min(left.to, right.to)};
+                }
+            }
+            else if (right.to >= left.from) {
+                return {left.from, std::min(left.to, right.to)};
+            }
+            return {-1, -1}; // Return a default Range object if no intersection range found
+        }
+
+        Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
+            assert(from <= to);
+        }
+
+        size_t Count() const { return to - from + 1; }
+
+        bool IsBefore(const Range&other) const { return to < other.from; }
 
-class Decryptor;
-class Page;
-
-// 16 MB is the default maximum page header size
-static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
-
-// 16 KB is the default expected page header size
-static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
-
-// \brief DataPageStats stores encoded statistics and number of values/rows for
-// a page.
-struct PARQUET_EXPORT DataPageStats {
-  DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
-                std::optional<int32_t> num_rows)
-      : encoded_statistics(encoded_statistics),
-        num_values(num_values),
-        num_rows(num_rows) {}
-
-  // Encoded statistics extracted from the page header.
-  // Nullptr if there are no statistics in the page header.
-  const EncodedStatistics* encoded_statistics;
-  // Number of values stored in the page. Filled for both V1 and V2 data pages.
-  // For repeated fields, this can be greater than number of rows. For
-  // non-repeated fields, this will be the same as the number of rows.
-  int32_t num_values;
-  // Number of rows stored in the page. std::nullopt if not available.
-  std::optional<int32_t> num_rows;
-};
-
-class PARQUET_EXPORT LevelDecoder {
- public:
-  LevelDecoder();
-  ~LevelDecoder();
-
-  // Initialize the LevelDecoder state with new data
-  // and return the number of bytes consumed
-  int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
-              const uint8_t* data, int32_t data_size);
-
-  void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
-                 const uint8_t* data);
-
-  // Decodes a batch of levels into an array and returns the number of levels decoded
-  int Decode(int batch_size, int16_t* levels);
-
- private:
-  int bit_width_;
-  int num_values_remaining_;
-  Encoding::type encoding_;
-  std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
-  std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
-  int16_t max_level_;
-};
-
-struct CryptoContext {
-  CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
-                std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
-      : start_decrypt_with_dictionary_page(start_with_dictionary_page),
-        row_group_ordinal(rg_ordinal),
-        column_ordinal(col_ordinal),
-        meta_decryptor(std::move(meta)),
-        data_decryptor(std::move(data)) {}
-  CryptoContext() {}
-
-  bool start_decrypt_with_dictionary_page = false;
-  int16_t row_group_ordinal = -1;
-  int16_t column_ordinal = -1;
-  std::shared_ptr<Decryptor> meta_decryptor;
-  std::shared_ptr<Decryptor> data_decryptor;
-};
-
-// Abstract page iterator interface. This way, we can feed column pages to the
-// ColumnReader through whatever mechanism we choose
-class PARQUET_EXPORT PageReader {
-  using DataPageFilter = std::function<bool(const DataPageStats&)>;
-
- public:
-  virtual ~PageReader() = default;
-
-  static std::unique_ptr<PageReader> Open(
-      std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
-      Compression::type codec, bool always_compressed = false,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-      const CryptoContext* ctx = NULLPTR);
-  static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
-                                          int64_t total_num_values,
-                                          Compression::type codec,
-                                          const ReaderProperties& properties,
-                                          bool always_compressed = false,
-                                          const CryptoContext* ctx = NULLPTR);
-
-  // If data_page_filter is present (not null), NextPage() will call the
-  // callback function exactly once per page in the order the pages appear in
-  // the column. If the callback function returns true the page will be
-  // skipped. The callback will be called only if the page type is DATA_PAGE or
-  // DATA_PAGE_V2. Dictionary pages will not be skipped.
-  // Caller is responsible for checking that statistics are correct using
-  // ApplicationVersion::HasCorrectStatistics().
-  // \note API EXPERIMENTAL
-  void set_data_page_filter(DataPageFilter data_page_filter) {
-    data_page_filter_ = std::move(data_page_filter);
-  }
-
-  // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
-  // containing new Page otherwise
-  //
-  // The returned Page may contain references that aren't guaranteed to live
-  // beyond the next call to NextPage().
-  virtual std::shared_ptr<Page> NextPage() = 0;
-
-  virtual void set_max_page_header_size(uint32_t size) = 0;
-
- protected:
-  // Callback that decides if we should skip a page or not.
-  DataPageFilter data_page_filter_;
-};
-
-class PARQUET_EXPORT ColumnReader {
- public:
-  virtual ~ColumnReader() = default;
-
-  static std::shared_ptr<ColumnReader> Make(
-      const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-  // Returns true if there are still values in this column.
-  virtual bool HasNext() = 0;
-
-  virtual Type::type type() const = 0;
-
-  virtual const ColumnDescriptor* descr() const = 0;
-
-  // Get the encoding that can be exposed by this reader. If it returns
-  // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
-  //
-  // \note API EXPERIMENTAL
-  virtual ExposedEncoding GetExposedEncoding() = 0;
-
- protected:
-  friend class RowGroupReader;
-  // Set the encoding that can be exposed by this reader.
-  //
-  // \note API EXPERIMENTAL
-  virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
-};
-
-// API to read values from a single column. This is a main client facing API.
-template <typename DType>
-class TypedColumnReader : public ColumnReader {
- public:
-  typedef typename DType::c_type T;
-
-  // Read a batch of repetition levels, definition levels, and values from the
-  // column.
-  //
-  // Since null values are not stored in the values, the number of values read
-  // may be less than the number of repetition and definition levels. With
-  // nested data this is almost certainly true.
-  //
-  // Set def_levels or rep_levels to nullptr if you want to skip reading them.
-  // This is only safe if you know through some other source that there are no
-  // undefined values.
-  //
-  // To fully exhaust a row group, you must read batches until the number of
-  // values read reaches the number of stored values according to the metadata.
-  //
-  // This API is the same for both V1 and V2 of the DataPage
-  //
-  // @returns: actual number of levels read (see values_read for number of values read)
-  virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
-                            T* values, int64_t* values_read) = 0;
-
-  /// Read a batch of repetition levels, definition levels, and values from the
-  /// column and leave spaces for null entries on the lowest level in the values
-  /// buffer.
-  ///
-  /// In comparison to ReadBatch the length of repetition and definition levels
-  /// is the same as of the number of values read for max_definition_level == 1.
-  /// In the case of max_definition_level > 1, the repetition and definition
-  /// levels are larger than the values but the values include the null entries
-  /// with definition_level == (max_definition_level - 1).
-  ///
-  /// To fully exhaust a row group, you must read batches until the number of
-  /// values read reaches the number of stored values according to the metadata.
-  ///
-  /// @param batch_size the number of levels to read
-  /// @param[out] def_levels The Parquet definition levels, output has
-  ///   the length levels_read.
-  /// @param[out] rep_levels The Parquet repetition levels, output has
-  ///   the length levels_read.
-  /// @param[out] values The values in the lowest nested level including
-  ///   spacing for nulls on the lowest levels; output has the length
-  ///   values_read.
-  /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
-  ///   the row is null or on the maximum definition level. For performance
-  ///   reasons the underlying buffer should be able to store 1 bit more than
-  ///   required. If this requires an additional byte, this byte is only read
-  ///   but never written to.
-  /// @param valid_bits_offset The offset in bits of the valid_bits where the
-  ///   first relevant bit resides.
-  /// @param[out] levels_read The number of repetition/definition levels that were read.
-  /// @param[out] values_read The number of values read, this includes all
-  ///   non-null entries as well as all null-entries on the lowest level
-  ///   (i.e. definition_level == max_definition_level - 1)
-  /// @param[out] null_count The number of nulls on the lowest levels.
-  ///   (i.e. (values_read - null_count) is total number of non-null entries)
-  ///
-  /// \deprecated Since 4.0.0
-  ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
-  virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
-                                  int16_t* rep_levels, T* values, uint8_t* valid_bits,
-                                  int64_t valid_bits_offset, int64_t* levels_read,
-                                  int64_t* values_read, int64_t* null_count) = 0;
-
-  // Skip reading values. This method will work for both repeated and
-  // non-repeated fields. Note that this method is skipping values and not
-  // records. This distinction is important for repeated fields, meaning that
-  // we are not skipping over the values to the next record. For example,
-  // consider the following two consecutive records containing one repeated field:
-  // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
-  // is inside the first record.
-  // Returns the number of values skipped.
-  virtual int64_t Skip(int64_t num_values_to_skip) = 0;
-
-  // Read a batch of repetition levels, definition levels, and indices from the
-  // column. And read the dictionary if a dictionary page is encountered during
-  // reading pages. This API is similar to ReadBatch(), with ability to read
-  // dictionary and indices. It is only valid to call this method  when the reader can
-  // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
-  // DICTIONARY).
-  //
-  // The dictionary is read along with the data page. When there's no data page,
-  // the dictionary won't be returned.
-  //
-  // @param batch_size The batch size to read
-  // @param[out] def_levels The Parquet definition levels.
-  // @param[out] rep_levels The Parquet repetition levels.
-  // @param[out] indices The dictionary indices.
-  // @param[out] indices_read The number of indices read.
-  // @param[out] dict The pointer to dictionary values. It will return nullptr if
-  // there's no data page. Each column chunk only has one dictionary page. The dictionary
-  // is owned by the reader, so the caller is responsible for copying the dictionary
-  // values before the reader gets destroyed.
-  // @param[out] dict_len The dictionary length. It will return 0 if there's no data
-  // page.
-  // @returns: actual number of levels read (see indices_read for number of
-  // indices read
-  //
-  // \note API EXPERIMENTAL
-  virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
-                                          int16_t* rep_levels, int32_t* indices,
-                                          int64_t* indices_read, const T** dict,
-                                          int32_t* dict_len) = 0;
-};
-
-struct Range {
-  static Range unionRange(const Range& left, const Range& right) {
-    if (left.from <= right.from) {
-      if (left.to + 1 >= right.from) {
-        return {left.from, std::max(left.to, right.to)};
-      }
-    } else if (right.to + 1 >= left.from) {
-      return {right.from, std::max(left.to, right.to)};
-    }
-    return {-1, -1};
-  }
-
-  static Range intersection(const Range& left, const Range& right) {
-    if (left.from <= right.from) {
-      if (left.to >= right.from) {
-        return {right.from, std::min(left.to, right.to)};
-      }
-    } else if (right.to >= left.from) {
-      return {left.from, std::min(left.to, right.to)};
-    }
-    return {-1, -1};  // Return a default Range object if no intersection range found
-  }
-
-  int64_t from;
-  int64_t to;
-
-  Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
-    assert(from <= to);
-  }
-
-  size_t count() const { return to - from + 1; }
-
-  bool isBefore(const Range& other) const { return to < other.from; }
-
-  bool isAfter(const Range& other) const { return from > other.to; }
-
-  bool isOverlap(const Range& other) const { return !isBefore(other) && !isAfter(other); }
-
-  std::string toString() const {
-    return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
-  }
-};
-
-class RowRanges {
-  std::vector<Range> ranges;
-
- public:
-  RowRanges() = default;
-
-  explicit RowRanges(const Range& range) { ranges.push_back(range); }
-
-  RowRanges(const std::vector<Range>& ranges) { this->ranges = ranges; }
-
-  // copy cstr
-  RowRanges(const RowRanges& other) { ranges = other.ranges; }
-
-  RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
-
-  static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) {
-    RowRanges result;
-    auto it1 = left.ranges.begin();
-    auto it2 = right.ranges.begin();
-    if (it2 != right.ranges.end()) {
-      Range range2 = *it2;
-      while (it1 != left.ranges.end()) {
-        Range range1 = *it1;
-        if (range1.isAfter(range2)) {
-          result.add(range2);
-          range2 = range1;
-          const auto tmp = it1;
-          it1 = it2;
-          it2 = tmp;
-        } else {
-          result.add(range1);
+        bool IsAfter(const Range&other) const { return from > other.to; }
+
+        bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); }
+
+        std::string ToString() const {
+            return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
+        }
+
+        int64_t from;
+        int64_t to;
+    };
+
+    class RowRanges {
+    public:
+        RowRanges() = default;
+
+        explicit RowRanges(const Range&range) { ranges.push_back(range); }
+
+        RowRanges(const std::vector<Range>&ranges) { this->ranges = ranges; }
+
+        // copy cstr
+        RowRanges(const RowRanges&other) { ranges = other.ranges; }
+
+        RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); }
+
+        void Add(const Range&range, bool merge = true) {
+            Range rangeToAdd = range;
+            if (merge) {
+                for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
+                    Range last = ranges[i];
+                    if (last.IsAfter(range)) {
+                        throw ParquetException(range.ToString() + " cannot be added to " +
+                                               this->ToString());
+                    }
+                    const Range u = Range::UnionRange(last, rangeToAdd);
+                    if (u.from == -1 && u.to == -1) {
+                        break;
+                    }
+                    rangeToAdd = u;
+                    ranges.erase(ranges.begin() + i);
+                }
+            }
+            else {
+                if (ranges.size() > 1)
+                    assert(rangeToAdd.from > ranges.back().to);
+            }
+            ranges.push_back(rangeToAdd);
         }
-        ++it1;
-      }
-      result.add(range2);
-    } else {
-      it2 = it1;
-    }
-    while (it2 != right.ranges.end()) {
-      result.add(*it2);
-      ++it2;
-    }
-
-    return result;
-  }
-
-  static RowRanges intersection(const RowRanges& left, const RowRanges& right) {
-    RowRanges result;
-
-    size_t rightIndex = 0;
-    for (const Range& l : left.ranges) {
-      for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
-        const Range& r = right.ranges[i];
-        if (l.isBefore(r)) {
-          break;
-        } else if (l.isAfter(r)) {
-          rightIndex = i + 1;
-          continue;
+
+        size_t RowCount() const {
+            size_t cnt = 0;
+            for (const Range&range: ranges) {
+                cnt += range.Count();
+            }
+            return cnt;
         }
-        result.add(Range::intersection(l, r));
-      }
-    }
-
-    return result;
-  }
-
-  RowRanges slice(const int64_t from, const int64_t to) const {
-    RowRanges result;
-    for (const Range& range : ranges) {
-      if (range.from >= from && range.to <= to) {
-        result.add(range);
-      }
-    }
-    return result;
-  }
-
-  void add(const Range& range, bool merge = true) {
-    Range rangeToAdd = range;
-    if (merge) {
-      for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
-        Range last = ranges[i];
-        if (last.isAfter(range)) {
-          throw ParquetException(range.toString() + " cannot be added to " +
-                                 this->toString());
+
+        bool IsValid() const {
+            if (ranges.size() == 0) return true;
+            if (ranges[0].from < 0) {
+                return false;
+            }
+            for (size_t i = 1; i < ranges.size(); i++) {
+                if (ranges[i].from <= ranges[i - 1].to) {
+                    return false;
+                }
+            }
+            return true;
         }
-        const Range u = Range::unionRange(last, rangeToAdd);
-        if (u.from == -1 && u.to == -1) {
-          break;
+
+        bool IsOverlapping(int64_t from, int64_t to) const {
+            const Range searchRange(from, to);
+            return IsOverlapping(searchRange);
         }
-        rangeToAdd = u;
-        ranges.erase(ranges.begin() + i);
-      }
-    } else {
-      if (ranges.size() > 1) assert(rangeToAdd.from > ranges.back().to);
-    }
-    ranges.push_back(rangeToAdd);
-  }
-
-  size_t rowCount() const {
-    size_t cnt = 0;
-    for (const Range& range : ranges) {
-      cnt += range.count();
-    }
-    return cnt;
-  }
-
-  bool isValid() const {
-    if (ranges.size() == 0) return true;
-    if (ranges[0].from < 0) {
-      return false;
-    }
-    for (size_t i = 1; i < ranges.size(); i++) {
-      if (ranges[i].from <= ranges[i - 1].to) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool isOverlapping(int64_t from, int64_t to) const {
-    const Range searchRange(from, to);
-    return isOverlapping(searchRange);
-  }
-
-  bool isOverlapping(const Range& searchRange) const {
-    auto it = std::lower_bound(
-        ranges.begin(), ranges.end(), searchRange,
-        [](const Range& r1, const Range& r2) { return r1.isBefore(r2); });
-    return it != ranges.end() && !(*it).isAfter(searchRange);
-  }
-
-  std::vector<Range>& getRanges() { return ranges; }
-
-  const Range& operator[](size_t index) const {
-    assert(index < ranges.size());
-    return ranges[index];
-  }
-
-  std::string toString() const {
-    std::string result = "[";
-    for (const Range& range : ranges) {
-      result +=
-          "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
-    }
-    if (!ranges.empty()) {
-      result = result.substr(0, result.size() - 2);
-    }
-    result += "]";
-    return result;
-  }
-};
-
-using RowRangesPtr = std::shared_ptr<RowRanges>;
-
-namespace internal {
-
-class PARQUET_EXPORT RecordSkipper {
- public:
-  RecordSkipper(RowRanges& pages, RowRanges& row_ranges_)
-      : row_ranges(row_ranges_) {  // copy row_ranges
-    RowRanges will_process_pages, skip_pages;
-    for (auto& page : pages.getRanges()) {
-      if (!row_ranges.isOverlapping(page)) {
-        skip_pages.add(page, false);
-      }
-    }
-
-    /// Since the skipped pages will be slienly skipped without updating
-    /// current_rg_processed_records or records_read_, we need to pre-process the row
-    /// ranges as if these skipped pages never existed
-    adjust_ranges(skip_pages, row_ranges);
-
-    total_rows_to_process = pages.rowCount() - skip_pages.rowCount();
-  }
-
-  /// \brief Return the number of records to read or to skip
-  /// if return values is positive, it means to read N records
-  /// if return values is negative, it means to skip N records
-  /// if return values is 0, it means end of RG
-  int64_t advise_next(const int64_t current_rg_procesed) {
-    if (row_ranges.getRanges().size() == row_range_idx) {
-      return 0;
-    }
-
-    if (row_ranges[row_range_idx].to < current_rg_procesed) {
-      row_range_idx++;
-      if (row_ranges.getRanges().size() == row_range_idx) {
-        // negative, skip the ramaining rows
-        return current_rg_procesed - total_rows_to_process;
-      }
-    }
-
-    if (row_ranges[row_range_idx].from > current_rg_procesed) {
-      // negative, skip
-      return current_rg_procesed - row_ranges[row_range_idx].from;
-    }
-
-    const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
-    assert(ret > 0);
-    return ret;
-  }
-
- private:
-  /// Keep copy of ranges, because advise_next() will modify them
-  RowRanges row_ranges;
-
-  size_t row_range_idx = 0;
-
-  size_t total_rows_to_process = 0;
-
-  void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
-    size_t skipped_rows = 0;
-    auto iter = to_adjust.getRanges().begin();
-    auto skip_iter = skip_pages.getRanges().begin();
-    while (iter != to_adjust.getRanges().end()) {
-      while (skip_iter != skip_pages.getRanges().end() && skip_iter->isBefore(*iter)) {
-        skipped_rows += skip_iter->count();
-        ++skip_iter;
-      }
-      iter->from -= skipped_rows;
-      iter->to -= skipped_rows;
-      ++iter;
-    }
-  }
-};
-
-/// \brief Stateful column reader that delimits semantic records for both flat
-/// and nested columns
-///
-/// \note API EXPERIMENTAL
-/// \since 1.3.0
-class PARQUET_EXPORT RecordReader {
- public:
-  /// \brief Creates a record reader.
-  /// @param descr Column descriptor
-  /// @param leaf_info Level info, used to determine if a column is nullable or not
-  /// @param pool Memory pool to use for buffering values and rep/def levels
-  /// @param read_dictionary True if reading directly as Arrow dictionary-encoded
-  /// @param read_dense_for_nullable True if reading dense and not leaving space for null
-  /// values
-  static std::shared_ptr<RecordReader> Make(
-      const ColumnDescriptor* descr, LevelInfo leaf_info,
-      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-      bool read_dictionary = false, bool read_dense_for_nullable = false);
-
-  virtual ~RecordReader() = default;
-
-  /// \brief Attempt to read indicated number of records from column chunk
-  /// Note that for repeated fields, a record may have more than one value
-  /// and all of them are read. If read_dense_for_nullable() it will
-  /// not leave any space for null values. Otherwise, it will read spaced.
-  /// \return number of records read
-  virtual int64_t ReadRecords(int64_t num_records) = 0;
-
-  /// \brief Attempt to skip indicated number of records from column chunk.
-  /// Note that for repeated fields, a record may have more than one value
-  /// and all of them are skipped.
-  /// \return number of records skipped
-  virtual int64_t SkipRecords(int64_t num_records) = 0;
-
-  /// \brief Pre-allocate space for data. Results in better flat read performance
-  virtual void Reserve(int64_t num_values) = 0;
-
-  /// \brief Clear consumed values and repetition/definition levels as the
-  /// result of calling ReadRecords
-  /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
-  virtual void Reset() = 0;
-
-  /// \brief Transfer filled values buffer to caller. A new one will be
-  /// allocated in subsequent ReadRecords calls
-  virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
-
-  /// \brief Transfer filled validity bitmap buffer to caller. A new one will
-  /// be allocated in subsequent ReadRecords calls
-  virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
-
-  /// \brief Return true if the record reader has more internal data yet to
-  /// process
-  virtual bool HasMoreData() const = 0;
-
-  /// \brief Advance record reader to the next row group. Must be set before
-  /// any records could be read/skipped.
-  /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
-  virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
-
-  /// \brief Returns the underlying column reader's descriptor.
-  virtual const ColumnDescriptor* descr() const = 0;
-
-  virtual void DebugPrintState() = 0;
-
-  /// \brief Decoded definition levels
-  int16_t* def_levels() const {
-    return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
-  }
-
-  /// \brief Decoded repetition levels
-  int16_t* rep_levels() const {
-    return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
-  }
-
-  /// \brief Decoded values, including nulls, if any
-  /// FLBA and ByteArray types do not use this array and read into their own
-  /// builders.
-  uint8_t* values() const { return values_->mutable_data(); }
-
-  /// \brief Number of values written, including space left for nulls if any.
-  /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
-  /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
-  /// FLBA and ByteArray types this value reflects the values written with the last
-  /// ReadRecords call since those readers will reset the values after each call.
-  int64_t values_written() const { return values_written_; }
-
-  /// \brief Number of definition / repetition levels (from those that have
-  /// been decoded) that have been consumed inside the reader.
-  int64_t levels_position() const { return levels_position_; }
-
-  /// \brief Number of definition / repetition levels that have been written
-  /// internally in the reader. This may be larger than values_written() because
-  /// for repeated fields we need to look at the levels in advance to figure out
-  /// the record boundaries.
-  int64_t levels_written() const { return levels_written_; }
-
-  /// \brief Number of nulls in the leaf that we have read so far into the
-  /// values vector. This is only valid when !read_dense_for_nullable(). When
-  /// read_dense_for_nullable() it will always be 0.
-  int64_t null_count() const { return null_count_; }
-
-  /// \brief True if the leaf values are nullable
-  bool nullable_values() const { return nullable_values_; }
-
-  /// \brief True if reading directly as Arrow dictionary-encoded
-  bool read_dictionary() const { return read_dictionary_; }
-
-  /// \brief True if reading dense for nullable columns.
-  bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
-
-  void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
-
-  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
-
- protected:
-  /// \brief Indicates if we can have nullable values. Note that repeated fields
-  /// may or may not be nullable.
-  bool nullable_values_;
-
-  bool at_record_start_;
-  int64_t records_read_;
-
-  int64_t current_rg_processed_records;  // counting both read and skip records
-
-  /// \brief Stores values. These values are populated based on each ReadRecords
-  /// call. No extra values are buffered for the next call. SkipRecords will not
-  /// add any value to this buffer.
-  std::shared_ptr<::arrow::ResizableBuffer> values_;
-  /// \brief False for BYTE_ARRAY, in which case we don't allocate the values
-  /// buffer and we directly read into builder classes.
-  bool uses_values_;
-
-  /// \brief Values that we have read into 'values_' + 'null_count_'.
-  int64_t values_written_;
-  int64_t values_capacity_;
-  int64_t null_count_;
-
-  /// \brief Each bit corresponds to one element in 'values_' and specifies if it
-  /// is null or not null. Not set if read_dense_for_nullable_ is true.
-  std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
-
-  /// \brief Buffer for definition levels. May contain more levels than
-  /// is actually read. This is because we read levels ahead to
-  /// figure out record boundaries for repeated fields.
-  /// For flat required fields, 'def_levels_' and 'rep_levels_' are not
-  ///  populated. For non-repeated fields 'rep_levels_' is not populated.
-  /// 'def_levels_' and 'rep_levels_' must be of the same size if present.
-  std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
-  /// \brief Buffer for repetition levels. Only populated for repeated
-  /// fields.
-  std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
-
-  /// \brief Number of definition / repetition levels that have been written
-  /// internally in the reader. This may be larger than values_written() since
-  /// for repeated fields we need to look at the levels in advance to figure out
-  /// the record boundaries.
-  int64_t levels_written_;
-  /// \brief Position of the next level that should be consumed.
-  int64_t levels_position_;
-  int64_t levels_capacity_;
-
-  bool read_dictionary_ = false;
-  // If true, we will not leave any space for the null values in the values_
-  // vector.
-  bool read_dense_for_nullable_ = false;
-
-  std::shared_ptr<RecordSkipper> skipper = NULLPTR;
-};
-
-class BinaryRecordReader : virtual public RecordReader {
- public:
-  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-};
-
-/// \brief Read records directly to dictionary-encoded Arrow form (int32
-/// indices). Only valid for BYTE_ARRAY columns
-class DictionaryRecordReader : virtual public RecordReader {
- public:
-  virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
-};
-
-}  // namespace internal
-
-using BoolReader = TypedColumnReader<BooleanType>;
-using Int32Reader = TypedColumnReader<Int32Type>;
-using Int64Reader = TypedColumnReader<Int64Type>;
-using Int96Reader = TypedColumnReader<Int96Type>;
-using FloatReader = TypedColumnReader<FloatType>;
-using DoubleReader = TypedColumnReader<DoubleType>;
-using ByteArrayReader = TypedColumnReader<ByteArrayType>;
-using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
-
-}  // namespace parquet
+
+        bool IsOverlapping(const Range&searchRange) const {
+            auto it = std::lower_bound(
+                ranges.begin(), ranges.end(), searchRange,
+                [](const Range&r1, const Range&r2) { return r1.IsBefore(r2); });
+            return it != ranges.end() && !(*it).IsAfter(searchRange);
+        }
+
+        std::vector<Range>& GetRanges() { return ranges; }
+
+        const Range& operator[](size_t index) const {
+            assert(index < ranges.size());
+            return ranges[index];
+        }
+
+        std::string ToString() const {
+            std::string result = "[";
+            for (const Range&range: ranges) {
+                result +=
+                        "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
+            }
+            if (!ranges.empty()) {
+                result = result.substr(0, result.size() - 2);
+            }
+            result += "]";
+            return result;
+        }
+
+    private:
+        std::vector<Range> ranges;
+    };
+
+    using RowRangesPtr = std::shared_ptr<RowRanges>;
+
+    namespace internal {
+        class PARQUET_EXPORT RecordSkipper {
+        public:
+            RecordSkipper(RowRanges&pages, RowRanges&row_ranges_)
+                : row_ranges(row_ranges_) {
+                // copy row_ranges
+                RowRanges will_process_pages, skip_pages;
+                for (auto&page: pages.GetRanges()) {
+                    if (!row_ranges.IsOverlapping(page)) {
+                        skip_pages.Add(page, false);
+                    }
+                }
+
+                /// Since the skipped pages will be slienly skipped without updating
+                /// current_rg_processed_records or records_read_, we need to pre-process the row
+                /// ranges as if these skipped pages never existed
+                adjust_ranges(skip_pages, row_ranges);
+
+                total_rows_to_process = pages.RowCount() - skip_pages.RowCount();
+            }
+
+            /// \brief Return the number of records to read or to skip
+            /// if return values is positive, it means to read N records
+            /// if return values is negative, it means to skip N records
+            /// if return values is 0, it means end of RG
+            int64_t advise_next(const int64_t current_rg_procesed) {
+                if (row_ranges.GetRanges().size() == row_range_idx) {
+                    return 0;
+                }
+
+                if (row_ranges[row_range_idx].to < current_rg_procesed) {
+                    row_range_idx++;
+                    if (row_ranges.GetRanges().size() == row_range_idx) {
+                        // negative, skip the ramaining rows
+                        return current_rg_procesed - total_rows_to_process;
+                    }
+                }
+
+                if (row_ranges[row_range_idx].from > current_rg_procesed) {
+                    // negative, skip
+                    return current_rg_procesed - row_ranges[row_range_idx].from;
+                }
+
+                const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
+                assert(ret > 0);
+                return ret;
+            }
+
+        private:
+            void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
+                size_t skipped_rows = 0;
+                auto iter = to_adjust.GetRanges().begin();
+                auto skip_iter = skip_pages.GetRanges().begin();
+                while (iter != to_adjust.GetRanges().end()) {
+                    while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) {
+                        skipped_rows += skip_iter->Count();
+                        ++skip_iter;
+                    }
+                    iter->from -= skipped_rows;
+                    iter->to -= skipped_rows;
+                    ++iter;
+                }
+            }
+
+            /// Keep copy of ranges, because advise_next() will modify them
+            RowRanges row_ranges;
+
+            size_t row_range_idx = 0;
+            size_t total_rows_to_process = 0;
+        };
+
+        /// \brief Stateful column reader that delimits semantic records for both flat
+        /// and nested columns
+        ///
+        /// \note API EXPERIMENTAL
+        /// \since 1.3.0
+        class PARQUET_EXPORT RecordReader {
+        public:
+            /// \brief Creates a record reader.
+            /// @param descr Column descriptor
+            /// @param leaf_info Level info, used to determine if a column is nullable or not
+            /// @param pool Memory pool to use for buffering values and rep/def levels
+            /// @param read_dictionary True if reading directly as Arrow dictionary-encoded
+            /// @param read_dense_for_nullable True if reading dense and not leaving space for null
+            /// values
+            static std::shared_ptr<RecordReader> Make(
+                const ColumnDescriptor* descr, LevelInfo leaf_info,
+                ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+                bool read_dictionary = false, bool read_dense_for_nullable = false);
+
+            virtual ~RecordReader() = default;
+
+            /// \brief Attempt to read indicated number of records from column chunk
+            /// Note that for repeated fields, a record may have more than one value
+            /// and all of them are read. If read_dense_for_nullable() it will
+            /// not leave any space for null values. Otherwise, it will read spaced.
+            /// \return number of records read
+            virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+            /// \brief Attempt to skip indicated number of records from column chunk.
+            /// Note that for repeated fields, a record may have more than one value
+            /// and all of them are skipped.
+            /// \return number of records skipped
+            virtual int64_t SkipRecords(int64_t num_records) = 0;
+
+            /// \brief Pre-allocate space for data. Results in better flat read performance
+            virtual void Reserve(int64_t num_values) = 0;
+
+            /// \brief Clear consumed values and repetition/definition levels as the
+            /// result of calling ReadRecords
+            /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
+            virtual void Reset() = 0;
+
+            /// \brief Transfer filled values buffer to caller. A new one will be
+            /// allocated in subsequent ReadRecords calls
+            virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+            /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+            /// be allocated in subsequent ReadRecords calls
+            virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+            /// \brief Return true if the record reader has more internal data yet to
+            /// process
+            virtual bool HasMoreData() const = 0;
+
+            /// \brief Advance record reader to the next row group. Must be set before
+            /// any records could be read/skipped.
+            /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+            virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+            /// \brief Returns the underlying column reader's descriptor.
+            virtual const ColumnDescriptor* descr() const = 0;
+
+            virtual void DebugPrintState() = 0;
+
+            /// \brief Decoded definition levels
+            int16_t* def_levels() const {
+                return reinterpret_cast<int16_t *>(def_levels_->mutable_data());
+            }
+
+            /// \brief Decoded repetition levels
+            int16_t* rep_levels() const {
+                return reinterpret_cast<int16_t *>(rep_levels_->mutable_data());
+            }
+
+            /// \brief Decoded values, including nulls, if any
+            /// FLBA and ByteArray types do not use this array and read into their own
+            /// builders.
+            uint8_t* values() const { return values_->mutable_data(); }
+
+            /// \brief Number of values written, including space left for nulls if any.
+            /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
+            /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
+            /// FLBA and ByteArray types this value reflects the values written with the last
+            /// ReadRecords call since those readers will reset the values after each call.
+            int64_t values_written() const { return values_written_; }
+
+            /// \brief Number of definition / repetition levels (from those that have
+            /// been decoded) that have been consumed inside the reader.
+            int64_t levels_position() const { return levels_position_; }
+
+            /// \brief Number of definition / repetition levels that have been written
+            /// internally in the reader. This may be larger than values_written() because
+            /// for repeated fields we need to look at the levels in advance to figure out
+            /// the record boundaries.
+            int64_t levels_written() const { return levels_written_; }
+
+            /// \brief Number of nulls in the leaf that we have read so far into the
+            /// values vector. This is only valid when !read_dense_for_nullable(). When
+            /// read_dense_for_nullable() it will always be 0.
+            int64_t null_count() const { return null_count_; }
+
+            /// \brief True if the leaf values are nullable
+            bool nullable_values() const { return nullable_values_; }
+
+            /// \brief True if reading directly as Arrow dictionary-encoded
+            bool read_dictionary() const { return read_dictionary_; }
+
+            /// \brief True if reading dense for nullable columns.
+            bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
+
+            void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
+
+            void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
+
+        protected:
+            /// \brief Indicates if we can have nullable values. Note that repeated fields
+            /// may or may not be nullable.
+            bool nullable_values_;
+
+            bool at_record_start_;
+            int64_t records_read_;
+
+            int64_t current_rg_processed_records; // counting both read and skip records
+
+            /// \brief Stores values. These values are populated based on each ReadRecords
+            /// call. No extra values are buffered for the next call. SkipRecords will not
+            /// add any value to this buffer.
+            std::shared_ptr<::arrow::ResizableBuffer> values_;
+            /// \brief False for BYTE_ARRAY, in which case we don't allocate the values
+            /// buffer and we directly read into builder classes.
+            bool uses_values_;
+
+            /// \brief Values that we have read into 'values_' + 'null_count_'.
+            int64_t values_written_;
+            int64_t values_capacity_;
+            int64_t null_count_;
+
+            /// \brief Each bit corresponds to one element in 'values_' and specifies if it
+            /// is null or not null. Not set if read_dense_for_nullable_ is true.
+            std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+
+            /// \brief Buffer for definition levels. May contain more levels than
+            /// is actually read. This is because we read levels ahead to
+            /// figure out record boundaries for repeated fields.
+            /// For flat required fields, 'def_levels_' and 'rep_levels_' are not
+            ///  populated. For non-repeated fields 'rep_levels_' is not populated.
+            /// 'def_levels_' and 'rep_levels_' must be of the same size if present.
+            std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+            /// \brief Buffer for repetition levels. Only populated for repeated
+            /// fields.
+            std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+            /// \brief Number of definition / repetition levels that have been written
+            /// internally in the reader. This may be larger than values_written() since
+            /// for repeated fields we need to look at the levels in advance to figure out
+            /// the record boundaries.
+            int64_t levels_written_;
+            /// \brief Position of the next level that should be consumed.
+            int64_t levels_position_;
+            int64_t levels_capacity_;
+
+            bool read_dictionary_ = false;
+            // If true, we will not leave any space for the null values in the values_
+            // vector.
+            bool read_dense_for_nullable_ = false;
+
+            std::shared_ptr<RecordSkipper> skipper = NULLPTR;
+        };
+
+        class BinaryRecordReader : virtual public RecordReader {
+        public:
+            virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+        };
+
+        /// \brief Read records directly to dictionary-encoded Arrow form (int32
+        /// indices). Only valid for BYTE_ARRAY columns
+        class DictionaryRecordReader : virtual public RecordReader {
+        public:
+            virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+        };
+    } // namespace internal
+
+    using BoolReader = TypedColumnReader<BooleanType>;
+    using Int32Reader = TypedColumnReader<Int32Type>;
+    using Int64Reader = TypedColumnReader<Int64Type>;
+    using Int96Reader = TypedColumnReader<Int96Type>;
+    using FloatReader = TypedColumnReader<FloatType>;
+    using DoubleReader = TypedColumnReader<DoubleType>;
+    using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+    using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+} // namespace parquet

From 945e543164a01a3c29774966c0b291dc49684e06 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 1 Jan 2024 18:48:04 +0800
Subject: [PATCH 12/25] simply map

---
 cpp/src/parquet/arrow/reader.cc         | 28 ++++----
 cpp/src/parquet/arrow/reader.h          |  4 +-
 cpp/src/parquet/arrow/reader_internal.h |  2 +-
 cpp/src/parquet/range_reader_test.cc    | 94 ++++++++++++-------------
 4 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 06e8b5bcf026..cbca49435e29 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -207,7 +207,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(
       int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
       const std::vector<int>& row_groups,
-      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      const std::map<int, RowRangesPtr>& row_ranges_map,
       std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
@@ -224,13 +224,13 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->row_ranges_map = row_ranges_map;
+    ctx->row_ranges_map = row_ranges_map; // it will be shared by all field readers, so copy instead of std::move()
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(
       const std::vector<int>& column_indices, const std::vector<int>& row_groups,
-      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      const std::map<int, RowRangesPtr>& row_ranges_map,
       std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
       std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -272,7 +272,7 @@ class FileReaderImpl : public FileReader {
     std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
 
     std::unique_ptr<ColumnReaderImpl> reader;
-    RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, NULLPTR, &reader));
+    RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, {}, &reader));
 
     return ReadColumn(i, row_groups, reader.get(), out);
   }
@@ -345,24 +345,24 @@ class FileReaderImpl : public FileReader {
 
   Status GetRecordBatchReader(
       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      const std::map<int, RowRangesPtr>& row_ranges_map,
       std::unique_ptr<RecordBatchReader>* out) override;
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               const std::vector<int>& column_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReader(row_group_indices, column_indices, NULLPTR, out);
+    return GetRecordBatchReader(row_group_indices, column_indices, {}, out);
   }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
     return GetRecordBatchReader(row_group_indices,
-                                Iota(reader_->metadata()->num_columns()), NULLPTR, out);
+                                Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
   Status GetRecordBatchReader(std::unique_ptr<RecordBatchReader>* out) override {
     return GetRecordBatchReader(Iota(num_row_groups()),
-                                Iota(reader_->metadata()->num_columns()), NULLPTR, out);
+                                Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
   ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
@@ -614,13 +614,13 @@ class LeafReader : public ColumnReaderImpl {
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
 
     /// using page index to reduce cost
-    if (page_reader != nullptr && ctx_->row_ranges_map) {
+    if (page_reader != nullptr) {
       // reset skipper
       record_reader_->set_record_skipper(NULLPTR);
 
       // if specific row range is provided for this rg
-      if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group());
-          iter != ctx_->row_ranges_map->end()) {
+      if (const auto iter = ctx_->row_ranges_map.find(input_->current_row_group());
+          iter != ctx_->row_ranges_map.end()) {
         if (iter->second != nullptr && iter->second->RowCount() != 0) {
           std::shared_ptr<RowRanges> page_ranges;
           checkAndGetPageRanges(iter->second, page_ranges);
@@ -1113,7 +1113,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReader(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+    const std::map<int, RowRangesPtr>& row_ranges_map,
     std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
@@ -1384,7 +1384,7 @@ Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> result_schema;
   RETURN_NOT_OK(
-      GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema));
+      GetFieldReaders(column_indices, row_groups, {}, &readers, &result_schema));
   // OptionalParallelForAsync requires an executor
   if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
 
@@ -1449,7 +1449,7 @@ Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indice
 
 Status FileReader::GetRecordBatchReader(
     const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+    const std::map<int, RowRangesPtr>& row_ranges_map,
     std::shared_ptr<RecordBatchReader>* out) {
   std::unique_ptr<RecordBatchReader> tmp;
   RETURN_NOT_OK(
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 0cd8f298d79d..d5b5cf54f131 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -190,7 +190,7 @@ class PARQUET_EXPORT FileReader {
 
   virtual ::arrow::Status GetRecordBatchReader(
       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      const std::map<int, RowRangesPtr>& row_ranges_map,
       std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
   /// \brief Return a RecordBatchReader of row groups selected from
@@ -207,7 +207,7 @@ class PARQUET_EXPORT FileReader {
   ///     contains an invalid index
   ::arrow::Status GetRecordBatchReader(
       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::shared_ptr<std::map<int, RowRangesPtr>>& row_ranges_map,
+      const std::map<int, RowRangesPtr>& row_ranges_map,
       std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                        const std::vector<int>& column_indices,
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index 56be0f93f414..fba583d27b06 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -113,7 +113,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
-  std::shared_ptr<std::map<int, RowRangesPtr>> row_ranges_map;
+  std::map<int, RowRangesPtr> row_ranges_map;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index 5bccaaa0c0f6..abbbb5fa60e4 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -274,13 +274,13 @@ class TestRecordBatchReaderWithRanges : public testing::Test {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
-  row_ranges_map->insert(
+  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  row_ranges_map.insert(
       {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
-  row_ranges_map->insert(
+  row_ranges_map.insert(
       {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
-  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  row_ranges_map.insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -292,13 +292,13 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert(
-      {0, std::make_shared<parquet::RowRanges>(
-              std::vector<parquet::Range>{parquet::Range{0, 7}, parquet::Range{16, 23}})});
-  row_ranges_map->insert({1, nullptr});
-  row_ranges_map->insert({2, nullptr});
-  row_ranges_map->insert({3, nullptr});
+  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+  row_ranges_map.insert(
+      {0, std::make_shared<parquet::RowRanges>(std::vector<parquet::Range>{
+              parquet::Range{0, 7}, parquet::Range{16, 23}})});
+  row_ranges_map.insert({1, nullptr});
+  row_ranges_map.insert({2, nullptr});
+  row_ranges_map.insert({3, nullptr});
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -310,14 +310,11 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert(
-      {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map->insert(
-      {1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map->insert(
-      {2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map->insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map.insert({1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  row_ranges_map.insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -329,15 +326,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
   // here we test four kinds of empty range:
 
   // rg 0 not put into map -> will read
-  row_ranges_map->insert({1, nullptr});  // value is nullptr -> will skip
-  row_ranges_map->insert(
+  row_ranges_map.insert({1, nullptr});  // value is nullptr -> will skip
+  row_ranges_map.insert(
       {2, std::make_shared<parquet::RowRanges>(
               std::vector<parquet::Range>())});  // value is empty -> will skip
-  row_ranges_map->insert(
+  row_ranges_map.insert(
       {3, std::make_shared<parquet::RowRanges>()});  // value is empty -> will skip
 
   const std::vector column_indices{0, 1, 2, 3, 4};
@@ -352,15 +349,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 1: only care about RG 0
   {
     std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map->insert({1, nullptr});
-    row_ranges_map->insert({2, nullptr});
-    row_ranges_map->insert({3, nullptr});
+    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map.insert({1, nullptr});
+    row_ranges_map.insert({2, nullptr});
+    row_ranges_map.insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
@@ -371,15 +368,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 2: care about RG 0 and 2
   {
     std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map->insert({1, nullptr});
-    row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map->insert({3, nullptr});
+    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map.insert({1, nullptr});
+    row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map.insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));
@@ -391,8 +388,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-    row_ranges_map->insert(
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    row_ranges_map.insert(
         {0, std::make_shared<parquet::RowRanges>(parquet::Range{-1, 5})});
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -404,9 +401,9 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
 
   {
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(std::vector{
-                                   parquet::Range{0, 4}, parquet::Range{2, 5}})});
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(std::vector{
+                                  parquet::Range{0, 4}, parquet::Range{2, 5}})});
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                            row_ranges_map, &rb_reader);
@@ -416,8 +413,8 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
                               "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos);
   }
   {
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-    row_ranges_map->insert(
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    row_ranges_map.insert(
         {0, std::make_shared<parquet::RowRanges>(std::vector{parquet::Range{0, 30}})});
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
@@ -467,9 +464,8 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
 
   std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
-  row_ranges_map->insert(
-      {0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
   std::vector column_indices{0, 1, 2, 3, 4};
   auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                    row_ranges_map, &rb_reader);
@@ -510,15 +506,15 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test {
 TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
   {
     std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    const auto row_ranges_map = std::make_shared<std::map<int, parquet::RowRangesPtr>>();
+    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map->insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map->insert({1, nullptr});
-    row_ranges_map->insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map->insert({3, nullptr});
+    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map.insert({1, nullptr});
+    row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(ranges)});
+    row_ranges_map.insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
                                                  row_ranges_map, &rb_reader));

From ed9d02b36c6e5ad5ae60b57bbfa0cd6470706b1b Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 1 Jan 2024 23:26:42 +0800
Subject: [PATCH 13/25] api refined

---
 cpp/src/parquet/CMakeLists.txt          |   1 +
 cpp/src/parquet/arrow/reader.cc         | 143 ++++++++++++---------
 cpp/src/parquet/arrow/reader.h          |  20 +--
 cpp/src/parquet/arrow/reader_internal.h |   2 +-
 cpp/src/parquet/column_reader.h         |  74 ++++++++++-
 cpp/src/parquet/range_reader_test.cc    | 159 +++++++++---------------
 cpp/src/parquet/row_range_test.cc       | 102 +++++++++++++++
 7 files changed, 330 insertions(+), 171 deletions(-)
 create mode 100644 cpp/src/parquet/row_range_test.cc

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 0b947af762b2..9f9a7f2336aa 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -355,6 +355,7 @@ add_parquet_test(reader-test
                  column_scanner_test.cc
                  reader_test.cc
                  range_reader_test.cc
+                 row_range_test.cc
                  stream_reader_test.cc
                  test_util.cc)
 
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index cbca49435e29..222493487cc4 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -207,7 +207,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(
       int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
       const std::vector<int>& row_groups,
-      const std::map<int, RowRangesPtr>& row_ranges_map,
+      const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
       std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
@@ -224,13 +224,13 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->row_ranges_map = row_ranges_map; // it will be shared by all field readers, so copy instead of std::move()
+    ctx->row_ranges_map = row_ranges_map;
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(
       const std::vector<int>& column_indices, const std::vector<int>& row_groups,
-      const std::map<int, RowRangesPtr>& row_ranges_map,
+      const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
       std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
       std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -343,25 +343,65 @@ class FileReaderImpl : public FileReader {
     return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
   }
 
-  Status GetRecordBatchReader(
-      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::map<int, RowRangesPtr>& row_ranges_map,
-      std::unique_ptr<RecordBatchReader>* out) override;
+  // This is a internal API owned by FileReaderImpl, not exposed in FileReader
+  Status GetRecordBatchReaderWithRowRanges(const std::vector<int>& row_group_indices,
+                                           const std::vector<int>& column_indices,
+                                           const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+                                           std::unique_ptr<RecordBatchReader>* out);
+
+  Status GetRecordBatchReader(const RowRanges& rows_to_return,
+                              const std::vector<int>& column_indices,
+                              std::unique_ptr<RecordBatchReader>* out) override {
+    const auto metadata = reader_->metadata();
+    // check if the row ranges are valid
+    if (!rows_to_return.IsValid()) {
+      return Status::Invalid("The provided row range is invalid, keep it monotone and non-interleaving: " +
+                             rows_to_return.ToString());
+    }
+    // check if the row ranges are within the row group boundaries
+    if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().to >= metadata->num_rows()) {
+      return Status::Invalid("The provided row range " + rows_to_return.ToString() +
+                             " exceeds the number of rows in the file: " +
+                             std::to_string(metadata->num_rows()));
+    }
+
+    std::vector<int64_t> split_points;
+    int64_t rows_so_far = 0;
+    for (int i = 0 ; i < metadata->num_row_groups() - 1; i++) {
+      rows_so_far += metadata->RowGroup(i)->num_rows();
+      split_points.push_back(rows_so_far);
+    }
+    // We'll assign a RowRanges for each RG, even if it's not required to return any rows
+    const std::vector<RowRanges> splits = rows_to_return.SplitAt(split_points);
+    // Call row_ranges_map because array index is the row group index
+    const std::shared_ptr<std::vector<RowRanges>> row_ranges_map =
+        std::make_shared<std::vector<RowRanges>>();
+    rows_so_far = 0;
+    std::vector<int> row_group_indices;
+    for (int i = 0 ; i < metadata->num_row_groups(); i++) {
+      row_ranges_map->push_back(splits[i].shift(-rows_so_far));
+      rows_so_far += metadata->RowGroup(i)->num_rows();
+      if (row_ranges_map->at(i).RowCount() > 0)
+        row_group_indices.push_back(i);
+    }
+
+    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_map, out);
+  }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               const std::vector<int>& column_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReader(row_group_indices, column_indices, {}, out);
+    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, {}, out);
   }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReader(row_group_indices,
+    return GetRecordBatchReaderWithRowRanges(row_group_indices,
                                 Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
   Status GetRecordBatchReader(std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReader(Iota(num_row_groups()),
+    return GetRecordBatchReaderWithRowRanges(Iota(num_row_groups()),
                                 Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
@@ -466,7 +506,7 @@ class RowGroupReaderImpl : public RowGroupReader {
 // Column reader implementations
 
 struct RowRangesPageFilter {
-  explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_,
+  explicit RowRangesPageFilter(const RowRanges& row_ranges_,
                                const RowRangesPtr& page_ranges_)
       : row_ranges(row_ranges_), page_ranges(page_ranges_) {
     assert(page_ranges != nullptr);
@@ -478,20 +518,20 @@ struct RowRangesPageFilter {
 
     Range current_page_range = (*page_ranges)[page_range_idx];
 
-    while (row_range_idx < row_ranges->GetRanges().size() &&
-           current_page_range.IsAfter((*row_ranges)[row_range_idx])) {
+    while (row_range_idx < row_ranges.GetRanges().size() &&
+           current_page_range.IsAfter(row_ranges[row_range_idx])) {
       row_range_idx++;
     }
 
-    if (row_range_idx >= row_ranges->GetRanges().size()) {
+    if (row_range_idx >= row_ranges.GetRanges().size()) {
       return true;
     }
 
-    return current_page_range.IsBefore((*row_ranges)[row_range_idx]);
+    return current_page_range.IsBefore(row_ranges[row_range_idx]);
   }
 
   size_t row_range_idx = 0;
-  const RowRangesPtr row_ranges;
+  const RowRanges & row_ranges;
 
   int page_range_idx = -1;
   const RowRangesPtr page_ranges;
@@ -559,8 +599,8 @@ class LeafReader : public ColumnReaderImpl {
  private:
   std::shared_ptr<ChunkedArray> out_;
 
-  void checkAndGetPageRanges(const std::shared_ptr<RowRanges>& row_ranges,
-                             std::shared_ptr<RowRanges>& page_ranges) {
+  void checkAndGetPageRanges(const RowRanges & row_ranges,
+                             std::shared_ptr<RowRanges>& page_ranges) const {
     // check offset exists
     const auto rg_pg_index_reader =
         ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group());
@@ -580,12 +620,6 @@ class LeafReader : public ColumnReaderImpl {
           field_->name());
     }
 
-    if (!row_ranges->IsValid()) {
-      throw ParquetException(
-          "The provided row range is invalid, keep it monotone and non-interleaving: " +
-          row_ranges->ToString());
-    }
-
     const auto page_locations = offset_index->page_locations();
     page_ranges = std::make_shared<RowRanges>();
     for (size_t i = 0; i < page_locations.size() - 1; i++) {
@@ -601,10 +635,10 @@ class LeafReader : public ColumnReaderImpl {
           false);
     }
 
-    if (row_ranges->GetRanges().size() > 0) {
-      if ((*row_ranges).GetRanges().back().to > page_ranges->GetRanges().back().to) {
+    if (row_ranges.GetRanges().size() > 0) {
+      if (row_ranges.GetRanges().back().to > page_ranges->GetRanges().back().to) {
         throw ParquetException(
-            "The provided row range " + row_ranges->ToString() +
+            "The provided row range " + row_ranges.ToString() +
             " exceeds last page :" + page_ranges->GetRanges().back().ToString());
       }
     }
@@ -614,32 +648,28 @@ class LeafReader : public ColumnReaderImpl {
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
 
     /// using page index to reduce cost
-    if (page_reader != nullptr) {
+    if (page_reader != nullptr && ctx_->row_ranges_map) {
       // reset skipper
       record_reader_->set_record_skipper(NULLPTR);
 
-      // if specific row range is provided for this rg
-      if (const auto iter = ctx_->row_ranges_map.find(input_->current_row_group());
-          iter != ctx_->row_ranges_map.end()) {
-        if (iter->second != nullptr && iter->second->RowCount() != 0) {
-          std::shared_ptr<RowRanges> page_ranges;
-          checkAndGetPageRanges(iter->second, page_ranges);
-
-          // part 1, skip decompressing & decoding unnecessary pages
-          page_reader->set_data_page_filter(
-              RowRangesPageFilter(iter->second, page_ranges));
-
-          // part 2, skip unnecessary rows in necessary pages
-          record_reader_->set_record_skipper(
-              std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
-                                                                 *iter->second));
-        } else {
-          NextRowGroup();
-          return;
-        }
+      const auto & row_ranges = (*ctx_->row_ranges_map)[input_->current_row_group()];
+      if (row_ranges.RowCount() != 0) {
+        // if specific row range is provided for this rg
+        std::shared_ptr<RowRanges> page_ranges;
+        checkAndGetPageRanges(row_ranges, page_ranges);
+
+        // part 1, skip decompressing & decoding unnecessary pages
+        page_reader->set_data_page_filter(
+            RowRangesPageFilter(row_ranges, page_ranges));
+
+        // part 2, skip unnecessary rows in necessary pages
+        record_reader_->set_record_skipper(
+            std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
+                                                               row_ranges));
+      } else {
+        NextRowGroup();
+        return;
       }
-      // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this
-      // RG will be read
     }
 
     record_reader_->reset_current_rg_processed_records();
@@ -1111,9 +1141,9 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 }  // namespace
 
-Status FileReaderImpl::GetRecordBatchReader(
+Status FileReaderImpl::GetRecordBatchReaderWithRowRanges(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
-    const std::map<int, RowRangesPtr>& row_ranges_map,
+    const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
     std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
@@ -1447,17 +1477,6 @@ Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indice
   return Status::OK();
 }
 
-Status FileReader::GetRecordBatchReader(
-    const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-    const std::map<int, RowRangesPtr>& row_ranges_map,
-    std::shared_ptr<RecordBatchReader>* out) {
-  std::unique_ptr<RecordBatchReader> tmp;
-  RETURN_NOT_OK(
-      GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp));
-  out->reset(tmp.release());
-  return Status::OK();
-}
-
 Status FileReader::Make(::arrow::MemoryPool* pool,
                         std::unique_ptr<ParquetFileReader> reader,
                         const ArrowReaderProperties& properties,
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index d5b5cf54f131..807be797aad6 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -188,10 +188,17 @@ class PARQUET_EXPORT FileReader {
       const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
       std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
-  virtual ::arrow::Status GetRecordBatchReader(
-      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::map<int, RowRangesPtr>& row_ranges_map,
-      std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+  /// \brief Return a RecordBatchReader of row groups selected from
+  /// rows_to_return, whose columns are selected by column_indices.
+  ///
+  /// Notice that rows_to_return is file based, it not only decides which row groups to read,
+  /// but also which rows to read in each row group.
+  ///
+  ///
+  /// \returns error Status if either rows_to_return or column_indices
+  ///     contains an invalid index
+  virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return,
+      const std::vector<int>& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
   /// \brief Return a RecordBatchReader of row groups selected from
   /// row_group_indices, whose columns are selected by column_indices.
@@ -205,10 +212,6 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either row_group_indices or column_indices
   ///     contains an invalid index
-  ::arrow::Status GetRecordBatchReader(
-      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
-      const std::map<int, RowRangesPtr>& row_ranges_map,
-      std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                                        const std::vector<int>& column_indices,
                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
@@ -216,6 +219,7 @@ class PARQUET_EXPORT FileReader {
                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
 
+
   /// \brief Return a generator of record batches.
   ///
   /// The FileReader must outlive the generator, so this requires that you pass in a
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index fba583d27b06..4d98f8a7fe5c 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -113,7 +113,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
-  std::map<int, RowRangesPtr> row_ranges_map;
+  std::shared_ptr<std::vector<RowRanges>> row_ranges_map;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 0c81087a3770..d884e0144e4b 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -343,11 +343,15 @@ namespace parquet {
 
         bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); }
 
+        bool IsValid() const { return from >= 0 && to >= 0 && to >= from; }
+
         std::string ToString() const {
             return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
         }
 
+        // inclusive
         int64_t from;
+        // inclusive
         int64_t to;
     };
 
@@ -364,6 +368,26 @@ namespace parquet {
 
         RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); }
 
+        static RowRanges Intersection(const RowRanges& left, const RowRanges& right) {
+            RowRanges result;
+
+            size_t rightIndex = 0;
+            for (const Range& l : left.ranges) {
+                for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
+                    const Range& r = right.ranges[i];
+                    if (l.IsBefore(r)) {
+                        break;
+                    } else if (l.IsAfter(r)) {
+                        rightIndex = i + 1;
+                        continue;
+                    }
+                    result.Add(Range::Intersection(l, r));
+                }
+            }
+
+            return result;
+        }
+
         void Add(const Range&range, bool merge = true) {
             Range rangeToAdd = range;
             if (merge) {
@@ -423,11 +447,57 @@ namespace parquet {
 
         std::vector<Range>& GetRanges() { return ranges; }
 
+        const std::vector<Range>& GetRanges() const { return ranges; }
+
+        // Split the ranges into N+1 parts at the given split point, where N = split_points.size()
+        // The RowRows object itself is not modified
+        std::vector<RowRanges> SplitAt(const std::vector<int64_t>&split_points) const {
+            if (split_points.size() == 0) {
+                return {*this};
+            }
+
+            std::vector<RowRanges> result;
+            int64_t last_split_point = -1;
+            for (const int64_t split_point: split_points) {
+                if (split_point <= 0) {
+                    throw ParquetException("Invalid split point " + std::to_string(split_point));
+                }
+                if (split_point <= last_split_point) {
+                    throw ParquetException("Split points must be in ascending order");
+                }
+                last_split_point = split_point;
+            }
+
+            RowRanges spaces;
+            for (size_t i = 0 ; i < split_points.size(); ++i) {
+                auto start = i == 0 ? 0 : split_points[i - 1];
+                auto end = split_points[i] - 1;
+                spaces.Add({start, end}, false);
+            }
+            spaces.Add({split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()},
+                       false);
+
+            for(Range space : spaces.GetRanges()) {
+                RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this);
+                result.push_back(intersection);
+            }
+
+            return result;
+        }
+
         const Range& operator[](size_t index) const {
             assert(index < ranges.size());
             return ranges[index];
         }
 
+        RowRanges shift(const int64_t offset) const {
+            RowRanges result;
+            for (const Range&range: ranges) {
+                result.Add({range.from + offset, range.to + offset});
+            }
+            return result;
+        }
+
         std::string ToString() const {
             std::string result = "[";
             for (const Range&range: ranges) {
@@ -450,7 +520,7 @@ namespace parquet {
     namespace internal {
         class PARQUET_EXPORT RecordSkipper {
         public:
-            RecordSkipper(RowRanges&pages, RowRanges&row_ranges_)
+            RecordSkipper(RowRanges&pages, const RowRanges&row_ranges_)
                 : row_ranges(row_ranges_) {
                 // copy row_ranges
                 RowRanges will_process_pages, skip_pages;
@@ -496,7 +566,7 @@ namespace parquet {
             }
 
         private:
-            void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
+            void adjust_ranges(RowRanges&skip_pages, RowRanges&to_adjust) {
                 size_t skipped_rows = 0;
                 auto iter = to_adjust.GetRanges().begin();
                 auto skip_iter = skip_pages.GetRanges().begin();
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index abbbb5fa60e4..6fcc35ec4fd4 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -33,6 +33,9 @@
 #include <random>
 #include <string>
 
+using parquet::Range;
+using parquet::RowRanges;
+
 std::string random_string(std::string::size_type length) {
   static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
@@ -181,7 +184,7 @@ bool checking_col(const std::string& col_name,
          column_names.end();
 }
 
-void check_rb(std::shared_ptr<arrow::RecordBatchReader> rb_reader,
+void check_rb(std::unique_ptr<arrow::RecordBatchReader> rb_reader,
               const size_t expected_rows, const int64_t expected_sum) {
   const std::vector<std::string> column_names = rb_reader->schema()->field_names();
 
@@ -272,128 +275,94 @@ class TestRecordBatchReaderWithRanges : public testing::Test {
   std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
 };
 
+TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {}
+
 TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
-  row_ranges_map.insert(
-      {1, std::make_shared<parquet::RowRanges>(parquet::Range{10, 19})});
-  row_ranges_map.insert(
-      {2, std::make_shared<parquet::RowRanges>(parquet::Range{20, 29})});
-  row_ranges_map.insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
+  RowRanges rows{{Range{0, 9}, Range{40, 49}, Range{80, 89}, Range{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
-  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                               row_ranges_map, &rb_reader));
+  ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
 
   // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280
-  check_rb(rb_reader, 40, 2280);
+  check_rb(std::move(rb_reader), 40, 2280);
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-  row_ranges_map.insert(
-      {0, std::make_shared<parquet::RowRanges>(std::vector<parquet::Range>{
-              parquet::Range{0, 7}, parquet::Range{16, 23}})});
-  row_ranges_map.insert({1, nullptr});
-  row_ranges_map.insert({2, nullptr});
-  row_ranges_map.insert({3, nullptr});
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
+  RowRanges rows{{Range{0, 7}, Range{16, 23}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
-  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                               row_ranges_map, &rb_reader));
+  ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
 
   // (0+...+7) + (16+...+23) = 184
-  check_rb(rb_reader, 16, 184);
+  check_rb(std::move(rb_reader), 16, 184);
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map.insert({1, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
-  row_ranges_map.insert({3, std::make_shared<parquet::RowRanges>(parquet::Range{0, 9})});
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
+  RowRanges rows{{Range{0, 29}, Range{30, 59}, Range{60, 89}, Range{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
-  ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                               row_ranges_map, &rb_reader));
+  ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
 
   // (0+...+99) = 4950
-  check_rb(rb_reader, 100, 4950);
+  check_rb(std::move(rb_reader), 100, 4950);
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-  // here we test four kinds of empty range:
-
-  // rg 0 not put into map -> will read
-  row_ranges_map.insert({1, nullptr});  // value is nullptr -> will skip
-  row_ranges_map.insert(
-      {2, std::make_shared<parquet::RowRanges>(
-              std::vector<parquet::Range>())});  // value is empty -> will skip
-  row_ranges_map.insert(
-      {3, std::make_shared<parquet::RowRanges>()});  // value is empty -> will skip
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
+  RowRanges rows{};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
-  const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                         row_ranges_map, &rb_reader);
+  const auto status =
+      arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
   ASSERT_OK(status);
-  // (0+...29) = 435
-  check_rb(rb_reader, 30, 435);
+  check_rb(std::move(rb_reader), 0, 0);
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 1: only care about RG 0
   {
-    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    std::unique_ptr<arrow::RecordBatchReader> rb_reader;
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map.insert({1, nullptr});
-    row_ranges_map.insert({2, nullptr});
-    row_ranges_map.insert({3, nullptr});
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                 row_ranges_map, &rb_reader));
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+                                                 &rb_reader));
 
-    check_rb(rb_reader, 15, 210);  // 0 + 2 + ... + 28 = 210
+    check_rb(std::move(rb_reader), 15, 210);  // 0 + 2 + ... + 28 = 210
   }
 
   // case 2: care about RG 0 and 2
   {
-    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    std::unique_ptr<arrow::RecordBatchReader> rb_reader;
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map.insert({1, nullptr});
-    row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map.insert({3, nullptr});
+
+    for (int64_t i = 60; i < 90; i++) {
+      if (i % 2 == 0) ranges.push_back({i, i});
+    }
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                 row_ranges_map, &rb_reader));
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+                                                 &rb_reader));
 
-    check_rb(rb_reader, 30, 1320);  // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
+    check_rb(std::move(rb_reader), 30,
+             1320);  // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320
   }
 }
 
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-    row_ranges_map.insert(
-        {0, std::make_shared<parquet::RowRanges>(parquet::Range{-1, 5})});
+    RowRanges rows{{Range{-1, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
-    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                           row_ranges_map, &rb_reader);
+    const auto status =
+        arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
     ASSERT_NOT_OK(status);
     EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it "
                                       "monotone and non-interleaving: [(-1, 5)]") !=
@@ -401,28 +370,25 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
 
   {
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(std::vector{
-                                  parquet::Range{0, 4}, parquet::Range{2, 5}})});
+    RowRanges rows{{Range{0, 4}, {2, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
-    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                           row_ranges_map, &rb_reader);
+    const auto status =
+        arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
     ASSERT_NOT_OK(status);
     EXPECT_TRUE(
         status.message().find("The provided row range is invalid, keep it monotone and "
                               "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos);
   }
   {
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-    row_ranges_map.insert(
-        {0, std::make_shared<parquet::RowRanges>(std::vector{parquet::Range{0, 30}})});
+    // will treat as {0,99}
+    RowRanges rows{{Range{0, 100}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
-    const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                           row_ranges_map, &rb_reader);
+    const auto status =
+        arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
     ASSERT_NOT_OK(status);
-    EXPECT_TRUE(
-        status.message().find("The provided row range [(0, 30)] exceeds last page :") !=
-        std::string::npos);
+    EXPECT_TRUE(status.message().find("The provided row range [(0, 100)] exceeds the "
+                                      "number of rows in the file: 100") !=
+                std::string::npos);
   }
 }
 
@@ -463,12 +429,10 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   reader_builder.properties(arrow_reader_props);
   ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
 
-  std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-  auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
-  row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(parquet::Range{0, 29})});
+  std::unique_ptr<arrow::RecordBatchReader> rb_reader;
+  RowRanges rows{{Range{0, 29}}};
   std::vector column_indices{0, 1, 2, 3, 4};
-  auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                   row_ranges_map, &rb_reader);
+  auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
   ASSERT_NOT_OK(status);
   EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is "
                                     "not found for Row Group: 0") != std::string::npos);
@@ -505,22 +469,21 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test {
 
 TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
   {
-    std::shared_ptr<arrow::RecordBatchReader> rb_reader;
-    auto row_ranges_map = std::map<int, parquet::RowRangesPtr>();
+    std::unique_ptr<arrow::RecordBatchReader> rb_reader;
     std::vector<parquet::Range> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
-    row_ranges_map.insert({0, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map.insert({1, nullptr});
-    row_ranges_map.insert({2, std::make_shared<parquet::RowRanges>(ranges)});
-    row_ranges_map.insert({3, nullptr});
+
+    for (int64_t i = 60; i < 90; i++) {
+      if (i % 2 == 0) ranges.push_back({i, i});
+    }
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices,
-                                                 row_ranges_map, &rb_reader));
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+                                                 &rb_reader));
 
     // 0-9 is masked as null, so the ramaining is:
     // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320
-    check_rb(rb_reader, 30, 1300);
+    check_rb(std::move(rb_reader), 30, 1300);
   }
 }
\ No newline at end of file
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
new file mode 100644
index 000000000000..3766df5e0fb5
--- /dev/null
+++ b/cpp/src/parquet/row_range_test.cc
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include <gtest/gtest.h>
+#include "parquet/column_reader.h"
+
+using namespace parquet;
+
+class RowRangesTest : public ::testing::Test {
+ protected:
+  RowRanges rowRanges;
+};
+
+TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) {
+  rowRanges.Add(Range(0, 10));
+  std::vector<int64_t> split_points;
+
+  auto result = rowRanges.SplitAt(split_points);
+
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result[0].GetRanges().size(), 1);
+  ASSERT_EQ(result[0][0].from, 0);
+  ASSERT_EQ(result[0][0].to, 10);
+}
+
+TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) {
+  rowRanges.Add(Range(0, 10));
+  std::vector<int64_t> split_points = {5};
+
+  auto result = rowRanges.SplitAt(split_points);
+
+  ASSERT_EQ(result.size(), 2);
+  ASSERT_EQ(result[0].GetRanges().size(), 1);
+  ASSERT_EQ(result[0][0].from, 0);
+  ASSERT_EQ(result[0][0].to, 4);
+  ASSERT_EQ(result[1].GetRanges().size(), 1);
+  ASSERT_EQ(result[1][0].from, 5);
+  ASSERT_EQ(result[1][0].to, 10);
+}
+
+TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) {
+  rowRanges.Add(Range(0, 10));
+  std::vector<int64_t> split_points = {3, 7};
+
+  auto result = rowRanges.SplitAt(split_points);
+
+  ASSERT_EQ(result.size(), 3);
+  ASSERT_EQ(result[0].GetRanges().size(), 1);
+  ASSERT_EQ(result[0][0].from, 0);
+  ASSERT_EQ(result[0][0].to, 2);
+  ASSERT_EQ(result[1].GetRanges().size(), 1);
+  ASSERT_EQ(result[1][0].from, 3);
+  ASSERT_EQ(result[1][0].to, 6);
+  ASSERT_EQ(result[2].GetRanges().size(), 1);
+  ASSERT_EQ(result[2][0].from, 7);
+  ASSERT_EQ(result[2][0].to, 10);
+}
+
+TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) {
+  rowRanges.Add(Range(11, 18));
+  std::vector<int64_t> split_points = {5, 10, 15, 20};
+
+  auto result = rowRanges.SplitAt(split_points);
+
+  ASSERT_EQ(result.size(), 5);
+  ASSERT_EQ(result[0].GetRanges().size(), 0);
+  ASSERT_EQ(result[1].GetRanges().size(), 0);
+  ASSERT_EQ(result[2].GetRanges().size(), 1);
+  ASSERT_EQ(result[2][0].from, 11);
+  ASSERT_EQ(result[2][0].to, 14);
+  ASSERT_EQ(result[3].GetRanges().size(), 1);
+  ASSERT_EQ(result[3][0].from, 15);
+  ASSERT_EQ(result[3][0].to, 18);
+  ASSERT_EQ(result[4].GetRanges().size(), 0);
+}
+
+TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) {
+  rowRanges.Add(Range(0, 10));
+  std::vector<int64_t> split_points = {-1};
+
+  ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);
+}
+
+TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) {
+  rowRanges.Add(Range(0, 10));
+  std::vector<int64_t> split_points = {5, 3};
+
+  ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);
+}

From 14974c096f181c6ae6be8f7f054e5282501ec8b9 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Tue, 2 Jan 2024 00:03:34 +0800
Subject: [PATCH 14/25] clean code

---
 cpp/src/parquet/arrow/reader.cc      |  25 ++--
 cpp/src/parquet/column_reader.h      | 175 +++++++++++++--------------
 cpp/src/parquet/range_reader_test.cc |  22 ++--
 cpp/src/parquet/row_range_test.cc    |  44 +++----
 4 files changed, 129 insertions(+), 137 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 222493487cc4..92b746b8ad92 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -74,8 +74,7 @@ using arrow::internal::Iota;
 // Help reduce verbosity
 using ParquetReader = parquet::ParquetFileReader;
 
-using parquet::Range;
-using parquet::RowRangesPtr;
+using parquet::IntervalRange;
 using parquet::internal::RecordReader;
 
 namespace bit_util = arrow::bit_util;
@@ -359,7 +358,7 @@ class FileReaderImpl : public FileReader {
                              rows_to_return.ToString());
     }
     // check if the row ranges are within the row group boundaries
-    if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().to >= metadata->num_rows()) {
+    if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().end >= metadata->num_rows()) {
       return Status::Invalid("The provided row range " + rows_to_return.ToString() +
                              " exceeds the number of rows in the file: " +
                              std::to_string(metadata->num_rows()));
@@ -507,16 +506,18 @@ class RowGroupReaderImpl : public RowGroupReader {
 
 struct RowRangesPageFilter {
   explicit RowRangesPageFilter(const RowRanges& row_ranges_,
-                               const RowRangesPtr& page_ranges_)
+                               const std::shared_ptr<RowRanges>& page_ranges_)
       : row_ranges(row_ranges_), page_ranges(page_ranges_) {
-    assert(page_ranges != nullptr);
-    assert(page_ranges->GetRanges().size() > 0);
+
+    if (page_ranges == nullptr || page_ranges->GetRanges().size() == 0) {
+      throw ParquetException("Page ranges is empty");
+    }
   }
 
   bool operator()(const DataPageStats& stats) {
     ++page_range_idx;
 
-    Range current_page_range = (*page_ranges)[page_range_idx];
+    IntervalRange current_page_range = (*page_ranges)[page_range_idx];
 
     while (row_range_idx < row_ranges.GetRanges().size() &&
            current_page_range.IsAfter(row_ranges[row_range_idx])) {
@@ -534,7 +535,7 @@ struct RowRangesPageFilter {
   const RowRanges & row_ranges;
 
   int page_range_idx = -1;
-  const RowRangesPtr page_ranges;
+  const std::shared_ptr<RowRanges> page_ranges;
 };
 
 // Leaf reader is for primitive arrays and primitive children of nested arrays
@@ -624,19 +625,17 @@ class LeafReader : public ColumnReaderImpl {
     page_ranges = std::make_shared<RowRanges>();
     for (size_t i = 0; i < page_locations.size() - 1; i++) {
       page_ranges->Add(
-          {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1},
-          false);
+          {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1});
     }
     if (page_locations.size() >= 1) {
       page_ranges->Add(
           {page_locations[page_locations.size() - 1].first_row_index,
            ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() -
-               1},
-          false);
+               1});
     }
 
     if (row_ranges.GetRanges().size() > 0) {
-      if (row_ranges.GetRanges().back().to > page_ranges->GetRanges().back().to) {
+      if (row_ranges.GetRanges().back().end > page_ranges->GetRanges().back().end) {
         throw ParquetException(
             "The provided row range " + row_ranges.ToString() +
             " exceeds last page :" + page_ranges->GetRanges().back().ToString());
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index d884e0144e4b..9b9393e4ecc2 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -17,12 +17,12 @@
 
 #pragma once
 
-#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "page_index.h"
 #include "parquet/exception.h"
 #include "parquet/level_conversion.h"
 #include "parquet/metadata.h"
@@ -306,64 +306,64 @@ namespace parquet {
                                                 int32_t* dict_len) = 0;
     };
 
-    struct Range {
-        static Range UnionRange(const Range&left, const Range&right) {
-            if (left.from <= right.from) {
-                if (left.to + 1 >= right.from) {
-                    return {left.from, std::max(left.to, right.to)};
+    // Represent a range to read. The range is inclusive on both ends.
+    struct IntervalRange {
+        static IntervalRange Intersection(const IntervalRange&left, const IntervalRange&right) {
+            if (left.start <= right.start) {
+                if (left.end >= right.start) {
+                    return {right.start, std::min(left.end, right.end)};
                 }
             }
-            else if (right.to + 1 >= left.from) {
-                return {right.from, std::max(left.to, right.to)};
-            }
-            return {-1, -1};
-        }
-
-        static Range Intersection(const Range&left, const Range&right) {
-            if (left.from <= right.from) {
-                if (left.to >= right.from) {
-                    return {right.from, std::min(left.to, right.to)};
-                }
-            }
-            else if (right.to >= left.from) {
-                return {left.from, std::min(left.to, right.to)};
+            else if (right.end >= left.start) {
+                return {left.start, std::min(left.end, right.end)};
             }
             return {-1, -1}; // Return a default Range object if no intersection range found
         }
 
-        Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) {
-            assert(from <= to);
+        IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) {
+            if (start > end) {
+                throw ParquetException("Invalid range with start: " + std::to_string(start)
+                    + " and end: " + std::to_string(end));
+            }
         }
 
-        size_t Count() const { return to - from + 1; }
+        size_t Count() const { return end - start + 1; }
 
-        bool IsBefore(const Range&other) const { return to < other.from; }
+        bool IsBefore(const IntervalRange&other) const { return end < other.start; }
 
-        bool IsAfter(const Range&other) const { return from > other.to; }
+        bool IsAfter(const IntervalRange&other) const { return start > other.end; }
 
-        bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); }
+        bool IsOverlap(const IntervalRange&other) const { return !IsBefore(other) && !IsAfter(other); }
 
-        bool IsValid() const { return from >= 0 && to >= 0 && to >= from; }
+        bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
 
         std::string ToString() const {
-            return "[" + std::to_string(from) + ", " + std::to_string(to) + "]";
+            return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
         }
 
         // inclusive
-        int64_t from;
+        int64_t start;
         // inclusive
-        int64_t to;
+        int64_t end;
+    };
+
+    struct BitmapRange {
+        int64_t offset;
+        // zero added to, if there are less than 64 elements left in the column.
+        uint64_t bitmap;
     };
 
+    struct End {};
+
+    // Represent a set of ranges to read. The ranges are sorted and non-overlapping.
     class RowRanges {
     public:
         RowRanges() = default;
 
-        explicit RowRanges(const Range&range) { ranges.push_back(range); }
+        explicit RowRanges(const IntervalRange&range) { ranges.push_back(range); }
 
-        RowRanges(const std::vector<Range>&ranges) { this->ranges = ranges; }
+        RowRanges(const std::vector<IntervalRange>&ranges) { this->ranges = ranges; }
 
-        // copy cstr
         RowRanges(const RowRanges&other) { ranges = other.ranges; }
 
         RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); }
@@ -372,49 +372,33 @@ namespace parquet {
             RowRanges result;
 
             size_t rightIndex = 0;
-            for (const Range& l : left.ranges) {
+            for (const IntervalRange& l : left.ranges) {
                 for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
-                    const Range& r = right.ranges[i];
+                    const IntervalRange& r = right.ranges[i];
                     if (l.IsBefore(r)) {
                         break;
                     } else if (l.IsAfter(r)) {
                         rightIndex = i + 1;
                         continue;
                     }
-                    result.Add(Range::Intersection(l, r));
+                    result.Add(IntervalRange::Intersection(l, r));
                 }
             }
 
             return result;
         }
 
-        void Add(const Range&range, bool merge = true) {
-            Range rangeToAdd = range;
-            if (merge) {
-                for (int i = static_cast<int>(ranges.size()) - 1; i >= 0; --i) {
-                    Range last = ranges[i];
-                    if (last.IsAfter(range)) {
-                        throw ParquetException(range.ToString() + " cannot be added to " +
-                                               this->ToString());
-                    }
-                    const Range u = Range::UnionRange(last, rangeToAdd);
-                    if (u.from == -1 && u.to == -1) {
-                        break;
-                    }
-                    rangeToAdd = u;
-                    ranges.erase(ranges.begin() + i);
-                }
-            }
-            else {
-                if (ranges.size() > 1)
-                    assert(rangeToAdd.from > ranges.back().to);
+        void Add(const IntervalRange&range) {
+            const IntervalRange rangeToAdd = range;
+            if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) {
+                throw ParquetException("Ranges must be added in order");
             }
             ranges.push_back(rangeToAdd);
         }
 
         size_t RowCount() const {
             size_t cnt = 0;
-            for (const Range&range: ranges) {
+            for (const IntervalRange&range: ranges) {
                 cnt += range.Count();
             }
             return cnt;
@@ -422,32 +406,32 @@ namespace parquet {
 
         bool IsValid() const {
             if (ranges.size() == 0) return true;
-            if (ranges[0].from < 0) {
+            if (ranges[0].start < 0) {
                 return false;
             }
             for (size_t i = 1; i < ranges.size(); i++) {
-                if (ranges[i].from <= ranges[i - 1].to) {
+                if (ranges[i].start <= ranges[i - 1].end) {
                     return false;
                 }
             }
             return true;
         }
 
-        bool IsOverlapping(int64_t from, int64_t to) const {
-            const Range searchRange(from, to);
+        bool IsOverlapping(int64_t start, int64_t end) const {
+            const IntervalRange searchRange(start, end);
             return IsOverlapping(searchRange);
         }
 
-        bool IsOverlapping(const Range&searchRange) const {
+        bool IsOverlapping(const IntervalRange&searchRange) const {
             auto it = std::lower_bound(
                 ranges.begin(), ranges.end(), searchRange,
-                [](const Range&r1, const Range&r2) { return r1.IsBefore(r2); });
+                [](const IntervalRange&r1, const IntervalRange&r2) { return r1.IsBefore(r2); });
             return it != ranges.end() && !(*it).IsAfter(searchRange);
         }
 
-        std::vector<Range>& GetRanges() { return ranges; }
+        std::vector<IntervalRange>& GetRanges() { return ranges; }
 
-        const std::vector<Range>& GetRanges() const { return ranges; }
+        const std::vector<IntervalRange>& GetRanges() const { return ranges; }
 
         // Split the ranges into N+1 parts at the given split point, where N = split_points.size()
         // The RowRows object itself is not modified
@@ -472,12 +456,11 @@ namespace parquet {
             for (size_t i = 0 ; i < split_points.size(); ++i) {
                 auto start = i == 0 ? 0 : split_points[i - 1];
                 auto end = split_points[i] - 1;
-                spaces.Add({start, end}, false);
+                spaces.Add({start, end});
             }
-            spaces.Add({split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()},
-                       false);
+            spaces.Add({split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()});
 
-            for(Range space : spaces.GetRanges()) {
+            for(IntervalRange space : spaces.GetRanges()) {
                 RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this);
                 result.push_back(intersection);
             }
@@ -485,24 +468,27 @@ namespace parquet {
             return result;
         }
 
-        const Range& operator[](size_t index) const {
-            assert(index < ranges.size());
+        const IntervalRange& operator[](size_t index) const {
+            // check index
+            if (index >= ranges.size() || index < 0) {
+                throw ParquetException("Index out of range");
+            }
             return ranges[index];
         }
 
         RowRanges shift(const int64_t offset) const {
             RowRanges result;
-            for (const Range&range: ranges) {
-                result.Add({range.from + offset, range.to + offset});
+            for (const IntervalRange&range: ranges) {
+                result.Add({range.start + offset, range.end + offset});
             }
             return result;
         }
 
         std::string ToString() const {
             std::string result = "[";
-            for (const Range&range: ranges) {
+            for (const IntervalRange&range: ranges) {
                 result +=
-                        "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), ";
+                        "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), ";
             }
             if (!ranges.empty()) {
                 result = result.substr(0, result.size() - 2);
@@ -511,12 +497,20 @@ namespace parquet {
             return result;
         }
 
+        /// The following APIs are to be implemented
+        /// Comment out for now to pass compile
+
+//         // Returns a vector of PageLocations that must be read all to get values for all included in this range
+//         virtual std::vector<PageLocation> PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0;
+//         class Iterator {
+//             virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+//         };
+//         virtual std::unique_ptr<Iterator> NewIterator() = 0;
+
     private:
-        std::vector<Range> ranges;
+        std::vector<IntervalRange> ranges;
     };
 
-    using RowRangesPtr = std::shared_ptr<RowRanges>;
-
     namespace internal {
         class PARQUET_EXPORT RecordSkipper {
         public:
@@ -526,11 +520,11 @@ namespace parquet {
                 RowRanges will_process_pages, skip_pages;
                 for (auto&page: pages.GetRanges()) {
                     if (!row_ranges.IsOverlapping(page)) {
-                        skip_pages.Add(page, false);
+                        skip_pages.Add(page);
                     }
                 }
 
-                /// Since the skipped pages will be slienly skipped without updating
+                /// Since the skipped pages will be silently skipped without updating
                 /// current_rg_processed_records or records_read_, we need to pre-process the row
                 /// ranges as if these skipped pages never existed
                 adjust_ranges(skip_pages, row_ranges);
@@ -542,31 +536,30 @@ namespace parquet {
             /// if return values is positive, it means to read N records
             /// if return values is negative, it means to skip N records
             /// if return values is 0, it means end of RG
-            int64_t advise_next(const int64_t current_rg_procesed) {
+            int64_t advise_next(const int64_t current_rg_processed) {
                 if (row_ranges.GetRanges().size() == row_range_idx) {
                     return 0;
                 }
 
-                if (row_ranges[row_range_idx].to < current_rg_procesed) {
+                if (row_ranges[row_range_idx].end < current_rg_processed) {
                     row_range_idx++;
                     if (row_ranges.GetRanges().size() == row_range_idx) {
                         // negative, skip the ramaining rows
-                        return current_rg_procesed - total_rows_to_process;
+                        return current_rg_processed - total_rows_to_process;
                     }
                 }
 
-                if (row_ranges[row_range_idx].from > current_rg_procesed) {
+                if (row_ranges[row_range_idx].start > current_rg_processed) {
                     // negative, skip
-                    return current_rg_procesed - row_ranges[row_range_idx].from;
+                    return current_rg_processed - row_ranges[row_range_idx].start;
                 }
 
-                const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1;
-                assert(ret > 0);
+                const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1;
                 return ret;
             }
 
         private:
-            void adjust_ranges(RowRanges&skip_pages, RowRanges&to_adjust) {
+            void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) {
                 size_t skipped_rows = 0;
                 auto iter = to_adjust.GetRanges().begin();
                 auto skip_iter = skip_pages.GetRanges().begin();
@@ -575,13 +568,13 @@ namespace parquet {
                         skipped_rows += skip_iter->Count();
                         ++skip_iter;
                     }
-                    iter->from -= skipped_rows;
-                    iter->to -= skipped_rows;
+                    iter->start -= skipped_rows;
+                    iter->end -= skipped_rows;
                     ++iter;
                 }
             }
 
-            /// Keep copy of ranges, because advise_next() will modify them
+            /// Keep copy of ranges, because adjust_ranges() will modify them
             RowRanges row_ranges;
 
             size_t row_range_idx = 0;
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index 6fcc35ec4fd4..b3127a8e346c 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -33,7 +33,7 @@
 #include <random>
 #include <string>
 
-using parquet::Range;
+using parquet::IntervalRange;
 using parquet::RowRanges;
 
 std::string random_string(std::string::size_type length) {
@@ -279,7 +279,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {}
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{Range{0, 9}, Range{40, 49}, Range{80, 89}, Range{90, 99}}};
+  RowRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -290,7 +290,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{Range{0, 7}, Range{16, 23}}};
+  RowRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -301,7 +301,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{Range{0, 29}, Range{30, 59}, Range{60, 89}, Range{90, 99}}};
+  RowRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -325,7 +325,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 1: only care about RG 0
   {
     std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-    std::vector<parquet::Range> ranges;
+    std::vector<parquet::IntervalRange> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
@@ -339,7 +339,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
   // case 2: care about RG 0 and 2
   {
     std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-    std::vector<parquet::Range> ranges;
+    std::vector<parquet::IntervalRange> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
@@ -359,7 +359,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    RowRanges rows{{Range{-1, 5}}};
+    RowRanges rows{{IntervalRange{-1, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -370,7 +370,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
 
   {
-    RowRanges rows{{Range{0, 4}, {2, 5}}};
+    RowRanges rows{{IntervalRange{0, 4}, {2, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -381,7 +381,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
   {
     // will treat as {0,99}
-    RowRanges rows{{Range{0, 100}}};
+    RowRanges rows{{IntervalRange{0, 100}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -430,7 +430,7 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
 
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{Range{0, 29}}};
+  RowRanges rows{{IntervalRange{0, 29}}};
   std::vector column_indices{0, 1, 2, 3, 4};
   auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
   ASSERT_NOT_OK(status);
@@ -470,7 +470,7 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test {
 TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
   {
     std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-    std::vector<parquet::Range> ranges;
+    std::vector<parquet::IntervalRange> ranges;
     for (int64_t i = 0; i < 30; i++) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
index 3766df5e0fb5..2e043f57a7b2 100644
--- a/cpp/src/parquet/row_range_test.cc
+++ b/cpp/src/parquet/row_range_test.cc
@@ -25,52 +25,52 @@ class RowRangesTest : public ::testing::Test {
 };
 
 TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) {
-  rowRanges.Add(Range(0, 10));
+  rowRanges.Add(IntervalRange(0, 10));
   std::vector<int64_t> split_points;
 
   auto result = rowRanges.SplitAt(split_points);
 
   ASSERT_EQ(result.size(), 1);
   ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].from, 0);
-  ASSERT_EQ(result[0][0].to, 10);
+  ASSERT_EQ(result[0][0].start, 0);
+  ASSERT_EQ(result[0][0].end, 10);
 }
 
 TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) {
-  rowRanges.Add(Range(0, 10));
+  rowRanges.Add(IntervalRange(0, 10));
   std::vector<int64_t> split_points = {5};
 
   auto result = rowRanges.SplitAt(split_points);
 
   ASSERT_EQ(result.size(), 2);
   ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].from, 0);
-  ASSERT_EQ(result[0][0].to, 4);
+  ASSERT_EQ(result[0][0].start, 0);
+  ASSERT_EQ(result[0][0].end, 4);
   ASSERT_EQ(result[1].GetRanges().size(), 1);
-  ASSERT_EQ(result[1][0].from, 5);
-  ASSERT_EQ(result[1][0].to, 10);
+  ASSERT_EQ(result[1][0].start, 5);
+  ASSERT_EQ(result[1][0].end, 10);
 }
 
 TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) {
-  rowRanges.Add(Range(0, 10));
+  rowRanges.Add(IntervalRange(0, 10));
   std::vector<int64_t> split_points = {3, 7};
 
   auto result = rowRanges.SplitAt(split_points);
 
   ASSERT_EQ(result.size(), 3);
   ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].from, 0);
-  ASSERT_EQ(result[0][0].to, 2);
+  ASSERT_EQ(result[0][0].start, 0);
+  ASSERT_EQ(result[0][0].end, 2);
   ASSERT_EQ(result[1].GetRanges().size(), 1);
-  ASSERT_EQ(result[1][0].from, 3);
-  ASSERT_EQ(result[1][0].to, 6);
+  ASSERT_EQ(result[1][0].start, 3);
+  ASSERT_EQ(result[1][0].end, 6);
   ASSERT_EQ(result[2].GetRanges().size(), 1);
-  ASSERT_EQ(result[2][0].from, 7);
-  ASSERT_EQ(result[2][0].to, 10);
+  ASSERT_EQ(result[2][0].start, 7);
+  ASSERT_EQ(result[2][0].end, 10);
 }
 
 TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) {
-  rowRanges.Add(Range(11, 18));
+  rowRanges.Add(IntervalRange(11, 18));
   std::vector<int64_t> split_points = {5, 10, 15, 20};
 
   auto result = rowRanges.SplitAt(split_points);
@@ -79,23 +79,23 @@ TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) {
   ASSERT_EQ(result[0].GetRanges().size(), 0);
   ASSERT_EQ(result[1].GetRanges().size(), 0);
   ASSERT_EQ(result[2].GetRanges().size(), 1);
-  ASSERT_EQ(result[2][0].from, 11);
-  ASSERT_EQ(result[2][0].to, 14);
+  ASSERT_EQ(result[2][0].start, 11);
+  ASSERT_EQ(result[2][0].end, 14);
   ASSERT_EQ(result[3].GetRanges().size(), 1);
-  ASSERT_EQ(result[3][0].from, 15);
-  ASSERT_EQ(result[3][0].to, 18);
+  ASSERT_EQ(result[3][0].start, 15);
+  ASSERT_EQ(result[3][0].end, 18);
   ASSERT_EQ(result[4].GetRanges().size(), 0);
 }
 
 TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) {
-  rowRanges.Add(Range(0, 10));
+  rowRanges.Add(IntervalRange(0, 10));
   std::vector<int64_t> split_points = {-1};
 
   ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);
 }
 
 TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) {
-  rowRanges.Add(Range(0, 10));
+  rowRanges.Add(IntervalRange(0, 10));
   std::vector<int64_t> split_points = {5, 3};
 
   ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);

From ef8a7c8350f1c0e761915101a15ad9ff37d72c70 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Tue, 2 Jan 2024 00:06:22 +0800
Subject: [PATCH 15/25] clean code

---
 cpp/src/parquet/column_reader.h | 1470 +++++++++++++++----------------
 1 file changed, 735 insertions(+), 735 deletions(-)

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 9b9393e4ecc2..7ebabf1f2095 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -32,743 +32,743 @@
 #include "parquet/types.h"
 
 namespace arrow {
-    class Array;
-    class ChunkedArray;
+class Array;
+class ChunkedArray;
 
-    namespace bit_util {
-        class BitReader;
-    } // namespace bit_util
+namespace bit_util {
+class BitReader;
+}  // namespace bit_util
 
-    namespace util {
-        class RleDecoder;
-    } // namespace util
-} // namespace arrow
+namespace util {
+class RleDecoder;
+}  // namespace util
+}  // namespace arrow
 
 namespace parquet {
-    class Decryptor;
-    class Page;
-
-    // 16 MB is the default maximum page header size
-    static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
-
-    // 16 KB is the default expected page header size
-    static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
-
-    // \brief DataPageStats stores encoded statistics and number of values/rows for
-    // a page.
-    struct PARQUET_EXPORT DataPageStats {
-        DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
-                      std::optional<int32_t> num_rows)
-            : encoded_statistics(encoded_statistics),
-              num_values(num_values),
-              num_rows(num_rows) {
+class Decryptor;
+class Page;
+
+// 16 MB is the default maximum page header size
+static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+// 16 KB is the default expected page header size
+static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+// \brief DataPageStats stores encoded statistics and number of values/rows for
+// a page.
+struct PARQUET_EXPORT DataPageStats {
+  DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
+                std::optional<int32_t> num_rows)
+      : encoded_statistics(encoded_statistics),
+        num_values(num_values),
+        num_rows(num_rows) {}
+
+  // Encoded statistics extracted from the page header.
+  // Nullptr if there are no statistics in the page header.
+  const EncodedStatistics* encoded_statistics;
+  // Number of values stored in the page. Filled for both V1 and V2 data pages.
+  // For repeated fields, this can be greater than number of rows. For
+  // non-repeated fields, this will be the same as the number of rows.
+  int32_t num_values;
+  // Number of rows stored in the page. std::nullopt if not available.
+  std::optional<int32_t> num_rows;
+};
+
+class PARQUET_EXPORT LevelDecoder {
+ public:
+  LevelDecoder();
+  ~LevelDecoder();
+
+  // Initialize the LevelDecoder state with new data
+  // and return the number of bytes consumed
+  int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+              const uint8_t* data, int32_t data_size);
+
+  void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+                 const uint8_t* data);
+
+  // Decodes a batch of levels into an array and returns the number of levels decoded
+  int Decode(int batch_size, int16_t* levels);
+
+ private:
+  int bit_width_;
+  int num_values_remaining_;
+  Encoding::type encoding_;
+  std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+  std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
+  int16_t max_level_;
+};
+
+struct CryptoContext {
+  CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
+                std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
+      : start_decrypt_with_dictionary_page(start_with_dictionary_page),
+        row_group_ordinal(rg_ordinal),
+        column_ordinal(col_ordinal),
+        meta_decryptor(std::move(meta)),
+        data_decryptor(std::move(data)) {}
+  CryptoContext() {}
+
+  bool start_decrypt_with_dictionary_page = false;
+  int16_t row_group_ordinal = -1;
+  int16_t column_ordinal = -1;
+  std::shared_ptr<Decryptor> meta_decryptor;
+  std::shared_ptr<Decryptor> data_decryptor;
+};
+
+// Abstract page iterator interface. This way, we can feed column pages to the
+// ColumnReader through whatever mechanism we choose
+class PARQUET_EXPORT PageReader {
+  using DataPageFilter = std::function<bool(const DataPageStats&)>;
+
+ public:
+  virtual ~PageReader() = default;
+
+  static std::unique_ptr<PageReader> Open(
+      std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
+      Compression::type codec, bool always_compressed = false,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+      const CryptoContext* ctx = NULLPTR);
+  static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
+                                          int64_t total_num_values,
+                                          Compression::type codec,
+                                          const ReaderProperties& properties,
+                                          bool always_compressed = false,
+                                          const CryptoContext* ctx = NULLPTR);
+
+  // If data_page_filter is present (not null), NextPage() will call the
+  // callback function exactly once per page in the order the pages appear in
+  // the column. If the callback function returns true the page will be
+  // skipped. The callback will be called only if the page type is DATA_PAGE or
+  // DATA_PAGE_V2. Dictionary pages will not be skipped.
+  // Caller is responsible for checking that statistics are correct using
+  // ApplicationVersion::HasCorrectStatistics().
+  // \note API EXPERIMENTAL
+  void set_data_page_filter(DataPageFilter data_page_filter) {
+    data_page_filter_ = std::move(data_page_filter);
+  }
+
+  // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+  // containing new Page otherwise
+  //
+  // The returned Page may contain references that aren't guaranteed to live
+  // beyond the next call to NextPage().
+  virtual std::shared_ptr<Page> NextPage() = 0;
+
+  virtual void set_max_page_header_size(uint32_t size) = 0;
+
+ protected:
+  // Callback that decides if we should skip a page or not.
+  DataPageFilter data_page_filter_;
+};
+
+class PARQUET_EXPORT ColumnReader {
+ public:
+  virtual ~ColumnReader() = default;
+
+  static std::shared_ptr<ColumnReader> Make(
+      const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  // Returns true if there are still values in this column.
+  virtual bool HasNext() = 0;
+
+  virtual Type::type type() const = 0;
+
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  // Get the encoding that can be exposed by this reader. If it returns
+  // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+  //
+  // \note API EXPERIMENTAL
+  virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+  friend class RowGroupReader;
+  // Set the encoding that can be exposed by this reader.
+  //
+  // \note API EXPERIMENTAL
+  virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+};
+
+// API to read values from a single column. This is a main client facing API.
+template <typename DType>
+class TypedColumnReader : public ColumnReader {
+ public:
+  typedef typename DType::c_type T;
+
+  // Read a batch of repetition levels, definition levels, and values from the
+  // column.
+  //
+  // Since null values are not stored in the values, the number of values read
+  // may be less than the number of repetition and definition levels. With
+  // nested data this is almost certainly true.
+  //
+  // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+  // This is only safe if you know through some other source that there are no
+  // undefined values.
+  //
+  // To fully exhaust a row group, you must read batches until the number of
+  // values read reaches the number of stored values according to the metadata.
+  //
+  // This API is the same for both V1 and V2 of the DataPage
+  //
+  // @returns: actual number of levels read (see values_read for number of values read)
+  virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+                            T* values, int64_t* values_read) = 0;
+
+  /// Read a batch of repetition levels, definition levels, and values from the
+  /// column and leave spaces for null entries on the lowest level in the values
+  /// buffer.
+  ///
+  /// In comparison to ReadBatch the length of repetition and definition levels
+  /// is the same as of the number of values read for max_definition_level == 1.
+  /// In the case of max_definition_level > 1, the repetition and definition
+  /// levels are larger than the values but the values include the null entries
+  /// with definition_level == (max_definition_level - 1).
+  ///
+  /// To fully exhaust a row group, you must read batches until the number of
+  /// values read reaches the number of stored values according to the metadata.
+  ///
+  /// @param batch_size the number of levels to read
+  /// @param[out] def_levels The Parquet definition levels, output has
+  ///   the length levels_read.
+  /// @param[out] rep_levels The Parquet repetition levels, output has
+  ///   the length levels_read.
+  /// @param[out] values The values in the lowest nested level including
+  ///   spacing for nulls on the lowest levels; output has the length
+  ///   values_read.
+  /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
+  ///   the row is null or on the maximum definition level. For performance
+  ///   reasons the underlying buffer should be able to store 1 bit more than
+  ///   required. If this requires an additional byte, this byte is only read
+  ///   but never written to.
+  /// @param valid_bits_offset The offset in bits of the valid_bits where the
+  ///   first relevant bit resides.
+  /// @param[out] levels_read The number of repetition/definition levels that were read.
+  /// @param[out] values_read The number of values read, this includes all
+  ///   non-null entries as well as all null-entries on the lowest level
+  ///   (i.e. definition_level == max_definition_level - 1)
+  /// @param[out] null_count The number of nulls on the lowest levels.
+  ///   (i.e. (values_read - null_count) is total number of non-null entries)
+  ///
+  /// \deprecated Since 4.0.0
+  ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
+  virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
+                                  int16_t* rep_levels, T* values, uint8_t* valid_bits,
+                                  int64_t valid_bits_offset, int64_t* levels_read,
+                                  int64_t* values_read, int64_t* null_count) = 0;
+
+  // Skip reading values. This method will work for both repeated and
+  // non-repeated fields. Note that this method is skipping values and not
+  // records. This distinction is important for repeated fields, meaning that
+  // we are not skipping over the values to the next record. For example,
+  // consider the following two consecutive records containing one repeated field:
+  // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
+  // is inside the first record.
+  // Returns the number of values skipped.
+  virtual int64_t Skip(int64_t num_values_to_skip) = 0;
+
+  // Read a batch of repetition levels, definition levels, and indices from the
+  // column. And read the dictionary if a dictionary page is encountered during
+  // reading pages. This API is similar to ReadBatch(), with ability to read
+  // dictionary and indices. It is only valid to call this method  when the reader can
+  // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+  // DICTIONARY).
+  //
+  // The dictionary is read along with the data page. When there's no data page,
+  // the dictionary won't be returned.
+  //
+  // @param batch_size The batch size to read
+  // @param[out] def_levels The Parquet definition levels.
+  // @param[out] rep_levels The Parquet repetition levels.
+  // @param[out] indices The dictionary indices.
+  // @param[out] indices_read The number of indices read.
+  // @param[out] dict The pointer to dictionary values. It will return nullptr if
+  // there's no data page. Each column chunk only has one dictionary page. The dictionary
+  // is owned by the reader, so the caller is responsible for copying the dictionary
+  // values before the reader gets destroyed.
+  // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+  // page.
+  // @returns: actual number of levels read (see indices_read for number of
+  // indices read
+  //
+  // \note API EXPERIMENTAL
+  virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+                                          int16_t* rep_levels, int32_t* indices,
+                                          int64_t* indices_read, const T** dict,
+                                          int32_t* dict_len) = 0;
+};
+
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+  static IntervalRange Intersection(const IntervalRange& left,
+                                    const IntervalRange& right) {
+    if (left.start <= right.start) {
+      if (left.end >= right.start) {
+        return {right.start, std::min(left.end, right.end)};
+      }
+    } else if (right.end >= left.start) {
+      return {left.start, std::min(left.end, right.end)};
+    }
+    return {-1, -1};  // Return a default Range object if no intersection range found
+  }
+
+  IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) {
+    if (start > end) {
+      throw ParquetException("Invalid range with start: " + std::to_string(start) +
+                             " and end: " + std::to_string(end));
+    }
+  }
+
+  size_t Count() const { return end - start + 1; }
+
+  bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+  bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+  bool IsOverlap(const IntervalRange& other) const {
+    return !IsBefore(other) && !IsAfter(other);
+  }
+
+  bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+  std::string ToString() const {
+    return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
+  }
+
+  // inclusive
+  int64_t start;
+  // inclusive
+  int64_t end;
+};
+
+struct BitmapRange {
+  int64_t offset;
+  // zero added to, if there are less than 64 elements left in the column.
+  uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and non-overlapping.
+class RowRanges {
+ public:
+  RowRanges() = default;
+
+  explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); }
+
+  RowRanges(const std::vector<IntervalRange>& ranges) { this->ranges = ranges; }
+
+  RowRanges(const RowRanges& other) { ranges = other.ranges; }
+
+  RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
+
+  static RowRanges Intersection(const RowRanges& left, const RowRanges& right) {
+    RowRanges result;
+
+    size_t rightIndex = 0;
+    for (const IntervalRange& l : left.ranges) {
+      for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
+        const IntervalRange& r = right.ranges[i];
+        if (l.IsBefore(r)) {
+          break;
+        } else if (l.IsAfter(r)) {
+          rightIndex = i + 1;
+          continue;
         }
-
-        // Encoded statistics extracted from the page header.
-        // Nullptr if there are no statistics in the page header.
-        const EncodedStatistics* encoded_statistics;
-        // Number of values stored in the page. Filled for both V1 and V2 data pages.
-        // For repeated fields, this can be greater than number of rows. For
-        // non-repeated fields, this will be the same as the number of rows.
-        int32_t num_values;
-        // Number of rows stored in the page. std::nullopt if not available.
-        std::optional<int32_t> num_rows;
-    };
-
-    class PARQUET_EXPORT LevelDecoder {
-    public:
-        LevelDecoder();
-
-        ~LevelDecoder();
-
-        // Initialize the LevelDecoder state with new data
-        // and return the number of bytes consumed
-        int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
-                    const uint8_t* data, int32_t data_size);
-
-        void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
-                       const uint8_t* data);
-
-        // Decodes a batch of levels into an array and returns the number of levels decoded
-        int Decode(int batch_size, int16_t* levels);
-
-    private:
-        int bit_width_;
-        int num_values_remaining_;
-        Encoding::type encoding_;
-        std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
-        std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
-        int16_t max_level_;
-    };
-
-    struct CryptoContext {
-        CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
-                      std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
-            : start_decrypt_with_dictionary_page(start_with_dictionary_page),
-              row_group_ordinal(rg_ordinal),
-              column_ordinal(col_ordinal),
-              meta_decryptor(std::move(meta)),
-              data_decryptor(std::move(data)) {
-        }
-
-        CryptoContext() {
-        }
-
-        bool start_decrypt_with_dictionary_page = false;
-        int16_t row_group_ordinal = -1;
-        int16_t column_ordinal = -1;
-        std::shared_ptr<Decryptor> meta_decryptor;
-        std::shared_ptr<Decryptor> data_decryptor;
-    };
-
-    // Abstract page iterator interface. This way, we can feed column pages to the
-    // ColumnReader through whatever mechanism we choose
-    class PARQUET_EXPORT PageReader {
-        using DataPageFilter = std::function<bool(const DataPageStats&)>;
-
-    public:
-        virtual ~PageReader() = default;
-
-        static std::unique_ptr<PageReader> Open(
-            std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
-            Compression::type codec, bool always_compressed = false,
-            ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-            const CryptoContext* ctx = NULLPTR);
-
-        static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
-                                                int64_t total_num_values,
-                                                Compression::type codec,
-                                                const ReaderProperties&properties,
-                                                bool always_compressed = false,
-                                                const CryptoContext* ctx = NULLPTR);
-
-        // If data_page_filter is present (not null), NextPage() will call the
-        // callback function exactly once per page in the order the pages appear in
-        // the column. If the callback function returns true the page will be
-        // skipped. The callback will be called only if the page type is DATA_PAGE or
-        // DATA_PAGE_V2. Dictionary pages will not be skipped.
-        // Caller is responsible for checking that statistics are correct using
-        // ApplicationVersion::HasCorrectStatistics().
-        // \note API EXPERIMENTAL
-        void set_data_page_filter(DataPageFilter data_page_filter) {
-            data_page_filter_ = std::move(data_page_filter);
-        }
-
-        // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
-        // containing new Page otherwise
-        //
-        // The returned Page may contain references that aren't guaranteed to live
-        // beyond the next call to NextPage().
-        virtual std::shared_ptr<Page> NextPage() = 0;
-
-        virtual void set_max_page_header_size(uint32_t size) = 0;
-
-    protected:
-        // Callback that decides if we should skip a page or not.
-        DataPageFilter data_page_filter_;
-    };
-
-    class PARQUET_EXPORT ColumnReader {
-    public:
-        virtual ~ColumnReader() = default;
-
-        static std::shared_ptr<ColumnReader> Make(
-            const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
-            ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-        // Returns true if there are still values in this column.
-        virtual bool HasNext() = 0;
-
-        virtual Type::type type() const = 0;
-
-        virtual const ColumnDescriptor* descr() const = 0;
-
-        // Get the encoding that can be exposed by this reader. If it returns
-        // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
-        //
-        // \note API EXPERIMENTAL
-        virtual ExposedEncoding GetExposedEncoding() = 0;
-
-    protected:
-        friend class RowGroupReader;
-        // Set the encoding that can be exposed by this reader.
-        //
-        // \note API EXPERIMENTAL
-        virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
-    };
-
-    // API to read values from a single column. This is a main client facing API.
-    template<typename DType>
-    class TypedColumnReader : public ColumnReader {
-    public:
-        typedef typename DType::c_type T;
-
-        // Read a batch of repetition levels, definition levels, and values from the
-        // column.
-        //
-        // Since null values are not stored in the values, the number of values read
-        // may be less than the number of repetition and definition levels. With
-        // nested data this is almost certainly true.
-        //
-        // Set def_levels or rep_levels to nullptr if you want to skip reading them.
-        // This is only safe if you know through some other source that there are no
-        // undefined values.
-        //
-        // To fully exhaust a row group, you must read batches until the number of
-        // values read reaches the number of stored values according to the metadata.
-        //
-        // This API is the same for both V1 and V2 of the DataPage
-        //
-        // @returns: actual number of levels read (see values_read for number of values read)
-        virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
-                                  T* values, int64_t* values_read) = 0;
-
-        /// Read a batch of repetition levels, definition levels, and values from the
-        /// column and leave spaces for null entries on the lowest level in the values
-        /// buffer.
-        ///
-        /// In comparison to ReadBatch the length of repetition and definition levels
-        /// is the same as of the number of values read for max_definition_level == 1.
-        /// In the case of max_definition_level > 1, the repetition and definition
-        /// levels are larger than the values but the values include the null entries
-        /// with definition_level == (max_definition_level - 1).
-        ///
-        /// To fully exhaust a row group, you must read batches until the number of
-        /// values read reaches the number of stored values according to the metadata.
-        ///
-        /// @param batch_size the number of levels to read
-        /// @param[out] def_levels The Parquet definition levels, output has
-        ///   the length levels_read.
-        /// @param[out] rep_levels The Parquet repetition levels, output has
-        ///   the length levels_read.
-        /// @param[out] values The values in the lowest nested level including
-        ///   spacing for nulls on the lowest levels; output has the length
-        ///   values_read.
-        /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
-        ///   the row is null or on the maximum definition level. For performance
-        ///   reasons the underlying buffer should be able to store 1 bit more than
-        ///   required. If this requires an additional byte, this byte is only read
-        ///   but never written to.
-        /// @param valid_bits_offset The offset in bits of the valid_bits where the
-        ///   first relevant bit resides.
-        /// @param[out] levels_read The number of repetition/definition levels that were read.
-        /// @param[out] values_read The number of values read, this includes all
-        ///   non-null entries as well as all null-entries on the lowest level
-        ///   (i.e. definition_level == max_definition_level - 1)
-        /// @param[out] null_count The number of nulls on the lowest levels.
-        ///   (i.e. (values_read - null_count) is total number of non-null entries)
-        ///
-        /// \deprecated Since 4.0.0
-        ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
-        virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
-                                        int16_t* rep_levels, T* values, uint8_t* valid_bits,
-                                        int64_t valid_bits_offset, int64_t* levels_read,
-                                        int64_t* values_read, int64_t* null_count) = 0;
-
-        // Skip reading values. This method will work for both repeated and
-        // non-repeated fields. Note that this method is skipping values and not
-        // records. This distinction is important for repeated fields, meaning that
-        // we are not skipping over the values to the next record. For example,
-        // consider the following two consecutive records containing one repeated field:
-        // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
-        // is inside the first record.
-        // Returns the number of values skipped.
-        virtual int64_t Skip(int64_t num_values_to_skip) = 0;
-
-        // Read a batch of repetition levels, definition levels, and indices from the
-        // column. And read the dictionary if a dictionary page is encountered during
-        // reading pages. This API is similar to ReadBatch(), with ability to read
-        // dictionary and indices. It is only valid to call this method  when the reader can
-        // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
-        // DICTIONARY).
-        //
-        // The dictionary is read along with the data page. When there's no data page,
-        // the dictionary won't be returned.
-        //
-        // @param batch_size The batch size to read
-        // @param[out] def_levels The Parquet definition levels.
-        // @param[out] rep_levels The Parquet repetition levels.
-        // @param[out] indices The dictionary indices.
-        // @param[out] indices_read The number of indices read.
-        // @param[out] dict The pointer to dictionary values. It will return nullptr if
-        // there's no data page. Each column chunk only has one dictionary page. The dictionary
-        // is owned by the reader, so the caller is responsible for copying the dictionary
-        // values before the reader gets destroyed.
-        // @param[out] dict_len The dictionary length. It will return 0 if there's no data
-        // page.
-        // @returns: actual number of levels read (see indices_read for number of
-        // indices read
-        //
-        // \note API EXPERIMENTAL
-        virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
-                                                int16_t* rep_levels, int32_t* indices,
-                                                int64_t* indices_read, const T** dict,
-                                                int32_t* dict_len) = 0;
-    };
-
-    // Represent a range to read. The range is inclusive on both ends.
-    struct IntervalRange {
-        static IntervalRange Intersection(const IntervalRange&left, const IntervalRange&right) {
-            if (left.start <= right.start) {
-                if (left.end >= right.start) {
-                    return {right.start, std::min(left.end, right.end)};
-                }
-            }
-            else if (right.end >= left.start) {
-                return {left.start, std::min(left.end, right.end)};
-            }
-            return {-1, -1}; // Return a default Range object if no intersection range found
-        }
-
-        IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) {
-            if (start > end) {
-                throw ParquetException("Invalid range with start: " + std::to_string(start)
-                    + " and end: " + std::to_string(end));
-            }
-        }
-
-        size_t Count() const { return end - start + 1; }
-
-        bool IsBefore(const IntervalRange&other) const { return end < other.start; }
-
-        bool IsAfter(const IntervalRange&other) const { return start > other.end; }
-
-        bool IsOverlap(const IntervalRange&other) const { return !IsBefore(other) && !IsAfter(other); }
-
-        bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
-
-        std::string ToString() const {
-            return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
-        }
-
-        // inclusive
-        int64_t start;
-        // inclusive
-        int64_t end;
-    };
-
-    struct BitmapRange {
-        int64_t offset;
-        // zero added to, if there are less than 64 elements left in the column.
-        uint64_t bitmap;
-    };
-
-    struct End {};
-
-    // Represent a set of ranges to read. The ranges are sorted and non-overlapping.
-    class RowRanges {
-    public:
-        RowRanges() = default;
-
-        explicit RowRanges(const IntervalRange&range) { ranges.push_back(range); }
-
-        RowRanges(const std::vector<IntervalRange>&ranges) { this->ranges = ranges; }
-
-        RowRanges(const RowRanges&other) { ranges = other.ranges; }
-
-        RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); }
-
-        static RowRanges Intersection(const RowRanges& left, const RowRanges& right) {
-            RowRanges result;
-
-            size_t rightIndex = 0;
-            for (const IntervalRange& l : left.ranges) {
-                for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
-                    const IntervalRange& r = right.ranges[i];
-                    if (l.IsBefore(r)) {
-                        break;
-                    } else if (l.IsAfter(r)) {
-                        rightIndex = i + 1;
-                        continue;
-                    }
-                    result.Add(IntervalRange::Intersection(l, r));
-                }
-            }
-
-            return result;
-        }
-
-        void Add(const IntervalRange&range) {
-            const IntervalRange rangeToAdd = range;
-            if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) {
-                throw ParquetException("Ranges must be added in order");
-            }
-            ranges.push_back(rangeToAdd);
-        }
-
-        size_t RowCount() const {
-            size_t cnt = 0;
-            for (const IntervalRange&range: ranges) {
-                cnt += range.Count();
-            }
-            return cnt;
-        }
-
-        bool IsValid() const {
-            if (ranges.size() == 0) return true;
-            if (ranges[0].start < 0) {
-                return false;
-            }
-            for (size_t i = 1; i < ranges.size(); i++) {
-                if (ranges[i].start <= ranges[i - 1].end) {
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        bool IsOverlapping(int64_t start, int64_t end) const {
-            const IntervalRange searchRange(start, end);
-            return IsOverlapping(searchRange);
-        }
-
-        bool IsOverlapping(const IntervalRange&searchRange) const {
-            auto it = std::lower_bound(
-                ranges.begin(), ranges.end(), searchRange,
-                [](const IntervalRange&r1, const IntervalRange&r2) { return r1.IsBefore(r2); });
-            return it != ranges.end() && !(*it).IsAfter(searchRange);
-        }
-
-        std::vector<IntervalRange>& GetRanges() { return ranges; }
-
-        const std::vector<IntervalRange>& GetRanges() const { return ranges; }
-
-        // Split the ranges into N+1 parts at the given split point, where N = split_points.size()
-        // The RowRows object itself is not modified
-        std::vector<RowRanges> SplitAt(const std::vector<int64_t>&split_points) const {
-            if (split_points.size() == 0) {
-                return {*this};
-            }
-
-            std::vector<RowRanges> result;
-            int64_t last_split_point = -1;
-            for (const int64_t split_point: split_points) {
-                if (split_point <= 0) {
-                    throw ParquetException("Invalid split point " + std::to_string(split_point));
-                }
-                if (split_point <= last_split_point) {
-                    throw ParquetException("Split points must be in ascending order");
-                }
-                last_split_point = split_point;
-            }
-
-            RowRanges spaces;
-            for (size_t i = 0 ; i < split_points.size(); ++i) {
-                auto start = i == 0 ? 0 : split_points[i - 1];
-                auto end = split_points[i] - 1;
-                spaces.Add({start, end});
-            }
-            spaces.Add({split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()});
-
-            for(IntervalRange space : spaces.GetRanges()) {
-                RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this);
-                result.push_back(intersection);
-            }
-
-            return result;
-        }
-
-        const IntervalRange& operator[](size_t index) const {
-            // check index
-            if (index >= ranges.size() || index < 0) {
-                throw ParquetException("Index out of range");
-            }
-            return ranges[index];
-        }
-
-        RowRanges shift(const int64_t offset) const {
-            RowRanges result;
-            for (const IntervalRange&range: ranges) {
-                result.Add({range.start + offset, range.end + offset});
-            }
-            return result;
-        }
-
-        std::string ToString() const {
-            std::string result = "[";
-            for (const IntervalRange&range: ranges) {
-                result +=
-                        "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), ";
-            }
-            if (!ranges.empty()) {
-                result = result.substr(0, result.size() - 2);
-            }
-            result += "]";
-            return result;
-        }
-
-        /// The following APIs are to be implemented
-        /// Comment out for now to pass compile
-
-//         // Returns a vector of PageLocations that must be read all to get values for all included in this range
-//         virtual std::vector<PageLocation> PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0;
-//         class Iterator {
-//             virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
-//         };
-//         virtual std::unique_ptr<Iterator> NewIterator() = 0;
-
-    private:
-        std::vector<IntervalRange> ranges;
-    };
-
-    namespace internal {
-        class PARQUET_EXPORT RecordSkipper {
-        public:
-            RecordSkipper(RowRanges&pages, const RowRanges&row_ranges_)
-                : row_ranges(row_ranges_) {
-                // copy row_ranges
-                RowRanges will_process_pages, skip_pages;
-                for (auto&page: pages.GetRanges()) {
-                    if (!row_ranges.IsOverlapping(page)) {
-                        skip_pages.Add(page);
-                    }
-                }
-
-                /// Since the skipped pages will be silently skipped without updating
-                /// current_rg_processed_records or records_read_, we need to pre-process the row
-                /// ranges as if these skipped pages never existed
-                adjust_ranges(skip_pages, row_ranges);
-
-                total_rows_to_process = pages.RowCount() - skip_pages.RowCount();
-            }
-
-            /// \brief Return the number of records to read or to skip
-            /// if return values is positive, it means to read N records
-            /// if return values is negative, it means to skip N records
-            /// if return values is 0, it means end of RG
-            int64_t advise_next(const int64_t current_rg_processed) {
-                if (row_ranges.GetRanges().size() == row_range_idx) {
-                    return 0;
-                }
-
-                if (row_ranges[row_range_idx].end < current_rg_processed) {
-                    row_range_idx++;
-                    if (row_ranges.GetRanges().size() == row_range_idx) {
-                        // negative, skip the ramaining rows
-                        return current_rg_processed - total_rows_to_process;
-                    }
-                }
-
-                if (row_ranges[row_range_idx].start > current_rg_processed) {
-                    // negative, skip
-                    return current_rg_processed - row_ranges[row_range_idx].start;
-                }
-
-                const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1;
-                return ret;
-            }
-
-        private:
-            void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) {
-                size_t skipped_rows = 0;
-                auto iter = to_adjust.GetRanges().begin();
-                auto skip_iter = skip_pages.GetRanges().begin();
-                while (iter != to_adjust.GetRanges().end()) {
-                    while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) {
-                        skipped_rows += skip_iter->Count();
-                        ++skip_iter;
-                    }
-                    iter->start -= skipped_rows;
-                    iter->end -= skipped_rows;
-                    ++iter;
-                }
-            }
-
-            /// Keep copy of ranges, because adjust_ranges() will modify them
-            RowRanges row_ranges;
-
-            size_t row_range_idx = 0;
-            size_t total_rows_to_process = 0;
-        };
-
-        /// \brief Stateful column reader that delimits semantic records for both flat
-        /// and nested columns
-        ///
-        /// \note API EXPERIMENTAL
-        /// \since 1.3.0
-        class PARQUET_EXPORT RecordReader {
-        public:
-            /// \brief Creates a record reader.
-            /// @param descr Column descriptor
-            /// @param leaf_info Level info, used to determine if a column is nullable or not
-            /// @param pool Memory pool to use for buffering values and rep/def levels
-            /// @param read_dictionary True if reading directly as Arrow dictionary-encoded
-            /// @param read_dense_for_nullable True if reading dense and not leaving space for null
-            /// values
-            static std::shared_ptr<RecordReader> Make(
-                const ColumnDescriptor* descr, LevelInfo leaf_info,
-                ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
-                bool read_dictionary = false, bool read_dense_for_nullable = false);
-
-            virtual ~RecordReader() = default;
-
-            /// \brief Attempt to read indicated number of records from column chunk
-            /// Note that for repeated fields, a record may have more than one value
-            /// and all of them are read. If read_dense_for_nullable() it will
-            /// not leave any space for null values. Otherwise, it will read spaced.
-            /// \return number of records read
-            virtual int64_t ReadRecords(int64_t num_records) = 0;
-
-            /// \brief Attempt to skip indicated number of records from column chunk.
-            /// Note that for repeated fields, a record may have more than one value
-            /// and all of them are skipped.
-            /// \return number of records skipped
-            virtual int64_t SkipRecords(int64_t num_records) = 0;
-
-            /// \brief Pre-allocate space for data. Results in better flat read performance
-            virtual void Reserve(int64_t num_values) = 0;
-
-            /// \brief Clear consumed values and repetition/definition levels as the
-            /// result of calling ReadRecords
-            /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
-            virtual void Reset() = 0;
-
-            /// \brief Transfer filled values buffer to caller. A new one will be
-            /// allocated in subsequent ReadRecords calls
-            virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
-
-            /// \brief Transfer filled validity bitmap buffer to caller. A new one will
-            /// be allocated in subsequent ReadRecords calls
-            virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
-
-            /// \brief Return true if the record reader has more internal data yet to
-            /// process
-            virtual bool HasMoreData() const = 0;
-
-            /// \brief Advance record reader to the next row group. Must be set before
-            /// any records could be read/skipped.
-            /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
-            virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
-
-            /// \brief Returns the underlying column reader's descriptor.
-            virtual const ColumnDescriptor* descr() const = 0;
-
-            virtual void DebugPrintState() = 0;
-
-            /// \brief Decoded definition levels
-            int16_t* def_levels() const {
-                return reinterpret_cast<int16_t *>(def_levels_->mutable_data());
-            }
-
-            /// \brief Decoded repetition levels
-            int16_t* rep_levels() const {
-                return reinterpret_cast<int16_t *>(rep_levels_->mutable_data());
-            }
-
-            /// \brief Decoded values, including nulls, if any
-            /// FLBA and ByteArray types do not use this array and read into their own
-            /// builders.
-            uint8_t* values() const { return values_->mutable_data(); }
-
-            /// \brief Number of values written, including space left for nulls if any.
-            /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
-            /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
-            /// FLBA and ByteArray types this value reflects the values written with the last
-            /// ReadRecords call since those readers will reset the values after each call.
-            int64_t values_written() const { return values_written_; }
-
-            /// \brief Number of definition / repetition levels (from those that have
-            /// been decoded) that have been consumed inside the reader.
-            int64_t levels_position() const { return levels_position_; }
-
-            /// \brief Number of definition / repetition levels that have been written
-            /// internally in the reader. This may be larger than values_written() because
-            /// for repeated fields we need to look at the levels in advance to figure out
-            /// the record boundaries.
-            int64_t levels_written() const { return levels_written_; }
-
-            /// \brief Number of nulls in the leaf that we have read so far into the
-            /// values vector. This is only valid when !read_dense_for_nullable(). When
-            /// read_dense_for_nullable() it will always be 0.
-            int64_t null_count() const { return null_count_; }
-
-            /// \brief True if the leaf values are nullable
-            bool nullable_values() const { return nullable_values_; }
-
-            /// \brief True if reading directly as Arrow dictionary-encoded
-            bool read_dictionary() const { return read_dictionary_; }
-
-            /// \brief True if reading dense for nullable columns.
-            bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
-
-            void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
-
-            void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
-
-        protected:
-            /// \brief Indicates if we can have nullable values. Note that repeated fields
-            /// may or may not be nullable.
-            bool nullable_values_;
-
-            bool at_record_start_;
-            int64_t records_read_;
-
-            int64_t current_rg_processed_records; // counting both read and skip records
-
-            /// \brief Stores values. These values are populated based on each ReadRecords
-            /// call. No extra values are buffered for the next call. SkipRecords will not
-            /// add any value to this buffer.
-            std::shared_ptr<::arrow::ResizableBuffer> values_;
-            /// \brief False for BYTE_ARRAY, in which case we don't allocate the values
-            /// buffer and we directly read into builder classes.
-            bool uses_values_;
-
-            /// \brief Values that we have read into 'values_' + 'null_count_'.
-            int64_t values_written_;
-            int64_t values_capacity_;
-            int64_t null_count_;
-
-            /// \brief Each bit corresponds to one element in 'values_' and specifies if it
-            /// is null or not null. Not set if read_dense_for_nullable_ is true.
-            std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
-
-            /// \brief Buffer for definition levels. May contain more levels than
-            /// is actually read. This is because we read levels ahead to
-            /// figure out record boundaries for repeated fields.
-            /// For flat required fields, 'def_levels_' and 'rep_levels_' are not
-            ///  populated. For non-repeated fields 'rep_levels_' is not populated.
-            /// 'def_levels_' and 'rep_levels_' must be of the same size if present.
-            std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
-            /// \brief Buffer for repetition levels. Only populated for repeated
-            /// fields.
-            std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
-
-            /// \brief Number of definition / repetition levels that have been written
-            /// internally in the reader. This may be larger than values_written() since
-            /// for repeated fields we need to look at the levels in advance to figure out
-            /// the record boundaries.
-            int64_t levels_written_;
-            /// \brief Position of the next level that should be consumed.
-            int64_t levels_position_;
-            int64_t levels_capacity_;
-
-            bool read_dictionary_ = false;
-            // If true, we will not leave any space for the null values in the values_
-            // vector.
-            bool read_dense_for_nullable_ = false;
-
-            std::shared_ptr<RecordSkipper> skipper = NULLPTR;
-        };
-
-        class BinaryRecordReader : virtual public RecordReader {
-        public:
-            virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-        };
-
-        /// \brief Read records directly to dictionary-encoded Arrow form (int32
-        /// indices). Only valid for BYTE_ARRAY columns
-        class DictionaryRecordReader : virtual public RecordReader {
-        public:
-            virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
-        };
-    } // namespace internal
-
-    using BoolReader = TypedColumnReader<BooleanType>;
-    using Int32Reader = TypedColumnReader<Int32Type>;
-    using Int64Reader = TypedColumnReader<Int64Type>;
-    using Int96Reader = TypedColumnReader<Int96Type>;
-    using FloatReader = TypedColumnReader<FloatType>;
-    using DoubleReader = TypedColumnReader<DoubleType>;
-    using ByteArrayReader = TypedColumnReader<ByteArrayType>;
-    using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
-} // namespace parquet
+        result.Add(IntervalRange::Intersection(l, r));
+      }
+    }
+
+    return result;
+  }
+
+  void Add(const IntervalRange& range) {
+    const IntervalRange rangeToAdd = range;
+    if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) {
+      throw ParquetException("Ranges must be added in order");
+    }
+    ranges.push_back(rangeToAdd);
+  }
+
+  size_t RowCount() const {
+    size_t cnt = 0;
+    for (const IntervalRange& range : ranges) {
+      cnt += range.Count();
+    }
+    return cnt;
+  }
+
+  bool IsValid() const {
+    if (ranges.size() == 0) return true;
+    if (ranges[0].start < 0) {
+      return false;
+    }
+    for (size_t i = 1; i < ranges.size(); i++) {
+      if (ranges[i].start <= ranges[i - 1].end) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool IsOverlapping(int64_t start, int64_t end) const {
+    const IntervalRange searchRange(start, end);
+    return IsOverlapping(searchRange);
+  }
+
+  bool IsOverlapping(const IntervalRange& searchRange) const {
+    auto it = std::lower_bound(
+        ranges.begin(), ranges.end(), searchRange,
+        [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
+    return it != ranges.end() && !(*it).IsAfter(searchRange);
+  }
+
+  std::vector<IntervalRange>& GetRanges() { return ranges; }
+
+  const std::vector<IntervalRange>& GetRanges() const { return ranges; }
+
+  // Split the ranges into N+1 parts at the given split point, where N =
+  // split_points.size() The RowRows object itself is not modified
+  std::vector<RowRanges> SplitAt(const std::vector<int64_t>& split_points) const {
+    if (split_points.size() == 0) {
+      return {*this};
+    }
+
+    std::vector<RowRanges> result;
+    int64_t last_split_point = -1;
+    for (const int64_t split_point : split_points) {
+      if (split_point <= 0) {
+        throw ParquetException("Invalid split point " + std::to_string(split_point));
+      }
+      if (split_point <= last_split_point) {
+        throw ParquetException("Split points must be in ascending order");
+      }
+      last_split_point = split_point;
+    }
+
+    RowRanges spaces;
+    for (size_t i = 0; i < split_points.size(); ++i) {
+      auto start = i == 0 ? 0 : split_points[i - 1];
+      auto end = split_points[i] - 1;
+      spaces.Add({start, end});
+    }
+    spaces.Add(
+        {split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()});
+
+    for (IntervalRange space : spaces.GetRanges()) {
+      RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this);
+      result.push_back(intersection);
+    }
+
+    return result;
+  }
+
+  const IntervalRange& operator[](size_t index) const {
+    // check index
+    if (index >= ranges.size() || index < 0) {
+      throw ParquetException("Index out of range");
+    }
+    return ranges[index];
+  }
+
+  RowRanges shift(const int64_t offset) const {
+    RowRanges result;
+    for (const IntervalRange& range : ranges) {
+      result.Add({range.start + offset, range.end + offset});
+    }
+    return result;
+  }
+
+  std::string ToString() const {
+    std::string result = "[";
+    for (const IntervalRange& range : ranges) {
+      result +=
+          "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), ";
+    }
+    if (!ranges.empty()) {
+      result = result.substr(0, result.size() - 2);
+    }
+    result += "]";
+    return result;
+  }
+
+  /// The following APIs are to be implemented
+  /// Comment out for now to pass compile
+
+  //         // Returns a vector of PageLocations that must be read all to get values for
+  //         all included in this range virtual std::vector<PageLocation>
+  //         PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0; class
+  //         Iterator {
+  //             virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+  //         };
+  //         virtual std::unique_ptr<Iterator> NewIterator() = 0;
+
+ private:
+  std::vector<IntervalRange> ranges;
+};
+
+namespace internal {
+class PARQUET_EXPORT RecordSkipper {
+ public:
+  RecordSkipper(RowRanges& pages, const RowRanges& row_ranges_)
+      : row_ranges(row_ranges_) {
+    // copy row_ranges
+    RowRanges will_process_pages, skip_pages;
+    for (auto& page : pages.GetRanges()) {
+      if (!row_ranges.IsOverlapping(page)) {
+        skip_pages.Add(page);
+      }
+    }
+
+    /// Since the skipped pages will be silently skipped without updating
+    /// current_rg_processed_records or records_read_, we need to pre-process the row
+    /// ranges as if these skipped pages never existed
+    adjust_ranges(skip_pages, row_ranges);
+
+    total_rows_to_process = pages.RowCount() - skip_pages.RowCount();
+  }
+
+  /// \brief Return the number of records to read or to skip
+  /// if return values is positive, it means to read N records
+  /// if return values is negative, it means to skip N records
+  /// if return values is 0, it means end of RG
+  int64_t advise_next(const int64_t current_rg_processed) {
+    if (row_ranges.GetRanges().size() == row_range_idx) {
+      return 0;
+    }
+
+    if (row_ranges[row_range_idx].end < current_rg_processed) {
+      row_range_idx++;
+      if (row_ranges.GetRanges().size() == row_range_idx) {
+        // negative, skip the ramaining rows
+        return current_rg_processed - total_rows_to_process;
+      }
+    }
+
+    if (row_ranges[row_range_idx].start > current_rg_processed) {
+      // negative, skip
+      return current_rg_processed - row_ranges[row_range_idx].start;
+    }
+
+    const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1;
+    return ret;
+  }
+
+ private:
+  void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
+    size_t skipped_rows = 0;
+    auto iter = to_adjust.GetRanges().begin();
+    auto skip_iter = skip_pages.GetRanges().begin();
+    while (iter != to_adjust.GetRanges().end()) {
+      while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) {
+        skipped_rows += skip_iter->Count();
+        ++skip_iter;
+      }
+      iter->start -= skipped_rows;
+      iter->end -= skipped_rows;
+      ++iter;
+    }
+  }
+
+  /// Keep copy of ranges, because adjust_ranges() will modify them
+  RowRanges row_ranges;
+
+  size_t row_range_idx = 0;
+  size_t total_rows_to_process = 0;
+};
+
+/// \brief Stateful column reader that delimits semantic records for both flat
+/// and nested columns
+///
+/// \note API EXPERIMENTAL
+/// \since 1.3.0
+class PARQUET_EXPORT RecordReader {
+ public:
+  /// \brief Creates a record reader.
+  /// @param descr Column descriptor
+  /// @param leaf_info Level info, used to determine if a column is nullable or not
+  /// @param pool Memory pool to use for buffering values and rep/def levels
+  /// @param read_dictionary True if reading directly as Arrow dictionary-encoded
+  /// @param read_dense_for_nullable True if reading dense and not leaving space for null
+  /// values
+  static std::shared_ptr<RecordReader> Make(
+      const ColumnDescriptor* descr, LevelInfo leaf_info,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+      bool read_dictionary = false, bool read_dense_for_nullable = false);
+
+  virtual ~RecordReader() = default;
+
+  /// \brief Attempt to read indicated number of records from column chunk
+  /// Note that for repeated fields, a record may have more than one value
+  /// and all of them are read. If read_dense_for_nullable() it will
+  /// not leave any space for null values. Otherwise, it will read spaced.
+  /// \return number of records read
+  virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+  /// \brief Attempt to skip indicated number of records from column chunk.
+  /// Note that for repeated fields, a record may have more than one value
+  /// and all of them are skipped.
+  /// \return number of records skipped
+  virtual int64_t SkipRecords(int64_t num_records) = 0;
+
+  /// \brief Pre-allocate space for data. Results in better flat read performance
+  virtual void Reserve(int64_t num_values) = 0;
+
+  /// \brief Clear consumed values and repetition/definition levels as the
+  /// result of calling ReadRecords
+  /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
+  virtual void Reset() = 0;
+
+  /// \brief Transfer filled values buffer to caller. A new one will be
+  /// allocated in subsequent ReadRecords calls
+  virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+  /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+  /// be allocated in subsequent ReadRecords calls
+  virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+  /// \brief Return true if the record reader has more internal data yet to
+  /// process
+  virtual bool HasMoreData() const = 0;
+
+  /// \brief Advance record reader to the next row group. Must be set before
+  /// any records could be read/skipped.
+  /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+  virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+  /// \brief Returns the underlying column reader's descriptor.
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  virtual void DebugPrintState() = 0;
+
+  /// \brief Decoded definition levels
+  int16_t* def_levels() const {
+    return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
+  }
+
+  /// \brief Decoded repetition levels
+  int16_t* rep_levels() const {
+    return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
+  }
+
+  /// \brief Decoded values, including nulls, if any
+  /// FLBA and ByteArray types do not use this array and read into their own
+  /// builders.
+  uint8_t* values() const { return values_->mutable_data(); }
+
+  /// \brief Number of values written, including space left for nulls if any.
+  /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
+  /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
+  /// FLBA and ByteArray types this value reflects the values written with the last
+  /// ReadRecords call since those readers will reset the values after each call.
+  int64_t values_written() const { return values_written_; }
+
+  /// \brief Number of definition / repetition levels (from those that have
+  /// been decoded) that have been consumed inside the reader.
+  int64_t levels_position() const { return levels_position_; }
+
+  /// \brief Number of definition / repetition levels that have been written
+  /// internally in the reader. This may be larger than values_written() because
+  /// for repeated fields we need to look at the levels in advance to figure out
+  /// the record boundaries.
+  int64_t levels_written() const { return levels_written_; }
+
+  /// \brief Number of nulls in the leaf that we have read so far into the
+  /// values vector. This is only valid when !read_dense_for_nullable(). When
+  /// read_dense_for_nullable() it will always be 0.
+  int64_t null_count() const { return null_count_; }
+
+  /// \brief True if the leaf values are nullable
+  bool nullable_values() const { return nullable_values_; }
+
+  /// \brief True if reading directly as Arrow dictionary-encoded
+  bool read_dictionary() const { return read_dictionary_; }
+
+  /// \brief True if reading dense for nullable columns.
+  bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
+
+  void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
+
+  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
+
+ protected:
+  /// \brief Indicates if we can have nullable values. Note that repeated fields
+  /// may or may not be nullable.
+  bool nullable_values_;
+
+  bool at_record_start_;
+  int64_t records_read_;
+
+  int64_t current_rg_processed_records;  // counting both read and skip records
+
+  /// \brief Stores values. These values are populated based on each ReadRecords
+  /// call. No extra values are buffered for the next call. SkipRecords will not
+  /// add any value to this buffer.
+  std::shared_ptr<::arrow::ResizableBuffer> values_;
+  /// \brief False for BYTE_ARRAY, in which case we don't allocate the values
+  /// buffer and we directly read into builder classes.
+  bool uses_values_;
+
+  /// \brief Values that we have read into 'values_' + 'null_count_'.
+  int64_t values_written_;
+  int64_t values_capacity_;
+  int64_t null_count_;
+
+  /// \brief Each bit corresponds to one element in 'values_' and specifies if it
+  /// is null or not null. Not set if read_dense_for_nullable_ is true.
+  std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+
+  /// \brief Buffer for definition levels. May contain more levels than
+  /// is actually read. This is because we read levels ahead to
+  /// figure out record boundaries for repeated fields.
+  /// For flat required fields, 'def_levels_' and 'rep_levels_' are not
+  ///  populated. For non-repeated fields 'rep_levels_' is not populated.
+  /// 'def_levels_' and 'rep_levels_' must be of the same size if present.
+  std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+  /// \brief Buffer for repetition levels. Only populated for repeated
+  /// fields.
+  std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+  /// \brief Number of definition / repetition levels that have been written
+  /// internally in the reader. This may be larger than values_written() since
+  /// for repeated fields we need to look at the levels in advance to figure out
+  /// the record boundaries.
+  int64_t levels_written_;
+  /// \brief Position of the next level that should be consumed.
+  int64_t levels_position_;
+  int64_t levels_capacity_;
+
+  bool read_dictionary_ = false;
+  // If true, we will not leave any space for the null values in the values_
+  // vector.
+  bool read_dense_for_nullable_ = false;
+
+  std::shared_ptr<RecordSkipper> skipper = NULLPTR;
+};
+
+class BinaryRecordReader : virtual public RecordReader {
+ public:
+  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
+/// \brief Read records directly to dictionary-encoded Arrow form (int32
+/// indices). Only valid for BYTE_ARRAY columns
+class DictionaryRecordReader : virtual public RecordReader {
+ public:
+  virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+};
+
+}  // namespace internal
+
+using BoolReader = TypedColumnReader<BooleanType>;
+using Int32Reader = TypedColumnReader<Int32Type>;
+using Int64Reader = TypedColumnReader<Int64Type>;
+using Int96Reader = TypedColumnReader<Int96Type>;
+using FloatReader = TypedColumnReader<FloatType>;
+using DoubleReader = TypedColumnReader<DoubleType>;
+using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+
+}  // namespace parquet

From 7b5d4a627e67f212b274deecb6abab7687842513 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Sun, 14 Jan 2024 22:38:46 +0800
Subject: [PATCH 16/25] RowRangesPageFilter refactored

---
 cpp/src/parquet/arrow/reader.cc         | 112 +++++++------
 cpp/src/parquet/arrow/reader.h          |   3 +-
 cpp/src/parquet/arrow/reader_internal.h |   8 +-
 cpp/src/parquet/column_reader.cc        |   2 +-
 cpp/src/parquet/column_reader.h         | 199 ++++++++++++++----------
 cpp/src/parquet/range_reader_test.cc    |  24 +--
 cpp/src/parquet/row_range_test.cc       |   2 +-
 7 files changed, 207 insertions(+), 143 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 92b746b8ad92..8289b63b475d 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -17,7 +17,7 @@
 
 #include "parquet/arrow/reader.h"
 
-#include <parquet/page_index.h>
+#include "parquet/page_index.h"
 
 #include <algorithm>
 #include <cstring>
@@ -74,7 +74,6 @@ using arrow::internal::Iota;
 // Help reduce verbosity
 using ParquetReader = parquet::ParquetFileReader;
 
-using parquet::IntervalRange;
 using parquet::internal::RecordReader;
 
 namespace bit_util = arrow::bit_util;
@@ -206,7 +205,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(
       int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
       const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+      const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
       std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
@@ -223,13 +222,13 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->row_ranges_map = row_ranges_map;
+    ctx->row_ranges_per_rg = row_ranges_per_rg;
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(
       const std::vector<int>& column_indices, const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+      const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
       std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
       std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -244,7 +243,7 @@ class FileReaderImpl : public FileReader {
     for (size_t i = 0; i < out->size(); ++i) {
       std::unique_ptr<ColumnReaderImpl> reader;
       RETURN_NOT_OK(GetFieldReader(field_indices[i], included_leaves, row_groups,
-                                   row_ranges_map, &reader));
+                                   row_ranges_per_rg, &reader));
 
       out_fields[i] = reader->field();
       out->at(i) = std::move(reader);
@@ -345,10 +344,10 @@ class FileReaderImpl : public FileReader {
   // This is a internal API owned by FileReaderImpl, not exposed in FileReader
   Status GetRecordBatchReaderWithRowRanges(const std::vector<int>& row_group_indices,
                                            const std::vector<int>& column_indices,
-                                           const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+                                           const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
                                            std::unique_ptr<RecordBatchReader>* out);
 
-  Status GetRecordBatchReader(const RowRanges& rows_to_return,
+  Status GetRecordBatchReader(const IntervalRanges& rows_to_return,
                               const std::vector<int>& column_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
     const auto metadata = reader_->metadata();
@@ -358,7 +357,7 @@ class FileReaderImpl : public FileReader {
                              rows_to_return.ToString());
     }
     // check if the row ranges are within the row group boundaries
-    if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().end >= metadata->num_rows()) {
+    if (rows_to_return.RowCount() != 0 && rows_to_return.LastRow() >= metadata->num_rows()) {
       return Status::Invalid("The provided row range " + rows_to_return.ToString() +
                              " exceeds the number of rows in the file: " +
                              std::to_string(metadata->num_rows()));
@@ -371,20 +370,19 @@ class FileReaderImpl : public FileReader {
       split_points.push_back(rows_so_far);
     }
     // We'll assign a RowRanges for each RG, even if it's not required to return any rows
-    const std::vector<RowRanges> splits = rows_to_return.SplitAt(split_points);
-    // Call row_ranges_map because array index is the row group index
-    const std::shared_ptr<std::vector<RowRanges>> row_ranges_map =
-        std::make_shared<std::vector<RowRanges>>();
+    const std::vector<IntervalRanges> splits = rows_to_return.SplitAt(split_points);
+    const std::shared_ptr<std::vector<IntervalRanges>> row_ranges_per_rg =
+        std::make_shared<std::vector<IntervalRanges>>();
     rows_so_far = 0;
     std::vector<int> row_group_indices;
     for (int i = 0 ; i < metadata->num_row_groups(); i++) {
-      row_ranges_map->push_back(splits[i].shift(-rows_so_far));
+      row_ranges_per_rg->push_back(splits[i].shift(-rows_so_far));
       rows_so_far += metadata->RowGroup(i)->num_rows();
-      if (row_ranges_map->at(i).RowCount() > 0)
+      if (row_ranges_per_rg->at(i).RowCount() > 0)
         row_group_indices.push_back(i);
     }
 
-    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_map, out);
+    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_per_rg, out);
   }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
@@ -504,38 +502,59 @@ class RowGroupReaderImpl : public RowGroupReader {
 // ----------------------------------------------------------------------
 // Column reader implementations
 
-struct RowRangesPageFilter {
-  explicit RowRangesPageFilter(const RowRanges& row_ranges_,
-                               const std::shared_ptr<RowRanges>& page_ranges_)
-      : row_ranges(row_ranges_), page_ranges(page_ranges_) {
+// Only support IntervalRange case for now
+class RowRangesPageFilter {
+ public:
+  RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr<RowRanges>& page_ranges)
+    : row_ranges_(row_ranges), page_ranges_(page_ranges) {
+  }
 
-    if (page_ranges == nullptr || page_ranges->GetRanges().size() == 0) {
-      throw ParquetException("Page ranges is empty");
-    }
+  // To avoid error "std::function target must be copy-constructible", we must define copy constructor
+  RowRangesPageFilter(const RowRangesPageFilter& other)
+    : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) {
   }
 
   bool operator()(const DataPageStats& stats) {
-    ++page_range_idx;
 
-    IntervalRange current_page_range = (*page_ranges)[page_range_idx];
+    if (!initted) {
+      row_ranges_itr_ = row_ranges_.NewIterator();
+      page_ranges_itr_ = page_ranges_->NewIterator();
+
+      current_row_range_ = row_ranges_itr_->NextRange();
+
+      if (current_row_range_.index() != 0) {
+        throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange");
+      }
+      initted = true;
+    }
+
+    current_page_range_ = page_ranges_itr_->NextRange();
+    if (current_page_range_.index() != 0) {
+      throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange");
+    }
 
-    while (row_range_idx < row_ranges.GetRanges().size() &&
-           current_page_range.IsAfter(row_ranges[row_range_idx])) {
-      row_range_idx++;
+    while (current_row_range_.index() == 0 &&
+           std::get<IntervalRange>(current_page_range_).IsAfter(
+             std::get<IntervalRange>(current_row_range_))) {
+      current_row_range_ = row_ranges_itr_->NextRange();
     }
 
-    if (row_range_idx >= row_ranges.GetRanges().size()) {
+    if (current_row_range_.index() != 0) {
       return true;
     }
 
-    return current_page_range.IsBefore(row_ranges[row_range_idx]);
+    return std::get<IntervalRange>(current_page_range_).IsBefore(
+      std::get<IntervalRange>(current_row_range_));
   }
 
-  size_t row_range_idx = 0;
-  const RowRanges & row_ranges;
-
-  int page_range_idx = -1;
-  const std::shared_ptr<RowRanges> page_ranges;
+ private:
+  const RowRanges& row_ranges_;
+  const std::shared_ptr<RowRanges> page_ranges_;
+  std::unique_ptr<RowRanges::Iterator> row_ranges_itr_ = NULLPTR;
+  std::unique_ptr<RowRanges::Iterator> page_ranges_itr_ = NULLPTR;
+  std::variant<IntervalRange, BitmapRange, End> current_row_range_ = End();
+  std::variant<IntervalRange, BitmapRange, End> current_page_range_ = End();
+  bool initted = false;
 };
 
 // Leaf reader is for primitive arrays and primitive children of nested arrays
@@ -600,8 +619,8 @@ class LeafReader : public ColumnReaderImpl {
  private:
   std::shared_ptr<ChunkedArray> out_;
 
-  void checkAndGetPageRanges(const RowRanges & row_ranges,
-                             std::shared_ptr<RowRanges>& page_ranges) const {
+  void checkAndGetPageRanges(const IntervalRanges& row_ranges,
+                             std::shared_ptr<IntervalRanges>& page_ranges) const {
     // check offset exists
     const auto rg_pg_index_reader =
         ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group());
@@ -622,7 +641,7 @@ class LeafReader : public ColumnReaderImpl {
     }
 
     const auto page_locations = offset_index->page_locations();
-    page_ranges = std::make_shared<RowRanges>();
+    page_ranges = std::make_shared<IntervalRanges>();
     for (size_t i = 0; i < page_locations.size() - 1; i++) {
       page_ranges->Add(
           {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1});
@@ -634,8 +653,8 @@ class LeafReader : public ColumnReaderImpl {
                1});
     }
 
-    if (row_ranges.GetRanges().size() > 0) {
-      if (row_ranges.GetRanges().back().end > page_ranges->GetRanges().back().end) {
+    if (row_ranges.RowCount() > 0) {
+      if (row_ranges.LastRow() > page_ranges->LastRow()) {
         throw ParquetException(
             "The provided row range " + row_ranges.ToString() +
             " exceeds last page :" + page_ranges->GetRanges().back().ToString());
@@ -647,14 +666,17 @@ class LeafReader : public ColumnReaderImpl {
     std::unique_ptr<PageReader> page_reader = input_->NextChunk();
 
     /// using page index to reduce cost
-    if (page_reader != nullptr && ctx_->row_ranges_map) {
+    if (page_reader != nullptr && ctx_->row_ranges_per_rg) {
       // reset skipper
       record_reader_->set_record_skipper(NULLPTR);
 
-      const auto & row_ranges = (*ctx_->row_ranges_map)[input_->current_row_group()];
+      const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()];
       if (row_ranges.RowCount() != 0) {
+        // BitmapRange is not supported yet, the following implementations
+        // are based on ItervalRanges assumption !!!
+
         // if specific row range is provided for this rg
-        std::shared_ptr<RowRanges> page_ranges;
+        std::shared_ptr<IntervalRanges> page_ranges;
         checkAndGetPageRanges(row_ranges, page_ranges);
 
         // part 1, skip decompressing & decoding unnecessary pages
@@ -1142,7 +1164,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReaderWithRowRanges(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+    const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
     std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
@@ -1156,7 +1178,7 @@ Status FileReaderImpl::GetRecordBatchReaderWithRowRanges(
 
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> batch_schema;
-  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_map, &readers,
+  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_per_rg, &readers,
                                 &batch_schema));
 
   if (readers.empty()) {
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 807be797aad6..b439f82789a0 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -197,7 +197,7 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either rows_to_return or column_indices
   ///     contains an invalid index
-  virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return,
+  virtual ::arrow::Status GetRecordBatchReader(const IntervalRanges& rows_to_return,
       const std::vector<int>& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
   /// \brief Return a RecordBatchReader of row groups selected from
@@ -219,7 +219,6 @@ class PARQUET_EXPORT FileReader {
                                        std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
 
-
   /// \brief Return a generator of record batches.
   ///
   /// The FileReader must outlive the generator, so this requires that you pass in a
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index 4d98f8a7fe5c..f579e62f610f 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -76,7 +76,7 @@ class FileColumnIterator {
     }
 
     auto row_group_reader = reader_->RowGroup(row_groups_.front());
-    current_rg = row_groups_.front();
+    current_rg_ = row_groups_.front();
     row_groups_.pop_front();
     return row_group_reader->GetColumnPageReader(column_index_);
   }
@@ -89,14 +89,14 @@ class FileColumnIterator {
 
   int column_index() const { return column_index_; }
 
-  int current_row_group() const { return current_rg; }
+  int current_row_group() const { return current_rg_; }
 
  protected:
   int column_index_;
   ParquetFileReader* reader_;
   const SchemaDescriptor* schema_;
   std::deque<int> row_groups_;
-  int current_rg = 0;
+  int current_rg_ = 0;
 };
 
 using FileColumnIteratorFactory =
@@ -113,7 +113,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
-  std::shared_ptr<std::vector<RowRanges>> row_ranges_map;
+  std::shared_ptr<std::vector<IntervalRanges>> row_ranges_per_rg;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index b517ee7c798e..56e0f0b99450 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1998,7 +1998,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
     }
 
     while (true) {
-      const auto advise = skipper->advise_next(current_rg_processed_records);
+      const auto advise = skipper->AdviseNext(current_rg_processed_records);
       if (advise == 0) {
         return 0;
       }
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 7ebabf1f2095..4d9770296d92 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -321,7 +321,13 @@ struct IntervalRange {
     }
   }
 
-  size_t Count() const { return end - start + 1; }
+  size_t Count() const {
+    if(!IsValid()) {
+      throw ParquetException("Invalid range with start: " + std::to_string(start) +
+                             " and end: " + std::to_string(end));
+    }
+    return end - start + 1;
+  }
 
   bool IsBefore(const IntervalRange& other) const { return end < other.start; }
 
@@ -355,22 +361,89 @@ struct End {};
 class RowRanges {
  public:
   RowRanges() = default;
+  virtual ~RowRanges() = default;
+  virtual size_t RowCount() const = 0;
+  virtual int64_t LastRow() const = 0;
+  virtual bool IsValid() const = 0;
+
+  // Returns a vector of PageLocations that must be read all to get values for
+  // all included in this range virtual std::vector<PageLocation>
+  // PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0;
+
+  class Iterator {
+  public:
+    virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+    virtual ~Iterator() = default;
+  };
+  virtual std::unique_ptr<Iterator> NewIterator() const = 0;
+
+};
 
-  explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); }
+class IntervalRanges : public RowRanges {
+ public:
+  IntervalRanges() = default;
+
+  explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); }
+
+  IntervalRanges(const std::vector<IntervalRange>& ranges) { this->ranges_ = ranges; }
 
-  RowRanges(const std::vector<IntervalRange>& ranges) { this->ranges = ranges; }
+  IntervalRanges(const IntervalRanges& other) { ranges_ = other.ranges_; }
 
-  RowRanges(const RowRanges& other) { ranges = other.ranges; }
+  IntervalRanges(IntervalRanges&& other) noexcept { ranges_ = std::move(other.ranges_); }
 
-  RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
+  class IntervalRowRangesIterator : public Iterator {
+  public:
+    IntervalRowRangesIterator(const std::vector<IntervalRange> & ranges) : ranges_(ranges) {}
+    ~IntervalRowRangesIterator() override {}
+
+    std::variant<IntervalRange, BitmapRange, End> NextRange() override {
+      if(current_index_ >= ranges_.size())
+        return End();
+
+      return ranges_[current_index_++];
+    }
 
-  static RowRanges Intersection(const RowRanges& left, const RowRanges& right) {
-    RowRanges result;
+  private:
+    const std::vector<IntervalRange> & ranges_;
+    size_t current_index_ = 0;
+  };
+
+  std::unique_ptr<Iterator> NewIterator() const override {
+    return std::make_unique<IntervalRowRangesIterator>(ranges_);
+  }
+
+  size_t RowCount() const override {
+    size_t cnt = 0;
+    for (const IntervalRange& range : ranges_) {
+      cnt += range.Count();
+    }
+    return cnt;
+  }
+
+  int64_t LastRow() const override {
+    return ranges_.back().end;
+  }
+
+  bool IsValid() const override {
+    if (ranges_.size() == 0) return true;
+    if (ranges_[0].start < 0) {
+      return false;
+    }
+    for (size_t i = 1; i < ranges_.size(); i++) {
+      if (ranges_[i].start <= ranges_[i - 1].end) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static IntervalRanges Intersection(const IntervalRanges& left, const IntervalRanges& right) {
+    IntervalRanges result;
 
     size_t rightIndex = 0;
-    for (const IntervalRange& l : left.ranges) {
-      for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
-        const IntervalRange& r = right.ranges[i];
+    for (const IntervalRange& l : left.ranges_) {
+      for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
+        const IntervalRange& r = right.ranges_[i];
         if (l.IsBefore(r)) {
           break;
         } else if (l.IsAfter(r)) {
@@ -386,31 +459,10 @@ class RowRanges {
 
   void Add(const IntervalRange& range) {
     const IntervalRange rangeToAdd = range;
-    if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) {
+    if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
       throw ParquetException("Ranges must be added in order");
     }
-    ranges.push_back(rangeToAdd);
-  }
-
-  size_t RowCount() const {
-    size_t cnt = 0;
-    for (const IntervalRange& range : ranges) {
-      cnt += range.Count();
-    }
-    return cnt;
-  }
-
-  bool IsValid() const {
-    if (ranges.size() == 0) return true;
-    if (ranges[0].start < 0) {
-      return false;
-    }
-    for (size_t i = 1; i < ranges.size(); i++) {
-      if (ranges[i].start <= ranges[i - 1].end) {
-        return false;
-      }
-    }
-    return true;
+    ranges_.push_back(rangeToAdd);
   }
 
   bool IsOverlapping(int64_t start, int64_t end) const {
@@ -420,23 +472,23 @@ class RowRanges {
 
   bool IsOverlapping(const IntervalRange& searchRange) const {
     auto it = std::lower_bound(
-        ranges.begin(), ranges.end(), searchRange,
+        ranges_.begin(), ranges_.end(), searchRange,
         [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
-    return it != ranges.end() && !(*it).IsAfter(searchRange);
+    return it != ranges_.end() && !(*it).IsAfter(searchRange);
   }
 
-  std::vector<IntervalRange>& GetRanges() { return ranges; }
+  std::vector<IntervalRange>& GetRanges() { return ranges_; }
 
-  const std::vector<IntervalRange>& GetRanges() const { return ranges; }
+  const std::vector<IntervalRange>& GetRanges() const { return ranges_; }
 
   // Split the ranges into N+1 parts at the given split point, where N =
-  // split_points.size() The RowRows object itself is not modified
-  std::vector<RowRanges> SplitAt(const std::vector<int64_t>& split_points) const {
+  // split_points.size(). The RowRows object itself is not modified
+  std::vector<IntervalRanges> SplitAt(const std::vector<int64_t>& split_points) const {
     if (split_points.size() == 0) {
       return {*this};
     }
 
-    std::vector<RowRanges> result;
+    std::vector<IntervalRanges> result;
     int64_t last_split_point = -1;
     for (const int64_t split_point : split_points) {
       if (split_point <= 0) {
@@ -448,7 +500,7 @@ class RowRanges {
       last_split_point = split_point;
     }
 
-    RowRanges spaces;
+    IntervalRanges spaces;
     for (size_t i = 0; i < split_points.size(); ++i) {
       auto start = i == 0 ? 0 : split_points[i - 1];
       auto end = split_points[i] - 1;
@@ -458,7 +510,7 @@ class RowRanges {
         {split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()});
 
     for (IntervalRange space : spaces.GetRanges()) {
-      RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this);
+      IntervalRanges intersection = IntervalRanges::Intersection(IntervalRanges(space), *this);
       result.push_back(intersection);
     }
 
@@ -467,15 +519,15 @@ class RowRanges {
 
   const IntervalRange& operator[](size_t index) const {
     // check index
-    if (index >= ranges.size() || index < 0) {
+    if (index >= ranges_.size() || index < 0) {
       throw ParquetException("Index out of range");
     }
-    return ranges[index];
+    return ranges_[index];
   }
 
-  RowRanges shift(const int64_t offset) const {
-    RowRanges result;
-    for (const IntervalRange& range : ranges) {
+  IntervalRanges shift(const int64_t offset) const {
+    IntervalRanges result;
+    for (const IntervalRange& range : ranges_) {
       result.Add({range.start + offset, range.end + offset});
     }
     return result;
@@ -483,39 +535,30 @@ class RowRanges {
 
   std::string ToString() const {
     std::string result = "[";
-    for (const IntervalRange& range : ranges) {
+    for (const IntervalRange& range : ranges_) {
       result +=
           "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), ";
     }
-    if (!ranges.empty()) {
+    if (!ranges_.empty()) {
       result = result.substr(0, result.size() - 2);
     }
     result += "]";
     return result;
   }
 
-  /// The following APIs are to be implemented
-  /// Comment out for now to pass compile
 
-  //         // Returns a vector of PageLocations that must be read all to get values for
-  //         all included in this range virtual std::vector<PageLocation>
-  //         PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0; class
-  //         Iterator {
-  //             virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
-  //         };
-  //         virtual std::unique_ptr<Iterator> NewIterator() = 0;
 
  private:
-  std::vector<IntervalRange> ranges;
+  std::vector<IntervalRange> ranges_;
 };
 
 namespace internal {
 class PARQUET_EXPORT RecordSkipper {
  public:
-  RecordSkipper(RowRanges& pages, const RowRanges& row_ranges_)
-      : row_ranges(row_ranges_) {
+  RecordSkipper(IntervalRanges& pages, const IntervalRanges& row_ranges)
+      : row_ranges_(row_ranges) {
     // copy row_ranges
-    RowRanges will_process_pages, skip_pages;
+    IntervalRanges skip_pages;
     for (auto& page : pages.GetRanges()) {
       if (!row_ranges.IsOverlapping(page)) {
         skip_pages.Add(page);
@@ -525,39 +568,39 @@ class PARQUET_EXPORT RecordSkipper {
     /// Since the skipped pages will be silently skipped without updating
     /// current_rg_processed_records or records_read_, we need to pre-process the row
     /// ranges as if these skipped pages never existed
-    adjust_ranges(skip_pages, row_ranges);
+    AdjustRanges(skip_pages, row_ranges_);
 
-    total_rows_to_process = pages.RowCount() - skip_pages.RowCount();
+    total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount();
   }
 
   /// \brief Return the number of records to read or to skip
   /// if return values is positive, it means to read N records
   /// if return values is negative, it means to skip N records
   /// if return values is 0, it means end of RG
-  int64_t advise_next(const int64_t current_rg_processed) {
-    if (row_ranges.GetRanges().size() == row_range_idx) {
+  int64_t AdviseNext(const int64_t current_rg_processed) {
+    if (row_ranges_.GetRanges().size() == row_range_idx_) {
       return 0;
     }
 
-    if (row_ranges[row_range_idx].end < current_rg_processed) {
-      row_range_idx++;
-      if (row_ranges.GetRanges().size() == row_range_idx) {
+    if (row_ranges_[row_range_idx_].end < current_rg_processed) {
+      row_range_idx_++;
+      if (row_ranges_.GetRanges().size() == row_range_idx_) {
         // negative, skip the ramaining rows
-        return current_rg_processed - total_rows_to_process;
+        return current_rg_processed - total_rows_to_process_;
       }
     }
 
-    if (row_ranges[row_range_idx].start > current_rg_processed) {
+    if (row_ranges_[row_range_idx_].start > current_rg_processed) {
       // negative, skip
-      return current_rg_processed - row_ranges[row_range_idx].start;
+      return current_rg_processed - row_ranges_[row_range_idx_].start;
     }
 
-    const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1;
+    const auto ret = row_ranges_[row_range_idx_].end - current_rg_processed + 1;
     return ret;
   }
 
  private:
-  void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) {
+  void AdjustRanges(IntervalRanges& skip_pages, IntervalRanges& to_adjust) {
     size_t skipped_rows = 0;
     auto iter = to_adjust.GetRanges().begin();
     auto skip_iter = skip_pages.GetRanges().begin();
@@ -572,11 +615,11 @@ class PARQUET_EXPORT RecordSkipper {
     }
   }
 
-  /// Keep copy of ranges, because adjust_ranges() will modify them
-  RowRanges row_ranges;
+  /// Keep copy of ranges, because AdjustRanges() will modify them
+  IntervalRanges row_ranges_;
 
-  size_t row_range_idx = 0;
-  size_t total_rows_to_process = 0;
+  size_t row_range_idx_ = 0;
+  size_t total_rows_to_process_ = 0;
 };
 
 /// \brief Stateful column reader that delimits semantic records for both flat
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index b3127a8e346c..cde60c583f50 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -34,7 +34,7 @@
 #include <string>
 
 using parquet::IntervalRange;
-using parquet::RowRanges;
+using parquet::IntervalRanges;
 
 std::string random_string(std::string::size_type length) {
   static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
@@ -279,7 +279,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {}
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}};
+  IntervalRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -290,7 +290,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}};
+  IntervalRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -301,7 +301,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}};
+  IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -312,7 +312,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{};
+  IntervalRanges rows{};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   const auto status =
@@ -330,7 +330,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices,
                                                  &rb_reader));
 
     check_rb(std::move(rb_reader), 15, 210);  // 0 + 2 + ... + 28 = 210
@@ -348,7 +348,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices,
                                                  &rb_reader));
 
     check_rb(std::move(rb_reader), 30,
@@ -359,7 +359,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    RowRanges rows{{IntervalRange{-1, 5}}};
+    IntervalRanges rows{{IntervalRange{-1, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -370,7 +370,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
 
   {
-    RowRanges rows{{IntervalRange{0, 4}, {2, 5}}};
+    IntervalRanges rows{{IntervalRange{0, 4}, {2, 5}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -381,7 +381,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   }
   {
     // will treat as {0,99}
-    RowRanges rows{{IntervalRange{0, 100}}};
+    IntervalRanges rows{{IntervalRange{0, 100}}};
     const std::vector column_indices{0, 1, 2, 3, 4};
     const auto status =
         arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
@@ -430,7 +430,7 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) {
   ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build());
 
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  RowRanges rows{{IntervalRange{0, 29}}};
+  IntervalRanges rows{{IntervalRange{0, 29}}};
   std::vector column_indices{0, 1, 2, 3, 4};
   auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
   ASSERT_NOT_OK(status);
@@ -479,7 +479,7 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
       if (i % 2 == 0) ranges.push_back({i, i});
     }
     const std::vector column_indices{0, 1, 2, 3, 4};
-    ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices,
+    ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices,
                                                  &rb_reader));
 
     // 0-9 is masked as null, so the ramaining is:
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
index 2e043f57a7b2..81b38bc28268 100644
--- a/cpp/src/parquet/row_range_test.cc
+++ b/cpp/src/parquet/row_range_test.cc
@@ -21,7 +21,7 @@ using namespace parquet;
 
 class RowRangesTest : public ::testing::Test {
  protected:
-  RowRanges rowRanges;
+  IntervalRanges rowRanges;
 };
 
 TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) {

From 25f83f8850005961fa4b6529d61f3af63fb9eaa0 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Sun, 14 Jan 2024 22:44:51 +0800
Subject: [PATCH 17/25] checkAndGetPageRanges refactored

---
 cpp/src/parquet/arrow/reader.cc | 7 +++----
 cpp/src/parquet/column_reader.h | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 8289b63b475d..843e4f227659 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -619,7 +619,7 @@ class LeafReader : public ColumnReaderImpl {
  private:
   std::shared_ptr<ChunkedArray> out_;
 
-  void checkAndGetPageRanges(const IntervalRanges& row_ranges,
+  void checkAndGetPageRanges(const RowRanges& row_ranges,
                              std::shared_ptr<IntervalRanges>& page_ranges) const {
     // check offset exists
     const auto rg_pg_index_reader =
@@ -671,11 +671,10 @@ class LeafReader : public ColumnReaderImpl {
       record_reader_->set_record_skipper(NULLPTR);
 
       const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()];
+      // if specific row range is provided for this rg
       if (row_ranges.RowCount() != 0) {
-        // BitmapRange is not supported yet, the following implementations
-        // are based on ItervalRanges assumption !!!
 
-        // if specific row range is provided for this rg
+        // Use IntervalRanges to represent pages
         std::shared_ptr<IntervalRanges> page_ranges;
         checkAndGetPageRanges(row_ranges, page_ranges);
 
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 4d9770296d92..35924581c8a5 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -340,7 +340,7 @@ struct IntervalRange {
   bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
 
   std::string ToString() const {
-    return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
+    return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
   }
 
   // inclusive
@@ -365,6 +365,7 @@ class RowRanges {
   virtual size_t RowCount() const = 0;
   virtual int64_t LastRow() const = 0;
   virtual bool IsValid() const = 0;
+  virtual std::string ToString() const = 0;
 
   // Returns a vector of PageLocations that must be read all to get values for
   // all included in this range virtual std::vector<PageLocation>
@@ -533,11 +534,10 @@ class IntervalRanges : public RowRanges {
     return result;
   }
 
-  std::string ToString() const {
+  std::string ToString() const override {
     std::string result = "[";
     for (const IntervalRange& range : ranges_) {
-      result +=
-          "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), ";
+      result += range.ToString() + ", ";
     }
     if (!ranges_.empty()) {
       result = result.substr(0, result.size() - 2);

From 2e43866ab4b6e0d2375096c2d47c8aaf2ea0a79e Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Sun, 14 Jan 2024 23:22:30 +0800
Subject: [PATCH 18/25] RecordSkipper refactored

---
 cpp/src/parquet/column_reader.h | 59 +++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 35924581c8a5..b18ef38c7006 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -365,6 +365,7 @@ class RowRanges {
   virtual size_t RowCount() const = 0;
   virtual int64_t LastRow() const = 0;
   virtual bool IsValid() const = 0;
+  virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
   virtual std::string ToString() const = 0;
 
   // Returns a vector of PageLocations that must be read all to get values for
@@ -471,7 +472,7 @@ class IntervalRanges : public RowRanges {
     return IsOverlapping(searchRange);
   }
 
-  bool IsOverlapping(const IntervalRange& searchRange) const {
+  bool IsOverlapping(const IntervalRange& searchRange) const override {
     auto it = std::lower_bound(
         ranges_.begin(), ranges_.end(), searchRange,
         [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
@@ -546,8 +547,6 @@ class IntervalRanges : public RowRanges {
     return result;
   }
 
-
-
  private:
   std::vector<IntervalRange> ranges_;
 };
@@ -555,12 +554,11 @@ class IntervalRanges : public RowRanges {
 namespace internal {
 class PARQUET_EXPORT RecordSkipper {
  public:
-  RecordSkipper(IntervalRanges& pages, const IntervalRanges& row_ranges)
-      : row_ranges_(row_ranges) {
+  RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) {
     // copy row_ranges
     IntervalRanges skip_pages;
     for (auto& page : pages.GetRanges()) {
-      if (!row_ranges.IsOverlapping(page)) {
+      if (!orig_row_ranges.IsOverlapping(page)) {
         skip_pages.Add(page);
       }
     }
@@ -568,7 +566,9 @@ class PARQUET_EXPORT RecordSkipper {
     /// Since the skipped pages will be silently skipped without updating
     /// current_rg_processed_records or records_read_, we need to pre-process the row
     /// ranges as if these skipped pages never existed
-    AdjustRanges(skip_pages, row_ranges_);
+    AdjustRanges(skip_pages, orig_row_ranges, row_ranges_);
+    range_iter_ = row_ranges_->NewIterator();
+    current_range_variant = range_iter_->NextRange();
 
     total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount();
   }
@@ -578,47 +578,56 @@ class PARQUET_EXPORT RecordSkipper {
   /// if return values is negative, it means to skip N records
   /// if return values is 0, it means end of RG
   int64_t AdviseNext(const int64_t current_rg_processed) {
-    if (row_ranges_.GetRanges().size() == row_range_idx_) {
+    if (current_range_variant.index() == 2) {
       return 0;
     }
 
-    if (row_ranges_[row_range_idx_].end < current_rg_processed) {
-      row_range_idx_++;
-      if (row_ranges_.GetRanges().size() == row_range_idx_) {
+    auto & current_range = std::get<IntervalRange>(current_range_variant);
+
+    if (current_range.end < current_rg_processed) {
+      current_range_variant = range_iter_->NextRange();
+      if (current_range_variant.index() == 2) {
         // negative, skip the ramaining rows
         return current_rg_processed - total_rows_to_process_;
       }
     }
 
-    if (row_ranges_[row_range_idx_].start > current_rg_processed) {
+    current_range = std::get<IntervalRange>(current_range_variant);
+
+    if (current_range.start > current_rg_processed) {
       // negative, skip
-      return current_rg_processed - row_ranges_[row_range_idx_].start;
+      return current_rg_processed - current_range.start;
     }
 
-    const auto ret = row_ranges_[row_range_idx_].end - current_rg_processed + 1;
+    const auto ret = current_range.end - current_rg_processed + 1;
     return ret;
   }
 
- private:
-  void AdjustRanges(IntervalRanges& skip_pages, IntervalRanges& to_adjust) {
+private:
+  void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges, std::unique_ptr<RowRanges>& ret) {
+    std::unique_ptr<IntervalRanges> temp = std::make_unique<IntervalRanges>();
+
     size_t skipped_rows = 0;
-    auto iter = to_adjust.GetRanges().begin();
+    const auto orig_range_iter = orig_row_ranges.NewIterator();
+    auto orig_range_variant = orig_range_iter->NextRange();
     auto skip_iter = skip_pages.GetRanges().begin();
-    while (iter != to_adjust.GetRanges().end()) {
-      while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) {
+    while (orig_range_variant.index() != 2) {
+      const auto & origin_range = std::get<IntervalRange>(orig_range_variant);
+      while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(origin_range)) {
         skipped_rows += skip_iter->Count();
         ++skip_iter;
       }
-      iter->start -= skipped_rows;
-      iter->end -= skipped_rows;
-      ++iter;
+
+      temp->Add(IntervalRange(origin_range.start - skipped_rows, origin_range.end - skipped_rows));
+      orig_range_variant = orig_range_iter->NextRange();
     }
+    ret = std::move(temp);
   }
 
-  /// Keep copy of ranges, because AdjustRanges() will modify them
-  IntervalRanges row_ranges_;
+  std::unique_ptr<RowRanges> row_ranges_;
+  std::unique_ptr<RowRanges::Iterator> range_iter_;
+  std::variant<IntervalRange, BitmapRange, End> current_range_variant = End();
 
-  size_t row_range_idx_ = 0;
   size_t total_rows_to_process_ = 0;
 };
 

From cb0d67b27dfd0c01ac7ea04f147d3ec484034846 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 18:20:04 +0800
Subject: [PATCH 19/25] refactor complete

---
 cpp/src/parquet/arrow/reader.cc         |  47 +++---
 cpp/src/parquet/arrow/reader.h          |   2 +-
 cpp/src/parquet/arrow/reader_internal.h |   2 +-
 cpp/src/parquet/column_reader.h         | 165 +++++++++-----------
 cpp/src/parquet/row_range_test.cc       | 193 ++++++++++++++++++------
 5 files changed, 239 insertions(+), 170 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 843e4f227659..49aeeb3f0f5e 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -205,7 +205,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(
       int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
       const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
+      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
       std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
@@ -222,13 +222,13 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->row_ranges_per_rg = row_ranges_per_rg;
+    ctx->row_ranges_per_rg = row_ranges_per_rg; // copy the shared pointer to extend its lifecycle
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(
       const std::vector<int>& column_indices, const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
+      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
       std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
       std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -344,10 +344,10 @@ class FileReaderImpl : public FileReader {
   // This is a internal API owned by FileReaderImpl, not exposed in FileReader
   Status GetRecordBatchReaderWithRowRanges(const std::vector<int>& row_group_indices,
                                            const std::vector<int>& column_indices,
-                                           const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
+                                           const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
                                            std::unique_ptr<RecordBatchReader>* out);
 
-  Status GetRecordBatchReader(const IntervalRanges& rows_to_return,
+  Status GetRecordBatchReader(const RowRanges& rows_to_return,
                               const std::vector<int>& column_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
     const auto metadata = reader_->metadata();
@@ -362,27 +362,24 @@ class FileReaderImpl : public FileReader {
                              " exceeds the number of rows in the file: " +
                              std::to_string(metadata->num_rows()));
     }
+    if (rows_to_return.RowCount() == 0) {
+      return GetRecordBatchReaderWithRowRanges({}, column_indices, {}, out);
+    }
 
-    std::vector<int64_t> split_points;
-    int64_t rows_so_far = 0;
-    for (int i = 0 ; i < metadata->num_row_groups() - 1; i++) {
-      rows_so_far += metadata->RowGroup(i)->num_rows();
-      split_points.push_back(rows_so_far);
+    std::vector<int64_t> rows_per_rg;
+    for (int i = 0 ; i < metadata->num_row_groups(); i++) {
+      rows_per_rg.push_back( metadata->RowGroup(i)->num_rows());
     }
     // We'll assign a RowRanges for each RG, even if it's not required to return any rows
-    const std::vector<IntervalRanges> splits = rows_to_return.SplitAt(split_points);
-    const std::shared_ptr<std::vector<IntervalRanges>> row_ranges_per_rg =
-        std::make_shared<std::vector<IntervalRanges>>();
-    rows_so_far = 0;
+    std::vector<std::unique_ptr<RowRanges>> row_ranges_per_rg = rows_to_return.SplitByRowGroups(rows_per_rg);
     std::vector<int> row_group_indices;
     for (int i = 0 ; i < metadata->num_row_groups(); i++) {
-      row_ranges_per_rg->push_back(splits[i].shift(-rows_so_far));
-      rows_so_far += metadata->RowGroup(i)->num_rows();
-      if (row_ranges_per_rg->at(i).RowCount() > 0)
+      if (row_ranges_per_rg.at(i)->RowCount() > 0)
         row_group_indices.push_back(i);
     }
 
-    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_per_rg, out);
+    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices,
+      std::make_shared<std::vector<std::unique_ptr<RowRanges>>>(std::move(row_ranges_per_rg)), out);
   }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
@@ -502,7 +499,9 @@ class RowGroupReaderImpl : public RowGroupReader {
 // ----------------------------------------------------------------------
 // Column reader implementations
 
-// Only support IntervalRange case for now
+// This class is used to skip decompressing & decoding unnecessary pages by comparing user-specified row_ranges
+// and page_ranges from metadata.
+// Only support IntervalRange case for now.
 class RowRangesPageFilter {
  public:
   RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr<RowRanges>& page_ranges)
@@ -672,20 +671,20 @@ class LeafReader : public ColumnReaderImpl {
 
       const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()];
       // if specific row range is provided for this rg
-      if (row_ranges.RowCount() != 0) {
+      if (row_ranges->RowCount() != 0) {
 
         // Use IntervalRanges to represent pages
         std::shared_ptr<IntervalRanges> page_ranges;
-        checkAndGetPageRanges(row_ranges, page_ranges);
+        checkAndGetPageRanges(*row_ranges, page_ranges);
 
         // part 1, skip decompressing & decoding unnecessary pages
         page_reader->set_data_page_filter(
-            RowRangesPageFilter(row_ranges, page_ranges));
+            RowRangesPageFilter(*row_ranges, page_ranges));
 
         // part 2, skip unnecessary rows in necessary pages
         record_reader_->set_record_skipper(
             std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
-                                                               row_ranges));
+                                                               *row_ranges));
       } else {
         NextRowGroup();
         return;
@@ -1163,7 +1162,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReaderWithRowRanges(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::vector<IntervalRanges>> & row_ranges_per_rg,
+    const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
     std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index b439f82789a0..98ea6f5c1a05 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -197,7 +197,7 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either rows_to_return or column_indices
   ///     contains an invalid index
-  virtual ::arrow::Status GetRecordBatchReader(const IntervalRanges& rows_to_return,
+  virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return,
       const std::vector<int>& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
   /// \brief Return a RecordBatchReader of row groups selected from
diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h
index f579e62f610f..b30aef2691c1 100644
--- a/cpp/src/parquet/arrow/reader_internal.h
+++ b/cpp/src/parquet/arrow/reader_internal.h
@@ -113,7 +113,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
-  std::shared_ptr<std::vector<IntervalRanges>> row_ranges_per_rg;
+  std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> row_ranges_per_rg;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index b18ef38c7006..5b9a96b27df9 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -366,6 +366,10 @@ class RowRanges {
   virtual int64_t LastRow() const = 0;
   virtual bool IsValid() const = 0;
   virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+  // Given a RowRanges with rows accross all RGs, split it into N RowRanges, where N = number of RGs
+  // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested in RowRanges [90-110], then
+  // this function will return 2 RowRanges: [90-99] and [0-10]
+  virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const std::vector<int64_t>& rows_per_rg) const = 0;
   virtual std::string ToString() const = 0;
 
   // Returns a vector of PageLocations that must be read all to get values for
@@ -387,26 +391,20 @@ class IntervalRanges : public RowRanges {
 
   explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); }
 
-  IntervalRanges(const std::vector<IntervalRange>& ranges) { this->ranges_ = ranges; }
-
-  IntervalRanges(const IntervalRanges& other) { ranges_ = other.ranges_; }
-
-  IntervalRanges(IntervalRanges&& other) noexcept { ranges_ = std::move(other.ranges_); }
-
   class IntervalRowRangesIterator : public Iterator {
-  public:
-    IntervalRowRangesIterator(const std::vector<IntervalRange> & ranges) : ranges_(ranges) {}
+   public:
+    IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges)
+        : ranges_(ranges) {}
     ~IntervalRowRangesIterator() override {}
 
     std::variant<IntervalRange, BitmapRange, End> NextRange() override {
-      if(current_index_ >= ranges_.size())
-        return End();
+      if (current_index_ >= ranges_.size()) return End();
 
       return ranges_[current_index_++];
     }
 
-  private:
-    const std::vector<IntervalRange> & ranges_;
+   private:
+    const std::vector<IntervalRange>& ranges_;
     size_t current_index_ = 0;
   };
 
@@ -422,9 +420,7 @@ class IntervalRanges : public RowRanges {
     return cnt;
   }
 
-  int64_t LastRow() const override {
-    return ranges_.back().end;
-  }
+  int64_t LastRow() const override { return ranges_.back().end; }
 
   bool IsValid() const override {
     if (ranges_.size() == 0) return true;
@@ -439,39 +435,6 @@ class IntervalRanges : public RowRanges {
     return true;
   }
 
-  static IntervalRanges Intersection(const IntervalRanges& left, const IntervalRanges& right) {
-    IntervalRanges result;
-
-    size_t rightIndex = 0;
-    for (const IntervalRange& l : left.ranges_) {
-      for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
-        const IntervalRange& r = right.ranges_[i];
-        if (l.IsBefore(r)) {
-          break;
-        } else if (l.IsAfter(r)) {
-          rightIndex = i + 1;
-          continue;
-        }
-        result.Add(IntervalRange::Intersection(l, r));
-      }
-    }
-
-    return result;
-  }
-
-  void Add(const IntervalRange& range) {
-    const IntervalRange rangeToAdd = range;
-    if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
-      throw ParquetException("Ranges must be added in order");
-    }
-    ranges_.push_back(rangeToAdd);
-  }
-
-  bool IsOverlapping(int64_t start, int64_t end) const {
-    const IntervalRange searchRange(start, end);
-    return IsOverlapping(searchRange);
-  }
-
   bool IsOverlapping(const IntervalRange& searchRange) const override {
     auto it = std::lower_bound(
         ranges_.begin(), ranges_.end(), searchRange,
@@ -479,79 +442,93 @@ class IntervalRanges : public RowRanges {
     return it != ranges_.end() && !(*it).IsAfter(searchRange);
   }
 
-  std::vector<IntervalRange>& GetRanges() { return ranges_; }
-
-  const std::vector<IntervalRange>& GetRanges() const { return ranges_; }
-
-  // Split the ranges into N+1 parts at the given split point, where N =
-  // split_points.size(). The RowRows object itself is not modified
-  std::vector<IntervalRanges> SplitAt(const std::vector<int64_t>& split_points) const {
-    if (split_points.size() == 0) {
-      return {*this};
+  std::string ToString() const override {
+    std::string result = "[";
+    for (const IntervalRange& range : ranges_) {
+      result += range.ToString() + ", ";
+    }
+    if (!ranges_.empty()) {
+      result = result.substr(0, result.size() - 2);
     }
+    result += "]";
+    return result;
+  }
 
-    std::vector<IntervalRanges> result;
-    int64_t last_split_point = -1;
-    for (const int64_t split_point : split_points) {
-      if (split_point <= 0) {
-        throw ParquetException("Invalid split point " + std::to_string(split_point));
-      }
-      if (split_point <= last_split_point) {
-        throw ParquetException("Split points must be in ascending order");
-      }
-      last_split_point = split_point;
+  std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(
+      const std::vector<int64_t>& rows_per_rg) const override {
+    if (rows_per_rg.size() <= 1) {
+      std::unique_ptr<RowRanges> single =
+          std::make_unique<IntervalRanges>(*this);  // return a copy of itself
+      auto ret = std::vector<std::unique_ptr<RowRanges>>();
+      ret.push_back(std::move(single));
+      return ret;
     }
 
+    std::vector<std::unique_ptr<RowRanges>> result;
+
     IntervalRanges spaces;
-    for (size_t i = 0; i < split_points.size(); ++i) {
-      auto start = i == 0 ? 0 : split_points[i - 1];
-      auto end = split_points[i] - 1;
+    int64_t rows_so_far = 0;
+    for (size_t i = 0; i < rows_per_rg.size(); ++i) {
+      auto start = rows_so_far;
+      rows_so_far += rows_per_rg[i];
+      auto end = rows_so_far - 1;
       spaces.Add({start, end});
     }
-    spaces.Add(
-        {split_points[split_points.size() - 1], std::numeric_limits<int64_t>::max()});
 
+    // each RG's row range forms a space, we need to adjust RowRanges in each space to
+    // zero based.
     for (IntervalRange space : spaces.GetRanges()) {
-      IntervalRanges intersection = IntervalRanges::Intersection(IntervalRanges(space), *this);
-      result.push_back(intersection);
+      auto intersection = Intersection(IntervalRanges(space), *this);
+
+      std::unique_ptr<IntervalRanges> zero_based_ranges =
+          std::make_unique<IntervalRanges>();
+      for (const IntervalRange& range : intersection.GetRanges()) {
+        zero_based_ranges->Add({range.start - space.start, range.end - space.start});
+      }
+      result.push_back(std::move(zero_based_ranges));
     }
 
     return result;
   }
 
-  const IntervalRange& operator[](size_t index) const {
-    // check index
-    if (index >= ranges_.size() || index < 0) {
-      throw ParquetException("Index out of range");
-    }
-    return ranges_[index];
-  }
-
-  IntervalRanges shift(const int64_t offset) const {
+  static IntervalRanges Intersection(const IntervalRanges& left,
+                                     const IntervalRanges& right) {
     IntervalRanges result;
-    for (const IntervalRange& range : ranges_) {
-      result.Add({range.start + offset, range.end + offset});
+
+    size_t rightIndex = 0;
+    for (const IntervalRange& l : left.ranges_) {
+      for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
+        const IntervalRange& r = right.ranges_[i];
+        if (l.IsBefore(r)) {
+          break;
+        } else if (l.IsAfter(r)) {
+          rightIndex = i + 1;
+          continue;
+        }
+        result.Add(IntervalRange::Intersection(l, r));
+      }
     }
+
     return result;
   }
 
-  std::string ToString() const override {
-    std::string result = "[";
-    for (const IntervalRange& range : ranges_) {
-      result += range.ToString() + ", ";
-    }
-    if (!ranges_.empty()) {
-      result = result.substr(0, result.size() - 2);
+  void Add(const IntervalRange& range) {
+    const IntervalRange rangeToAdd = range;
+    if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
+      throw ParquetException("Ranges must be added in order");
     }
-    result += "]";
-    return result;
+    ranges_.push_back(rangeToAdd);
   }
 
+  const std::vector<IntervalRange>& GetRanges() const { return ranges_; }
+
  private:
   std::vector<IntervalRange> ranges_;
 };
 
 namespace internal {
+
+// A RecordSkipper is used to skip uncessary rows within each pages.
 class PARQUET_EXPORT RecordSkipper {
  public:
   RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) {
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
index 81b38bc28268..82ad60c6b3fe 100644
--- a/cpp/src/parquet/row_range_test.cc
+++ b/cpp/src/parquet/row_range_test.cc
@@ -24,79 +24,172 @@ class RowRangesTest : public ::testing::Test {
   IntervalRanges rowRanges;
 };
 
-TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) {
+TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) {
   rowRanges.Add(IntervalRange(0, 10));
-  std::vector<int64_t> split_points;
-
-  auto result = rowRanges.SplitAt(split_points);
+  std::vector<int64_t> rows_per_rg;
 
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 1);
-  ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].start, 0);
-  ASSERT_EQ(result[0][0].end, 10);
+
+  auto iter = result[0]->NewIterator();
+  auto range = std::get<IntervalRange>(iter->NextRange());
+  ASSERT_EQ(range.start, 0);
+  ASSERT_EQ(range.end, 10);
+  ASSERT_EQ(iter->NextRange().index(), 2);
 }
 
-TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) {
+TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) {
   rowRanges.Add(IntervalRange(0, 10));
-  std::vector<int64_t> split_points = {5};
+  std::vector<int64_t> rows_per_rg = {11};
 
-  auto result = rowRanges.SplitAt(split_points);
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  ASSERT_EQ(result.size(), 1);
 
-  ASSERT_EQ(result.size(), 2);
-  ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].start, 0);
-  ASSERT_EQ(result[0][0].end, 4);
-  ASSERT_EQ(result[1].GetRanges().size(), 1);
-  ASSERT_EQ(result[1][0].start, 5);
-  ASSERT_EQ(result[1][0].end, 10);
+  auto iter = result[0]->NewIterator();
+  auto range = std::get<IntervalRange>(iter->NextRange());
+  ASSERT_EQ(range.start, 0);
+  ASSERT_EQ(range.end, 10);
+  ASSERT_EQ(iter->NextRange().index(), 2);
 }
 
-TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) {
+TEST_F(RowRangesTest, ReturnsTwoRowRanges) {
   rowRanges.Add(IntervalRange(0, 10));
-  std::vector<int64_t> split_points = {3, 7};
+  std::vector<int64_t> rows_per_rg = {5, 6};
 
-  auto result = rowRanges.SplitAt(split_points);
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  ASSERT_EQ(result.size(), 2);
+  {
+    auto iter = result[0]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 4);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[1]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 5);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+}
 
+TEST_F(RowRangesTest, ReturnsMultipleRowRanges) {
+  rowRanges.Add(IntervalRange(0, 11));
+  std::vector<int64_t> rows_per_rg = {3, 4, 100};
+
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 3);
-  ASSERT_EQ(result[0].GetRanges().size(), 1);
-  ASSERT_EQ(result[0][0].start, 0);
-  ASSERT_EQ(result[0][0].end, 2);
-  ASSERT_EQ(result[1].GetRanges().size(), 1);
-  ASSERT_EQ(result[1][0].start, 3);
-  ASSERT_EQ(result[1][0].end, 6);
-  ASSERT_EQ(result[2].GetRanges().size(), 1);
-  ASSERT_EQ(result[2][0].start, 7);
-  ASSERT_EQ(result[2][0].end, 10);
+  {
+    auto iter = result[0]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 2);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[1]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 3);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[2]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 4);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
 }
 
-TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) {
-  rowRanges.Add(IntervalRange(11, 18));
-  std::vector<int64_t> split_points = {5, 10, 15, 20};
+TEST_F(RowRangesTest, MultipleInputRange) {
+  rowRanges.Add(IntervalRange(0, 10));
+  rowRanges.Add(IntervalRange(90, 111));
+  rowRanges.Add(IntervalRange(191, 210));
 
-  auto result = rowRanges.SplitAt(split_points);
+  std::vector<int64_t> rows_per_rg = {100, 100};
 
-  ASSERT_EQ(result.size(), 5);
-  ASSERT_EQ(result[0].GetRanges().size(), 0);
-  ASSERT_EQ(result[1].GetRanges().size(), 0);
-  ASSERT_EQ(result[2].GetRanges().size(), 1);
-  ASSERT_EQ(result[2][0].start, 11);
-  ASSERT_EQ(result[2][0].end, 14);
-  ASSERT_EQ(result[3].GetRanges().size(), 1);
-  ASSERT_EQ(result[3][0].start, 15);
-  ASSERT_EQ(result[3][0].end, 18);
-  ASSERT_EQ(result[4].GetRanges().size(), 0);
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  ASSERT_EQ(result.size(), 2);
+  {
+    auto iter = result[0]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 10);
+
+    range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 90);
+    ASSERT_EQ(range.end, 99);
+
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[1]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 11);
+
+    range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 91);
+    ASSERT_EQ(range.end, 99);
+
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
 }
 
-TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) {
-  rowRanges.Add(IntervalRange(0, 10));
-  std::vector<int64_t> split_points = {-1};
+TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) {
+  rowRanges.Add(IntervalRange(11, 18));
+  std::vector<int64_t> rows_per_rg = {5, 5, 5, 5, 5};
 
-  ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  ASSERT_EQ(result.size(), 5);
+  {
+    auto iter = result[0]->NewIterator();
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[1]->NewIterator();
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[2]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 1);
+    ASSERT_EQ(range.end, 4);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[3]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 3);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[4]->NewIterator();
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
 }
 
-TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) {
+TEST_F(RowRangesTest, RangeExceedRG) {
   rowRanges.Add(IntervalRange(0, 10));
-  std::vector<int64_t> split_points = {5, 3};
+  std::vector<int64_t> rows_per_rg = {5, 3};
 
-  ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException);
+  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  ASSERT_EQ(result.size(), 2);
+  {
+    auto iter = result[0]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 4);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
+  {
+    auto iter = result[1]->NewIterator();
+    auto range = std::get<IntervalRange>(iter->NextRange());
+    ASSERT_EQ(range.start, 0);
+    ASSERT_EQ(range.end, 2);
+    ASSERT_EQ(iter->NextRange().index(), 2);
+  }
 }

From 5805b976d5b920c4e27d8c984973a1442b32713d Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 19:35:53 +0800
Subject: [PATCH 20/25] fix style

---
 cpp/src/parquet/arrow/reader.cc  |  2 +-
 cpp/src/parquet/column_reader.cc | 10 +++++-----
 cpp/src/parquet/column_reader.h  | 16 ++++++++++++----
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 49aeeb3f0f5e..8782f9d84b2a 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -205,7 +205,7 @@ class FileReaderImpl : public FileReader {
   Status GetFieldReader(
       int i, const std::shared_ptr<std::unordered_set<int>>& included_leaves,
       const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
+      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>>& row_ranges_per_rg,
       std::unique_ptr<ColumnReaderImpl>* out) {
     // Should be covered by GetRecordBatchReader checks but
     // manifest_.schema_fields is a separate variable so be extra careful.
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 56e0f0b99450..763274ed74a2 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1637,7 +1637,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
     int64_t skipped_records = 0;
     if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) {
       skipped_records = this->Skip(num_records);
-      current_rg_processed_records += skipped_records;
+      current_rg_processed_records_ += skipped_records;
       return skipped_records;
     }
     if (this->max_rep_level_ == 0) {
@@ -1656,7 +1656,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
       skipped_records += this->SkipRecordsRepeated(num_records);
     }
 
-    current_rg_processed_records += skipped_records;
+    current_rg_processed_records_ += skipped_records;
     return skipped_records;
   }
 
@@ -1988,17 +1988,17 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
       this->ConsumeBufferedValues(values_to_read);
     }
 
-    current_rg_processed_records += records_read;
+    current_rg_processed_records_ += records_read;
     return records_read;
   }
 
   int64_t ReadRecordDataWithSkipCheck(const int64_t num_records) {
-    if (!skipper) {
+    if (!skipper_) {
       return ReadRecordData(num_records);
     }
 
     while (true) {
-      const auto advise = skipper->AdviseNext(current_rg_processed_records);
+      const auto advise = skipper_->AdviseNext(current_rg_processed_records_);
       if (advise == 0) {
         return 0;
       }
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 5b9a96b27df9..cae7a1336590 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -32,6 +32,7 @@
 #include "parquet/types.h"
 
 namespace arrow {
+
 class Array;
 class ChunkedArray;
 
@@ -42,9 +43,11 @@ class BitReader;
 namespace util {
 class RleDecoder;
 }  // namespace util
+
 }  // namespace arrow
 
 namespace parquet {
+
 class Decryptor;
 class Page;
 
@@ -427,6 +430,11 @@ class IntervalRanges : public RowRanges {
     if (ranges_[0].start < 0) {
       return false;
     }
+    for (size_t i = 0; i < ranges_.size(); i++) {
+      if (!ranges_[i].IsValid()) {
+        return false;
+      }
+    }
     for (size_t i = 1; i < ranges_.size(); i++) {
       if (ranges_[i].start <= ranges_[i - 1].end) {
         return false;
@@ -718,9 +726,9 @@ class PARQUET_EXPORT RecordReader {
   /// \brief True if reading dense for nullable columns.
   bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
 
-  void reset_current_rg_processed_records() { current_rg_processed_records = 0; }
+  void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; }
 
-  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper = skipper_; }
+  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper_ = skipper_; }
 
  protected:
   /// \brief Indicates if we can have nullable values. Note that repeated fields
@@ -730,7 +738,7 @@ class PARQUET_EXPORT RecordReader {
   bool at_record_start_;
   int64_t records_read_;
 
-  int64_t current_rg_processed_records;  // counting both read and skip records
+  int64_t current_rg_processed_records_;  // counting both read and skip records
 
   /// \brief Stores values. These values are populated based on each ReadRecords
   /// call. No extra values are buffered for the next call. SkipRecords will not
@@ -774,7 +782,7 @@ class PARQUET_EXPORT RecordReader {
   // vector.
   bool read_dense_for_nullable_ = false;
 
-  std::shared_ptr<RecordSkipper> skipper = NULLPTR;
+  std::shared_ptr<RecordSkipper> skipper_ = NULLPTR;
 };
 
 class BinaryRecordReader : virtual public RecordReader {

From 639d94a221d6cb1d5ee4e311918043643f5de12e Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 20:04:39 +0800
Subject: [PATCH 21/25] separete definition

---
 cpp/src/parquet/column_reader.cc | 141 +++++++++++++++++++++++++++
 cpp/src/parquet/column_reader.h  | 161 +++++--------------------------
 2 files changed, 165 insertions(+), 137 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 763274ed74a2..954b94ad47e0 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1319,6 +1319,147 @@ std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
   return std::shared_ptr<ColumnReader>(nullptr);
 }
 
+// ----------------------------------------------------------------------
+// RowRanges and ins implementations
+
+IntervalRanges::IntervalRanges() = default;
+
+IntervalRanges::IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); }
+
+IntervalRanges::IntervalRanges(const std::vector<IntervalRange>& ranges) {
+  this->ranges_ = ranges;
+}
+
+std::unique_ptr<RowRanges::Iterator> IntervalRanges::NewIterator() const {
+  return std::make_unique<IntervalRowRangesIterator>(ranges_);
+}
+
+size_t IntervalRanges::RowCount() const {
+  size_t cnt = 0;
+  for (const IntervalRange& range : ranges_) {
+    cnt += range.Count();
+  }
+  return cnt;
+}
+
+int64_t IntervalRanges::LastRow() const { return ranges_.back().end; }
+
+bool IntervalRanges::IsValid() const {
+  if (ranges_.size() == 0) return true;
+  if (ranges_[0].start < 0) {
+    return false;
+  }
+  for (size_t i = 0; i < ranges_.size(); i++) {
+    if (!ranges_[i].IsValid()) {
+      return false;
+    }
+  }
+  for (size_t i = 1; i < ranges_.size(); i++) {
+    if (ranges_[i].start <= ranges_[i - 1].end) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IntervalRanges::IsOverlapping(const IntervalRange& searchRange) const {
+  auto it = std::lower_bound(
+      ranges_.begin(), ranges_.end(), searchRange,
+      [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
+  return it != ranges_.end() && !(*it).IsAfter(searchRange);
+}
+
+std::string IntervalRanges::ToString() const {
+  std::string result = "[";
+  for (const IntervalRange& range : ranges_) {
+    result += range.ToString() + ", ";
+  }
+  if (!ranges_.empty()) {
+    result = result.substr(0, result.size() - 2);
+  }
+  result += "]";
+  return result;
+}
+
+std::vector<std::unique_ptr<RowRanges>> IntervalRanges::SplitByRowGroups(
+    const std::vector<int64_t>& rows_per_rg) const {
+  if (rows_per_rg.size() <= 1) {
+    std::unique_ptr<RowRanges> single =
+        std::make_unique<IntervalRanges>(*this);  // return a copy of itself
+    auto ret = std::vector<std::unique_ptr<RowRanges>>();
+    ret.push_back(std::move(single));
+    return ret;
+  }
+
+  std::vector<std::unique_ptr<RowRanges>> result;
+
+  IntervalRanges spaces;
+  int64_t rows_so_far = 0;
+  for (size_t i = 0; i < rows_per_rg.size(); ++i) {
+    auto start = rows_so_far;
+    rows_so_far += rows_per_rg[i];
+    auto end = rows_so_far - 1;
+    spaces.Add({start, end});
+  }
+
+  // each RG's row range forms a space, we need to adjust RowRanges in each space to
+  // zero based.
+  for (IntervalRange space : spaces.GetRanges()) {
+    auto intersection = Intersection(IntervalRanges(space), *this);
+
+    std::unique_ptr<IntervalRanges> zero_based_ranges =
+        std::make_unique<IntervalRanges>();
+    for (const IntervalRange& range : intersection.GetRanges()) {
+      zero_based_ranges->Add({range.start - space.start, range.end - space.start});
+    }
+    result.push_back(std::move(zero_based_ranges));
+  }
+
+  return result;
+}
+
+IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left,
+                                            const IntervalRanges& right) {
+  IntervalRanges result;
+
+  size_t rightIndex = 0;
+  for (const IntervalRange& l : left.ranges_) {
+    for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
+      const IntervalRange& r = right.ranges_[i];
+      if (l.IsBefore(r)) {
+        break;
+      } else if (l.IsAfter(r)) {
+        rightIndex = i + 1;
+        continue;
+      }
+      result.Add(IntervalRange::Intersection(l, r));
+    }
+  }
+
+  return result;
+}
+
+void IntervalRanges::Add(const IntervalRange& range) {
+  const IntervalRange rangeToAdd = range;
+  if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
+    throw ParquetException("Ranges must be added in order");
+  }
+  ranges_.push_back(rangeToAdd);
+}
+
+const std::vector<IntervalRange>& IntervalRanges::GetRanges() const { return ranges_; }
+
+IntervalRowRangesIterator::IntervalRowRangesIterator(
+    const std::vector<IntervalRange>& ranges)
+    : ranges_(ranges) {}
+IntervalRowRangesIterator::~IntervalRowRangesIterator() {}
+
+std::variant<IntervalRange, BitmapRange, End> IntervalRowRangesIterator::NextRange() {
+  if (current_index_ >= ranges_.size()) return End();
+
+  return ranges_[current_index_++];
+}
+
 // ----------------------------------------------------------------------
 // RecordReader
 
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index cae7a1336590..1a921e6c26df 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -390,148 +390,35 @@ class RowRanges {
 
 class IntervalRanges : public RowRanges {
  public:
-  IntervalRanges() = default;
-
-  explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); }
-
-  class IntervalRowRangesIterator : public Iterator {
-   public:
-    IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges)
-        : ranges_(ranges) {}
-    ~IntervalRowRangesIterator() override {}
-
-    std::variant<IntervalRange, BitmapRange, End> NextRange() override {
-      if (current_index_ >= ranges_.size()) return End();
-
-      return ranges_[current_index_++];
-    }
-
-   private:
-    const std::vector<IntervalRange>& ranges_;
-    size_t current_index_ = 0;
-  };
-
-  std::unique_ptr<Iterator> NewIterator() const override {
-    return std::make_unique<IntervalRowRangesIterator>(ranges_);
-  }
-
-  size_t RowCount() const override {
-    size_t cnt = 0;
-    for (const IntervalRange& range : ranges_) {
-      cnt += range.Count();
-    }
-    return cnt;
-  }
-
-  int64_t LastRow() const override { return ranges_.back().end; }
-
-  bool IsValid() const override {
-    if (ranges_.size() == 0) return true;
-    if (ranges_[0].start < 0) {
-      return false;
-    }
-    for (size_t i = 0; i < ranges_.size(); i++) {
-      if (!ranges_[i].IsValid()) {
-        return false;
-      }
-    }
-    for (size_t i = 1; i < ranges_.size(); i++) {
-      if (ranges_[i].start <= ranges_[i - 1].end) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool IsOverlapping(const IntervalRange& searchRange) const override {
-    auto it = std::lower_bound(
-        ranges_.begin(), ranges_.end(), searchRange,
-        [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
-    return it != ranges_.end() && !(*it).IsAfter(searchRange);
-  }
-
-  std::string ToString() const override {
-    std::string result = "[";
-    for (const IntervalRange& range : ranges_) {
-      result += range.ToString() + ", ";
-    }
-    if (!ranges_.empty()) {
-      result = result.substr(0, result.size() - 2);
-    }
-    result += "]";
-    return result;
-  }
-
+  IntervalRanges();
+  explicit IntervalRanges(const IntervalRange& range);
+  explicit IntervalRanges(const std::vector<IntervalRange>& ranges);
+  std::unique_ptr<Iterator> NewIterator() const override;
+  size_t RowCount() const override;
+  int64_t LastRow() const override;
+  bool IsValid() const override;
+  bool IsOverlapping(const IntervalRange& searchRange) const override;
+  std::string ToString() const override;
   std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(
-      const std::vector<int64_t>& rows_per_rg) const override {
-    if (rows_per_rg.size() <= 1) {
-      std::unique_ptr<RowRanges> single =
-          std::make_unique<IntervalRanges>(*this);  // return a copy of itself
-      auto ret = std::vector<std::unique_ptr<RowRanges>>();
-      ret.push_back(std::move(single));
-      return ret;
-    }
-
-    std::vector<std::unique_ptr<RowRanges>> result;
-
-    IntervalRanges spaces;
-    int64_t rows_so_far = 0;
-    for (size_t i = 0; i < rows_per_rg.size(); ++i) {
-      auto start = rows_so_far;
-      rows_so_far += rows_per_rg[i];
-      auto end = rows_so_far - 1;
-      spaces.Add({start, end});
-    }
-
-    // each RG's row range forms a space, we need to adjust RowRanges in each space to
-    // zero based.
-    for (IntervalRange space : spaces.GetRanges()) {
-      auto intersection = Intersection(IntervalRanges(space), *this);
-
-      std::unique_ptr<IntervalRanges> zero_based_ranges =
-          std::make_unique<IntervalRanges>();
-      for (const IntervalRange& range : intersection.GetRanges()) {
-        zero_based_ranges->Add({range.start - space.start, range.end - space.start});
-      }
-      result.push_back(std::move(zero_based_ranges));
-    }
-
-    return result;
-  }
-
+      const std::vector<int64_t>& rows_per_rg) const override;
   static IntervalRanges Intersection(const IntervalRanges& left,
-                                     const IntervalRanges& right) {
-    IntervalRanges result;
-
-    size_t rightIndex = 0;
-    for (const IntervalRange& l : left.ranges_) {
-      for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
-        const IntervalRange& r = right.ranges_[i];
-        if (l.IsBefore(r)) {
-          break;
-        } else if (l.IsAfter(r)) {
-          rightIndex = i + 1;
-          continue;
-        }
-        result.Add(IntervalRange::Intersection(l, r));
-      }
-    }
+                                     const IntervalRanges& right);
+  void Add(const IntervalRange& range);
+  const std::vector<IntervalRange>& GetRanges() const;
 
-    return result;
-  }
-
-  void Add(const IntervalRange& range) {
-    const IntervalRange rangeToAdd = range;
-    if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
-      throw ParquetException("Ranges must be added in order");
-    }
-    ranges_.push_back(rangeToAdd);
-  }
+ private:
+  std::vector<IntervalRange> ranges_;
+};
 
-  const std::vector<IntervalRange>& GetRanges() const { return ranges_; }
+class IntervalRowRangesIterator : public RowRanges::Iterator {
+ public:
+  IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges);
+  ~IntervalRowRangesIterator() override;
+  std::variant<IntervalRange, BitmapRange, End> NextRange() override;
 
  private:
-  std::vector<IntervalRange> ranges_;
+  const std::vector<IntervalRange>& ranges_;
+  size_t current_index_;
 };
 
 namespace internal {
@@ -728,7 +615,7 @@ class PARQUET_EXPORT RecordReader {
 
   void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; }
 
-  void set_record_skipper(std::shared_ptr<RecordSkipper> skipper_) { skipper_ = skipper_; }
+  void set_record_skipper(const std::shared_ptr<RecordSkipper>& skipper) { skipper_ = skipper; }
 
  protected:
   /// \brief Indicates if we can have nullable values. Note that repeated fields

From 8f5a88a481142ea54fae90c7306d17fac80c2200 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 20:16:35 +0800
Subject: [PATCH 22/25] separete definition 2

---
 cpp/src/parquet/column_reader.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 1a921e6c26df..7d75caba05cb 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -347,9 +347,9 @@ struct IntervalRange {
   }
 
   // inclusive
-  int64_t start;
+  int64_t start = -1;
   // inclusive
-  int64_t end;
+  int64_t end = -1;
 };
 
 struct BitmapRange {
@@ -418,7 +418,7 @@ class IntervalRowRangesIterator : public RowRanges::Iterator {
 
  private:
   const std::vector<IntervalRange>& ranges_;
-  size_t current_index_;
+  size_t current_index_ = 0;
 };
 
 namespace internal {

From 09286d7537b575331bda06e5c42f2efb16fc7a4d Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 20:29:01 +0800
Subject: [PATCH 23/25] separete definition 3

---
 cpp/src/parquet/arrow/reader.cc  |  1 -
 cpp/src/parquet/column_reader.cc | 67 +++++++++++++++++++++++++++++
 cpp/src/parquet/column_reader.h  | 74 ++++----------------------------
 3 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 8782f9d84b2a..e471696a401d 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -1398,7 +1398,6 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
   ctx->pool = pool_;
   ctx->iterator_factory = iterator_factory;
   ctx->filter_leaves = false;
-
   std::unique_ptr<ColumnReaderImpl> result;
   RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
   *out = std::move(result);
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 954b94ad47e0..4ba8243f696e 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -2447,5 +2447,72 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
   return nullptr;
 }
 
+RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) {
+  // copy row_ranges
+  IntervalRanges skip_pages;
+  for (auto& page : pages.GetRanges()) {
+    if (!orig_row_ranges.IsOverlapping(page)) {
+      skip_pages.Add(page);
+    }
+  }
+
+  AdjustRanges(skip_pages, orig_row_ranges, row_ranges_);
+  range_iter_ = row_ranges_->NewIterator();
+  current_range_variant = range_iter_->NextRange();
+
+  total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount();
+}
+
+
+int64_t RecordSkipper::AdviseNext(const int64_t current_rg_processed) {
+  if (current_range_variant.index() == 2) {
+    return 0;
+  }
+
+  auto& current_range = std::get<IntervalRange>(current_range_variant);
+
+  if (current_range.end < current_rg_processed) {
+    current_range_variant = range_iter_->NextRange();
+    if (current_range_variant.index() == 2) {
+      // negative, skip the ramaining rows
+      return current_rg_processed - total_rows_to_process_;
+    }
+  }
+
+  current_range = std::get<IntervalRange>(current_range_variant);
+
+  if (current_range.start > current_rg_processed) {
+    // negative, skip
+    return current_rg_processed - current_range.start;
+  }
+
+  const auto ret = current_range.end - current_rg_processed + 1;
+  return ret;
+}
+
+void RecordSkipper::AdjustRanges(IntervalRanges& skip_pages,
+                                 const RowRanges& orig_row_ranges,
+                                 std::unique_ptr<RowRanges>& ret) {
+  std::unique_ptr<IntervalRanges> temp = std::make_unique<IntervalRanges>();
+
+  size_t skipped_rows = 0;
+  const auto orig_range_iter = orig_row_ranges.NewIterator();
+  auto orig_range_variant = orig_range_iter->NextRange();
+  auto skip_iter = skip_pages.GetRanges().begin();
+  while (orig_range_variant.index() != 2) {
+    const auto& origin_range = std::get<IntervalRange>(orig_range_variant);
+    while (skip_iter != skip_pages.GetRanges().end() &&
+           skip_iter->IsBefore(origin_range)) {
+      skipped_rows += skip_iter->Count();
+      ++skip_iter;
+    }
+
+    temp->Add(IntervalRange(origin_range.start - skipped_rows,
+                            origin_range.end - skipped_rows));
+    orig_range_variant = orig_range_iter->NextRange();
+  }
+  ret = std::move(temp);
+}
+
 }  // namespace internal
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 7d75caba05cb..34fb43b28eed 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -426,75 +426,19 @@ namespace internal {
 // A RecordSkipper is used to skip uncessary rows within each pages.
 class PARQUET_EXPORT RecordSkipper {
  public:
-  RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) {
-    // copy row_ranges
-    IntervalRanges skip_pages;
-    for (auto& page : pages.GetRanges()) {
-      if (!orig_row_ranges.IsOverlapping(page)) {
-        skip_pages.Add(page);
-      }
-    }
-
-    /// Since the skipped pages will be silently skipped without updating
-    /// current_rg_processed_records or records_read_, we need to pre-process the row
-    /// ranges as if these skipped pages never existed
-    AdjustRanges(skip_pages, orig_row_ranges, row_ranges_);
-    range_iter_ = row_ranges_->NewIterator();
-    current_range_variant = range_iter_->NextRange();
-
-    total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount();
-  }
-
-  /// \brief Return the number of records to read or to skip
+  RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges);
+  /// Return the number of records to read or to skip
   /// if return values is positive, it means to read N records
   /// if return values is negative, it means to skip N records
   /// if return values is 0, it means end of RG
-  int64_t AdviseNext(const int64_t current_rg_processed) {
-    if (current_range_variant.index() == 2) {
-      return 0;
-    }
-
-    auto & current_range = std::get<IntervalRange>(current_range_variant);
-
-    if (current_range.end < current_rg_processed) {
-      current_range_variant = range_iter_->NextRange();
-      if (current_range_variant.index() == 2) {
-        // negative, skip the ramaining rows
-        return current_rg_processed - total_rows_to_process_;
-      }
-    }
+  int64_t AdviseNext(const int64_t current_rg_processed);
 
-    current_range = std::get<IntervalRange>(current_range_variant);
-
-    if (current_range.start > current_rg_processed) {
-      // negative, skip
-      return current_rg_processed - current_range.start;
-    }
-
-    const auto ret = current_range.end - current_rg_processed + 1;
-    return ret;
-  }
-
-private:
-  void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges, std::unique_ptr<RowRanges>& ret) {
-    std::unique_ptr<IntervalRanges> temp = std::make_unique<IntervalRanges>();
-
-    size_t skipped_rows = 0;
-    const auto orig_range_iter = orig_row_ranges.NewIterator();
-    auto orig_range_variant = orig_range_iter->NextRange();
-    auto skip_iter = skip_pages.GetRanges().begin();
-    while (orig_range_variant.index() != 2) {
-      const auto & origin_range = std::get<IntervalRange>(orig_range_variant);
-      while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(origin_range)) {
-        skipped_rows += skip_iter->Count();
-        ++skip_iter;
-      }
-
-      temp->Add(IntervalRange(origin_range.start - skipped_rows, origin_range.end - skipped_rows));
-      orig_range_variant = orig_range_iter->NextRange();
-    }
-    ret = std::move(temp);
-  }
+ private:
+  /// Since the skipped pages will be silently skipped without updating
+  /// current_rg_processed_records or records_read_, we need to pre-process the row
+  /// ranges as if these skipped pages never existed
+  static void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges,
+                           std::unique_ptr<RowRanges>& ret);
 
   std::unique_ptr<RowRanges> row_ranges_;
   std::unique_ptr<RowRanges::Iterator> range_iter_;

From b75abdf1e00eafa60f215cfea75fb8b8fb55837e Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 15 Jan 2024 20:35:01 +0800
Subject: [PATCH 24/25] minor

---
 cpp/src/parquet/column_reader.h   |  2 +-
 cpp/src/parquet/row_range_test.cc | 34 +++++++++++++++----------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index 34fb43b28eed..cf11c8975dc5 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -569,7 +569,7 @@ class PARQUET_EXPORT RecordReader {
   bool at_record_start_;
   int64_t records_read_;
 
-  int64_t current_rg_processed_records_;  // counting both read and skip records
+  int64_t current_rg_processed_records_ = 0;  // counting both read and skip records
 
   /// \brief Stores values. These values are populated based on each ReadRecords
   /// call. No extra values are buffered for the next call. SkipRecords will not
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
index 82ad60c6b3fe..44327baab04c 100644
--- a/cpp/src/parquet/row_range_test.cc
+++ b/cpp/src/parquet/row_range_test.cc
@@ -21,14 +21,14 @@ using namespace parquet;
 
 class RowRangesTest : public ::testing::Test {
  protected:
-  IntervalRanges rowRanges;
+  IntervalRanges row_ranges;
 };
 
 TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) {
-  rowRanges.Add(IntervalRange(0, 10));
+  row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg;
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 1);
 
   auto iter = result[0]->NewIterator();
@@ -39,10 +39,10 @@ TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) {
 }
 
 TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) {
-  rowRanges.Add(IntervalRange(0, 10));
+  row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {11};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 1);
 
   auto iter = result[0]->NewIterator();
@@ -53,10 +53,10 @@ TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) {
 }
 
 TEST_F(RowRangesTest, ReturnsTwoRowRanges) {
-  rowRanges.Add(IntervalRange(0, 10));
+  row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {5, 6};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();
@@ -75,10 +75,10 @@ TEST_F(RowRangesTest, ReturnsTwoRowRanges) {
 }
 
 TEST_F(RowRangesTest, ReturnsMultipleRowRanges) {
-  rowRanges.Add(IntervalRange(0, 11));
+  row_ranges.Add(IntervalRange(0, 11));
   std::vector<int64_t> rows_per_rg = {3, 4, 100};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 3);
   {
     auto iter = result[0]->NewIterator();
@@ -104,13 +104,13 @@ TEST_F(RowRangesTest, ReturnsMultipleRowRanges) {
 }
 
 TEST_F(RowRangesTest, MultipleInputRange) {
-  rowRanges.Add(IntervalRange(0, 10));
-  rowRanges.Add(IntervalRange(90, 111));
-  rowRanges.Add(IntervalRange(191, 210));
+  row_ranges.Add(IntervalRange(0, 10));
+  row_ranges.Add(IntervalRange(90, 111));
+  row_ranges.Add(IntervalRange(191, 210));
 
   std::vector<int64_t> rows_per_rg = {100, 100};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();
@@ -139,10 +139,10 @@ TEST_F(RowRangesTest, MultipleInputRange) {
 }
 
 TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) {
-  rowRanges.Add(IntervalRange(11, 18));
+  row_ranges.Add(IntervalRange(11, 18));
   std::vector<int64_t> rows_per_rg = {5, 5, 5, 5, 5};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 5);
   {
     auto iter = result[0]->NewIterator();
@@ -173,10 +173,10 @@ TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) {
 }
 
 TEST_F(RowRangesTest, RangeExceedRG) {
-  rowRanges.Add(IntervalRange(0, 10));
+  row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {5, 3};
 
-  auto result = rowRanges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();

From e361c66f59b22534bb20d9cfa4c94aa67bc050a5 Mon Sep 17 00:00:00 2001
From: Hongbin Ma <mahongbin@apache.org>
Date: Mon, 22 Jan 2024 16:16:26 +0800
Subject: [PATCH 25/25] fix comments

---
 cpp/src/parquet/CMakeLists.txt       |   1 +
 cpp/src/parquet/arrow/reader.cc      | 105 ++++++++-------
 cpp/src/parquet/arrow/reader.h       |   9 +-
 cpp/src/parquet/column_reader.cc     | 150 +--------------------
 cpp/src/parquet/column_reader.h      | 128 +-----------------
 cpp/src/parquet/range_reader_test.cc |  65 +++++----
 cpp/src/parquet/row_range.cc         | 190 +++++++++++++++++++++++++++
 cpp/src/parquet/row_range.h          | 156 ++++++++++++++++++++++
 cpp/src/parquet/row_range_test.cc    |  17 +--
 9 files changed, 461 insertions(+), 360 deletions(-)
 create mode 100644 cpp/src/parquet/row_range.cc
 create mode 100644 cpp/src/parquet/row_range.h

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 9f9a7f2336aa..7d12d87e5d9c 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -156,6 +156,7 @@ set(PARQUET_SRCS
     arrow/writer.cc
     bloom_filter.cc
     bloom_filter_reader.cc
+    row_range.cc
     column_reader.cc
     column_scanner.cc
     column_writer.cc
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index e471696a401d..cb15145b8a78 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -222,13 +222,14 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
-    ctx->row_ranges_per_rg = row_ranges_per_rg; // copy the shared pointer to extend its lifecycle
+    ctx->row_ranges_per_rg =
+        row_ranges_per_rg;  // copy the shared pointer to extend its lifecycle
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
   Status GetFieldReaders(
       const std::vector<int>& column_indices, const std::vector<int>& row_groups,
-      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
+      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>>& row_ranges_per_rg,
       std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
       std::shared_ptr<::arrow::Schema>* out_schema) {
     // We only need to read schema fields which have columns indicated
@@ -342,44 +343,43 @@ class FileReaderImpl : public FileReader {
   }
 
   // This is a internal API owned by FileReaderImpl, not exposed in FileReader
-  Status GetRecordBatchReaderWithRowRanges(const std::vector<int>& row_group_indices,
-                                           const std::vector<int>& column_indices,
-                                           const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
-                                           std::unique_ptr<RecordBatchReader>* out);
+  Status GetRecordBatchReaderWithRowRanges(
+      const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+      const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>>& row_ranges_per_rg,
+      std::unique_ptr<RecordBatchReader>* out);
 
   Status GetRecordBatchReader(const RowRanges& rows_to_return,
                               const std::vector<int>& column_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
     const auto metadata = reader_->metadata();
-    // check if the row ranges are valid
-    if (!rows_to_return.IsValid()) {
-      return Status::Invalid("The provided row range is invalid, keep it monotone and non-interleaving: " +
-                             rows_to_return.ToString());
-    }
     // check if the row ranges are within the row group boundaries
-    if (rows_to_return.RowCount() != 0 && rows_to_return.LastRow() >= metadata->num_rows()) {
+    if (rows_to_return.num_rows() != 0 &&
+        rows_to_return.last_row() >= metadata->num_rows()) {
       return Status::Invalid("The provided row range " + rows_to_return.ToString() +
                              " exceeds the number of rows in the file: " +
                              std::to_string(metadata->num_rows()));
     }
-    if (rows_to_return.RowCount() == 0) {
+    if (rows_to_return.num_rows() == 0) {
       return GetRecordBatchReaderWithRowRanges({}, column_indices, {}, out);
     }
 
     std::vector<int64_t> rows_per_rg;
-    for (int i = 0 ; i < metadata->num_row_groups(); i++) {
-      rows_per_rg.push_back( metadata->RowGroup(i)->num_rows());
+    for (int i = 0; i < metadata->num_row_groups(); i++) {
+      rows_per_rg.push_back(metadata->RowGroup(i)->num_rows());
     }
     // We'll assign a RowRanges for each RG, even if it's not required to return any rows
-    std::vector<std::unique_ptr<RowRanges>> row_ranges_per_rg = rows_to_return.SplitByRowGroups(rows_per_rg);
+    std::vector<std::unique_ptr<RowRanges>> row_ranges_per_rg =
+        rows_to_return.SplitByRowRange(rows_per_rg);
     std::vector<int> row_group_indices;
-    for (int i = 0 ; i < metadata->num_row_groups(); i++) {
-      if (row_ranges_per_rg.at(i)->RowCount() > 0)
-        row_group_indices.push_back(i);
+    for (int i = 0; i < metadata->num_row_groups(); i++) {
+      if (row_ranges_per_rg.at(i)->num_rows() > 0) row_group_indices.push_back(i);
     }
 
-    return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices,
-      std::make_shared<std::vector<std::unique_ptr<RowRanges>>>(std::move(row_ranges_per_rg)), out);
+    return GetRecordBatchReaderWithRowRanges(
+        row_group_indices, column_indices,
+        std::make_shared<std::vector<std::unique_ptr<RowRanges>>>(
+            std::move(row_ranges_per_rg)),
+        out);
   }
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
@@ -390,13 +390,13 @@ class FileReaderImpl : public FileReader {
 
   Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
                               std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReaderWithRowRanges(row_group_indices,
-                                Iota(reader_->metadata()->num_columns()), {}, out);
+    return GetRecordBatchReaderWithRowRanges(
+        row_group_indices, Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
   Status GetRecordBatchReader(std::unique_ptr<RecordBatchReader>* out) override {
-    return GetRecordBatchReaderWithRowRanges(Iota(num_row_groups()),
-                                Iota(reader_->metadata()->num_columns()), {}, out);
+    return GetRecordBatchReaderWithRowRanges(
+        Iota(num_row_groups()), Iota(reader_->metadata()->num_columns()), {}, out);
   }
 
   ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
@@ -499,22 +499,21 @@ class RowGroupReaderImpl : public RowGroupReader {
 // ----------------------------------------------------------------------
 // Column reader implementations
 
-// This class is used to skip decompressing & decoding unnecessary pages by comparing user-specified row_ranges
-// and page_ranges from metadata.
-// Only support IntervalRange case for now.
+// This class is used to skip decompressing & decoding unnecessary pages by comparing
+// user-specified row_ranges and page_ranges from metadata. Only support IntervalRange
+// case for now.
 class RowRangesPageFilter {
  public:
-  RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr<RowRanges>& page_ranges)
-    : row_ranges_(row_ranges), page_ranges_(page_ranges) {
-  }
+  RowRangesPageFilter(const RowRanges& row_ranges,
+                      const std::shared_ptr<RowRanges>& page_ranges)
+      : row_ranges_(row_ranges), page_ranges_(page_ranges) {}
 
-  // To avoid error "std::function target must be copy-constructible", we must define copy constructor
+  // To avoid error "std::function target must be copy-constructible", we must define copy
+  // constructor
   RowRangesPageFilter(const RowRangesPageFilter& other)
-    : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) {
-  }
+      : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) {}
 
   bool operator()(const DataPageStats& stats) {
-
     if (!initted) {
       row_ranges_itr_ = row_ranges_.NewIterator();
       page_ranges_itr_ = page_ranges_->NewIterator();
@@ -522,19 +521,21 @@ class RowRangesPageFilter {
       current_row_range_ = row_ranges_itr_->NextRange();
 
       if (current_row_range_.index() != 0) {
-        throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange");
+        throw ParquetException(
+            "RowRangesPageFilter expects first NextRange() to be a IntervalRange");
       }
       initted = true;
     }
 
     current_page_range_ = page_ranges_itr_->NextRange();
     if (current_page_range_.index() != 0) {
-      throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange");
+      throw ParquetException(
+          "RowRangesPageFilter expects first NextRange() to be a IntervalRange");
     }
 
     while (current_row_range_.index() == 0 &&
-           std::get<IntervalRange>(current_page_range_).IsAfter(
-             std::get<IntervalRange>(current_row_range_))) {
+           IntervalRangeUtils::IsAfter(std::get<IntervalRange>(current_page_range_),
+                                       std::get<IntervalRange>(current_row_range_))) {
       current_row_range_ = row_ranges_itr_->NextRange();
     }
 
@@ -542,8 +543,8 @@ class RowRangesPageFilter {
       return true;
     }
 
-    return std::get<IntervalRange>(current_page_range_).IsBefore(
-      std::get<IntervalRange>(current_row_range_));
+    return IntervalRangeUtils::IsBefore(std::get<IntervalRange>(current_page_range_),
+                                        std::get<IntervalRange>(current_row_range_));
   }
 
  private:
@@ -652,11 +653,11 @@ class LeafReader : public ColumnReaderImpl {
                1});
     }
 
-    if (row_ranges.RowCount() > 0) {
-      if (row_ranges.LastRow() > page_ranges->LastRow()) {
+    if (row_ranges.num_rows() > 0) {
+      if (row_ranges.last_row() > page_ranges->last_row()) {
         throw ParquetException(
-            "The provided row range " + row_ranges.ToString() +
-            " exceeds last page :" + page_ranges->GetRanges().back().ToString());
+            "The provided row range " + row_ranges.ToString() + " exceeds last page :" +
+            IntervalRangeUtils::ToString(page_ranges->GetRanges().back()));
       }
     }
   }
@@ -667,23 +668,21 @@ class LeafReader : public ColumnReaderImpl {
     /// using page index to reduce cost
     if (page_reader != nullptr && ctx_->row_ranges_per_rg) {
       // reset skipper
-      record_reader_->set_record_skipper(NULLPTR);
+      record_reader_->reset_record_skipper();
 
-      const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()];
+      const auto& row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()];
       // if specific row range is provided for this rg
-      if (row_ranges->RowCount() != 0) {
-
+      if (row_ranges->num_rows() != 0) {
         // Use IntervalRanges to represent pages
         std::shared_ptr<IntervalRanges> page_ranges;
         checkAndGetPageRanges(*row_ranges, page_ranges);
 
         // part 1, skip decompressing & decoding unnecessary pages
-        page_reader->set_data_page_filter(
-            RowRangesPageFilter(*row_ranges, page_ranges));
+        page_reader->set_data_page_filter(RowRangesPageFilter(*row_ranges, page_ranges));
 
         // part 2, skip unnecessary rows in necessary pages
         record_reader_->set_record_skipper(
-            std::make_shared<parquet::internal::RecordSkipper>(*page_ranges,
+            std::make_unique<parquet::internal::RecordSkipper>(*page_ranges,
                                                                *row_ranges));
       } else {
         NextRowGroup();
@@ -1162,7 +1161,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReaderWithRowRanges(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
-    const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>> & row_ranges_per_rg,
+    const std::shared_ptr<std::vector<std::unique_ptr<RowRanges>>>& row_ranges_per_rg,
     std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h
index 98ea6f5c1a05..1bcf04ee867e 100644
--- a/cpp/src/parquet/arrow/reader.h
+++ b/cpp/src/parquet/arrow/reader.h
@@ -191,14 +191,15 @@ class PARQUET_EXPORT FileReader {
   /// \brief Return a RecordBatchReader of row groups selected from
   /// rows_to_return, whose columns are selected by column_indices.
   ///
-  /// Notice that rows_to_return is file based, it not only decides which row groups to read,
-  /// but also which rows to read in each row group.
+  /// Notice that rows_to_return is file based, it not only decides which row groups to
+  /// read, but also which rows to read in each row group.
   ///
   ///
   /// \returns error Status if either rows_to_return or column_indices
   ///     contains an invalid index
-  virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return,
-      const std::vector<int>& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+  virtual ::arrow::Status GetRecordBatchReader(
+      const RowRanges& rows_to_return, const std::vector<int>& column_indices,
+      std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
 
   /// \brief Return a RecordBatchReader of row groups selected from
   /// row_group_indices, whose columns are selected by column_indices.
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 4ba8243f696e..76fad7a75486 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1319,147 +1319,6 @@ std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
   return std::shared_ptr<ColumnReader>(nullptr);
 }
 
-// ----------------------------------------------------------------------
-// RowRanges and ins implementations
-
-IntervalRanges::IntervalRanges() = default;
-
-IntervalRanges::IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); }
-
-IntervalRanges::IntervalRanges(const std::vector<IntervalRange>& ranges) {
-  this->ranges_ = ranges;
-}
-
-std::unique_ptr<RowRanges::Iterator> IntervalRanges::NewIterator() const {
-  return std::make_unique<IntervalRowRangesIterator>(ranges_);
-}
-
-size_t IntervalRanges::RowCount() const {
-  size_t cnt = 0;
-  for (const IntervalRange& range : ranges_) {
-    cnt += range.Count();
-  }
-  return cnt;
-}
-
-int64_t IntervalRanges::LastRow() const { return ranges_.back().end; }
-
-bool IntervalRanges::IsValid() const {
-  if (ranges_.size() == 0) return true;
-  if (ranges_[0].start < 0) {
-    return false;
-  }
-  for (size_t i = 0; i < ranges_.size(); i++) {
-    if (!ranges_[i].IsValid()) {
-      return false;
-    }
-  }
-  for (size_t i = 1; i < ranges_.size(); i++) {
-    if (ranges_[i].start <= ranges_[i - 1].end) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool IntervalRanges::IsOverlapping(const IntervalRange& searchRange) const {
-  auto it = std::lower_bound(
-      ranges_.begin(), ranges_.end(), searchRange,
-      [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); });
-  return it != ranges_.end() && !(*it).IsAfter(searchRange);
-}
-
-std::string IntervalRanges::ToString() const {
-  std::string result = "[";
-  for (const IntervalRange& range : ranges_) {
-    result += range.ToString() + ", ";
-  }
-  if (!ranges_.empty()) {
-    result = result.substr(0, result.size() - 2);
-  }
-  result += "]";
-  return result;
-}
-
-std::vector<std::unique_ptr<RowRanges>> IntervalRanges::SplitByRowGroups(
-    const std::vector<int64_t>& rows_per_rg) const {
-  if (rows_per_rg.size() <= 1) {
-    std::unique_ptr<RowRanges> single =
-        std::make_unique<IntervalRanges>(*this);  // return a copy of itself
-    auto ret = std::vector<std::unique_ptr<RowRanges>>();
-    ret.push_back(std::move(single));
-    return ret;
-  }
-
-  std::vector<std::unique_ptr<RowRanges>> result;
-
-  IntervalRanges spaces;
-  int64_t rows_so_far = 0;
-  for (size_t i = 0; i < rows_per_rg.size(); ++i) {
-    auto start = rows_so_far;
-    rows_so_far += rows_per_rg[i];
-    auto end = rows_so_far - 1;
-    spaces.Add({start, end});
-  }
-
-  // each RG's row range forms a space, we need to adjust RowRanges in each space to
-  // zero based.
-  for (IntervalRange space : spaces.GetRanges()) {
-    auto intersection = Intersection(IntervalRanges(space), *this);
-
-    std::unique_ptr<IntervalRanges> zero_based_ranges =
-        std::make_unique<IntervalRanges>();
-    for (const IntervalRange& range : intersection.GetRanges()) {
-      zero_based_ranges->Add({range.start - space.start, range.end - space.start});
-    }
-    result.push_back(std::move(zero_based_ranges));
-  }
-
-  return result;
-}
-
-IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left,
-                                            const IntervalRanges& right) {
-  IntervalRanges result;
-
-  size_t rightIndex = 0;
-  for (const IntervalRange& l : left.ranges_) {
-    for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
-      const IntervalRange& r = right.ranges_[i];
-      if (l.IsBefore(r)) {
-        break;
-      } else if (l.IsAfter(r)) {
-        rightIndex = i + 1;
-        continue;
-      }
-      result.Add(IntervalRange::Intersection(l, r));
-    }
-  }
-
-  return result;
-}
-
-void IntervalRanges::Add(const IntervalRange& range) {
-  const IntervalRange rangeToAdd = range;
-  if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
-    throw ParquetException("Ranges must be added in order");
-  }
-  ranges_.push_back(rangeToAdd);
-}
-
-const std::vector<IntervalRange>& IntervalRanges::GetRanges() const { return ranges_; }
-
-IntervalRowRangesIterator::IntervalRowRangesIterator(
-    const std::vector<IntervalRange>& ranges)
-    : ranges_(ranges) {}
-IntervalRowRangesIterator::~IntervalRowRangesIterator() {}
-
-std::variant<IntervalRange, BitmapRange, End> IntervalRowRangesIterator::NextRange() {
-  if (current_index_ >= ranges_.size()) return End();
-
-  return ranges_[current_index_++];
-}
-
 // ----------------------------------------------------------------------
 // RecordReader
 
@@ -2451,7 +2310,7 @@ RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ra
   // copy row_ranges
   IntervalRanges skip_pages;
   for (auto& page : pages.GetRanges()) {
-    if (!orig_row_ranges.IsOverlapping(page)) {
+    if (!orig_row_ranges.IsOverlapping(page.start, page.end)) {
       skip_pages.Add(page);
     }
   }
@@ -2460,10 +2319,9 @@ RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ra
   range_iter_ = row_ranges_->NewIterator();
   current_range_variant = range_iter_->NextRange();
 
-  total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount();
+  total_rows_to_process_ = pages.num_rows() - skip_pages.num_rows();
 }
 
-
 int64_t RecordSkipper::AdviseNext(const int64_t current_rg_processed) {
   if (current_range_variant.index() == 2) {
     return 0;
@@ -2502,8 +2360,8 @@ void RecordSkipper::AdjustRanges(IntervalRanges& skip_pages,
   while (orig_range_variant.index() != 2) {
     const auto& origin_range = std::get<IntervalRange>(orig_range_variant);
     while (skip_iter != skip_pages.GetRanges().end() &&
-           skip_iter->IsBefore(origin_range)) {
-      skipped_rows += skip_iter->Count();
+           IntervalRangeUtils::IsBefore(*skip_iter, origin_range)) {
+      skipped_rows += IntervalRangeUtils::Count(*skip_iter);
       ++skip_iter;
     }
 
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index cf11c8975dc5..f41995a0138f 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -22,12 +22,12 @@
 #include <utility>
 #include <vector>
 
-#include "page_index.h"
 #include "parquet/exception.h"
 #include "parquet/level_conversion.h"
 #include "parquet/metadata.h"
 #include "parquet/platform.h"
 #include "parquet/properties.h"
+#include "parquet/row_range.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
 
@@ -303,124 +303,6 @@ class TypedColumnReader : public ColumnReader {
                                           int32_t* dict_len) = 0;
 };
 
-// Represent a range to read. The range is inclusive on both ends.
-struct IntervalRange {
-  static IntervalRange Intersection(const IntervalRange& left,
-                                    const IntervalRange& right) {
-    if (left.start <= right.start) {
-      if (left.end >= right.start) {
-        return {right.start, std::min(left.end, right.end)};
-      }
-    } else if (right.end >= left.start) {
-      return {left.start, std::min(left.end, right.end)};
-    }
-    return {-1, -1};  // Return a default Range object if no intersection range found
-  }
-
-  IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) {
-    if (start > end) {
-      throw ParquetException("Invalid range with start: " + std::to_string(start) +
-                             " and end: " + std::to_string(end));
-    }
-  }
-
-  size_t Count() const {
-    if(!IsValid()) {
-      throw ParquetException("Invalid range with start: " + std::to_string(start) +
-                             " and end: " + std::to_string(end));
-    }
-    return end - start + 1;
-  }
-
-  bool IsBefore(const IntervalRange& other) const { return end < other.start; }
-
-  bool IsAfter(const IntervalRange& other) const { return start > other.end; }
-
-  bool IsOverlap(const IntervalRange& other) const {
-    return !IsBefore(other) && !IsAfter(other);
-  }
-
-  bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
-
-  std::string ToString() const {
-    return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
-  }
-
-  // inclusive
-  int64_t start = -1;
-  // inclusive
-  int64_t end = -1;
-};
-
-struct BitmapRange {
-  int64_t offset;
-  // zero added to, if there are less than 64 elements left in the column.
-  uint64_t bitmap;
-};
-
-struct End {};
-
-// Represent a set of ranges to read. The ranges are sorted and non-overlapping.
-class RowRanges {
- public:
-  RowRanges() = default;
-  virtual ~RowRanges() = default;
-  virtual size_t RowCount() const = 0;
-  virtual int64_t LastRow() const = 0;
-  virtual bool IsValid() const = 0;
-  virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
-  // Given a RowRanges with rows accross all RGs, split it into N RowRanges, where N = number of RGs
-  // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested in RowRanges [90-110], then
-  // this function will return 2 RowRanges: [90-99] and [0-10]
-  virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const std::vector<int64_t>& rows_per_rg) const = 0;
-  virtual std::string ToString() const = 0;
-
-  // Returns a vector of PageLocations that must be read all to get values for
-  // all included in this range virtual std::vector<PageLocation>
-  // PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0;
-
-  class Iterator {
-  public:
-    virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
-    virtual ~Iterator() = default;
-  };
-  virtual std::unique_ptr<Iterator> NewIterator() const = 0;
-
-};
-
-class IntervalRanges : public RowRanges {
- public:
-  IntervalRanges();
-  explicit IntervalRanges(const IntervalRange& range);
-  explicit IntervalRanges(const std::vector<IntervalRange>& ranges);
-  std::unique_ptr<Iterator> NewIterator() const override;
-  size_t RowCount() const override;
-  int64_t LastRow() const override;
-  bool IsValid() const override;
-  bool IsOverlapping(const IntervalRange& searchRange) const override;
-  std::string ToString() const override;
-  std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(
-      const std::vector<int64_t>& rows_per_rg) const override;
-  static IntervalRanges Intersection(const IntervalRanges& left,
-                                     const IntervalRanges& right);
-  void Add(const IntervalRange& range);
-  const std::vector<IntervalRange>& GetRanges() const;
-
- private:
-  std::vector<IntervalRange> ranges_;
-};
-
-class IntervalRowRangesIterator : public RowRanges::Iterator {
- public:
-  IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges);
-  ~IntervalRowRangesIterator() override;
-  std::variant<IntervalRange, BitmapRange, End> NextRange() override;
-
- private:
-  const std::vector<IntervalRange>& ranges_;
-  size_t current_index_ = 0;
-};
-
 namespace internal {
 
 // A RecordSkipper is used to skip uncessary rows within each pages.
@@ -559,7 +441,11 @@ class PARQUET_EXPORT RecordReader {
 
   void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; }
 
-  void set_record_skipper(const std::shared_ptr<RecordSkipper>& skipper) { skipper_ = skipper; }
+  void set_record_skipper(std::unique_ptr<RecordSkipper> skipper) {
+    skipper_ = std::move(skipper);
+  }
+
+  void reset_record_skipper() { skipper_.reset(); }
 
  protected:
   /// \brief Indicates if we can have nullable values. Note that repeated fields
@@ -613,7 +499,7 @@ class PARQUET_EXPORT RecordReader {
   // vector.
   bool read_dense_for_nullable_ = false;
 
-  std::shared_ptr<RecordSkipper> skipper_ = NULLPTR;
+  std::unique_ptr<RecordSkipper> skipper_ = NULLPTR;
 };
 
 class BinaryRecordReader : virtual public RecordReader {
diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc
index cde60c583f50..04510143e54c 100644
--- a/cpp/src/parquet/range_reader_test.cc
+++ b/cpp/src/parquet/range_reader_test.cc
@@ -39,7 +39,7 @@ using parquet::IntervalRanges;
 std::string random_string(std::string::size_type length) {
   static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
-  static std::mt19937 rg{std::random_device{}()};
+  static std::mt19937 rg = std::mt19937(std::random_device()());
   static std::uniform_int_distribution<std::string::size_type> pick(0, sizeof(chrs) - 2);
 
   std::string s;
@@ -240,10 +240,18 @@ void check_rb(std::unique_ptr<arrow::RecordBatchReader> rb_reader,
   }
   ASSERT_EQ(expected_rows, total_rows);
 
-  if (checking_col("a", column_names)) ASSERT_EQ(expected_sum * 2, sum_a);
-  if (checking_col("b", column_names)) ASSERT_EQ(expected_sum * 3, sum_b);
-  if (checking_col("c", column_names)) ASSERT_EQ(expected_sum, sum_c);
-  if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d);
+  if (checking_col("a", column_names)) {
+    ASSERT_EQ(expected_sum * 2, sum_a);
+  }
+  if (checking_col("b", column_names)) {
+    ASSERT_EQ(expected_sum * 3, sum_b);
+  }
+  if (checking_col("c", column_names)) {
+    ASSERT_EQ(expected_sum, sum_c);
+  }
+  if (checking_col("d", column_names)) {
+    ASSERT_EQ(expected_sum, sum_d);
+  }
 }
 
 class TestRecordBatchReaderWithRanges : public testing::Test {
@@ -279,7 +287,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {}
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  IntervalRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}};
+  IntervalRanges rows{{{0, 9}, {40, 49}, {80, 89}, {90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -301,7 +309,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) {
 
 TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
-  IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}};
+  IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89},
+                       IntervalRange{90, 99}}};
 
   const std::vector column_indices{0, 1, 2, 3, 4};
   ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader));
@@ -341,11 +350,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
     std::unique_ptr<arrow::RecordBatchReader> rb_reader;
     std::vector<parquet::IntervalRange> ranges;
     for (int64_t i = 0; i < 30; i++) {
-      if (i % 2 == 0) ranges.push_back({i, i});
+      if (i % 2 == 0) {
+        ranges.push_back({i, i});
+      }
     }
 
     for (int64_t i = 60; i < 90; i++) {
-      if (i % 2 == 0) ranges.push_back({i, i});
+      if (i % 2 == 0) {
+        ranges.push_back({i, i});
+      }
     }
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices,
@@ -359,25 +372,17 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) {
 TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) {
   std::unique_ptr<arrow::RecordBatchReader> rb_reader;
   {
-    IntervalRanges rows{{IntervalRange{-1, 5}}};
-    const std::vector column_indices{0, 1, 2, 3, 4};
-    const auto status =
-        arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
-    ASSERT_NOT_OK(status);
-    EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it "
-                                      "monotone and non-interleaving: [(-1, 5)]") !=
-                std::string::npos);
+    auto create_ranges = []() -> IntervalRanges {
+      return IntervalRanges{{IntervalRange{-1, 5}}};
+    };
+    EXPECT_THROW(create_ranges(), parquet::ParquetException);
   }
 
   {
-    IntervalRanges rows{{IntervalRange{0, 4}, {2, 5}}};
-    const std::vector column_indices{0, 1, 2, 3, 4};
-    const auto status =
-        arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader);
-    ASSERT_NOT_OK(status);
-    EXPECT_TRUE(
-        status.message().find("The provided row range is invalid, keep it monotone and "
-                              "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos);
+    auto create_ranges = []() -> IntervalRanges {
+      return IntervalRanges{{{0, 4}, {2, 5}}};
+    };
+    EXPECT_THROW(create_ranges(), parquet::ParquetException);
   }
   {
     // will treat as {0,99}
@@ -472,11 +477,15 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
     std::unique_ptr<arrow::RecordBatchReader> rb_reader;
     std::vector<parquet::IntervalRange> ranges;
     for (int64_t i = 0; i < 30; i++) {
-      if (i % 2 == 0) ranges.push_back({i, i});
+      if (i % 2 == 0) {
+        ranges.push_back({i, i});
+      }
     }
 
     for (int64_t i = 60; i < 90; i++) {
-      if (i % 2 == 0) ranges.push_back({i, i});
+      if (i % 2 == 0) {
+        ranges.push_back({i, i});
+      }
     }
     const std::vector column_indices{0, 1, 2, 3, 4};
     ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices,
@@ -486,4 +495,4 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) {
     // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320
     check_rb(std::move(rb_reader), 30, 1300);
   }
-}
\ No newline at end of file
+}
diff --git a/cpp/src/parquet/row_range.cc b/cpp/src/parquet/row_range.cc
new file mode 100644
index 000000000000..fa996a198f43
--- /dev/null
+++ b/cpp/src/parquet/row_range.cc
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/row_range.h"
+
+#include <variant>
+
+#include "parquet/exception.h"
+
+namespace parquet {
+// ----------------------------------------------------------------------
+// RowRanges and ins implementations
+bool IsValid(const std::vector<IntervalRange>& ranges) {
+  if (ranges.size() == 0) return true;
+  if (ranges[0].start < 0) {
+    return false;
+  }
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (!IntervalRangeUtils::IsValid(ranges[i])) {
+      return false;
+    }
+  }
+  for (size_t i = 1; i < ranges.size(); i++) {
+    if (ranges[i].start <= ranges[i - 1].end) {
+      return false;
+    }
+  }
+  return true;
+}
+
+IntervalRanges::IntervalRanges() = default;
+
+IntervalRanges::IntervalRanges(const IntervalRange& range) {
+  ranges_.push_back(range);
+  if (!IsValid(ranges_)) {
+    throw ParquetException("Invalid range with start: " + std::to_string(range.start) +
+                           " and end: " + std::to_string(range.end) +
+                           ", keep it monotone and non-interleaving");
+  }
+}
+
+IntervalRanges::IntervalRanges(const std::vector<IntervalRange>& ranges) {
+  this->ranges_ = ranges;
+  if (!IsValid(ranges_)) {
+    throw ParquetException("Invalid ranges: " + this->IntervalRanges::ToString() +
+                           ", keep it monotone and non-interleaving");
+  }
+}
+
+std::unique_ptr<RowRanges::Iterator> IntervalRanges::NewIterator() const {
+  return std::make_unique<IntervalRowRangesIterator>(ranges_);
+}
+
+size_t IntervalRanges::num_rows() const {
+  size_t cnt = 0;
+  for (const IntervalRange& range : ranges_) {
+    cnt += IntervalRangeUtils::Count(range);
+  }
+  return cnt;
+}
+
+int64_t IntervalRanges::first_row() const {
+  if (ranges_.empty()) {
+    throw ParquetException("first_row() called on empty IntervalRanges");
+  }
+  return ranges_.front().start;
+}
+
+int64_t IntervalRanges::last_row() const {
+  if (ranges_.empty()) {
+    throw ParquetException("last_row() called on empty IntervalRanges");
+  }
+  return ranges_.back().end;
+}
+
+bool IntervalRanges::IsOverlapping(const int64_t start, const int64_t end) const {
+  auto searchRange = IntervalRange{start, end};
+  auto it = std::lower_bound(ranges_.begin(), ranges_.end(), searchRange,
+                             [](const IntervalRange& r1, const IntervalRange& r2) {
+                               return IntervalRangeUtils::IsBefore(r1, r2);
+                             });
+  return it != ranges_.end() && !IntervalRangeUtils::IsAfter(*it, searchRange);
+}
+
+std::string IntervalRanges::ToString() const {
+  std::string result = "[";
+  for (const IntervalRange& range : ranges_) {
+    result += IntervalRangeUtils::ToString(range) + ", ";
+  }
+  if (!ranges_.empty()) {
+    result = result.substr(0, result.size() - 2);
+  }
+  result += "]";
+  return result;
+}
+
+std::vector<std::unique_ptr<RowRanges>> IntervalRanges::SplitByRowRange(
+    const std::vector<int64_t>& num_rows_per_sub_ranges) const {
+  if (num_rows_per_sub_ranges.size() <= 1) {
+    std::unique_ptr<RowRanges> single =
+        std::make_unique<IntervalRanges>(*this);  // return a copy of itself
+    auto ret = std::vector<std::unique_ptr<RowRanges>>();
+    ret.push_back(std::move(single));
+    return ret;
+  }
+
+  std::vector<std::unique_ptr<RowRanges>> result;
+
+  IntervalRanges spaces;
+  int64_t rows_so_far = 0;
+  for (size_t i = 0; i < num_rows_per_sub_ranges.size(); ++i) {
+    auto start = rows_so_far;
+    rows_so_far += num_rows_per_sub_ranges[i];
+    auto end = rows_so_far - 1;
+    spaces.Add({start, end});
+  }
+
+  // each RG's row range forms a space, we need to adjust RowRanges in each space to
+  // zero based.
+  for (IntervalRange space : spaces.GetRanges()) {
+    auto intersection = Intersection(IntervalRanges(space), *this);
+
+    std::unique_ptr<IntervalRanges> zero_based_ranges =
+        std::make_unique<IntervalRanges>();
+    for (const IntervalRange& range : intersection.GetRanges()) {
+      zero_based_ranges->Add({range.start - space.start, range.end - space.start});
+    }
+    result.push_back(std::move(zero_based_ranges));
+  }
+
+  return result;
+}
+
+IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left,
+                                            const IntervalRanges& right) {
+  IntervalRanges result;
+
+  size_t rightIndex = 0;
+  for (const IntervalRange& l : left.ranges_) {
+    for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) {
+      const IntervalRange& r = right.ranges_[i];
+      if (IntervalRangeUtils::IsBefore(l, r)) {
+        break;
+      } else if (IntervalRangeUtils::IsAfter(l, r)) {
+        rightIndex = i + 1;
+        continue;
+      }
+      result.Add(IntervalRangeUtils::Intersection(l, r));
+    }
+  }
+
+  return result;
+}
+
+void IntervalRanges::Add(const IntervalRange& range) {
+  const IntervalRange rangeToAdd = range;
+  if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) {
+    throw ParquetException("Ranges must be added in order");
+  }
+  ranges_.push_back(rangeToAdd);
+}
+
+const std::vector<IntervalRange>& IntervalRanges::GetRanges() const { return ranges_; }
+
+IntervalRowRangesIterator::IntervalRowRangesIterator(
+    const std::vector<IntervalRange>& ranges)
+    : ranges_(ranges) {}
+
+IntervalRowRangesIterator::~IntervalRowRangesIterator() {}
+
+std::variant<IntervalRange, BitmapRange, End> IntervalRowRangesIterator::NextRange() {
+  if (current_index_ >= ranges_.size()) return End();
+
+  return ranges_[current_index_++];
+}
+}  // namespace parquet
diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_range.h
new file mode 100644
index 000000000000..4e7c2631eb6a
--- /dev/null
+++ b/cpp/src/parquet/row_range.h
@@ -0,0 +1,156 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#pragma once
+#include <variant>
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+  IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) {
+    if (start > end) {
+      throw ParquetException("Invalid range with start: " + std::to_string(start) +
+                             " bigger than end: " + std::to_string(end));
+    }
+  }
+
+  // inclusive
+  int64_t start = -1;
+  // inclusive
+  int64_t end = -1;
+};
+
+class IntervalRangeUtils {
+ public:
+  static IntervalRange Intersection(const IntervalRange& left,
+                                    const IntervalRange& right) {
+    if (left.start <= right.start) {
+      if (left.end >= right.start) {
+        return {right.start, std::min(left.end, right.end)};
+      }
+    } else if (right.end >= left.start) {
+      return {left.start, std::min(left.end, right.end)};
+    }
+    return {-1, -1};  // Return a default Range object if no intersection range found
+  }
+
+  static std::string ToString(const IntervalRange& range) {
+    return "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + ")";
+  }
+
+  static bool IsValid(const IntervalRange& range) {
+    return range.start >= 0 && range.end >= 0 && range.end >= range.start;
+  }
+
+  static size_t Count(const IntervalRange& range) {
+    if (!IsValid(range)) {
+      throw ParquetException("Invalid range: " + ToString(range));
+    }
+    return range.end - range.start + 1;
+  }
+
+  static bool IsBefore(const IntervalRange& self, const IntervalRange& other) {
+    return self.end < other.start;
+  }
+
+  static bool IsAfter(const IntervalRange& self, const IntervalRange& other) {
+    return self.start > other.end;
+  }
+
+  static bool IsOverlap(const IntervalRange& self, const IntervalRange& other) {
+    return !IsBefore(self, other) && !IsAfter(self, other);
+  }
+};
+
+struct BitmapRange {
+  int64_t offset;
+  // zero added to, if there are less than 64 elements left in the column.
+  uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and non-overlapping.
+class RowRanges {
+ public:
+  virtual ~RowRanges() = default;
+  /// \brief Total number of rows in the row ranges.
+  virtual size_t num_rows() const = 0;
+  /// \brief First row in the ranges
+  virtual int64_t first_row() const = 0;
+  /// \brief Last row in the ranges
+  virtual int64_t last_row() const = 0;
+  /// \brief Whether the given range from start to end overlaps with the row ranges.
+  virtual bool IsOverlapping(int64_t start, int64_t end) const = 0;
+  /// \brief Split the row ranges into sub row ranges according to the
+  ///   specified number of rows per sub row ranges. A typical use case is
+  ///   to convert file based RowRanges to row group based RowRanges.
+  ///
+  /// \param num_rows_per_sub_ranges number of rows per sub row range.
+  virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowRange(
+      const std::vector<int64_t>& num_rows_per_sub_ranges) const = 0;
+  /// \brief Readable string representation
+  virtual std::string ToString() const = 0;
+
+  class Iterator {
+   public:
+    virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+    virtual ~Iterator() = default;
+  };
+  /// \brief Create an iterator to iterate over the ranges
+  virtual std::unique_ptr<Iterator> NewIterator() const = 0;
+};
+
+class IntervalRanges : public RowRanges {
+ public:
+  IntervalRanges();
+  explicit IntervalRanges(const IntervalRange& range);
+  explicit IntervalRanges(const std::vector<IntervalRange>& ranges);
+  std::unique_ptr<Iterator> NewIterator() const override;
+  size_t num_rows() const override;
+  int64_t first_row() const override;
+  int64_t last_row() const override;
+  bool IsOverlapping(int64_t start, int64_t end) const override;
+  std::string ToString() const override;
+  std::vector<std::unique_ptr<RowRanges>> SplitByRowRange(
+      const std::vector<int64_t>& num_rows_per_sub_ranges) const override;
+  static IntervalRanges Intersection(const IntervalRanges& left,
+                                     const IntervalRanges& right);
+  void Add(const IntervalRange& range);
+  const std::vector<IntervalRange>& GetRanges() const;
+
+ private:
+  std::vector<IntervalRange> ranges_;
+};
+
+class IntervalRowRangesIterator : public RowRanges::Iterator {
+ public:
+  explicit IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges);
+  ~IntervalRowRangesIterator() override;
+  std::variant<IntervalRange, BitmapRange, End> NextRange() override;
+
+ private:
+  const std::vector<IntervalRange>& ranges_;
+  size_t current_index_ = 0;
+};
+}  // namespace parquet
diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc
index 44327baab04c..bf0563211b8e 100644
--- a/cpp/src/parquet/row_range_test.cc
+++ b/cpp/src/parquet/row_range_test.cc
@@ -17,7 +17,8 @@
 #include <gtest/gtest.h>
 #include "parquet/column_reader.h"
 
-using namespace parquet;
+using parquet::IntervalRange;
+using parquet::IntervalRanges;
 
 class RowRangesTest : public ::testing::Test {
  protected:
@@ -28,7 +29,7 @@ TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) {
   row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg;
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 1);
 
   auto iter = result[0]->NewIterator();
@@ -42,7 +43,7 @@ TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) {
   row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {11};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 1);
 
   auto iter = result[0]->NewIterator();
@@ -56,7 +57,7 @@ TEST_F(RowRangesTest, ReturnsTwoRowRanges) {
   row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {5, 6};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();
@@ -78,7 +79,7 @@ TEST_F(RowRangesTest, ReturnsMultipleRowRanges) {
   row_ranges.Add(IntervalRange(0, 11));
   std::vector<int64_t> rows_per_rg = {3, 4, 100};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 3);
   {
     auto iter = result[0]->NewIterator();
@@ -110,7 +111,7 @@ TEST_F(RowRangesTest, MultipleInputRange) {
 
   std::vector<int64_t> rows_per_rg = {100, 100};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();
@@ -142,7 +143,7 @@ TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) {
   row_ranges.Add(IntervalRange(11, 18));
   std::vector<int64_t> rows_per_rg = {5, 5, 5, 5, 5};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 5);
   {
     auto iter = result[0]->NewIterator();
@@ -176,7 +177,7 @@ TEST_F(RowRangesTest, RangeExceedRG) {
   row_ranges.Add(IntervalRange(0, 10));
   std::vector<int64_t> rows_per_rg = {5, 3};
 
-  auto result = row_ranges.SplitByRowGroups(rows_per_rg);
+  auto result = row_ranges.SplitByRowRange(rows_per_rg);
   ASSERT_EQ(result.size(), 2);
   {
     auto iter = result[0]->NewIterator();