From 2ead01724d7baab88af49755e0c3a9c8bde6528a Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Wed, 22 Nov 2023 21:36:44 +0800 Subject: [PATCH 01/25] happy path pass --- cpp/examples/arrow/parquet_read_write.cc | 297 ++++++++++---------- cpp/src/parquet/arrow/reader.cc | 116 +++++++- cpp/src/parquet/arrow/reader.h | 10 + cpp/src/parquet/arrow/reader_internal.h | 5 + cpp/src/parquet/column_reader.cc | 33 ++- cpp/src/parquet/column_reader.h | 333 +++++++++++++++++++++++ cpp/src/parquet/reader_test.cc | 3 + 7 files changed, 647 insertions(+), 150 deletions(-) diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 3b8b4c2212b7..cc267f38d73d 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -23,168 +23,189 @@ #include "parquet/arrow/writer.h" #include - -arrow::Status ReadFullFile(std::string path_to_file) { - // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" - - arrow::MemoryPool* pool = arrow::default_memory_pool(); - std::shared_ptr input; - ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(path_to_file)); - - // Open Parquet file reader - std::unique_ptr arrow_reader; - ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader)); - - // Read entire file as a single Arrow table - std::shared_ptr table; - ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table)); - return arrow::Status::OK(); -} +#include arrow::Status ReadInBatches(std::string path_to_file) { - // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" - - arrow::MemoryPool* pool = arrow::default_memory_pool(); - - // Configure general Parquet reader settings - auto reader_properties = parquet::ReaderProperties(pool); - reader_properties.set_buffer_size(4096 * 4); - reader_properties.enable_buffered_stream(); - - // Configure Arrow-specific Parquet reader settings - auto arrow_reader_props = parquet::ArrowReaderProperties(); - arrow_reader_props.set_batch_size(128 * 1024); // default 64 * 1024 - - parquet::arrow::FileReaderBuilder reader_builder; - ARROW_RETURN_NOT_OK( - reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); - reader_builder.memory_pool(pool); - reader_builder.properties(arrow_reader_props); - - std::unique_ptr arrow_reader; - ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); - - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader)); - - for (arrow::Result> maybe_batch : *rb_reader) { - // Operate on each batch... - } - return arrow::Status::OK(); + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + // Configure general Parquet reader settings + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + // Configure Arrow-specific Parquet reader settings + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(10); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + ARROW_RETURN_NOT_OK( + reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + std::unique_ptr arrow_reader; + ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + + std::vector ranges; + for (int64_t i = 0; i < 50; i++) { + if (i % 2 == 0) + ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); + + + ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader)); + + size_t total_rows = 0; + size_t total_values = 0; + for (arrow::Result> maybe_batch: *rb_reader) { + // Operate on each batch... + auto batch = maybe_batch.ValueOrDie(); + total_rows += batch->num_rows(); + std::cout << "batch size: " << batch->num_rows() << std::endl; + + auto int_array = std::dynamic_pointer_cast(batch->column(1)); + for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { + total_values += (*iter).value(); + } + } + std::cout << "total rows is : " << total_rows << std::endl; + std::cout << "total value of y is : " << total_values << std::endl; + return arrow::Status::OK(); } arrow::Result> GetTable() { - auto builder = arrow::Int32Builder(); - - std::shared_ptr arr_x; - ARROW_RETURN_NOT_OK(builder.AppendValues({1, 3, 5, 7, 1})); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); - - std::shared_ptr arr_y; - ARROW_RETURN_NOT_OK(builder.AppendValues({2, 4, 6, 8, 10})); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); - - auto schema = arrow::schema( - {arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32())}); - - return arrow::Table::Make(schema, {arr_x, arr_y}); + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_x; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); + + std::shared_ptr arr_y; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); + + std::shared_ptr arr_z_values; + std::shared_ptr arr_z_offsets; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,300))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values)); + std::vector offsets = arrow::internal::Iota(0, 101); + std::transform(offsets.begin(), offsets.end(), offsets.begin(), [](int x) { return x * 3; }); + ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets)); + ARROW_ASSIGN_OR_RAISE(auto arr_z, arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values)); + + + auto schema = arrow::schema( + { + arrow::field("x", arrow::int32()), + arrow::field("y", arrow::int32()), + arrow::field("z", arrow::list(arrow::int32())) + }); + + return arrow::Table::Make(schema, {arr_x, arr_y, arr_z}); } arrow::Result> GetRBR() { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - auto reader = std::make_shared(table); - reader->set_chunksize(10); - return reader; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + auto reader = std::make_shared(table); + reader->set_chunksize(10); + return reader; } arrow::Status WriteFullFile(std::string path_to_file) { - // #include "parquet/arrow/writer.h" - // #include "arrow/util/type_fwd.h" - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13) + ->data_pagesize(1) // this will cause every batch creating a page + ->compression(arrow::Compression::SNAPPY)->build(); + std::cout << "hello" << std::endl; - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), - arrow::default_memory_pool(), outfile, - /*chunk_size=*/3, props, arrow_props)); - return arrow::Status::OK(); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, + /*chunk_size=*/100, props, arrow_props)); + return arrow::Status::OK(); } arrow::Status WriteInBatches(std::string path_to_file) { - // #include "parquet/arrow/writer.h" - // #include "arrow/util/type_fwd.h" - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; - - // Data is in RBR - std::shared_ptr batch_stream; - ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); - - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); - - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); - - // Create a writer - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - std::unique_ptr writer; - ARROW_ASSIGN_OR_RAISE( - writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), - arrow::default_memory_pool(), outfile, - props, arrow_props)); - - // Write each batch as a row_group - for (arrow::Result> maybe_batch : *batch_stream) { - ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatches(batch->schema(), {batch})); - ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); - } - - // Write file footer and close - ARROW_RETURN_NOT_OK(writer->Close()); - - return arrow::Status::OK(); + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + // Data is in RBR + std::shared_ptr batch_stream; + ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); + + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + // Create a writer + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::unique_ptr writer; + ARROW_ASSIGN_OR_RAISE( + writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), + arrow::default_memory_pool(), outfile, + props, arrow_props)); + + // Write each batch as a row_group + for (arrow::Result> maybe_batch: *batch_stream) { + ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(batch->schema(), {batch})); + ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); + } + + // Write file footer and close + ARROW_RETURN_NOT_OK(writer->Close()); + + return arrow::Status::OK(); } arrow::Status RunExamples(std::string path_to_file) { - ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); - ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); - ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); - ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); - return arrow::Status::OK(); + // ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); + // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); + // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); + ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); + return arrow::Status::OK(); } int main(int argc, char** argv) { - if (argc != 2) { - // Fake success for CI purposes. + if (argc != 2) { + // Fake success for CI purposes. + return EXIT_SUCCESS; + } + + std::string path_to_file = argv[1]; + arrow::Status status = RunExamples(path_to_file); + + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } return EXIT_SUCCESS; - } - - std::string path_to_file = argv[1]; - arrow::Status status = RunExamples(path_to_file); - - if (!status.ok()) { - std::cerr << "Error occurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 40fbdcbb562b..34316bf47c1b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -17,12 +17,15 @@ #include "parquet/arrow/reader.h" +#include + #include #include #include #include #include #include +#include #include "arrow/array.h" #include "arrow/buffer.h" @@ -72,6 +75,8 @@ using arrow::internal::Iota; // Help reduce verbosity using ParquetReader = parquet::ParquetFileReader; +using parquet::RowRangesPtr; +using parquet::Range; using parquet::internal::RecordReader; namespace bit_util = arrow::bit_util; @@ -203,6 +208,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader(int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, + const std::shared_ptr>& row_ranges_map, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. @@ -219,11 +225,13 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; + ctx->row_ranges_map = row_ranges_map; return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders(const std::vector& column_indices, const std::vector& row_groups, + const std::shared_ptr>& row_ranges_map, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -238,7 +246,8 @@ class FileReaderImpl : public FileReader { for (size_t i = 0; i < out->size(); ++i) { std::unique_ptr reader; RETURN_NOT_OK( - GetFieldReader(field_indices[i], included_leaves, row_groups, &reader)); + GetFieldReader(field_indices[i], included_leaves, row_groups, + row_ranges_map, &reader)); out_fields[i] = reader->field(); out->at(i) = std::move(reader); @@ -265,7 +274,7 @@ class FileReaderImpl : public FileReader { std::vector row_groups = Iota(reader_->metadata()->num_row_groups()); std::unique_ptr reader; - RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader)); + RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, NULLPTR, &reader)); return ReadColumn(i, row_groups, reader.get(), out); } @@ -336,19 +345,26 @@ class FileReaderImpl : public FileReader { return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table); } + Status GetRecordBatchReader( + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::unique_ptr* out) override; + Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, - std::unique_ptr* out) override; + std::unique_ptr* out) override { + return GetRecordBatchReader(row_group_indices, column_indices, NULLPTR, out); + } Status GetRecordBatchReader(const std::vector& row_group_indices, std::unique_ptr* out) override { return GetRecordBatchReader(row_group_indices, - Iota(reader_->metadata()->num_columns()), out); + Iota(reader_->metadata()->num_columns()), NULLPTR, out); } Status GetRecordBatchReader(std::unique_ptr* out) override { return GetRecordBatchReader(Iota(num_row_groups()), - Iota(reader_->metadata()->num_columns()), out); + Iota(reader_->metadata()->num_columns()), NULLPTR, out); } ::arrow::Result<::arrow::AsyncGenerator>> @@ -451,6 +467,43 @@ class RowGroupReaderImpl : public RowGroupReader { // ---------------------------------------------------------------------- // Column reader implementations +struct RowRangesPageFilter { + explicit RowRangesPageFilter(const RowRangesPtr & row_ranges_, const RowRangesPtr & page_ranges_) + : row_ranges(row_ranges_), page_ranges(page_ranges_) { + assert(row_ranges != nullptr); + assert(page_ranges != nullptr); + assert(row_ranges->getRanges().size() > 0); + assert(page_ranges->getRanges().size() > 0); + } + + bool operator() (const DataPageStats & stats) { + ++page_range_idx; + + if (row_range_idx >= row_ranges->getRanges().size()) { + return true; + } + + Range current_page_range = (*page_ranges)[page_range_idx]; + + if (current_page_range.isBefore((*row_ranges)[row_range_idx])) { + return true; + } + + while (row_range_idx < row_ranges->getRanges().size() && + current_page_range.isAfter((*row_ranges)[row_range_idx])) { + row_range_idx++; + } + + return row_range_idx >= row_ranges->getRanges().size(); + } + + size_t row_range_idx = 0; + const RowRangesPtr row_ranges; + + int page_range_idx = -1; + const RowRangesPtr page_ranges; +}; + // Leaf reader is for primitive arrays and primitive children of nested arrays class LeafReader : public ColumnReaderImpl { public: @@ -514,6 +567,43 @@ class LeafReader : public ColumnReaderImpl { std::shared_ptr out_; void NextRowGroup() { std::unique_ptr page_reader = input_->NextChunk(); + + /// using page index to reduce cost + if (page_reader != nullptr && ctx_->row_ranges_map) { + // if specific row range is provided for this rg + if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); + iter != ctx_->row_ranges_map->end()) { + + // check offset exists + auto offset_index = ctx_->reader->GetPageIndexReader() + ->RowGroup(input_->current_row_group()) + ->GetOffsetIndex(input_->column_index()); + if (!offset_index) { + throw ParquetException("Offset index is not found for column: " + field_->name()); + } + + const auto page_locations = offset_index->page_locations(); + auto page_ranges = std::make_shared(); + for (size_t i = 0; i < page_locations.size() - 1; i++) { + page_ranges->add({page_locations[i].first_row_index, + page_locations[i + 1].first_row_index - 1}, false); + } + if (page_locations.size() >= 1) { + page_ranges->add({ + page_locations[page_locations.size() - 1].first_row_index, + ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - 1}, false); + } + + // part 1, skip decompressing & decoding unnecessary pages + page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges)); + + // part 2, skip unnecessary rows in necessary pages + record_reader_->set_record_skipper(std::make_shared( + *page_ranges, *iter->second)); + } + } + + record_reader_->reset_current_rg_processed_records(); record_reader_->SetPageReader(std::move(page_reader)); } @@ -984,6 +1074,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); @@ -997,7 +1088,7 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, std::vector> readers; std::shared_ptr<::arrow::Schema> batch_schema; - RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema)); + RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups,row_ranges_map, &readers, &batch_schema)); if (readers.empty()) { // Just generate all batches right now; they're cheap since they have no columns. @@ -1218,6 +1309,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto ctx->pool = pool_; ctx->iterator_factory = iterator_factory; ctx->filter_leaves = false; + std::unique_ptr result; RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result)); *out = std::move(result); @@ -1251,7 +1343,7 @@ Future> FileReaderImpl::DecodeRowGroups( // in a sync context too so use `this` over `self` std::vector> readers; std::shared_ptr<::arrow::Schema> result_schema; - RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema)); + RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema)); // OptionalParallelForAsync requires an executor if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool(); @@ -1314,6 +1406,16 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice return Status::OK(); } +Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, + const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::shared_ptr* out) { + std::unique_ptr tmp; + RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp)); + out->reset(tmp.release()); + return Status::OK(); +} + Status FileReader::Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cbd36176f5e..0fd35349b643 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -24,6 +24,7 @@ #include #include "parquet/file_reader.h" +#include "parquet/column_reader.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -187,6 +188,11 @@ class PARQUET_EXPORT FileReader { const std::vector& row_group_indices, const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; + virtual ::arrow::Status GetRecordBatchReader( + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; + /// \brief Return a RecordBatchReader of row groups selected from /// row_group_indices, whose columns are selected by column_indices. /// @@ -199,6 +205,10 @@ class PARQUET_EXPORT FileReader { /// /// \returns error Status if either row_group_indices or column_indices /// contains an invalid index + ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, + const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::shared_ptr<::arrow::RecordBatchReader>* out); diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index cf9dbb86577b..56be0f93f414 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -76,6 +76,7 @@ class FileColumnIterator { } auto row_group_reader = reader_->RowGroup(row_groups_.front()); + current_rg = row_groups_.front(); row_groups_.pop_front(); return row_group_reader->GetColumnPageReader(column_index_); } @@ -88,11 +89,14 @@ class FileColumnIterator { int column_index() const { return column_index_; } + int current_row_group() const { return current_rg; } + protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; std::deque row_groups_; + int current_rg = 0; }; using FileColumnIteratorFactory = @@ -109,6 +113,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; + std::shared_ptr> row_ranges_map; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3294aaaf283f..5187ef94aa9c 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1373,7 +1373,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, int64_t records_read = 0; if (has_values_to_process()) { - records_read += ReadRecordData(num_records); + records_read += ReadRecordDataWithSkipCheck(num_records); } int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records); @@ -1427,11 +1427,11 @@ class TypedRecordReader : public TypedColumnReaderImpl, } levels_written_ += levels_read; - records_read += ReadRecordData(num_records - records_read); + records_read += ReadRecordDataWithSkipCheck(num_records - records_read); } else { // No repetition or definition levels batch_size = std::min(num_records - records_read, batch_size); - records_read += ReadRecordData(batch_size); + records_read += ReadRecordDataWithSkipCheck(batch_size); } } @@ -1634,10 +1634,12 @@ class TypedRecordReader : public TypedColumnReaderImpl, // Top level required field. Number of records equals to number of levels, // and there is not read-ahead for levels. + int64_t skipped_records = 0; if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) { - return this->Skip(num_records); + skipped_records = this->Skip(num_records); + current_rg_processed_records += skipped_records; + return skipped_records; } - int64_t skipped_records = 0; if (this->max_rep_level_ == 0) { // Non-repeated optional field. // First consume whatever is in the buffer. @@ -1653,6 +1655,8 @@ class TypedRecordReader : public TypedColumnReaderImpl, } else { skipped_records += this->SkipRecordsRepeated(num_records); } + + current_rg_processed_records += skipped_records; return skipped_records; } @@ -1984,9 +1988,28 @@ class TypedRecordReader : public TypedColumnReaderImpl, this->ConsumeBufferedValues(values_to_read); } + current_rg_processed_records += records_read; return records_read; } + int64_t ReadRecordDataWithSkipCheck(const int64_t num_records) { + if (!skipper) { + return ReadRecordData(num_records); + } + + while (true) { + const auto advise = skipper->advise_next(current_rg_processed_records); + std::cout << "advise got after current_rg_processed_records: " << current_rg_processed_records << " is: " << advise < 0) { + return ReadRecordData(std::min(num_records, advise)); + } + SkipRecords(-advise); + } + } + void DebugPrintState() override { const int16_t* def_levels = this->def_levels(); const int16_t* rep_levels = this->rep_levels(); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 334b8bcffe0b..4468fd1aa20e 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -302,8 +303,332 @@ class TypedColumnReader : public ColumnReader { int32_t* dict_len) = 0; }; +struct Range { + static Range unionRange(const Range&left, const Range&right) { + if (left.from <= right.from) { + if (left.to + 1 >= right.from) { + return {left.from, std::max(left.to, right.to)}; + } + } + else if (right.to + 1 >= left.from) { + return {right.from, std::max(left.to, right.to)}; + } + return {-1, -1}; + } + + static Range intersection(const Range&left, const Range&right) { + if (left.from <= right.from) { + if (left.to >= right.from) { + return {right.from, std::min(left.to, right.to)}; + } + } + else if (right.to >= left.from) { + return {left.from, std::min(left.to, right.to)}; + } + return {-1, -1}; // Return a default Range object if no intersection range found + } + + int64_t from; + int64_t to; + + Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { + assert(from <= to); + } + + size_t count() const { + return to - from + 1; + } + + bool isBefore(const Range&other) const { + return to < other.from; + } + + bool isAfter(const Range&other) const { + return from > other.to; + } + + bool isOverlap(const Range&other) const { + return !isBefore(other) && !isAfter(other); + } + + std::string toString() const { + return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; + } +}; + +class RowRanges { + std::vector ranges; + +public: + RowRanges() = default; + + explicit RowRanges(const Range&range) { + ranges.push_back(range); + } + + RowRanges(const std::vector&ranges) { + this->ranges = ranges; + } + + //copy cstr + RowRanges(const RowRanges&other) { + ranges = other.ranges; + } + + RowRanges(RowRanges&&other) noexcept { + ranges = std::move(other.ranges); + } + + static RowRanges createSingle(const size_t rowCount) { + return RowRanges({Range(0L, rowCount - 1L)}); + } + + // static RowRanges create(size_t rowCount, const std::vector& pageIndexes, const OffsetIndex& offsetIndex) { + // RowRanges ranges; + // for (int pageIndex : pageIndexes) { + // ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount))); + // } + // return ranges; + // } + + static RowRanges unionRanges(const RowRanges&left, const RowRanges&right) { + RowRanges result; + auto it1 = left.ranges.begin(); + auto it2 = right.ranges.begin(); + if (it2 != right.ranges.end()) { + Range range2 = *it2; + while (it1 != left.ranges.end()) { + Range range1 = *it1; + if (range1.isAfter(range2)) { + result.add(range2); + range2 = range1; + const auto tmp = it1; + it1 = it2; + it2 = tmp; + } + else { + result.add(range1); + } + ++it1; + } + result.add(range2); + } + else { + it2 = it1; + } + while (it2 != right.ranges.end()) { + result.add(*it2); + ++it2; + } + + return result; + } + + static RowRanges intersection(const RowRanges&left, const RowRanges&right) { + RowRanges result; + + size_t rightIndex = 0; + for (const Range&l: left.ranges) { + for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { + const Range&r = right.ranges[i]; + if (l.isBefore(r)) { + break; + } + else if (l.isAfter(r)) { + rightIndex = i + 1; + continue; + } + result.add(Range::intersection(l, r)); + } + } + + return result; + } + + RowRanges slice(const int64_t from, const int64_t to) const { + RowRanges result; + for (const Range&range: ranges) { + if (range.from >= from && range.to <= to) { + result.add(range); + } + } + return result; + } + + void add(const Range&range, bool merge = true) { + Range rangeToAdd = range; + if(merge) { + for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { + Range last = ranges[i]; + assert(!last.isAfter(range)); + const Range u = Range::unionRange(last, rangeToAdd); + assert (u.from != -1 && u.to != -1); + rangeToAdd = u; + ranges.erase(ranges.begin() + i); + } + } + ranges.push_back(rangeToAdd); + } + + size_t rowCount() const { + size_t cnt = 0; + for (const Range&range: ranges) { + cnt += range.count(); + } + return cnt; + } + + // + // class Iterator { + // private: + // int currentRangeIndex; + // Range currentRange; + // long next; + // std::vector ranges; + // + // long findNext() { + // if (currentRangeIndex < ranges.size()) { + // currentRange = ranges[++currentRangeIndex]; + // next = currentRange.from; + // } else { + // return -1; + // } + // return next; + // } + // + // public: + // Iterator(const std::vector& ranges) { + // this->ranges = ranges; + // currentRangeIndex = -1; + // next = findNext(); + // } + // + // bool hasNext() const { + // return next >= 0; + // } + // + // long nextLong() { + // long ret = next; + // if (ret < 0) { + // throw std::out_of_range("No such element"); + // } + // next = findNext(); + // return ret; + // } + // }; + // + // Iterator iterator() const { + // return Iterator(ranges); + // } + + bool isOverlapping(int64_t from, int64_t to) const { + const Range searchRange(from, to); + return isOverlapping(searchRange); + } + + bool isOverlapping(const Range&searchRange) const { + auto it = std::lower_bound(ranges.begin(), ranges.end(), searchRange, [](const Range&r1, const Range&r2) { + return r1.isBefore(r2); + }); + return it != ranges.end() && !(*it).isAfter(searchRange); + } + + std::vector& getRanges() { + return ranges; + } + + const Range& operator[](size_t index) const { + return ranges[index]; + } + + std::string toString() const { + std::string result = "["; + for (const Range&range: ranges) { + result += "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; + } + if (!ranges.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; + } +}; + +using RowRangesPtr = std::shared_ptr; + namespace internal { +class PARQUET_EXPORT RecordSkipper { +public: + RecordSkipper(RowRanges & pages, RowRanges & row_ranges_) : row_ranges(row_ranges_) { + RowRanges will_process_pages, skip_pages; + for(auto & page : pages.getRanges()) { + if(row_ranges.isOverlapping(page)) { + // will_process_pages.add(page); + } else { + skip_pages.add(page, false); + } + } + adjust_ranges(skip_pages, row_ranges); + // adjust_ranges(skip_pages, will_process_pages); + } + + /// \brief Return the number of records to read or to skip + /// if return values is positive, it means to read N records + /// if return values is negative, it means to skip N records + /// if return values is 0, it means to skip all records in this row group + int64_t advise_next(const int64_t current_rg_procesed) + { + if (row_ranges.getRanges().size() == row_range_idx) + { + return 0; + } + + if (row_ranges[row_range_idx].to < current_rg_procesed) + { + row_range_idx++; + if (row_ranges.getRanges().size() == row_range_idx) + { + return 0; + } + } + + if (row_ranges[row_range_idx].from > current_rg_procesed) + { + // return negative + return current_rg_procesed - row_ranges[row_range_idx].from; + } + + const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; + assert(ret >= 1); + return ret; + } +private: + /// Keep copy of ranges, because advise_next() will modify them + // RowRanges will_process_pages; + RowRanges row_ranges; + + size_t row_range_idx = 0; + + /// Since the skipped pages will be slienly skipped without updating current_rg_processed_records + /// or records_read_, we need to pre-process the row ranges as if these skipped pages never existed + void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) { + size_t skipped_rows = 0; + auto iter = to_adjust.getRanges().begin(); + auto skip_iter = skip_pages.getRanges().begin(); + while(iter != to_adjust.getRanges().end()) { + while(skip_iter != skip_pages.getRanges().end() && + skip_iter->isBefore(*iter)) { + skipped_rows += skip_iter->count(); + ++skip_iter; + } + iter->from -= skipped_rows; + iter->to -= skipped_rows; + ++iter; + } + } +}; + /// \brief Stateful column reader that delimits semantic records for both flat /// and nested columns /// @@ -414,6 +739,10 @@ class PARQUET_EXPORT RecordReader { /// \brief True if reading dense for nullable columns. bool read_dense_for_nullable() const { return read_dense_for_nullable_; } + void reset_current_rg_processed_records() { current_rg_processed_records = 0; } + + void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } + protected: /// \brief Indicates if we can have nullable values. Note that repeated fields /// may or may not be nullable. @@ -422,6 +751,8 @@ class PARQUET_EXPORT RecordReader { bool at_record_start_; int64_t records_read_; + int64_t current_rg_processed_records; // counting both read and skip records + /// \brief Stores values. These values are populated based on each ReadRecords /// call. No extra values are buffered for the next call. SkipRecords will not /// add any value to this buffer. @@ -463,6 +794,8 @@ class PARQUET_EXPORT RecordReader { // If true, we will not leave any space for the null values in the values_ // vector. bool read_dense_for_nullable_ = false; + + std::shared_ptr skipper = NULLPTR; }; class BinaryRecordReader : virtual public RecordReader { diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 0a73002846ad..a9adcdf5b9c3 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1457,3 +1457,6 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) { } } // namespace parquet + + +//TODO: TEST_P ,enable dictionary \ No newline at end of file From 3e9af38857a7c2bf29f9d11dc6256cd982904ce8 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Wed, 22 Nov 2023 22:25:30 +0800 Subject: [PATCH 02/25] happy path pass 2 --- cpp/examples/arrow/parquet_read_write.cc | 263 +++++++++-------------- 1 file changed, 104 insertions(+), 159 deletions(-) diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index cc267f38d73d..63f1a28fe475 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -17,195 +17,140 @@ #include "arrow/api.h" #include "arrow/io/api.h" +#include "arrow/io/memory.h" #include "arrow/result.h" #include "arrow/util/type_fwd.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" -#include +#include #include +#include -arrow::Status ReadInBatches(std::string path_to_file) { - // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" - - arrow::MemoryPool* pool = arrow::default_memory_pool(); - - // Configure general Parquet reader settings - auto reader_properties = parquet::ReaderProperties(pool); - reader_properties.set_buffer_size(4096 * 4); - reader_properties.enable_buffered_stream(); - - // Configure Arrow-specific Parquet reader settings - auto arrow_reader_props = parquet::ArrowReaderProperties(); - arrow_reader_props.set_batch_size(10); // default 64 * 1024 - - parquet::arrow::FileReaderBuilder reader_builder; - ARROW_RETURN_NOT_OK( - reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); - reader_builder.memory_pool(pool); - reader_builder.properties(arrow_reader_props); - - std::unique_ptr arrow_reader; - ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); +arrow::Result> GetTable() { + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_x; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); + + std::shared_ptr arr_y; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); + + std::shared_ptr arr_z_values; + std::shared_ptr arr_z_offsets; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 300))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values)); + std::vector offsets = arrow::internal::Iota(0, 101); + std::transform(offsets.begin(), offsets.end(), offsets.begin(), + [](int x) { return x * 3; }); + ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets)); + ARROW_ASSIGN_OR_RAISE(auto arr_z, + arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values)); + + auto schema = + arrow::schema({arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32()), + arrow::field("z", arrow::list(arrow::int32()))}); + + return arrow::Table::Make(schema, {arr_x, arr_y, arr_z}); +} - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); +arrow::Result> WriteFullFile() { + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; - std::vector ranges; - for (int64_t i = 0; i < 50; i++) { - if (i % 2 == 0) - ranges.push_back({i, i}); - } - row_ranges_map->insert({0, std::make_shared(ranges)}); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + std::shared_ptr props = + WriterProperties::Builder() + .max_row_group_length(50) + ->enable_write_page_index() + ->write_batch_size(13) + ->data_pagesize(1) // this will cause every batch creating a page + ->compression(arrow::Compression::SNAPPY) + ->build(); - ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader)); + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); - size_t total_rows = 0; - size_t total_values = 0; - for (arrow::Result> maybe_batch: *rb_reader) { - // Operate on each batch... - auto batch = maybe_batch.ValueOrDie(); - total_rows += batch->num_rows(); - std::cout << "batch size: " << batch->num_rows() << std::endl; + ARROW_ASSIGN_OR_RAISE(auto outfile, ::arrow::io::BufferOutputStream::Create()); - auto int_array = std::dynamic_pointer_cast(batch->column(1)); - for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { - total_values += (*iter).value(); - } - } - std::cout << "total rows is : " << total_rows << std::endl; - std::cout << "total value of y is : " << total_values << std::endl; - return arrow::Status::OK(); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, + /*chunk_size=*/100, props, arrow_props)); + return outfile->Finish(); } -arrow::Result> GetTable() { - auto builder = arrow::Int32Builder(); - - std::shared_ptr arr_x; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); - - std::shared_ptr arr_y; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); - - std::shared_ptr arr_z_values; - std::shared_ptr arr_z_offsets; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0,300))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values)); - std::vector offsets = arrow::internal::Iota(0, 101); - std::transform(offsets.begin(), offsets.end(), offsets.begin(), [](int x) { return x * 3; }); - ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets)); - ARROW_ASSIGN_OR_RAISE(auto arr_z, arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values)); - - - auto schema = arrow::schema( - { - arrow::field("x", arrow::int32()), - arrow::field("y", arrow::int32()), - arrow::field("z", arrow::list(arrow::int32())) - }); - - return arrow::Table::Make(schema, {arr_x, arr_y, arr_z}); -} -arrow::Result> GetRBR() { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - auto reader = std::make_shared(table); - reader->set_chunksize(10); - return reader; -} +arrow::Status ReadInBatches(std::shared_ptr buffer) { + arrow::MemoryPool* pool = arrow::default_memory_pool(); -arrow::Status WriteFullFile(std::string path_to_file) { - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(10); // default 64 * 1024 - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13) - ->data_pagesize(1) // this will cause every batch creating a page - ->compression(arrow::Compression::SNAPPY)->build(); - std::cout << "hello" << std::endl; + parquet::arrow::FileReaderBuilder reader_builder; + auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + ARROW_RETURN_NOT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); + ARROW_ASSIGN_OR_RAISE(auto arrow_reader, reader_builder.Build()); - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); - ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), - arrow::default_memory_pool(), outfile, - /*chunk_size=*/100, props, arrow_props)); - return arrow::Status::OK(); -} + std::vector ranges; + for (int64_t i = 0; i < 50; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); -arrow::Status WriteInBatches(std::string path_to_file) { - // #include "parquet/arrow/writer.h" - // #include "arrow/util/type_fwd.h" - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; - - // Data is in RBR - std::shared_ptr batch_stream; - ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); - - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); - - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); - - // Create a writer - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - std::unique_ptr writer; - ARROW_ASSIGN_OR_RAISE( - writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), - arrow::default_memory_pool(), outfile, - props, arrow_props)); - - // Write each batch as a row_group - for (arrow::Result> maybe_batch: *batch_stream) { - ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatches(batch->schema(), {batch})); - ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); - } + ARROW_RETURN_NOT_OK( + arrow_reader->GetRecordBatchReader({0, 1}, {0, 1}, row_ranges_map, &rb_reader)); - // Write file footer and close - ARROW_RETURN_NOT_OK(writer->Close()); + size_t total_rows = 0; + size_t total_values = 0; + for (arrow::Result> maybe_batch : *rb_reader) { + // Operate on each batch... + auto batch = maybe_batch.ValueOrDie(); + total_rows += batch->num_rows(); + std::cout << "batch size: " << batch->num_rows() << std::endl; - return arrow::Status::OK(); + auto int_array = std::dynamic_pointer_cast(batch->column(1)); + for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { + total_values += (*iter).value(); + } + } + std::cout << "total rows is : " << total_rows << std::endl; + std::cout << "total value of y is : " << total_values << std::endl; + return arrow::Status::OK(); } -arrow::Status RunExamples(std::string path_to_file) { - // ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); - // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); - // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); - ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); - return arrow::Status::OK(); + +arrow::Status RunExamples() { + ARROW_ASSIGN_OR_RAISE(auto buffer, WriteFullFile()); + ARROW_RETURN_NOT_OK(ReadInBatches(buffer)); + return arrow::Status::OK(); } int main(int argc, char** argv) { - if (argc != 2) { - // Fake success for CI purposes. - return EXIT_SUCCESS; - } + if (argc != 2) { + // Fake success for CI purposes. + return EXIT_SUCCESS; + } - std::string path_to_file = argv[1]; - arrow::Status status = RunExamples(path_to_file); + std::string path_to_file = argv[1]; + arrow::Status status = RunExamples(); - if (!status.ok()) { - std::cerr << "Error occurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; } From a185d54cf16a95f1eec5c7809666740e52b16c17 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 15:00:44 +0800 Subject: [PATCH 03/25] happy path pass 3 --- cpp/examples/arrow/parquet_read_write.cc | 273 +++++++----- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/arrow/reader.cc | 107 +++-- cpp/src/parquet/column_reader.h | 527 +++++++++++------------ cpp/src/parquet/filtered_reader_test.cc | 207 +++++++++ 5 files changed, 693 insertions(+), 422 deletions(-) create mode 100644 cpp/src/parquet/filtered_reader_test.cc diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 63f1a28fe475..fa45a34cff49 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -17,140 +17,207 @@ #include "arrow/api.h" #include "arrow/io/api.h" -#include "arrow/io/memory.h" #include "arrow/result.h" #include "arrow/util/type_fwd.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" -#include -#include #include +#include -arrow::Result> GetTable() { - auto builder = arrow::Int32Builder(); - - std::shared_ptr arr_x; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); - - std::shared_ptr arr_y; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); - - std::shared_ptr arr_z_values; - std::shared_ptr arr_z_offsets; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 300))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_values)); - std::vector offsets = arrow::internal::Iota(0, 101); - std::transform(offsets.begin(), offsets.end(), offsets.begin(), - [](int x) { return x * 3; }); - ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_z_offsets)); - ARROW_ASSIGN_OR_RAISE(auto arr_z, - arrow::ListArray::FromArrays(*arr_z_offsets, *arr_z_values)); - - auto schema = - arrow::schema({arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32()), - arrow::field("z", arrow::list(arrow::int32()))}); - - return arrow::Table::Make(schema, {arr_x, arr_y, arr_z}); -} +arrow::Status ReadInBatches(std::string path_to_file) { + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" -arrow::Result> WriteFullFile() { - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; + arrow::MemoryPool* pool = arrow::default_memory_pool(); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + // Configure general Parquet reader settings + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); - std::shared_ptr props = - WriterProperties::Builder() - .max_row_group_length(50) - ->enable_write_page_index() - ->write_batch_size(13) - ->data_pagesize(1) // this will cause every batch creating a page - ->compression(arrow::Compression::SNAPPY) - ->build(); + // Configure Arrow-specific Parquet reader settings + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(10); // default 64 * 1024 - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); + parquet::arrow::FileReaderBuilder reader_builder; + ARROW_RETURN_NOT_OK( + reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); - ARROW_ASSIGN_OR_RAISE(auto outfile, ::arrow::io::BufferOutputStream::Create()); + std::unique_ptr arrow_reader; + ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + + std::vector ranges; + for (int64_t i = 0; i < 50; i++) { + if (i % 2 == 0) + ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); - ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), - arrow::default_memory_pool(), outfile, - /*chunk_size=*/100, props, arrow_props)); - return outfile->Finish(); + + ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader)); + + size_t total_rows = 0; + size_t total_values = 0; + for (arrow::Result> maybe_batch: *rb_reader) { + // Operate on each batch... + auto batch = maybe_batch.ValueOrDie(); + total_rows += batch->num_rows(); + std::cout << "batch size: " << batch->num_rows() << std::endl; + + auto int_array = std::dynamic_pointer_cast(batch->column(1)); + for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { + total_values += (*iter).value(); + } + } + std::cout << "total rows is : " << total_rows << std::endl; + std::cout << "total value of y is : " << total_values << std::endl; + return arrow::Status::OK(); } -arrow::Status ReadInBatches(std::shared_ptr buffer) { - arrow::MemoryPool* pool = arrow::default_memory_pool(); +arrow::Result> GetTable() { + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_a_values; + std::shared_ptr arr_a_offsets; + std::vector a_values; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 3; ++j) { + a_values.push_back(i); + } + } + ARROW_RETURN_NOT_OK(builder.AppendValues(a_values)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values)); + std::vector offsets = arrow::internal::Iota(0, 101); + std::transform(offsets.begin(), offsets.end(), offsets.begin(), + [](int x) { return x * 3; }); + ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets)); + ARROW_ASSIGN_OR_RAISE(auto arr_a, + arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values)); + + std::shared_ptr arr_b; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_b)); + + auto string_builder = arrow::StringBuilder(); + std::shared_ptr arr_c; + std::vector strs; + for (size_t i = 0; i < 100; i++) { + strs.push_back("" + std::to_string(i)); + } + ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs)); + ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); + + auto schema = arrow::schema({ + arrow::field("a", arrow::list(arrow::int32())), + arrow::field("b", arrow::int32()), + arrow::field("c", arrow::utf8()), + }); - auto reader_properties = parquet::ReaderProperties(pool); - reader_properties.set_buffer_size(4096 * 4); - reader_properties.enable_buffered_stream(); + return arrow::Table::Make(schema, {arr_a, arr_b, arr_c}); +} - auto arrow_reader_props = parquet::ArrowReaderProperties(); - arrow_reader_props.set_batch_size(10); // default 64 * 1024 +arrow::Result> GetRBR() { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + auto reader = std::make_shared(table); + reader->set_chunksize(10); + return reader; +} - parquet::arrow::FileReaderBuilder reader_builder; - auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); - ARROW_RETURN_NOT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); - reader_builder.memory_pool(pool); - reader_builder.properties(arrow_reader_props); +arrow::Status WriteFullFile(std::string path_to_file) { + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; - ARROW_ASSIGN_OR_RAISE(auto arrow_reader, reader_builder.Build()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13) + ->data_pagesize(1) // this will cause every batch creating a page + ->compression(arrow::Compression::SNAPPY)->build(); + std::cout << "hello" << std::endl; - std::vector ranges; - for (int64_t i = 0; i < 50; i++) { - if (i % 2 == 0) ranges.push_back({i, i}); - } - row_ranges_map->insert({0, std::make_shared(ranges)}); + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); - ARROW_RETURN_NOT_OK( - arrow_reader->GetRecordBatchReader({0, 1}, {0, 1}, row_ranges_map, &rb_reader)); + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - size_t total_rows = 0; - size_t total_values = 0; - for (arrow::Result> maybe_batch : *rb_reader) { - // Operate on each batch... - auto batch = maybe_batch.ValueOrDie(); - total_rows += batch->num_rows(); - std::cout << "batch size: " << batch->num_rows() << std::endl; + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, + /*chunk_size=*/100, props, arrow_props)); + return arrow::Status::OK(); +} - auto int_array = std::dynamic_pointer_cast(batch->column(1)); - for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { - total_values += (*iter).value(); +arrow::Status WriteInBatches(std::string path_to_file) { + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + // Data is in RBR + std::shared_ptr batch_stream; + ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); + + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + // Create a writer + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::unique_ptr writer; + ARROW_ASSIGN_OR_RAISE( + writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), + arrow::default_memory_pool(), outfile, + props, arrow_props)); + + // Write each batch as a row_group + for (arrow::Result> maybe_batch: *batch_stream) { + ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(batch->schema(), {batch})); + ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); } - } - std::cout << "total rows is : " << total_rows << std::endl; - std::cout << "total value of y is : " << total_values << std::endl; - return arrow::Status::OK(); -} + // Write file footer and close + ARROW_RETURN_NOT_OK(writer->Close()); -arrow::Status RunExamples() { - ARROW_ASSIGN_OR_RAISE(auto buffer, WriteFullFile()); - ARROW_RETURN_NOT_OK(ReadInBatches(buffer)); - return arrow::Status::OK(); + return arrow::Status::OK(); +} + +arrow::Status RunExamples(std::string path_to_file) { + ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); + // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); + // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); + // ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); + return arrow::Status::OK(); } int main(int argc, char** argv) { - if (argc != 2) { - // Fake success for CI purposes. - return EXIT_SUCCESS; - } + if (argc != 2) { + // Fake success for CI purposes. + return EXIT_SUCCESS; + } - std::string path_to_file = argv[1]; - arrow::Status status = RunExamples(); + std::string path_to_file = argv[1]; + arrow::Status status = RunExamples(path_to_file); - if (!status.ok()) { - std::cerr << "Error occurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; } diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e6aad7cee2a3..06be0da74aa6 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -354,6 +354,7 @@ add_parquet_test(reader-test level_conversion_test.cc column_scanner_test.cc reader_test.cc + filtered_reader_test.cc stream_reader_test.cc test_util.cc) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 34316bf47c1b..81c7b1188895 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -19,13 +19,13 @@ #include +#include #include #include #include #include #include #include -#include #include "arrow/array.h" #include "arrow/buffer.h" @@ -75,8 +75,8 @@ using arrow::internal::Iota; // Help reduce verbosity using ParquetReader = parquet::ParquetFileReader; -using parquet::RowRangesPtr; using parquet::Range; +using parquet::RowRangesPtr; using parquet::internal::RecordReader; namespace bit_util = arrow::bit_util; @@ -205,11 +205,11 @@ class FileReaderImpl : public FileReader { return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out); } - Status GetFieldReader(int i, - const std::shared_ptr>& included_leaves, - const std::vector& row_groups, - const std::shared_ptr>& row_ranges_map, - std::unique_ptr* out) { + Status GetFieldReader( + int i, const std::shared_ptr>& included_leaves, + const std::vector& row_groups, + const std::shared_ptr>& row_ranges_map, + std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. if (ARROW_PREDICT_FALSE(i < 0 || @@ -229,11 +229,11 @@ class FileReaderImpl : public FileReader { return GetReader(manifest_.schema_fields[i], ctx, out); } - Status GetFieldReaders(const std::vector& column_indices, - const std::vector& row_groups, - const std::shared_ptr>& row_ranges_map, - std::vector>* out, - std::shared_ptr<::arrow::Schema>* out_schema) { + Status GetFieldReaders( + const std::vector& column_indices, const std::vector& row_groups, + const std::shared_ptr>& row_ranges_map, + std::vector>* out, + std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated // in the indices vector ARROW_ASSIGN_OR_RAISE(std::vector field_indices, @@ -245,9 +245,8 @@ class FileReaderImpl : public FileReader { ::arrow::FieldVector out_fields(field_indices.size()); for (size_t i = 0; i < out->size(); ++i) { std::unique_ptr reader; - RETURN_NOT_OK( - GetFieldReader(field_indices[i], included_leaves, row_groups, - row_ranges_map, &reader)); + RETURN_NOT_OK(GetFieldReader(field_indices[i], included_leaves, row_groups, + row_ranges_map, &reader)); out_fields[i] = reader->field(); out->at(i) = std::move(reader); @@ -346,9 +345,9 @@ class FileReaderImpl : public FileReader { } Status GetRecordBatchReader( - const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, - std::unique_ptr* out) override; + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::unique_ptr* out) override; Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, @@ -468,15 +467,16 @@ class RowGroupReaderImpl : public RowGroupReader { // Column reader implementations struct RowRangesPageFilter { - explicit RowRangesPageFilter(const RowRangesPtr & row_ranges_, const RowRangesPtr & page_ranges_) - : row_ranges(row_ranges_), page_ranges(page_ranges_) { + explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_, + const RowRangesPtr& page_ranges_) + : row_ranges(row_ranges_), page_ranges(page_ranges_) { assert(row_ranges != nullptr); assert(page_ranges != nullptr); assert(row_ranges->getRanges().size() > 0); assert(page_ranges->getRanges().size() > 0); } - bool operator() (const DataPageStats & stats) { + bool operator()(const DataPageStats& stats) { ++page_range_idx; if (row_range_idx >= row_ranges->getRanges().size()) { @@ -490,8 +490,8 @@ struct RowRangesPageFilter { } while (row_range_idx < row_ranges->getRanges().size() && - current_page_range.isAfter((*row_ranges)[row_range_idx])) { - row_range_idx++; + current_page_range.isAfter((*row_ranges)[row_range_idx])) { + row_range_idx++; } return row_range_idx >= row_ranges->getRanges().size(); @@ -570,37 +570,49 @@ class LeafReader : public ColumnReaderImpl { /// using page index to reduce cost if (page_reader != nullptr && ctx_->row_ranges_map) { + // reset skipper + record_reader_->set_record_skipper(NULLPTR); + // if specific row range is provided for this rg if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); - iter != ctx_->row_ranges_map->end()) { - + iter != ctx_->row_ranges_map->end()) { // check offset exists auto offset_index = ctx_->reader->GetPageIndexReader() - ->RowGroup(input_->current_row_group()) - ->GetOffsetIndex(input_->column_index()); + ->RowGroup(input_->current_row_group()) + ->GetOffsetIndex(input_->column_index()); if (!offset_index) { - throw ParquetException("Offset index is not found for column: " + field_->name()); + throw ParquetException("Attempting to filter pages but Offset index is not found for column: " + + field_->name()); } const auto page_locations = offset_index->page_locations(); auto page_ranges = std::make_shared(); for (size_t i = 0; i < page_locations.size() - 1; i++) { page_ranges->add({page_locations[i].first_row_index, - page_locations[i + 1].first_row_index - 1}, false); + page_locations[i + 1].first_row_index - 1}, + false); } if (page_locations.size() >= 1) { - page_ranges->add({ - page_locations[page_locations.size() - 1].first_row_index, - ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - 1}, false); + page_ranges->add({page_locations[page_locations.size() - 1].first_row_index, + ctx_->reader->metadata() + ->RowGroup(input_->current_row_group()) + ->num_rows() - + 1}, + false); } // part 1, skip decompressing & decoding unnecessary pages page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges)); // part 2, skip unnecessary rows in necessary pages - record_reader_->set_record_skipper(std::make_shared( - *page_ranges, *iter->second)); - } + record_reader_->set_record_skipper( + std::make_shared(*page_ranges, + *iter->second)); + } else { + // If row_ranges_map exists but no row_ranges is found for this RG, skip this RG + NextRowGroup(); + return; + } } record_reader_->reset_current_rg_processed_records(); @@ -1072,10 +1084,10 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& } // namespace -Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, - const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, - std::unique_ptr* out) { +Status FileReaderImpl::GetRecordBatchReader( + const std::vector& row_groups, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); if (reader_properties_.pre_buffer()) { @@ -1088,7 +1100,8 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, std::vector> readers; std::shared_ptr<::arrow::Schema> batch_schema; - RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups,row_ranges_map, &readers, &batch_schema)); + RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_map, &readers, + &batch_schema)); if (readers.empty()) { // Just generate all batches right now; they're cheap since they have no columns. @@ -1343,7 +1356,8 @@ Future> FileReaderImpl::DecodeRowGroups( // in a sync context too so use `this` over `self` std::vector> readers; std::shared_ptr<::arrow::Schema> result_schema; - RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema)); + RETURN_NOT_OK( + GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema)); // OptionalParallelForAsync requires an executor if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool(); @@ -1406,12 +1420,13 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice return Status::OK(); } -Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, - const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, - std::shared_ptr* out) { +Status FileReader::GetRecordBatchReader( + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::shared_ptr* out) { std::unique_ptr tmp; - RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp)); + RETURN_NOT_OK( + GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp)); out->reset(tmp.release()); return Status::OK(); } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 4468fd1aa20e..860642bb3657 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -304,254 +304,234 @@ class TypedColumnReader : public ColumnReader { }; struct Range { - static Range unionRange(const Range&left, const Range&right) { - if (left.from <= right.from) { - if (left.to + 1 >= right.from) { - return {left.from, std::max(left.to, right.to)}; - } - } - else if (right.to + 1 >= left.from) { - return {right.from, std::max(left.to, right.to)}; - } - return {-1, -1}; + static Range unionRange(const Range& left, const Range& right) { + if (left.from <= right.from) { + if (left.to + 1 >= right.from) { + return {left.from, std::max(left.to, right.to)}; + } + } else if (right.to + 1 >= left.from) { + return {right.from, std::max(left.to, right.to)}; } + return {-1, -1}; + } - static Range intersection(const Range&left, const Range&right) { - if (left.from <= right.from) { - if (left.to >= right.from) { - return {right.from, std::min(left.to, right.to)}; - } - } - else if (right.to >= left.from) { - return {left.from, std::min(left.to, right.to)}; - } - return {-1, -1}; // Return a default Range object if no intersection range found + static Range intersection(const Range& left, const Range& right) { + if (left.from <= right.from) { + if (left.to >= right.from) { + return {right.from, std::min(left.to, right.to)}; + } + } else if (right.to >= left.from) { + return {left.from, std::min(left.to, right.to)}; } + return {-1, -1}; // Return a default Range object if no intersection range found + } - int64_t from; - int64_t to; + int64_t from; + int64_t to; - Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { - assert(from <= to); - } + Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { + assert(from <= to); + } - size_t count() const { - return to - from + 1; - } + size_t count() const { return to - from + 1; } - bool isBefore(const Range&other) const { - return to < other.from; - } + bool isBefore(const Range& other) const { return to < other.from; } - bool isAfter(const Range&other) const { - return from > other.to; - } + bool isAfter(const Range& other) const { return from > other.to; } - bool isOverlap(const Range&other) const { - return !isBefore(other) && !isAfter(other); - } + bool isOverlap(const Range& other) const { return !isBefore(other) && !isAfter(other); } - std::string toString() const { - return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; - } + std::string toString() const { + return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; + } }; class RowRanges { - std::vector ranges; + std::vector ranges; -public: - RowRanges() = default; + public: + RowRanges() = default; - explicit RowRanges(const Range&range) { - ranges.push_back(range); - } + explicit RowRanges(const Range& range) { ranges.push_back(range); } - RowRanges(const std::vector&ranges) { - this->ranges = ranges; - } + RowRanges(const std::vector& ranges) { this->ranges = ranges; } - //copy cstr - RowRanges(const RowRanges&other) { - ranges = other.ranges; - } + // copy cstr + RowRanges(const RowRanges& other) { ranges = other.ranges; } - RowRanges(RowRanges&&other) noexcept { - ranges = std::move(other.ranges); - } + RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); } - static RowRanges createSingle(const size_t rowCount) { - return RowRanges({Range(0L, rowCount - 1L)}); - } + static RowRanges createSingle(const size_t rowCount) { + return RowRanges({Range(0L, rowCount - 1L)}); + } - // static RowRanges create(size_t rowCount, const std::vector& pageIndexes, const OffsetIndex& offsetIndex) { - // RowRanges ranges; - // for (int pageIndex : pageIndexes) { - // ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount))); - // } - // return ranges; - // } - - static RowRanges unionRanges(const RowRanges&left, const RowRanges&right) { - RowRanges result; - auto it1 = left.ranges.begin(); - auto it2 = right.ranges.begin(); - if (it2 != right.ranges.end()) { - Range range2 = *it2; - while (it1 != left.ranges.end()) { - Range range1 = *it1; - if (range1.isAfter(range2)) { - result.add(range2); - range2 = range1; - const auto tmp = it1; - it1 = it2; - it2 = tmp; - } - else { - result.add(range1); - } - ++it1; - } - result.add(range2); - } - else { - it2 = it1; + // static RowRanges create(size_t rowCount, const std::vector& pageIndexes, const + // OffsetIndex& offsetIndex) { + // RowRanges ranges; + // for (int pageIndex : pageIndexes) { + // ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), + // offsetIndex.getLastRowIndex(pageIndex, rowCount))); + // } + // return ranges; + // } + + static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) { + RowRanges result; + auto it1 = left.ranges.begin(); + auto it2 = right.ranges.begin(); + if (it2 != right.ranges.end()) { + Range range2 = *it2; + while (it1 != left.ranges.end()) { + Range range1 = *it1; + if (range1.isAfter(range2)) { + result.add(range2); + range2 = range1; + const auto tmp = it1; + it1 = it2; + it2 = tmp; + } else { + result.add(range1); } - while (it2 != right.ranges.end()) { - result.add(*it2); - ++it2; - } - - return result; + ++it1; + } + result.add(range2); + } else { + it2 = it1; } - - static RowRanges intersection(const RowRanges&left, const RowRanges&right) { - RowRanges result; - - size_t rightIndex = 0; - for (const Range&l: left.ranges) { - for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { - const Range&r = right.ranges[i]; - if (l.isBefore(r)) { - break; - } - else if (l.isAfter(r)) { - rightIndex = i + 1; - continue; - } - result.add(Range::intersection(l, r)); - } - } - - return result; + while (it2 != right.ranges.end()) { + result.add(*it2); + ++it2; } - RowRanges slice(const int64_t from, const int64_t to) const { - RowRanges result; - for (const Range&range: ranges) { - if (range.from >= from && range.to <= to) { - result.add(range); - } - } - return result; - } + return result; + } - void add(const Range&range, bool merge = true) { - Range rangeToAdd = range; - if(merge) { - for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { - Range last = ranges[i]; - assert(!last.isAfter(range)); - const Range u = Range::unionRange(last, rangeToAdd); - assert (u.from != -1 && u.to != -1); - rangeToAdd = u; - ranges.erase(ranges.begin() + i); - } + static RowRanges intersection(const RowRanges& left, const RowRanges& right) { + RowRanges result; + + size_t rightIndex = 0; + for (const Range& l : left.ranges) { + for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { + const Range& r = right.ranges[i]; + if (l.isBefore(r)) { + break; + } else if (l.isAfter(r)) { + rightIndex = i + 1; + continue; } - ranges.push_back(rangeToAdd); + result.add(Range::intersection(l, r)); + } } - size_t rowCount() const { - size_t cnt = 0; - for (const Range&range: ranges) { - cnt += range.count(); - } - return cnt; - } + return result; + } - // - // class Iterator { - // private: - // int currentRangeIndex; - // Range currentRange; - // long next; - // std::vector ranges; - // - // long findNext() { - // if (currentRangeIndex < ranges.size()) { - // currentRange = ranges[++currentRangeIndex]; - // next = currentRange.from; - // } else { - // return -1; - // } - // return next; - // } - // - // public: - // Iterator(const std::vector& ranges) { - // this->ranges = ranges; - // currentRangeIndex = -1; - // next = findNext(); - // } - // - // bool hasNext() const { - // return next >= 0; - // } - // - // long nextLong() { - // long ret = next; - // if (ret < 0) { - // throw std::out_of_range("No such element"); - // } - // next = findNext(); - // return ret; - // } - // }; - // - // Iterator iterator() const { - // return Iterator(ranges); - // } - - bool isOverlapping(int64_t from, int64_t to) const { - const Range searchRange(from, to); - return isOverlapping(searchRange); + RowRanges slice(const int64_t from, const int64_t to) const { + RowRanges result; + for (const Range& range : ranges) { + if (range.from >= from && range.to <= to) { + result.add(range); + } } + return result; + } - bool isOverlapping(const Range&searchRange) const { - auto it = std::lower_bound(ranges.begin(), ranges.end(), searchRange, [](const Range&r1, const Range&r2) { - return r1.isBefore(r2); - }); - return it != ranges.end() && !(*it).isAfter(searchRange); + void add(const Range& range, bool merge = true) { + Range rangeToAdd = range; + if (merge) { + for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { + Range last = ranges[i]; + assert(!last.isAfter(range)); + const Range u = Range::unionRange(last, rangeToAdd); + assert(u.from != -1 && u.to != -1); + rangeToAdd = u; + ranges.erase(ranges.begin() + i); + } + } else { + if (ranges.size() > 1) assert(rangeToAdd.from > ranges.back().to); } + ranges.push_back(rangeToAdd); + } - std::vector& getRanges() { - return ranges; + size_t rowCount() const { + size_t cnt = 0; + for (const Range& range : ranges) { + cnt += range.count(); } + return cnt; + } - const Range& operator[](size_t index) const { - return ranges[index]; - } + // + // class Iterator { + // private: + // int currentRangeIndex; + // Range currentRange; + // long next; + // std::vector ranges; + // + // long findNext() { + // if (currentRangeIndex < ranges.size()) { + // currentRange = ranges[++currentRangeIndex]; + // next = currentRange.from; + // } else { + // return -1; + // } + // return next; + // } + // + // public: + // Iterator(const std::vector& ranges) { + // this->ranges = ranges; + // currentRangeIndex = -1; + // next = findNext(); + // } + // + // bool hasNext() const { + // return next >= 0; + // } + // + // long nextLong() { + // long ret = next; + // if (ret < 0) { + // throw std::out_of_range("No such element"); + // } + // next = findNext(); + // return ret; + // } + // }; + // + // Iterator iterator() const { + // return Iterator(ranges); + // } - std::string toString() const { - std::string result = "["; - for (const Range&range: ranges) { - result += "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; - } - if (!ranges.empty()) { - result = result.substr(0, result.size() - 2); - } - result += "]"; - return result; + bool isOverlapping(int64_t from, int64_t to) const { + const Range searchRange(from, to); + return isOverlapping(searchRange); + } + + bool isOverlapping(const Range& searchRange) const { + auto it = std::lower_bound( + ranges.begin(), ranges.end(), searchRange, + [](const Range& r1, const Range& r2) { return r1.isBefore(r2); }); + return it != ranges.end() && !(*it).isAfter(searchRange); + } + + std::vector& getRanges() { return ranges; } + + const Range& operator[](size_t index) const { return ranges[index]; } + + std::string toString() const { + std::string result = "["; + for (const Range& range : ranges) { + result += + "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; } + if (!ranges.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; + } }; using RowRangesPtr = std::shared_ptr; @@ -559,74 +539,75 @@ using RowRangesPtr = std::shared_ptr; namespace internal { class PARQUET_EXPORT RecordSkipper { -public: - RecordSkipper(RowRanges & pages, RowRanges & row_ranges_) : row_ranges(row_ranges_) { - RowRanges will_process_pages, skip_pages; - for(auto & page : pages.getRanges()) { - if(row_ranges.isOverlapping(page)) { - // will_process_pages.add(page); - } else { - skip_pages.add(page, false); - } - } - adjust_ranges(skip_pages, row_ranges); - // adjust_ranges(skip_pages, will_process_pages); + public: + RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) : row_ranges(row_ranges_) { + RowRanges will_process_pages, skip_pages; + for (auto& page : pages.getRanges()) { + if (row_ranges.isOverlapping(page)) { + // will_process_pages.add(page); + } else { + skip_pages.add(page, false); + } } + adjust_ranges(skip_pages, row_ranges); + // adjust_ranges(skip_pages, will_process_pages); - /// \brief Return the number of records to read or to skip - /// if return values is positive, it means to read N records - /// if return values is negative, it means to skip N records - /// if return values is 0, it means to skip all records in this row group - int64_t advise_next(const int64_t current_rg_procesed) - { - if (row_ranges.getRanges().size() == row_range_idx) - { - return 0; - } + total_rows_to_process = pages.rowCount() - skip_pages.rowCount() + 1; + } - if (row_ranges[row_range_idx].to < current_rg_procesed) - { - row_range_idx++; - if (row_ranges.getRanges().size() == row_range_idx) - { - return 0; - } - } + /// \brief Return the number of records to read or to skip + /// if return values is positive, it means to read N records + /// if return values is negative, it means to skip N records + /// if return values is 0, it means end of RG + int64_t advise_next(const int64_t current_rg_procesed) { + if (row_ranges.getRanges().size() == row_range_idx) { + return 0; + } - if (row_ranges[row_range_idx].from > current_rg_procesed) - { - // return negative - return current_rg_procesed - row_ranges[row_range_idx].from; - } + if (row_ranges[row_range_idx].to < current_rg_procesed) { + row_range_idx++; + if (row_ranges.getRanges().size() == row_range_idx) { + // negative, skip the ramaining rows + return current_rg_procesed - total_rows_to_process; + } + } - const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; - assert(ret >= 1); - return ret; + if (row_ranges[row_range_idx].from > current_rg_procesed) { + // negative, skip + return current_rg_procesed - row_ranges[row_range_idx].from; } -private: - /// Keep copy of ranges, because advise_next() will modify them - // RowRanges will_process_pages; - RowRanges row_ranges; - - size_t row_range_idx = 0; - - /// Since the skipped pages will be slienly skipped without updating current_rg_processed_records - /// or records_read_, we need to pre-process the row ranges as if these skipped pages never existed - void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) { - size_t skipped_rows = 0; - auto iter = to_adjust.getRanges().begin(); - auto skip_iter = skip_pages.getRanges().begin(); - while(iter != to_adjust.getRanges().end()) { - while(skip_iter != skip_pages.getRanges().end() && - skip_iter->isBefore(*iter)) { - skipped_rows += skip_iter->count(); - ++skip_iter; - } - iter->from -= skipped_rows; - iter->to -= skipped_rows; - ++iter; - } + + const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; + assert(ret > 0); + return ret; + } + + private: + /// Keep copy of ranges, because advise_next() will modify them + // RowRanges will_process_pages; + RowRanges row_ranges; + + size_t row_range_idx = 0; + + size_t total_rows_to_process = 0; + + /// Since the skipped pages will be slienly skipped without updating + /// current_rg_processed_records or records_read_, we need to pre-process the row ranges + /// as if these skipped pages never existed + void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { + size_t skipped_rows = 0; + auto iter = to_adjust.getRanges().begin(); + auto skip_iter = skip_pages.getRanges().begin(); + while (iter != to_adjust.getRanges().end()) { + while (skip_iter != skip_pages.getRanges().end() && skip_iter->isBefore(*iter)) { + skipped_rows += skip_iter->count(); + ++skip_iter; + } + iter->from -= skipped_rows; + iter->to -= skipped_rows; + ++iter; } + } }; /// \brief Stateful column reader that delimits semantic records for both flat @@ -751,7 +732,7 @@ class PARQUET_EXPORT RecordReader { bool at_record_start_; int64_t records_read_; - int64_t current_rg_processed_records; // counting both read and skip records + int64_t current_rg_processed_records; // counting both read and skip records /// \brief Stores values. These values are populated based on each ReadRecords /// call. No extra values are buffered for the next call. SkipRecords will not diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc new file mode 100644 index 000000000000..427476c7c3b1 --- /dev/null +++ b/cpp/src/parquet/filtered_reader_test.cc @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/api.h" +#include "arrow/io/api.h" +#include "arrow/io/memory.h" +#include "arrow/result.h" +#include "arrow/util/type_fwd.h" +#include "parquet/arrow/reader.h" +#include "parquet/arrow/writer.h" + +#include +#include +#include + +arrow::Result> GetTable() { + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_a_values; + std::shared_ptr arr_a_offsets; + std::vector a_values; + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 3; ++j) { + a_values.push_back(i); + } + } + ARROW_RETURN_NOT_OK(builder.AppendValues(a_values)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values)); + std::vector offsets = arrow::internal::Iota(0, 101); + std::transform(offsets.begin(), offsets.end(), offsets.begin(), + [](int x) { return x * 3; }); + ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets)); + ARROW_ASSIGN_OR_RAISE(auto arr_a, + arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values)); + + std::shared_ptr arr_b; + ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_b)); + + auto string_builder = arrow::StringBuilder(); + std::shared_ptr arr_c; + std::vector strs; + for (size_t i = 0; i < 100; i++) { + strs.push_back("" + std::to_string(i)); + } + ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs)); + ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); + + auto schema = arrow::schema({ + arrow::field("a", arrow::list(arrow::int32())), + arrow::field("b", arrow::int32()), + arrow::field("c", arrow::utf8()), + }); + + return arrow::Table::Make(schema, {arr_a, arr_b, arr_c}); +} + +arrow::Result> WriteFullFile() { + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + + std::shared_ptr props = + WriterProperties::Builder() + .max_row_group_length(30) + ->enable_write_page_index() + ->write_batch_size(13) + ->data_pagesize(1) // this will cause every batch creating a page + ->compression(arrow::Compression::SNAPPY) + ->build(); + + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + ARROW_ASSIGN_OR_RAISE(auto out_stream, ::arrow::io::BufferOutputStream::Create()); + + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), out_stream, + /*chunk_size=*/100, props, arrow_props)); + + // { + // // output to a local file for debugging + // ARROW_ASSIGN_OR_RAISE(auto outfile, arrow::io::FileOutputStream::Open( + // "/tmp/filtered_reader_test.parquet")); + // + // ARROW_RETURN_NOT_OK( + // parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile, + // /*chunk_size=*/100, props, arrow_props)); + // } + + return out_stream->Finish(); +} + +void check_rb(std::shared_ptr rb_reader, size_t expect_rows, + int64_t expect_sum_of_b) { + size_t total_rows = 0; + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_c = 0; + for (arrow::Result> maybe_batch : *rb_reader) { + ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); + total_rows += batch->num_rows(); + + auto a_array = std::dynamic_pointer_cast(batch->column(0)); + ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten()); + auto a_array_values = std::dynamic_pointer_cast(flatten_a_array); + for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) { + sum_a += (*iter).value(); + } + + auto b_array = std::dynamic_pointer_cast(batch->column(1)); + for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) { + sum_b += (*iter).value(); + } + + auto c_array = std::dynamic_pointer_cast(batch->column(2)); + for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { + sum_c += std::stoi(std::string((*iter).value())); + } + } + ASSERT_EQ(expect_rows, total_rows); + ASSERT_EQ(expect_sum_of_b * 3, sum_a); + ASSERT_EQ(expect_sum_of_b, sum_b); + ASSERT_EQ(expect_sum_of_b, sum_c); +} + +class TestRecordBatchReaderWithRanges : public ::testing::Test { +public: + void SetUp() { + + } + + void TearDown() {} + +protected: + +}; + +TEST(TestRecordBatchReaderWithRanges2, Normal) { + ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile()); + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + auto arrow_reader_props = parquet::ArrowReaderProperties(); + // arrow_reader_props.set_batch_size(64 * 1024); // default 64 * 1024 + arrow_reader_props.set_batch_size(10); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); + + // // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped + // { + // std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + // auto row_ranges_map = std::make_shared>(); + // std::vector ranges; + // for (int64_t i = 0; i < 30; i++) { + // if (i % 2 == 0) ranges.push_back({i, i}); + // } + // row_ranges_map->insert({0, std::make_shared(ranges)}); + // ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, + // row_ranges_map, + // &rb_reader)); + // + // check_rb(rb_reader, 15, 210); // 0 + 2 + ... + 28 = 210 + // } + + // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped + { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + std::vector ranges; + for (int64_t i = 0; i < 30; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); + row_ranges_map->insert({2, std::make_shared(ranges)}); + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, row_ranges_map, + &rb_reader)); + + check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 + } +} From bdebb741943898ab6ca8d96fedf0656b2d97d99e Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 15:49:51 +0800 Subject: [PATCH 04/25] happy path pass 4 --- cpp/src/parquet/arrow/reader.cc | 2 + cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/filtered_reader_test.cc | 164 ++++++++++++++++-------- 3 files changed, 111 insertions(+), 57 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 81c7b1188895..aa07912a373b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -566,6 +567,7 @@ class LeafReader : public ColumnReaderImpl { private: std::shared_ptr out_; void NextRowGroup() { + std::cout << "Entering NextRowGroup" << std::endl; std::unique_ptr page_reader = input_->NextChunk(); /// using page index to reduce cost diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 860642bb3657..d9227ebcb025 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -552,7 +552,7 @@ class PARQUET_EXPORT RecordSkipper { adjust_ranges(skip_pages, row_ranges); // adjust_ranges(skip_pages, will_process_pages); - total_rows_to_process = pages.rowCount() - skip_pages.rowCount() + 1; + total_rows_to_process = pages.rowCount() - skip_pages.rowCount(); } /// \brief Return the number of records to read or to skip diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc index 427476c7c3b1..272bbe463af3 100644 --- a/cpp/src/parquet/filtered_reader_test.cc +++ b/cpp/src/parquet/filtered_reader_test.cc @@ -56,7 +56,7 @@ arrow::Result> GetTable() { std::shared_ptr arr_c; std::vector strs; for (size_t i = 0; i < 100; i++) { - strs.push_back("" + std::to_string(i)); + strs.push_back(std::to_string(i)); } ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs)); ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); @@ -108,7 +108,7 @@ arrow::Result> WriteFullFile() { } void check_rb(std::shared_ptr rb_reader, size_t expect_rows, - int64_t expect_sum_of_b) { + int64_t expect_sum_of_b, const std::vector& column_indices) { size_t total_rows = 0; int64_t sum_a = 0; int64_t sum_b = 0; @@ -117,77 +117,91 @@ void check_rb(std::shared_ptr rb_reader, size_t expect ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); total_rows += batch->num_rows(); - auto a_array = std::dynamic_pointer_cast(batch->column(0)); - ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten()); - auto a_array_values = std::dynamic_pointer_cast(flatten_a_array); - for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) { - sum_a += (*iter).value(); + if (std::find(column_indices.begin(), column_indices.end(), 0) != + column_indices.end()) { + auto a_array = + std::dynamic_pointer_cast(batch->GetColumnByName("a")); + ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten()); + auto a_array_values = std::dynamic_pointer_cast(flatten_a_array); + for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) { + sum_a += (*iter).value(); + } } - auto b_array = std::dynamic_pointer_cast(batch->column(1)); - for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) { - sum_b += (*iter).value(); + if (std::find(column_indices.begin(), column_indices.end(), 1) != + column_indices.end()) { + auto b_array = + std::dynamic_pointer_cast(batch->GetColumnByName("b")); + for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) { + sum_b += (*iter).value(); + } } - auto c_array = std::dynamic_pointer_cast(batch->column(2)); - for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { - sum_c += std::stoi(std::string((*iter).value())); + if (std::find(column_indices.begin(), column_indices.end(), 2) != + column_indices.end()) { + auto c_array = + std::dynamic_pointer_cast(batch->GetColumnByName("c")); + for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { + sum_c += std::stoi(std::string((*iter).value())); + } } } ASSERT_EQ(expect_rows, total_rows); - ASSERT_EQ(expect_sum_of_b * 3, sum_a); - ASSERT_EQ(expect_sum_of_b, sum_b); - ASSERT_EQ(expect_sum_of_b, sum_c); + + if (std::find(column_indices.begin(), column_indices.end(), 0) != column_indices.end()) + ASSERT_EQ(expect_sum_of_b * 3, sum_a); + if (std::find(column_indices.begin(), column_indices.end(), 1) != column_indices.end()) + ASSERT_EQ(expect_sum_of_b, sum_b); + if (std::find(column_indices.begin(), column_indices.end(), 2) != column_indices.end()) + ASSERT_EQ(expect_sum_of_b, sum_c); } class TestRecordBatchReaderWithRanges : public ::testing::Test { -public: + public: void SetUp() { + ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile()); - } + arrow::MemoryPool* pool = arrow::default_memory_pool(); - void TearDown() {} + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); -protected: + auto arrow_reader_props = parquet::ArrowReaderProperties(); + // arrow_reader_props.set_batch_size(64 * 1024); // default 64 * 1024 + arrow_reader_props.set_batch_size(10); // default 64 * 1024 -}; + parquet::arrow::FileReaderBuilder reader_builder; + auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); -TEST(TestRecordBatchReaderWithRanges2, Normal) { - ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile()); - - arrow::MemoryPool* pool = arrow::default_memory_pool(); - - auto reader_properties = parquet::ReaderProperties(pool); - reader_properties.set_buffer_size(4096 * 4); - reader_properties.enable_buffered_stream(); + ASSERT_OK_AND_ASSIGN(arrow_reader, reader_builder.Build()); + } - auto arrow_reader_props = parquet::ArrowReaderProperties(); - // arrow_reader_props.set_batch_size(64 * 1024); // default 64 * 1024 - arrow_reader_props.set_batch_size(10); // default 64 * 1024 + void TearDown() {} - parquet::arrow::FileReaderBuilder reader_builder; - auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); - ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); - reader_builder.memory_pool(pool); - reader_builder.properties(arrow_reader_props); + protected: + std::unique_ptr arrow_reader; +}; - ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); +TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { + // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped + { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + std::vector ranges; + for (int64_t i = 0; i < 30; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); + std::vector column_indices{0, 1, 2}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); - // // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped - // { - // std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - // auto row_ranges_map = std::make_shared>(); - // std::vector ranges; - // for (int64_t i = 0; i < 30; i++) { - // if (i % 2 == 0) ranges.push_back({i, i}); - // } - // row_ranges_map->insert({0, std::make_shared(ranges)}); - // ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, - // row_ranges_map, - // &rb_reader)); - // - // check_rb(rb_reader, 15, 210); // 0 + 2 + ... + 28 = 210 - // } + check_rb(rb_reader, 15, 210, column_indices); // 0 + 2 + ... + 28 = 210 + } // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped { @@ -199,9 +213,47 @@ TEST(TestRecordBatchReaderWithRanges2, Normal) { } row_ranges_map->insert({0, std::make_shared(ranges)}); row_ranges_map->insert({2, std::make_shared(ranges)}); - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, {0, 1, 2}, row_ranges_map, - &rb_reader)); + std::vector column_indices{0, 1, 2}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); - check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 + check_rb(rb_reader, 30, 1320, + column_indices); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 } } + +TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); + row_ranges_map->insert( + {1, std::make_shared(parquet::Range{10, 19})}); + row_ranges_map->insert( + {2, std::make_shared(parquet::Range{20, 29})}); + row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + + std::vector column_indices{0, 1, 2}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280 + check_rb(rb_reader, 40, 2280, column_indices); +} + +TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert( + {1, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert( + {2, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + + std::vector column_indices{0, 1, 2}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // (0+...+99) = 4950 + check_rb(rb_reader, 100, 4950, column_indices); +} From 29c471a8e1528bf9cba963938f5b4b2a81df81d0 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 19:45:13 +0800 Subject: [PATCH 05/25] happy path pass 5 --- cpp/src/parquet/arrow/reader.cc | 79 +++++-- cpp/src/parquet/column_reader.h | 84 ++----- cpp/src/parquet/filtered_reader_test.cc | 290 ++++++++++++++++++------ 3 files changed, 296 insertions(+), 157 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index aa07912a373b..93b4089ef68e 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -566,8 +566,59 @@ class LeafReader : public ColumnReaderImpl { private: std::shared_ptr out_; + + void checkAndGetPageRanges(const std::shared_ptr& row_ranges, + std::shared_ptr& page_ranges) { + // check offset exists + auto rg_pg_index_reader = + ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group()); + + if (!rg_pg_index_reader) { + throw ParquetException( + "Attempting to read with Ranges but Page Index is not found for Row " + "Group: " + + std::to_string(input_->current_row_group())); + } + auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index()); + + if (!offset_index) { + throw ParquetException( + "Attempting to read with Ranges but Offset index is not found for " + "column: " + + field_->name()); + } + + if (!row_ranges->isValid()) { + throw ParquetException( + "The provided row range is invalid, keep it monotone and non-interleaving: " + + row_ranges->toString()); + } + + const auto page_locations = offset_index->page_locations(); + page_ranges = std::make_shared(); + for (size_t i = 0; i < page_locations.size() - 1; i++) { + page_ranges->add( + {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1}, + false); + } + if (page_locations.size() >= 1) { + page_ranges->add( + {page_locations[page_locations.size() - 1].first_row_index, + ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - + 1}, + false); + } + + if (row_ranges->getRanges().size() > 0) { + if ((*row_ranges).getRanges().back().to > page_ranges->getRanges().back().to) { + throw ParquetException( + "The provided row range " + row_ranges->toString() + + " exceeds last page :" + page_ranges->getRanges().back().toString()); + } + } + } + void NextRowGroup() { - std::cout << "Entering NextRowGroup" << std::endl; std::unique_ptr page_reader = input_->NextChunk(); /// using page index to reduce cost @@ -578,30 +629,8 @@ class LeafReader : public ColumnReaderImpl { // if specific row range is provided for this rg if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); iter != ctx_->row_ranges_map->end()) { - // check offset exists - auto offset_index = ctx_->reader->GetPageIndexReader() - ->RowGroup(input_->current_row_group()) - ->GetOffsetIndex(input_->column_index()); - if (!offset_index) { - throw ParquetException("Attempting to filter pages but Offset index is not found for column: " + - field_->name()); - } - - const auto page_locations = offset_index->page_locations(); - auto page_ranges = std::make_shared(); - for (size_t i = 0; i < page_locations.size() - 1; i++) { - page_ranges->add({page_locations[i].first_row_index, - page_locations[i + 1].first_row_index - 1}, - false); - } - if (page_locations.size() >= 1) { - page_ranges->add({page_locations[page_locations.size() - 1].first_row_index, - ctx_->reader->metadata() - ->RowGroup(input_->current_row_group()) - ->num_rows() - - 1}, - false); - } + std::shared_ptr page_ranges; + checkAndGetPageRanges(iter->second, page_ranges); // part 1, skip decompressing & decoding unnecessary pages page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges)); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index d9227ebcb025..44288e25afea 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -354,27 +355,15 @@ class RowRanges { explicit RowRanges(const Range& range) { ranges.push_back(range); } - RowRanges(const std::vector& ranges) { this->ranges = ranges; } + RowRanges(const std::vector& ranges) { + this->ranges = ranges; + } // copy cstr RowRanges(const RowRanges& other) { ranges = other.ranges; } RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); } - static RowRanges createSingle(const size_t rowCount) { - return RowRanges({Range(0L, rowCount - 1L)}); - } - - // static RowRanges create(size_t rowCount, const std::vector& pageIndexes, const - // OffsetIndex& offsetIndex) { - // RowRanges ranges; - // for (int pageIndex : pageIndexes) { - // ranges.add(Range(offsetIndex.getFirstRowIndex(pageIndex), - // offsetIndex.getLastRowIndex(pageIndex, rowCount))); - // } - // return ranges; - // } - static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) { RowRanges result; auto it1 = left.ranges.begin(); @@ -441,9 +430,14 @@ class RowRanges { if (merge) { for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { Range last = ranges[i]; - assert(!last.isAfter(range)); + if (last.isAfter(range)) { + throw ParquetException(range.toString() + " cannot be added to " + + this->toString()); + } const Range u = Range::unionRange(last, rangeToAdd); - assert(u.from != -1 && u.to != -1); + if (u.from == -1 && u.to == -1) { + break; + } rangeToAdd = u; ranges.erase(ranges.begin() + i); } @@ -461,48 +455,20 @@ class RowRanges { return cnt; } - // - // class Iterator { - // private: - // int currentRangeIndex; - // Range currentRange; - // long next; - // std::vector ranges; - // - // long findNext() { - // if (currentRangeIndex < ranges.size()) { - // currentRange = ranges[++currentRangeIndex]; - // next = currentRange.from; - // } else { - // return -1; - // } - // return next; - // } - // - // public: - // Iterator(const std::vector& ranges) { - // this->ranges = ranges; - // currentRangeIndex = -1; - // next = findNext(); - // } - // - // bool hasNext() const { - // return next >= 0; - // } - // - // long nextLong() { - // long ret = next; - // if (ret < 0) { - // throw std::out_of_range("No such element"); - // } - // next = findNext(); - // return ret; - // } - // }; - // - // Iterator iterator() const { - // return Iterator(ranges); - // } + bool isValid() const { + if (ranges.size() == 0) { + return false; + } + if (ranges[0].from < 0) { + return false; + } + for (size_t i = 1; i < ranges.size(); i++) { + if (ranges[i].from <= ranges[i - 1].to) { + return false; + } + } + return true; + } bool isOverlapping(int64_t from, int64_t to) const { const Range searchRange(from, to); diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc index 272bbe463af3..9cd711cdf176 100644 --- a/cpp/src/parquet/filtered_reader_test.cc +++ b/cpp/src/parquet/filtered_reader_test.cc @@ -25,33 +25,56 @@ #include #include +#include +#include #include +/// The table looks like: +// { +// { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0}, +// { a: {x: 1, y: 1}, b: {1, 1, 1}, c: "1", d: 1}, +// ... +// { a: {x: 99, y: 99}, b: {99, 99, 99}, c: "99", d: 99} +// } arrow::Result> GetTable() { - auto builder = arrow::Int32Builder(); - - std::shared_ptr arr_a_values; - std::shared_ptr arr_a_offsets; - std::vector a_values; + auto int32_builder = arrow::Int32Builder(); + + // Struct col + std::shared_ptr arr_a_x; + std::shared_ptr arr_a_y; + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_x)); + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_y)); + ARROW_ASSIGN_OR_RAISE( + auto arr_a, + arrow::StructArray::Make({arr_a_x, arr_a_y}, std::vector{"x", "y"})); + + // List col + std::shared_ptr arr_b_values; + std::shared_ptr arr_b_offsets; + std::vector b_values; for (int i = 0; i < 100; ++i) { for (int j = 0; j < 3; ++j) { - a_values.push_back(i); + b_values.push_back(i); } } - ARROW_RETURN_NOT_OK(builder.AppendValues(a_values)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values)); + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(b_values)); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_values)); std::vector offsets = arrow::internal::Iota(0, 101); std::transform(offsets.begin(), offsets.end(), offsets.begin(), [](int x) { return x * 3; }); - ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets)); - ARROW_ASSIGN_OR_RAISE(auto arr_a, - arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values)); + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(offsets)); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_offsets)); + ARROW_ASSIGN_OR_RAISE(auto arr_b, + arrow::ListArray::FromArrays(*arr_b_offsets, *arr_b_values)); - std::shared_ptr arr_b; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_b)); + // int col + std::shared_ptr arr_d; + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100))); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d)); + // string col auto string_builder = arrow::StringBuilder(); std::shared_ptr arr_c; std::vector strs; @@ -62,12 +85,14 @@ arrow::Result> GetTable() { ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); auto schema = arrow::schema({ - arrow::field("a", arrow::list(arrow::int32())), - arrow::field("b", arrow::int32()), + // complex types prior to simple types + arrow::field("a", arr_a->type()), + arrow::field("b", arrow::list(arrow::int32())), arrow::field("c", arrow::utf8()), + arrow::field("d", arrow::int32()), }); - return arrow::Table::Make(schema, {arr_a, arr_b, arr_c}); + return arrow::Table::Make(schema, {arr_a, arr_b, arr_c, arr_d}); } arrow::Result> WriteFullFile() { @@ -107,53 +132,70 @@ arrow::Result> WriteFullFile() { return out_stream->Finish(); } -void check_rb(std::shared_ptr rb_reader, size_t expect_rows, - int64_t expect_sum_of_b, const std::vector& column_indices) { +bool checking_col(const std::string col_name, + const std::vector& column_names) { + return std::find(column_names.begin(), column_names.end(), col_name) != + column_names.end(); +} + +void check_rb(std::shared_ptr rb_reader, + const size_t expected_rows, const int64_t expected_sum) { + const std::vector column_names = rb_reader->schema()->field_names(); + size_t total_rows = 0; int64_t sum_a = 0; int64_t sum_b = 0; int64_t sum_c = 0; + int64_t sum_d = 0; for (arrow::Result> maybe_batch : *rb_reader) { ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch); total_rows += batch->num_rows(); - if (std::find(column_indices.begin(), column_indices.end(), 0) != - column_indices.end()) { + if (checking_col("a", column_names)) { auto a_array = - std::dynamic_pointer_cast(batch->GetColumnByName("a")); - ASSERT_OK_AND_ASSIGN(auto flatten_a_array, a_array->Flatten()); - auto a_array_values = std::dynamic_pointer_cast(flatten_a_array); - for (auto iter = a_array_values->begin(); iter != a_array_values->end(); ++iter) { + std::dynamic_pointer_cast(batch->GetColumnByName("a")); + auto a_x_array = std::dynamic_pointer_cast(a_array->field(0)); + auto a_y_array = std::dynamic_pointer_cast(a_array->field(1)); + for (auto iter = a_x_array->begin(); iter != a_x_array->end(); ++iter) { + sum_a += (*iter).value(); + } + for (auto iter = a_y_array->begin(); iter != a_y_array->end(); ++iter) { sum_a += (*iter).value(); } } - if (std::find(column_indices.begin(), column_indices.end(), 1) != - column_indices.end()) { + if (checking_col("b", column_names)) { auto b_array = - std::dynamic_pointer_cast(batch->GetColumnByName("b")); - for (auto iter = b_array->begin(); iter != b_array->end(); ++iter) { + std::dynamic_pointer_cast(batch->GetColumnByName("b")); + ASSERT_OK_AND_ASSIGN(auto flatten_b_array, b_array->Flatten()); + auto b_array_values = std::dynamic_pointer_cast(flatten_b_array); + for (auto iter = b_array_values->begin(); iter != b_array_values->end(); ++iter) { sum_b += (*iter).value(); } } - if (std::find(column_indices.begin(), column_indices.end(), 2) != - column_indices.end()) { + if (checking_col("c", column_names)) { auto c_array = std::dynamic_pointer_cast(batch->GetColumnByName("c")); for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { sum_c += std::stoi(std::string((*iter).value())); } } + + if (checking_col("d", column_names)) { + auto d_array = + std::dynamic_pointer_cast(batch->GetColumnByName("d")); + for (auto iter = d_array->begin(); iter != d_array->end(); ++iter) { + sum_d += (*iter).value(); + } + } } - ASSERT_EQ(expect_rows, total_rows); - - if (std::find(column_indices.begin(), column_indices.end(), 0) != column_indices.end()) - ASSERT_EQ(expect_sum_of_b * 3, sum_a); - if (std::find(column_indices.begin(), column_indices.end(), 1) != column_indices.end()) - ASSERT_EQ(expect_sum_of_b, sum_b); - if (std::find(column_indices.begin(), column_indices.end(), 2) != column_indices.end()) - ASSERT_EQ(expect_sum_of_b, sum_c); + ASSERT_EQ(expected_rows, total_rows); + + if (checking_col("a", column_names)) ASSERT_EQ(expected_sum * 2, sum_a); + if (checking_col("b", column_names)) ASSERT_EQ(expected_sum * 3, sum_b); + if (checking_col("c", column_names)) ASSERT_EQ(expected_sum, sum_c); + if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d); } class TestRecordBatchReaderWithRanges : public ::testing::Test { @@ -168,7 +210,6 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test { reader_properties.enable_buffered_stream(); auto arrow_reader_props = parquet::ArrowReaderProperties(); - // arrow_reader_props.set_batch_size(64 * 1024); // default 64 * 1024 arrow_reader_props.set_batch_size(10); // default 64 * 1024 parquet::arrow::FileReaderBuilder reader_builder; @@ -186,6 +227,56 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test { std::unique_ptr arrow_reader; }; +TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); + row_ranges_map->insert( + {1, std::make_shared(parquet::Range{10, 19})}); + row_ranges_map->insert( + {2, std::make_shared(parquet::Range{20, 29})}); + row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + + std::vector column_indices{0, 1, 2, 3, 4}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280 + check_rb(rb_reader, 40, 2280); +} + +TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert( + {0, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert( + {1, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert( + {2, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + + std::vector column_indices{0, 1, 2, 3, 4}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // (0+...+99) = 4950 + check_rb(rb_reader, 100, 4950); +} + +TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert( + {0, std::make_shared(std::vector())}); + std::vector column_indices{0, 1, 2, 3, 4}; + auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); + ASSERT_NOT_OK(status); + EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone " + "and non-interleaving: []") != std::string::npos); +} + TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped { @@ -196,11 +287,11 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); - std::vector column_indices{0, 1, 2}; + std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); - check_rb(rb_reader, 15, 210, column_indices); // 0 + 2 + ... + 28 = 210 + check_rb(rb_reader, 15, 210); // 0 + 2 + ... + 28 = 210 } // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped @@ -213,47 +304,100 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { } row_ranges_map->insert({0, std::make_shared(ranges)}); row_ranges_map->insert({2, std::make_shared(ranges)}); - std::vector column_indices{0, 1, 2}; + std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); - check_rb(rb_reader, 30, 1320, - column_indices); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 + check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 } } -TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { +TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); - row_ranges_map->insert( - {1, std::make_shared(parquet::Range{10, 19})}); - row_ranges_map->insert( - {2, std::make_shared(parquet::Range{20, 29})}); - row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); - - std::vector column_indices{0, 1, 2}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + { + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert( + {0, std::make_shared(parquet::Range{-1, 5})}); + std::vector column_indices{0, 1, 2, 3, 4}; + auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); + ASSERT_NOT_OK(status); + EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it " + "monotone and non-interleaving: [(-1, 5)]") != + std::string::npos); + } - // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280 - check_rb(rb_reader, 40, 2280, column_indices); + { + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert({0, std::make_shared(std::vector{ + parquet::Range{0, 4}, parquet::Range{2, 5}})}); + std::vector column_indices{0, 1, 2, 3, 4}; + auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); + ASSERT_NOT_OK(status); + EXPECT_TRUE( + status.message().find("The provided row range is invalid, keep it monotone and " + "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos); + } + { + auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert( + {0, std::make_shared(std::vector{parquet::Range{0, 30}})}); + std::vector column_indices{0, 1, 2, 3, 4}; + auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); + ASSERT_NOT_OK(status); + EXPECT_TRUE(status.message().find( + "The provided row range [(0, 30)] exceeds last page :[26, 29]") != + std::string::npos); + } } -TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { +TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + // write a file without page index + ASSERT_OK_AND_ASSIGN(std::shared_ptr table, GetTable()); + std::shared_ptr props = + WriterProperties::Builder() + .max_row_group_length(30) + ->disable_write_page_index() // NO INDEX !!!! + ->write_batch_size(13) + ->data_pagesize(1) + ->compression(arrow::Compression::SNAPPY) + ->build(); + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + ASSERT_OK_AND_ASSIGN(auto out_stream, ::arrow::io::BufferOutputStream::Create()); + ASSERT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), + out_stream, + /*chunk_size=*/100, props, arrow_props)); + ASSERT_OK_AND_ASSIGN(auto buffer, out_stream->Finish()); + + // try to read the file with Range + arrow::MemoryPool* pool = arrow::default_memory_pool(); + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(10); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map->insert( - {1, std::make_shared(parquet::Range{0, 29})}); row_ranges_map->insert( - {2, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); - - std::vector column_indices{0, 1, 2}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); - - // (0+...+99) = 4950 - check_rb(rb_reader, 100, 4950, column_indices); + {0, std::make_shared(parquet::Range{0, 29})}); + std::vector column_indices{0, 1, 2, 3, 4}; + auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); + ASSERT_NOT_OK(status); + EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is " + "not found for Row Group: 0") != std::string::npos); } From 7bf0e97e9de543175d781b9712ff1d2dccb93f12 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 20:33:05 +0800 Subject: [PATCH 06/25] happy path pass 6 --- cpp/src/parquet/filtered_reader_test.cc | 200 ++++++++++++++++-------- 1 file changed, 137 insertions(+), 63 deletions(-) diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/filtered_reader_test.cc index 9cd711cdf176..018e2580f254 100644 --- a/cpp/src/parquet/filtered_reader_test.cc +++ b/cpp/src/parquet/filtered_reader_test.cc @@ -23,20 +23,41 @@ #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" +#include #include #include #include #include #include -/// The table looks like: +/// The table looks like (with_nulls = false): // { // { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0}, // { a: {x: 1, y: 1}, b: {1, 1, 1}, c: "1", d: 1}, // ... // { a: {x: 99, y: 99}, b: {99, 99, 99}, c: "99", d: 99} // } -arrow::Result> GetTable() { +arrow::Result> GetTable(bool with_nulls = false) { + // if with_nulls, the generated table should null values + // set first 10 rows and last 10 rows to null + std::shared_ptr null_bitmap; + std::vector flags(100, true); + if (with_nulls) { + std::fill_n(flags.begin(), 10, false); + std::fill_n(flags.begin() + 90, 10, false); + + size_t length = flags.size(); + + ARROW_ASSIGN_OR_RAISE(null_bitmap, arrow::AllocateEmptyBitmap(length)); + + uint8_t* bitmap = null_bitmap->mutable_data(); + for (size_t i = 0; i < length; ++i) { + if (flags[i]) { + arrow::bit_util::SetBit(bitmap, i); + } + } + } + auto int32_builder = arrow::Int32Builder(); // Struct col @@ -46,9 +67,9 @@ arrow::Result> GetTable() { ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_x)); ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100))); ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_a_y)); - ARROW_ASSIGN_OR_RAISE( - auto arr_a, - arrow::StructArray::Make({arr_a_x, arr_a_y}, std::vector{"x", "y"})); + ARROW_ASSIGN_OR_RAISE(auto arr_a, arrow::StructArray::Make( + {arr_a_x, arr_a_y}, + std::vector{"x", "y"}, null_bitmap)); // List col std::shared_ptr arr_b_values; @@ -63,45 +84,49 @@ arrow::Result> GetTable() { ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_values)); std::vector offsets = arrow::internal::Iota(0, 101); std::transform(offsets.begin(), offsets.end(), offsets.begin(), - [](int x) { return x * 3; }); + [](const int x) { return x * 3; }); ARROW_RETURN_NOT_OK(int32_builder.AppendValues(offsets)); ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_b_offsets)); - ARROW_ASSIGN_OR_RAISE(auto arr_b, - arrow::ListArray::FromArrays(*arr_b_offsets, *arr_b_values)); - - // int col - std::shared_ptr arr_d; - ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100))); - ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d)); + ARROW_ASSIGN_OR_RAISE(auto arr_b, arrow::ListArray::FromArrays( + *arr_b_offsets, *arr_b_values, + arrow::default_memory_pool(), null_bitmap)); // string col auto string_builder = arrow::StringBuilder(); std::shared_ptr arr_c; std::vector strs; + uint8_t valid_bytes[100]; for (size_t i = 0; i < 100; i++) { strs.push_back(std::to_string(i)); + valid_bytes[i] = flags[i]; } - ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs)); + ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs, &valid_bytes[0])); ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); + // int col + std::shared_ptr arr_d; + ARROW_RETURN_NOT_OK(int32_builder.AppendValues(arrow::internal::Iota(0, 100), flags)); + ARROW_RETURN_NOT_OK(int32_builder.Finish(&arr_d)); + auto schema = arrow::schema({ // complex types prior to simple types - arrow::field("a", arr_a->type()), - arrow::field("b", arrow::list(arrow::int32())), - arrow::field("c", arrow::utf8()), - arrow::field("d", arrow::int32()), + field("a", arr_a->type()), + field("b", list(arrow::int32())), + field("c", arrow::utf8()), + field("d", arrow::int32()), }); return arrow::Table::Make(schema, {arr_a, arr_b, arr_c, arr_d}); } -arrow::Result> WriteFullFile() { +arrow::Result> WriteFullFile( + const bool with_nulls = false) { using parquet::ArrowWriterProperties; using parquet::WriterProperties; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + ARROW_ASSIGN_OR_RAISE(const auto table, GetTable(with_nulls)); - std::shared_ptr props = + const std::shared_ptr props = WriterProperties::Builder() .max_row_group_length(30) ->enable_write_page_index() @@ -110,10 +135,10 @@ arrow::Result> WriteFullFile() { ->compression(arrow::Compression::SNAPPY) ->build(); - std::shared_ptr arrow_props = + const std::shared_ptr arrow_props = ArrowWriterProperties::Builder().store_schema()->build(); - ARROW_ASSIGN_OR_RAISE(auto out_stream, ::arrow::io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(const auto out_stream, ::arrow::io::BufferOutputStream::Create()); ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), out_stream, @@ -132,7 +157,7 @@ arrow::Result> WriteFullFile() { return out_stream->Finish(); } -bool checking_col(const std::string col_name, +bool checking_col(const std::string& col_name, const std::vector& column_names) { return std::find(column_names.begin(), column_names.end(), col_name) != column_names.end(); @@ -157,10 +182,10 @@ void check_rb(std::shared_ptr rb_reader, auto a_x_array = std::dynamic_pointer_cast(a_array->field(0)); auto a_y_array = std::dynamic_pointer_cast(a_array->field(1)); for (auto iter = a_x_array->begin(); iter != a_x_array->end(); ++iter) { - sum_a += (*iter).value(); + sum_a += (*iter).has_value() ? (*iter).value() : 0; } for (auto iter = a_y_array->begin(); iter != a_y_array->end(); ++iter) { - sum_a += (*iter).value(); + sum_a += (*iter).has_value() ? (*iter).value() : 0; } } @@ -170,7 +195,7 @@ void check_rb(std::shared_ptr rb_reader, ASSERT_OK_AND_ASSIGN(auto flatten_b_array, b_array->Flatten()); auto b_array_values = std::dynamic_pointer_cast(flatten_b_array); for (auto iter = b_array_values->begin(); iter != b_array_values->end(); ++iter) { - sum_b += (*iter).value(); + sum_b += (*iter).has_value() ? (*iter).value() : 0; } } @@ -178,7 +203,7 @@ void check_rb(std::shared_ptr rb_reader, auto c_array = std::dynamic_pointer_cast(batch->GetColumnByName("c")); for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { - sum_c += std::stoi(std::string((*iter).value())); + sum_c += std::stoi(std::string((*iter).has_value() ? (*iter).value() : "0")); } } @@ -186,7 +211,7 @@ void check_rb(std::shared_ptr rb_reader, auto d_array = std::dynamic_pointer_cast(batch->GetColumnByName("d")); for (auto iter = d_array->begin(); iter != d_array->end(); ++iter) { - sum_d += (*iter).value(); + sum_d += (*iter).has_value() ? (*iter).value() : 0; } } } @@ -198,7 +223,7 @@ void check_rb(std::shared_ptr rb_reader, if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d); } -class TestRecordBatchReaderWithRanges : public ::testing::Test { +class TestRecordBatchReaderWithRanges : public testing::Test { public: void SetUp() { ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile()); @@ -213,7 +238,7 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test { arrow_reader_props.set_batch_size(10); // default 64 * 1024 parquet::arrow::FileReaderBuilder reader_builder; - auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + const auto in_file = std::make_shared(buffer); ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); reader_builder.memory_pool(pool); reader_builder.properties(arrow_reader_props); @@ -228,8 +253,8 @@ class TestRecordBatchReaderWithRanges : public ::testing::Test { }; TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); row_ranges_map->insert( {1, std::make_shared(parquet::Range{10, 19})}); @@ -237,7 +262,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { {2, std::make_shared(parquet::Range{20, 29})}); row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); - std::vector column_indices{0, 1, 2, 3, 4}; + const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -246,8 +271,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { } TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert( {0, std::make_shared(parquet::Range{0, 29})}); row_ranges_map->insert( @@ -256,7 +281,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { {2, std::make_shared(parquet::Range{0, 29})}); row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); - std::vector column_indices{0, 1, 2, 3, 4}; + const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -265,13 +290,13 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { } TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert( {0, std::make_shared(std::vector())}); - std::vector column_indices{0, 1, 2, 3, 4}; - auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const std::vector column_indices{0, 1, 2, 3, 4}; + const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone " "and non-interleaving: []") != std::string::npos); @@ -280,14 +305,14 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); - std::vector column_indices{0, 1, 2, 3, 4}; + const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -296,15 +321,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); row_ranges_map->insert({2, std::make_shared(ranges)}); - std::vector column_indices{0, 1, 2, 3, 4}; + const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -313,14 +338,14 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { } TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + std::shared_ptr rb_reader; { - auto row_ranges_map = std::make_shared>(); + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert( {0, std::make_shared(parquet::Range{-1, 5})}); - std::vector column_indices{0, 1, 2, 3, 4}; - auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const std::vector column_indices{0, 1, 2, 3, 4}; + const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it " "monotone and non-interleaving: [(-1, 5)]") != @@ -328,24 +353,24 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { - auto row_ranges_map = std::make_shared>(); + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert({0, std::make_shared(std::vector{ parquet::Range{0, 4}, parquet::Range{2, 5}})}); - std::vector column_indices{0, 1, 2, 3, 4}; - auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const std::vector column_indices{0, 1, 2, 3, 4}; + const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE( status.message().find("The provided row range is invalid, keep it monotone and " "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos); } { - auto row_ranges_map = std::make_shared>(); + const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert( {0, std::make_shared(std::vector{parquet::Range{0, 30}})}); - std::vector column_indices{0, 1, 2, 3, 4}; - auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const std::vector column_indices{0, 1, 2, 3, 4}; + const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE(status.message().find( "The provided row range [(0, 30)] exceeds last page :[26, 29]") != @@ -384,13 +409,13 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { arrow_reader_props.set_batch_size(10); // default 64 * 1024 parquet::arrow::FileReaderBuilder reader_builder; - auto in_file = std::make_shared<::arrow::io::BufferReader>(buffer); + auto in_file = std::make_shared(buffer); ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); reader_builder.memory_pool(pool); reader_builder.properties(arrow_reader_props); ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + std::shared_ptr rb_reader; auto row_ranges_map = std::make_shared>(); row_ranges_map->insert( {0, std::make_shared(parquet::Range{0, 29})}); @@ -401,3 +426,52 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is " "not found for Row Group: 0") != std::string::npos); } + +class TestRecordBatchReaderWithRangesWithNulls : public testing::Test { + public: + void SetUp() { + ASSERT_OK_AND_ASSIGN(auto buffer, WriteFullFile(true)); + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(10); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + const auto in_file = std::make_shared(buffer); + ASSERT_OK(reader_builder.Open(in_file, /*memory_map=*/reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + ASSERT_OK_AND_ASSIGN(arrow_reader, reader_builder.Build()); + } + + void TearDown() {} + + protected: + std::unique_ptr arrow_reader; +}; + +TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { + { + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); + std::vector ranges; + for (int64_t i = 0; i < 30; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } + row_ranges_map->insert({0, std::make_shared(ranges)}); + row_ranges_map->insert({2, std::make_shared(ranges)}); + const std::vector column_indices{0, 1, 2, 3, 4}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // 0-9 is masked as null, so the ramaining is: + // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320 + check_rb(rb_reader, 30, 1300); + } +} \ No newline at end of file From c97ea481de6191e59aca254714bae57be38aae29 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 20:57:42 +0800 Subject: [PATCH 07/25] happy path pass 7 --- cpp/examples/arrow/parquet_read_write.cc | 309 ++++++++---------- cpp/src/parquet/CMakeLists.txt | 2 +- cpp/src/parquet/arrow/reader.cc | 2 - cpp/src/parquet/arrow/reader.h | 10 +- cpp/src/parquet/column_reader.cc | 3 +- cpp/src/parquet/column_reader.h | 5 +- ...ed_reader_test.cc => range_reader_test.cc} | 2 +- cpp/src/parquet/reader_test.cc | 3 - 8 files changed, 147 insertions(+), 189 deletions(-) rename cpp/src/parquet/{filtered_reader_test.cc => range_reader_test.cc} (99%) diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index fa45a34cff49..3b8b4c2212b7 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -23,201 +23,168 @@ #include "parquet/arrow/writer.h" #include -#include -arrow::Status ReadInBatches(std::string path_to_file) { - // #include "arrow/io/api.h" - // #include "arrow/parquet/arrow/reader.h" - - arrow::MemoryPool* pool = arrow::default_memory_pool(); - - // Configure general Parquet reader settings - auto reader_properties = parquet::ReaderProperties(pool); - reader_properties.set_buffer_size(4096 * 4); - reader_properties.enable_buffered_stream(); - - // Configure Arrow-specific Parquet reader settings - auto arrow_reader_props = parquet::ArrowReaderProperties(); - arrow_reader_props.set_batch_size(10); // default 64 * 1024 - - parquet::arrow::FileReaderBuilder reader_builder; - ARROW_RETURN_NOT_OK( - reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); - reader_builder.memory_pool(pool); - reader_builder.properties(arrow_reader_props); - - std::unique_ptr arrow_reader; - ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); - - std::shared_ptr<::arrow::RecordBatchReader> rb_reader; - auto row_ranges_map = std::make_shared>(); - - std::vector ranges; - for (int64_t i = 0; i < 50; i++) { - if (i % 2 == 0) - ranges.push_back({i, i}); - } - row_ranges_map->insert({0, std::make_shared(ranges)}); - - - ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader({0,1},{0,1},row_ranges_map,&rb_reader)); - - size_t total_rows = 0; - size_t total_values = 0; - for (arrow::Result> maybe_batch: *rb_reader) { - // Operate on each batch... - auto batch = maybe_batch.ValueOrDie(); - total_rows += batch->num_rows(); - std::cout << "batch size: " << batch->num_rows() << std::endl; - - auto int_array = std::dynamic_pointer_cast(batch->column(1)); - for (auto iter = int_array->begin(); iter != int_array->end(); ++iter) { - total_values += (*iter).value(); - } - } - std::cout << "total rows is : " << total_rows << std::endl; - std::cout << "total value of y is : " << total_values << std::endl; - return arrow::Status::OK(); +arrow::Status ReadFullFile(std::string path_to_file) { + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + std::shared_ptr input; + ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(path_to_file)); + + // Open Parquet file reader + std::unique_ptr arrow_reader; + ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader)); + + // Read entire file as a single Arrow table + std::shared_ptr table; + ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table)); + return arrow::Status::OK(); } +arrow::Status ReadInBatches(std::string path_to_file) { + // #include "arrow/io/api.h" + // #include "arrow/parquet/arrow/reader.h" + + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + // Configure general Parquet reader settings + auto reader_properties = parquet::ReaderProperties(pool); + reader_properties.set_buffer_size(4096 * 4); + reader_properties.enable_buffered_stream(); + + // Configure Arrow-specific Parquet reader settings + auto arrow_reader_props = parquet::ArrowReaderProperties(); + arrow_reader_props.set_batch_size(128 * 1024); // default 64 * 1024 + + parquet::arrow::FileReaderBuilder reader_builder; + ARROW_RETURN_NOT_OK( + reader_builder.OpenFile(path_to_file, /*memory_map=*/false, reader_properties)); + reader_builder.memory_pool(pool); + reader_builder.properties(arrow_reader_props); + + std::unique_ptr arrow_reader; + ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader)); + + for (arrow::Result> maybe_batch : *rb_reader) { + // Operate on each batch... + } + return arrow::Status::OK(); +} arrow::Result> GetTable() { - auto builder = arrow::Int32Builder(); - - std::shared_ptr arr_a_values; - std::shared_ptr arr_a_offsets; - std::vector a_values; - for (int i = 0; i < 100; ++i) { - for (int j = 0; j < 3; ++j) { - a_values.push_back(i); - } - } - ARROW_RETURN_NOT_OK(builder.AppendValues(a_values)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_values)); - std::vector offsets = arrow::internal::Iota(0, 101); - std::transform(offsets.begin(), offsets.end(), offsets.begin(), - [](int x) { return x * 3; }); - ARROW_RETURN_NOT_OK(builder.AppendValues(offsets)); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_a_offsets)); - ARROW_ASSIGN_OR_RAISE(auto arr_a, - arrow::ListArray::FromArrays(*arr_a_offsets, *arr_a_values)); - - std::shared_ptr arr_b; - ARROW_RETURN_NOT_OK(builder.AppendValues(arrow::internal::Iota(0, 100))); - ARROW_RETURN_NOT_OK(builder.Finish(&arr_b)); - - auto string_builder = arrow::StringBuilder(); - std::shared_ptr arr_c; - std::vector strs; - for (size_t i = 0; i < 100; i++) { - strs.push_back("" + std::to_string(i)); - } - ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs)); - ARROW_RETURN_NOT_OK(string_builder.Finish(&arr_c)); - - auto schema = arrow::schema({ - arrow::field("a", arrow::list(arrow::int32())), - arrow::field("b", arrow::int32()), - arrow::field("c", arrow::utf8()), - }); - - return arrow::Table::Make(schema, {arr_a, arr_b, arr_c}); + auto builder = arrow::Int32Builder(); + + std::shared_ptr arr_x; + ARROW_RETURN_NOT_OK(builder.AppendValues({1, 3, 5, 7, 1})); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_x)); + + std::shared_ptr arr_y; + ARROW_RETURN_NOT_OK(builder.AppendValues({2, 4, 6, 8, 10})); + ARROW_RETURN_NOT_OK(builder.Finish(&arr_y)); + + auto schema = arrow::schema( + {arrow::field("x", arrow::int32()), arrow::field("y", arrow::int32())}); + + return arrow::Table::Make(schema, {arr_x, arr_y}); } arrow::Result> GetRBR() { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - auto reader = std::make_shared(table); - reader->set_chunksize(10); - return reader; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + auto reader = std::make_shared(table); + reader->set_chunksize(10); + return reader; } arrow::Status WriteFullFile(std::string path_to_file) { - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr table, GetTable()); - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().max_row_group_length(50)->enable_write_page_index()->write_batch_size(13) - ->data_pagesize(1) // this will cause every batch creating a page - ->compression(arrow::Compression::SNAPPY)->build(); - std::cout << "hello" << std::endl; + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), - arrow::default_memory_pool(), outfile, - /*chunk_size=*/100, props, arrow_props)); - return arrow::Status::OK(); + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), + arrow::default_memory_pool(), outfile, + /*chunk_size=*/3, props, arrow_props)); + return arrow::Status::OK(); } arrow::Status WriteInBatches(std::string path_to_file) { - // #include "parquet/arrow/writer.h" - // #include "arrow/util/type_fwd.h" - using parquet::ArrowWriterProperties; - using parquet::WriterProperties; - - // Data is in RBR - std::shared_ptr batch_stream; - ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); - - // Choose compression - std::shared_ptr props = - WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); - - // Opt to store Arrow schema for easier reads back into Arrow - std::shared_ptr arrow_props = - ArrowWriterProperties::Builder().store_schema()->build(); - - // Create a writer - std::shared_ptr outfile; - ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); - std::unique_ptr writer; - ARROW_ASSIGN_OR_RAISE( - writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), - arrow::default_memory_pool(), outfile, - props, arrow_props)); - - // Write each batch as a row_group - for (arrow::Result> maybe_batch: *batch_stream) { - ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); - ARROW_ASSIGN_OR_RAISE(auto table, - arrow::Table::FromRecordBatches(batch->schema(), {batch})); - ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); - } - - // Write file footer and close - ARROW_RETURN_NOT_OK(writer->Close()); - - return arrow::Status::OK(); + // #include "parquet/arrow/writer.h" + // #include "arrow/util/type_fwd.h" + using parquet::ArrowWriterProperties; + using parquet::WriterProperties; + + // Data is in RBR + std::shared_ptr batch_stream; + ARROW_ASSIGN_OR_RAISE(batch_stream, GetRBR()); + + // Choose compression + std::shared_ptr props = + WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + + // Opt to store Arrow schema for easier reads back into Arrow + std::shared_ptr arrow_props = + ArrowWriterProperties::Builder().store_schema()->build(); + + // Create a writer + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open(path_to_file)); + std::unique_ptr writer; + ARROW_ASSIGN_OR_RAISE( + writer, parquet::arrow::FileWriter::Open(*batch_stream->schema().get(), + arrow::default_memory_pool(), outfile, + props, arrow_props)); + + // Write each batch as a row_group + for (arrow::Result> maybe_batch : *batch_stream) { + ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(batch->schema(), {batch})); + ARROW_RETURN_NOT_OK(writer->WriteTable(*table.get(), batch->num_rows())); + } + + // Write file footer and close + ARROW_RETURN_NOT_OK(writer->Close()); + + return arrow::Status::OK(); } arrow::Status RunExamples(std::string path_to_file) { - ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); - // ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); - // ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); - // ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); - return arrow::Status::OK(); + ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file)); + ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file)); + ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file)); + ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file)); + return arrow::Status::OK(); } int main(int argc, char** argv) { - if (argc != 2) { - // Fake success for CI purposes. - return EXIT_SUCCESS; - } - - std::string path_to_file = argv[1]; - arrow::Status status = RunExamples(path_to_file); - - if (!status.ok()) { - std::cerr << "Error occurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } + if (argc != 2) { + // Fake success for CI purposes. return EXIT_SUCCESS; + } + + std::string path_to_file = argv[1]; + arrow::Status status = RunExamples(path_to_file); + + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; } diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 06be0da74aa6..0b947af762b2 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -354,7 +354,7 @@ add_parquet_test(reader-test level_conversion_test.cc column_scanner_test.cc reader_test.cc - filtered_reader_test.cc + range_reader_test.cc stream_reader_test.cc test_util.cc) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 93b4089ef68e..52a0d36412d7 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -19,10 +19,8 @@ #include -#include #include #include -#include #include #include #include diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 0fd35349b643..0cd8f298d79d 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -23,8 +23,8 @@ #include #include -#include "parquet/file_reader.h" #include "parquet/column_reader.h" +#include "parquet/file_reader.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -205,10 +205,10 @@ class PARQUET_EXPORT FileReader { /// /// \returns error Status if either row_group_indices or column_indices /// contains an invalid index - ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, - const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, - std::shared_ptr<::arrow::RecordBatchReader>* out); + ::arrow::Status GetRecordBatchReader( + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>& row_ranges_map, + std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::shared_ptr<::arrow::RecordBatchReader>* out); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 5187ef94aa9c..b517ee7c798e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1636,7 +1636,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, // and there is not read-ahead for levels. int64_t skipped_records = 0; if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) { - skipped_records = this->Skip(num_records); + skipped_records = this->Skip(num_records); current_rg_processed_records += skipped_records; return skipped_records; } @@ -1999,7 +1999,6 @@ class TypedRecordReader : public TypedColumnReaderImpl, while (true) { const auto advise = skipper->advise_next(current_rg_processed_records); - std::cout << "advise got after current_rg_processed_records: " << current_rg_processed_records << " is: " << advise < #include #include #include @@ -355,9 +354,7 @@ class RowRanges { explicit RowRanges(const Range& range) { ranges.push_back(range); } - RowRanges(const std::vector& ranges) { - this->ranges = ranges; - } + RowRanges(const std::vector& ranges) { this->ranges = ranges; } // copy cstr RowRanges(const RowRanges& other) { ranges = other.ranges; } diff --git a/cpp/src/parquet/filtered_reader_test.cc b/cpp/src/parquet/range_reader_test.cc similarity index 99% rename from cpp/src/parquet/filtered_reader_test.cc rename to cpp/src/parquet/range_reader_test.cc index 018e2580f254..835c5e7fe1e2 100644 --- a/cpp/src/parquet/filtered_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -147,7 +147,7 @@ arrow::Result> WriteFullFile( // { // // output to a local file for debugging // ARROW_ASSIGN_OR_RAISE(auto outfile, arrow::io::FileOutputStream::Open( - // "/tmp/filtered_reader_test.parquet")); + // "/tmp/range_reader_test.parquet")); // // ARROW_RETURN_NOT_OK( // parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile, diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index a9adcdf5b9c3..0a73002846ad 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1457,6 +1457,3 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) { } } // namespace parquet - - -//TODO: TEST_P ,enable dictionary \ No newline at end of file From f11d6a87593508d8e729e1998affd3614ad187a1 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 23 Nov 2023 21:32:05 +0800 Subject: [PATCH 08/25] happy path pass 8 --- cpp/src/parquet/column_reader.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index aee4d45ef835..40b734d33ea0 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -503,17 +503,19 @@ namespace internal { class PARQUET_EXPORT RecordSkipper { public: - RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) : row_ranges(row_ranges_) { + RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) + : row_ranges(row_ranges_) { // copy row_ranges RowRanges will_process_pages, skip_pages; for (auto& page : pages.getRanges()) { - if (row_ranges.isOverlapping(page)) { - // will_process_pages.add(page); - } else { + if (!row_ranges.isOverlapping(page)) { skip_pages.add(page, false); } } + + /// Since the skipped pages will be slienly skipped without updating + /// current_rg_processed_records or records_read_, we need to pre-process the row + /// ranges as if these skipped pages never existed adjust_ranges(skip_pages, row_ranges); - // adjust_ranges(skip_pages, will_process_pages); total_rows_to_process = pages.rowCount() - skip_pages.rowCount(); } @@ -547,16 +549,12 @@ class PARQUET_EXPORT RecordSkipper { private: /// Keep copy of ranges, because advise_next() will modify them - // RowRanges will_process_pages; RowRanges row_ranges; size_t row_range_idx = 0; size_t total_rows_to_process = 0; - /// Since the skipped pages will be slienly skipped without updating - /// current_rg_processed_records or records_read_, we need to pre-process the row ranges - /// as if these skipped pages never existed void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { size_t skipped_rows = 0; auto iter = to_adjust.getRanges().begin(); From 53ea5e5d6dd2ec9a140288cd4b0a68f354ecb24e Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Wed, 29 Nov 2023 16:01:37 +0800 Subject: [PATCH 09/25] refine emtpy logic --- cpp/src/parquet/arrow/reader.cc | 37 ++++++++++++++-------------- cpp/src/parquet/column_reader.h | 9 ++++--- cpp/src/parquet/range_reader_test.cc | 26 ++++++++++++++----- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 52a0d36412d7..10c731a6a8b9 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -469,9 +469,7 @@ struct RowRangesPageFilter { explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_, const RowRangesPtr& page_ranges_) : row_ranges(row_ranges_), page_ranges(page_ranges_) { - assert(row_ranges != nullptr); assert(page_ranges != nullptr); - assert(row_ranges->getRanges().size() > 0); assert(page_ranges->getRanges().size() > 0); } @@ -568,7 +566,7 @@ class LeafReader : public ColumnReaderImpl { void checkAndGetPageRanges(const std::shared_ptr& row_ranges, std::shared_ptr& page_ranges) { // check offset exists - auto rg_pg_index_reader = + const auto rg_pg_index_reader = ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group()); if (!rg_pg_index_reader) { @@ -577,7 +575,7 @@ class LeafReader : public ColumnReaderImpl { "Group: " + std::to_string(input_->current_row_group())); } - auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index()); + const auto offset_index = rg_pg_index_reader->GetOffsetIndex(input_->column_index()); if (!offset_index) { throw ParquetException( @@ -627,21 +625,24 @@ class LeafReader : public ColumnReaderImpl { // if specific row range is provided for this rg if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); iter != ctx_->row_ranges_map->end()) { - std::shared_ptr page_ranges; - checkAndGetPageRanges(iter->second, page_ranges); - - // part 1, skip decompressing & decoding unnecessary pages - page_reader->set_data_page_filter(RowRangesPageFilter(iter->second, page_ranges)); - - // part 2, skip unnecessary rows in necessary pages - record_reader_->set_record_skipper( - std::make_shared(*page_ranges, - *iter->second)); - } else { - // If row_ranges_map exists but no row_ranges is found for this RG, skip this RG - NextRowGroup(); - return; + if (iter->second != nullptr && iter->second->rowCount() != 0) { + std::shared_ptr page_ranges; + checkAndGetPageRanges(iter->second, page_ranges); + + // part 1, skip decompressing & decoding unnecessary pages + page_reader->set_data_page_filter( + RowRangesPageFilter(iter->second, page_ranges)); + + // part 2, skip unnecessary rows in necessary pages + record_reader_->set_record_skipper( + std::make_shared(*page_ranges, + *iter->second)); + } else { + NextRowGroup(); + return; + } } + // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this RG will be read } record_reader_->reset_current_rg_processed_records(); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 40b734d33ea0..dde78d5115c3 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -453,9 +453,7 @@ class RowRanges { } bool isValid() const { - if (ranges.size() == 0) { - return false; - } + if (ranges.size() == 0) return true; if (ranges[0].from < 0) { return false; } @@ -481,7 +479,10 @@ class RowRanges { std::vector& getRanges() { return ranges; } - const Range& operator[](size_t index) const { return ranges[index]; } + const Range& operator[](size_t index) const { + assert(index < ranges.size()); + return ranges[index]; + } std::string toString() const { std::string result = "["; diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index 835c5e7fe1e2..7a7c7e001bb7 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -292,18 +292,25 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { std::shared_ptr rb_reader; const auto row_ranges_map = std::make_shared>(); + // here we test four kinds of empty range: + + // rg 0 not put into map -> will read + row_ranges_map->insert({1, nullptr}); // value is nullptr -> will skip row_ranges_map->insert( - {0, std::make_shared(std::vector())}); + {2, std::make_shared( + std::vector())}); // value is empty -> will skip + row_ranges_map->insert({3, std::make_shared()}); // value is empty -> will skip + const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader); - ASSERT_NOT_OK(status); - EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it monotone " - "and non-interleaving: []") != std::string::npos); + ASSERT_OK(status); + // (0+...29) = 435 + check_rb(rb_reader, 30, 435); } TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { - // case 1: row_ranges_map contains only RG {0}, other RGs should be skipped + // case 1: only care about RG 0 { std::shared_ptr rb_reader; const auto row_ranges_map = std::make_shared>(); @@ -312,6 +319,9 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); + row_ranges_map->insert({1, nullptr}); + row_ranges_map->insert({2, nullptr}); + row_ranges_map->insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -319,7 +329,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { check_rb(rb_reader, 15, 210); // 0 + 2 + ... + 28 = 210 } - // case 2: row_ranges_map contains only RG {0,2}, other RGs should be skipped + // case 2: care about RG 0 and 2 { std::shared_ptr rb_reader; const auto row_ranges_map = std::make_shared>(); @@ -328,7 +338,9 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); + row_ranges_map->insert({1, nullptr}); row_ranges_map->insert({2, std::make_shared(ranges)}); + row_ranges_map->insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -465,7 +477,9 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } row_ranges_map->insert({0, std::make_shared(ranges)}); + row_ranges_map->insert({1, nullptr}); row_ranges_map->insert({2, std::make_shared(ranges)}); + row_ranges_map->insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); From dca69af1b5b61ae93cc992dfda1631ebc4edf694 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Thu, 30 Nov 2023 12:18:27 +0800 Subject: [PATCH 10/25] fix bug --- cpp/src/parquet/arrow/reader.cc | 17 ++++----- cpp/src/parquet/range_reader_test.cc | 57 +++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 10c731a6a8b9..1606c60d64e3 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -476,22 +476,18 @@ struct RowRangesPageFilter { bool operator()(const DataPageStats& stats) { ++page_range_idx; - if (row_range_idx >= row_ranges->getRanges().size()) { - return true; - } - Range current_page_range = (*page_ranges)[page_range_idx]; - if (current_page_range.isBefore((*row_ranges)[row_range_idx])) { - return true; - } - while (row_range_idx < row_ranges->getRanges().size() && current_page_range.isAfter((*row_ranges)[row_range_idx])) { row_range_idx++; } - return row_range_idx >= row_ranges->getRanges().size(); + if (row_range_idx >= row_ranges->getRanges().size()) { + return true; + } + + return current_page_range.isBefore((*row_ranges)[row_range_idx]); } size_t row_range_idx = 0; @@ -642,7 +638,8 @@ class LeafReader : public ColumnReaderImpl { return; } } - // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this RG will be read + // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this + // RG will be read } record_reader_->reset_current_rg_processed_records(); diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index 7a7c7e001bb7..5bccaaa0c0f6 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -30,6 +30,22 @@ #include #include +#include +#include + +std::string random_string(std::string::size_type length) { + static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + static std::mt19937 rg{std::random_device{}()}; + static std::uniform_int_distribution pick(0, sizeof(chrs) - 2); + + std::string s; + s.reserve(length); + while (length--) s += chrs[pick(rg)]; + + return s; +} + /// The table looks like (with_nulls = false): // { // { a: {x: 0, y: 0}, b: {0, 0, 0}, c: "0", d: 0}, @@ -97,7 +113,8 @@ arrow::Result> GetTable(bool with_nulls = false) { std::vector strs; uint8_t valid_bytes[100]; for (size_t i = 0; i < 100; i++) { - strs.push_back(std::to_string(i)); + // add more chars to make this column unaligned with other columns' page + strs.push_back(std::to_string(i) + random_string(20)); valid_bytes[i] = flags[i]; } ARROW_RETURN_NOT_OK(string_builder.AppendValues(strs, &valid_bytes[0])); @@ -130,8 +147,9 @@ arrow::Result> WriteFullFile( WriterProperties::Builder() .max_row_group_length(30) ->enable_write_page_index() - ->write_batch_size(13) - ->data_pagesize(1) // this will cause every batch creating a page + ->disable_dictionary() + ->write_batch_size(1) + ->data_pagesize(30) // small pages ->compression(arrow::Compression::SNAPPY) ->build(); @@ -203,7 +221,9 @@ void check_rb(std::shared_ptr rb_reader, auto c_array = std::dynamic_pointer_cast(batch->GetColumnByName("c")); for (auto iter = c_array->begin(); iter != c_array->end(); ++iter) { - sum_c += std::stoi(std::string((*iter).has_value() ? (*iter).value() : "0")); + sum_c += std::stoi(std::string( + (*iter).has_value() ? (*iter).value().substr(0, (*iter).value().size() - 20) + : "0")); } } @@ -252,7 +272,7 @@ class TestRecordBatchReaderWithRanges : public testing::Test { std::unique_ptr arrow_reader; }; -TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { +TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { std::shared_ptr rb_reader; const auto row_ranges_map = std::make_shared>(); row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); @@ -270,6 +290,24 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForEachRG) { check_rb(rb_reader, 40, 2280); } +TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { + std::shared_ptr rb_reader; + const auto row_ranges_map = std::make_shared>(); + row_ranges_map->insert( + {0, std::make_shared( + std::vector{parquet::Range{0, 7}, parquet::Range{16, 23}})}); + row_ranges_map->insert({1, nullptr}); + row_ranges_map->insert({2, nullptr}); + row_ranges_map->insert({3, nullptr}); + + const std::vector column_indices{0, 1, 2, 3, 4}; + ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, + row_ranges_map, &rb_reader)); + + // (0+...+7) + (16+...+23) = 184 + check_rb(rb_reader, 16, 184); +} + TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { std::shared_ptr rb_reader; const auto row_ranges_map = std::make_shared>(); @@ -299,7 +337,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { row_ranges_map->insert( {2, std::make_shared( std::vector())}); // value is empty -> will skip - row_ranges_map->insert({3, std::make_shared()}); // value is empty -> will skip + row_ranges_map->insert( + {3, std::make_shared()}); // value is empty -> will skip const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -384,9 +423,9 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader); ASSERT_NOT_OK(status); - EXPECT_TRUE(status.message().find( - "The provided row range [(0, 30)] exceeds last page :[26, 29]") != - std::string::npos); + EXPECT_TRUE( + status.message().find("The provided row range [(0, 30)] exceeds last page :") != + std::string::npos); } } From 2c8b06872b3dd93a0d7bdba18aa101b279e7ff06 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 1 Jan 2024 17:12:43 +0800 Subject: [PATCH 11/25] camel naming --- cpp/src/parquet/arrow/reader.cc | 28 +- cpp/src/parquet/column_reader.h | 1390 +++++++++++++++---------------- 2 files changed, 681 insertions(+), 737 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 1606c60d64e3..06e8b5bcf026 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -470,7 +470,7 @@ struct RowRangesPageFilter { const RowRangesPtr& page_ranges_) : row_ranges(row_ranges_), page_ranges(page_ranges_) { assert(page_ranges != nullptr); - assert(page_ranges->getRanges().size() > 0); + assert(page_ranges->GetRanges().size() > 0); } bool operator()(const DataPageStats& stats) { @@ -478,16 +478,16 @@ struct RowRangesPageFilter { Range current_page_range = (*page_ranges)[page_range_idx]; - while (row_range_idx < row_ranges->getRanges().size() && - current_page_range.isAfter((*row_ranges)[row_range_idx])) { + while (row_range_idx < row_ranges->GetRanges().size() && + current_page_range.IsAfter((*row_ranges)[row_range_idx])) { row_range_idx++; } - if (row_range_idx >= row_ranges->getRanges().size()) { + if (row_range_idx >= row_ranges->GetRanges().size()) { return true; } - return current_page_range.isBefore((*row_ranges)[row_range_idx]); + return current_page_range.IsBefore((*row_ranges)[row_range_idx]); } size_t row_range_idx = 0; @@ -580,32 +580,32 @@ class LeafReader : public ColumnReaderImpl { field_->name()); } - if (!row_ranges->isValid()) { + if (!row_ranges->IsValid()) { throw ParquetException( "The provided row range is invalid, keep it monotone and non-interleaving: " + - row_ranges->toString()); + row_ranges->ToString()); } const auto page_locations = offset_index->page_locations(); page_ranges = std::make_shared(); for (size_t i = 0; i < page_locations.size() - 1; i++) { - page_ranges->add( + page_ranges->Add( {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1}, false); } if (page_locations.size() >= 1) { - page_ranges->add( + page_ranges->Add( {page_locations[page_locations.size() - 1].first_row_index, ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - 1}, false); } - if (row_ranges->getRanges().size() > 0) { - if ((*row_ranges).getRanges().back().to > page_ranges->getRanges().back().to) { + if (row_ranges->GetRanges().size() > 0) { + if ((*row_ranges).GetRanges().back().to > page_ranges->GetRanges().back().to) { throw ParquetException( - "The provided row range " + row_ranges->toString() + - " exceeds last page :" + page_ranges->getRanges().back().toString()); + "The provided row range " + row_ranges->ToString() + + " exceeds last page :" + page_ranges->GetRanges().back().ToString()); } } } @@ -621,7 +621,7 @@ class LeafReader : public ColumnReaderImpl { // if specific row range is provided for this rg if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); iter != ctx_->row_ranges_map->end()) { - if (iter->second != nullptr && iter->second->rowCount() != 0) { + if (iter->second != nullptr && iter->second->RowCount() != 0) { std::shared_ptr page_ranges; checkAndGetPageRanges(iter->second, page_ranges); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index dde78d5115c3..0c81087a3770 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -32,736 +32,680 @@ #include "parquet/types.h" namespace arrow { + class Array; + class ChunkedArray; -class Array; -class ChunkedArray; + namespace bit_util { + class BitReader; + } // namespace bit_util -namespace bit_util { -class BitReader; -} // namespace bit_util + namespace util { + class RleDecoder; + } // namespace util +} // namespace arrow -namespace util { -class RleDecoder; -} // namespace util +namespace parquet { + class Decryptor; + class Page; + + // 16 MB is the default maximum page header size + static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; + + // 16 KB is the default expected page header size + static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; + + // \brief DataPageStats stores encoded statistics and number of values/rows for + // a page. + struct PARQUET_EXPORT DataPageStats { + DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values, + std::optional num_rows) + : encoded_statistics(encoded_statistics), + num_values(num_values), + num_rows(num_rows) { + } -} // namespace arrow + // Encoded statistics extracted from the page header. + // Nullptr if there are no statistics in the page header. + const EncodedStatistics* encoded_statistics; + // Number of values stored in the page. Filled for both V1 and V2 data pages. + // For repeated fields, this can be greater than number of rows. For + // non-repeated fields, this will be the same as the number of rows. + int32_t num_values; + // Number of rows stored in the page. std::nullopt if not available. + std::optional num_rows; + }; + + class PARQUET_EXPORT LevelDecoder { + public: + LevelDecoder(); + + ~LevelDecoder(); + + // Initialize the LevelDecoder state with new data + // and return the number of bytes consumed + int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, + const uint8_t* data, int32_t data_size); + + void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values, + const uint8_t* data); + + // Decodes a batch of levels into an array and returns the number of levels decoded + int Decode(int batch_size, int16_t* levels); + + private: + int bit_width_; + int num_values_remaining_; + Encoding::type encoding_; + std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; + int16_t max_level_; + }; + + struct CryptoContext { + CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, + std::shared_ptr meta, std::shared_ptr data) + : start_decrypt_with_dictionary_page(start_with_dictionary_page), + row_group_ordinal(rg_ordinal), + column_ordinal(col_ordinal), + meta_decryptor(std::move(meta)), + data_decryptor(std::move(data)) { + } -namespace parquet { + CryptoContext() { + } + + bool start_decrypt_with_dictionary_page = false; + int16_t row_group_ordinal = -1; + int16_t column_ordinal = -1; + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; + }; + + // Abstract page iterator interface. This way, we can feed column pages to the + // ColumnReader through whatever mechanism we choose + class PARQUET_EXPORT PageReader { + using DataPageFilter = std::function; + + public: + virtual ~PageReader() = default; + + static std::unique_ptr Open( + std::shared_ptr stream, int64_t total_num_values, + Compression::type codec, bool always_compressed = false, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + const CryptoContext* ctx = NULLPTR); + + static std::unique_ptr Open(std::shared_ptr stream, + int64_t total_num_values, + Compression::type codec, + const ReaderProperties&properties, + bool always_compressed = false, + const CryptoContext* ctx = NULLPTR); + + // If data_page_filter is present (not null), NextPage() will call the + // callback function exactly once per page in the order the pages appear in + // the column. If the callback function returns true the page will be + // skipped. The callback will be called only if the page type is DATA_PAGE or + // DATA_PAGE_V2. Dictionary pages will not be skipped. + // Caller is responsible for checking that statistics are correct using + // ApplicationVersion::HasCorrectStatistics(). + // \note API EXPERIMENTAL + void set_data_page_filter(DataPageFilter data_page_filter) { + data_page_filter_ = std::move(data_page_filter); + } + + // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr + // containing new Page otherwise + // + // The returned Page may contain references that aren't guaranteed to live + // beyond the next call to NextPage(). + virtual std::shared_ptr NextPage() = 0; + + virtual void set_max_page_header_size(uint32_t size) = 0; + + protected: + // Callback that decides if we should skip a page or not. + DataPageFilter data_page_filter_; + }; + + class PARQUET_EXPORT ColumnReader { + public: + virtual ~ColumnReader() = default; + + static std::shared_ptr Make( + const ColumnDescriptor* descr, std::unique_ptr pager, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + // Returns true if there are still values in this column. + virtual bool HasNext() = 0; + + virtual Type::type type() const = 0; + + virtual const ColumnDescriptor* descr() const = 0; + + // Get the encoding that can be exposed by this reader. If it returns + // dictionary encoding, then ReadBatchWithDictionary can be used to read data. + // + // \note API EXPERIMENTAL + virtual ExposedEncoding GetExposedEncoding() = 0; + + protected: + friend class RowGroupReader; + // Set the encoding that can be exposed by this reader. + // + // \note API EXPERIMENTAL + virtual void SetExposedEncoding(ExposedEncoding encoding) = 0; + }; + + // API to read values from a single column. This is a main client facing API. + template + class TypedColumnReader : public ColumnReader { + public: + typedef typename DType::c_type T; + + // Read a batch of repetition levels, definition levels, and values from the + // column. + // + // Since null values are not stored in the values, the number of values read + // may be less than the number of repetition and definition levels. With + // nested data this is almost certainly true. + // + // Set def_levels or rep_levels to nullptr if you want to skip reading them. + // This is only safe if you know through some other source that there are no + // undefined values. + // + // To fully exhaust a row group, you must read batches until the number of + // values read reaches the number of stored values according to the metadata. + // + // This API is the same for both V1 and V2 of the DataPage + // + // @returns: actual number of levels read (see values_read for number of values read) + virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, + T* values, int64_t* values_read) = 0; + + /// Read a batch of repetition levels, definition levels, and values from the + /// column and leave spaces for null entries on the lowest level in the values + /// buffer. + /// + /// In comparison to ReadBatch the length of repetition and definition levels + /// is the same as of the number of values read for max_definition_level == 1. + /// In the case of max_definition_level > 1, the repetition and definition + /// levels are larger than the values but the values include the null entries + /// with definition_level == (max_definition_level - 1). + /// + /// To fully exhaust a row group, you must read batches until the number of + /// values read reaches the number of stored values according to the metadata. + /// + /// @param batch_size the number of levels to read + /// @param[out] def_levels The Parquet definition levels, output has + /// the length levels_read. + /// @param[out] rep_levels The Parquet repetition levels, output has + /// the length levels_read. + /// @param[out] values The values in the lowest nested level including + /// spacing for nulls on the lowest levels; output has the length + /// values_read. + /// @param[out] valid_bits Memory allocated for a bitmap that indicates if + /// the row is null or on the maximum definition level. For performance + /// reasons the underlying buffer should be able to store 1 bit more than + /// required. If this requires an additional byte, this byte is only read + /// but never written to. + /// @param valid_bits_offset The offset in bits of the valid_bits where the + /// first relevant bit resides. + /// @param[out] levels_read The number of repetition/definition levels that were read. + /// @param[out] values_read The number of values read, this includes all + /// non-null entries as well as all null-entries on the lowest level + /// (i.e. definition_level == max_definition_level - 1) + /// @param[out] null_count The number of nulls on the lowest levels. + /// (i.e. (values_read - null_count) is total number of non-null entries) + /// + /// \deprecated Since 4.0.0 + ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.") + virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, + int16_t* rep_levels, T* values, uint8_t* valid_bits, + int64_t valid_bits_offset, int64_t* levels_read, + int64_t* values_read, int64_t* null_count) = 0; + + // Skip reading values. This method will work for both repeated and + // non-repeated fields. Note that this method is skipping values and not + // records. This distinction is important for repeated fields, meaning that + // we are not skipping over the values to the next record. For example, + // consider the following two consecutive records containing one repeated field: + // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which + // is inside the first record. + // Returns the number of values skipped. + virtual int64_t Skip(int64_t num_values_to_skip) = 0; + + // Read a batch of repetition levels, definition levels, and indices from the + // column. And read the dictionary if a dictionary page is encountered during + // reading pages. This API is similar to ReadBatch(), with ability to read + // dictionary and indices. It is only valid to call this method when the reader can + // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns + // DICTIONARY). + // + // The dictionary is read along with the data page. When there's no data page, + // the dictionary won't be returned. + // + // @param batch_size The batch size to read + // @param[out] def_levels The Parquet definition levels. + // @param[out] rep_levels The Parquet repetition levels. + // @param[out] indices The dictionary indices. + // @param[out] indices_read The number of indices read. + // @param[out] dict The pointer to dictionary values. It will return nullptr if + // there's no data page. Each column chunk only has one dictionary page. The dictionary + // is owned by the reader, so the caller is responsible for copying the dictionary + // values before the reader gets destroyed. + // @param[out] dict_len The dictionary length. It will return 0 if there's no data + // page. + // @returns: actual number of levels read (see indices_read for number of + // indices read + // + // \note API EXPERIMENTAL + virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels, + int16_t* rep_levels, int32_t* indices, + int64_t* indices_read, const T** dict, + int32_t* dict_len) = 0; + }; + + struct Range { + static Range UnionRange(const Range&left, const Range&right) { + if (left.from <= right.from) { + if (left.to + 1 >= right.from) { + return {left.from, std::max(left.to, right.to)}; + } + } + else if (right.to + 1 >= left.from) { + return {right.from, std::max(left.to, right.to)}; + } + return {-1, -1}; + } + + static Range Intersection(const Range&left, const Range&right) { + if (left.from <= right.from) { + if (left.to >= right.from) { + return {right.from, std::min(left.to, right.to)}; + } + } + else if (right.to >= left.from) { + return {left.from, std::min(left.to, right.to)}; + } + return {-1, -1}; // Return a default Range object if no intersection range found + } + + Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { + assert(from <= to); + } + + size_t Count() const { return to - from + 1; } + + bool IsBefore(const Range&other) const { return to < other.from; } -class Decryptor; -class Page; - -// 16 MB is the default maximum page header size -static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; - -// 16 KB is the default expected page header size -static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; - -// \brief DataPageStats stores encoded statistics and number of values/rows for -// a page. -struct PARQUET_EXPORT DataPageStats { - DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values, - std::optional num_rows) - : encoded_statistics(encoded_statistics), - num_values(num_values), - num_rows(num_rows) {} - - // Encoded statistics extracted from the page header. - // Nullptr if there are no statistics in the page header. - const EncodedStatistics* encoded_statistics; - // Number of values stored in the page. Filled for both V1 and V2 data pages. - // For repeated fields, this can be greater than number of rows. For - // non-repeated fields, this will be the same as the number of rows. - int32_t num_values; - // Number of rows stored in the page. std::nullopt if not available. - std::optional num_rows; -}; - -class PARQUET_EXPORT LevelDecoder { - public: - LevelDecoder(); - ~LevelDecoder(); - - // Initialize the LevelDecoder state with new data - // and return the number of bytes consumed - int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, - const uint8_t* data, int32_t data_size); - - void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values, - const uint8_t* data); - - // Decodes a batch of levels into an array and returns the number of levels decoded - int Decode(int batch_size, int16_t* levels); - - private: - int bit_width_; - int num_values_remaining_; - Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; - std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; - int16_t max_level_; -}; - -struct CryptoContext { - CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, - std::shared_ptr meta, std::shared_ptr data) - : start_decrypt_with_dictionary_page(start_with_dictionary_page), - row_group_ordinal(rg_ordinal), - column_ordinal(col_ordinal), - meta_decryptor(std::move(meta)), - data_decryptor(std::move(data)) {} - CryptoContext() {} - - bool start_decrypt_with_dictionary_page = false; - int16_t row_group_ordinal = -1; - int16_t column_ordinal = -1; - std::shared_ptr meta_decryptor; - std::shared_ptr data_decryptor; -}; - -// Abstract page iterator interface. This way, we can feed column pages to the -// ColumnReader through whatever mechanism we choose -class PARQUET_EXPORT PageReader { - using DataPageFilter = std::function; - - public: - virtual ~PageReader() = default; - - static std::unique_ptr Open( - std::shared_ptr stream, int64_t total_num_values, - Compression::type codec, bool always_compressed = false, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - const CryptoContext* ctx = NULLPTR); - static std::unique_ptr Open(std::shared_ptr stream, - int64_t total_num_values, - Compression::type codec, - const ReaderProperties& properties, - bool always_compressed = false, - const CryptoContext* ctx = NULLPTR); - - // If data_page_filter is present (not null), NextPage() will call the - // callback function exactly once per page in the order the pages appear in - // the column. If the callback function returns true the page will be - // skipped. The callback will be called only if the page type is DATA_PAGE or - // DATA_PAGE_V2. Dictionary pages will not be skipped. - // Caller is responsible for checking that statistics are correct using - // ApplicationVersion::HasCorrectStatistics(). - // \note API EXPERIMENTAL - void set_data_page_filter(DataPageFilter data_page_filter) { - data_page_filter_ = std::move(data_page_filter); - } - - // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr - // containing new Page otherwise - // - // The returned Page may contain references that aren't guaranteed to live - // beyond the next call to NextPage(). - virtual std::shared_ptr NextPage() = 0; - - virtual void set_max_page_header_size(uint32_t size) = 0; - - protected: - // Callback that decides if we should skip a page or not. - DataPageFilter data_page_filter_; -}; - -class PARQUET_EXPORT ColumnReader { - public: - virtual ~ColumnReader() = default; - - static std::shared_ptr Make( - const ColumnDescriptor* descr, std::unique_ptr pager, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - // Returns true if there are still values in this column. - virtual bool HasNext() = 0; - - virtual Type::type type() const = 0; - - virtual const ColumnDescriptor* descr() const = 0; - - // Get the encoding that can be exposed by this reader. If it returns - // dictionary encoding, then ReadBatchWithDictionary can be used to read data. - // - // \note API EXPERIMENTAL - virtual ExposedEncoding GetExposedEncoding() = 0; - - protected: - friend class RowGroupReader; - // Set the encoding that can be exposed by this reader. - // - // \note API EXPERIMENTAL - virtual void SetExposedEncoding(ExposedEncoding encoding) = 0; -}; - -// API to read values from a single column. This is a main client facing API. -template -class TypedColumnReader : public ColumnReader { - public: - typedef typename DType::c_type T; - - // Read a batch of repetition levels, definition levels, and values from the - // column. - // - // Since null values are not stored in the values, the number of values read - // may be less than the number of repetition and definition levels. With - // nested data this is almost certainly true. - // - // Set def_levels or rep_levels to nullptr if you want to skip reading them. - // This is only safe if you know through some other source that there are no - // undefined values. - // - // To fully exhaust a row group, you must read batches until the number of - // values read reaches the number of stored values according to the metadata. - // - // This API is the same for both V1 and V2 of the DataPage - // - // @returns: actual number of levels read (see values_read for number of values read) - virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, - T* values, int64_t* values_read) = 0; - - /// Read a batch of repetition levels, definition levels, and values from the - /// column and leave spaces for null entries on the lowest level in the values - /// buffer. - /// - /// In comparison to ReadBatch the length of repetition and definition levels - /// is the same as of the number of values read for max_definition_level == 1. - /// In the case of max_definition_level > 1, the repetition and definition - /// levels are larger than the values but the values include the null entries - /// with definition_level == (max_definition_level - 1). - /// - /// To fully exhaust a row group, you must read batches until the number of - /// values read reaches the number of stored values according to the metadata. - /// - /// @param batch_size the number of levels to read - /// @param[out] def_levels The Parquet definition levels, output has - /// the length levels_read. - /// @param[out] rep_levels The Parquet repetition levels, output has - /// the length levels_read. - /// @param[out] values The values in the lowest nested level including - /// spacing for nulls on the lowest levels; output has the length - /// values_read. - /// @param[out] valid_bits Memory allocated for a bitmap that indicates if - /// the row is null or on the maximum definition level. For performance - /// reasons the underlying buffer should be able to store 1 bit more than - /// required. If this requires an additional byte, this byte is only read - /// but never written to. - /// @param valid_bits_offset The offset in bits of the valid_bits where the - /// first relevant bit resides. - /// @param[out] levels_read The number of repetition/definition levels that were read. - /// @param[out] values_read The number of values read, this includes all - /// non-null entries as well as all null-entries on the lowest level - /// (i.e. definition_level == max_definition_level - 1) - /// @param[out] null_count The number of nulls on the lowest levels. - /// (i.e. (values_read - null_count) is total number of non-null entries) - /// - /// \deprecated Since 4.0.0 - ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.") - virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, - int16_t* rep_levels, T* values, uint8_t* valid_bits, - int64_t valid_bits_offset, int64_t* levels_read, - int64_t* values_read, int64_t* null_count) = 0; - - // Skip reading values. This method will work for both repeated and - // non-repeated fields. Note that this method is skipping values and not - // records. This distinction is important for repeated fields, meaning that - // we are not skipping over the values to the next record. For example, - // consider the following two consecutive records containing one repeated field: - // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which - // is inside the first record. - // Returns the number of values skipped. - virtual int64_t Skip(int64_t num_values_to_skip) = 0; - - // Read a batch of repetition levels, definition levels, and indices from the - // column. And read the dictionary if a dictionary page is encountered during - // reading pages. This API is similar to ReadBatch(), with ability to read - // dictionary and indices. It is only valid to call this method when the reader can - // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns - // DICTIONARY). - // - // The dictionary is read along with the data page. When there's no data page, - // the dictionary won't be returned. - // - // @param batch_size The batch size to read - // @param[out] def_levels The Parquet definition levels. - // @param[out] rep_levels The Parquet repetition levels. - // @param[out] indices The dictionary indices. - // @param[out] indices_read The number of indices read. - // @param[out] dict The pointer to dictionary values. It will return nullptr if - // there's no data page. Each column chunk only has one dictionary page. The dictionary - // is owned by the reader, so the caller is responsible for copying the dictionary - // values before the reader gets destroyed. - // @param[out] dict_len The dictionary length. It will return 0 if there's no data - // page. - // @returns: actual number of levels read (see indices_read for number of - // indices read - // - // \note API EXPERIMENTAL - virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels, - int16_t* rep_levels, int32_t* indices, - int64_t* indices_read, const T** dict, - int32_t* dict_len) = 0; -}; - -struct Range { - static Range unionRange(const Range& left, const Range& right) { - if (left.from <= right.from) { - if (left.to + 1 >= right.from) { - return {left.from, std::max(left.to, right.to)}; - } - } else if (right.to + 1 >= left.from) { - return {right.from, std::max(left.to, right.to)}; - } - return {-1, -1}; - } - - static Range intersection(const Range& left, const Range& right) { - if (left.from <= right.from) { - if (left.to >= right.from) { - return {right.from, std::min(left.to, right.to)}; - } - } else if (right.to >= left.from) { - return {left.from, std::min(left.to, right.to)}; - } - return {-1, -1}; // Return a default Range object if no intersection range found - } - - int64_t from; - int64_t to; - - Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { - assert(from <= to); - } - - size_t count() const { return to - from + 1; } - - bool isBefore(const Range& other) const { return to < other.from; } - - bool isAfter(const Range& other) const { return from > other.to; } - - bool isOverlap(const Range& other) const { return !isBefore(other) && !isAfter(other); } - - std::string toString() const { - return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; - } -}; - -class RowRanges { - std::vector ranges; - - public: - RowRanges() = default; - - explicit RowRanges(const Range& range) { ranges.push_back(range); } - - RowRanges(const std::vector& ranges) { this->ranges = ranges; } - - // copy cstr - RowRanges(const RowRanges& other) { ranges = other.ranges; } - - RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); } - - static RowRanges unionRanges(const RowRanges& left, const RowRanges& right) { - RowRanges result; - auto it1 = left.ranges.begin(); - auto it2 = right.ranges.begin(); - if (it2 != right.ranges.end()) { - Range range2 = *it2; - while (it1 != left.ranges.end()) { - Range range1 = *it1; - if (range1.isAfter(range2)) { - result.add(range2); - range2 = range1; - const auto tmp = it1; - it1 = it2; - it2 = tmp; - } else { - result.add(range1); + bool IsAfter(const Range&other) const { return from > other.to; } + + bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); } + + std::string ToString() const { + return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; + } + + int64_t from; + int64_t to; + }; + + class RowRanges { + public: + RowRanges() = default; + + explicit RowRanges(const Range&range) { ranges.push_back(range); } + + RowRanges(const std::vector&ranges) { this->ranges = ranges; } + + // copy cstr + RowRanges(const RowRanges&other) { ranges = other.ranges; } + + RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); } + + void Add(const Range&range, bool merge = true) { + Range rangeToAdd = range; + if (merge) { + for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { + Range last = ranges[i]; + if (last.IsAfter(range)) { + throw ParquetException(range.ToString() + " cannot be added to " + + this->ToString()); + } + const Range u = Range::UnionRange(last, rangeToAdd); + if (u.from == -1 && u.to == -1) { + break; + } + rangeToAdd = u; + ranges.erase(ranges.begin() + i); + } + } + else { + if (ranges.size() > 1) + assert(rangeToAdd.from > ranges.back().to); + } + ranges.push_back(rangeToAdd); } - ++it1; - } - result.add(range2); - } else { - it2 = it1; - } - while (it2 != right.ranges.end()) { - result.add(*it2); - ++it2; - } - - return result; - } - - static RowRanges intersection(const RowRanges& left, const RowRanges& right) { - RowRanges result; - - size_t rightIndex = 0; - for (const Range& l : left.ranges) { - for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { - const Range& r = right.ranges[i]; - if (l.isBefore(r)) { - break; - } else if (l.isAfter(r)) { - rightIndex = i + 1; - continue; + + size_t RowCount() const { + size_t cnt = 0; + for (const Range&range: ranges) { + cnt += range.Count(); + } + return cnt; } - result.add(Range::intersection(l, r)); - } - } - - return result; - } - - RowRanges slice(const int64_t from, const int64_t to) const { - RowRanges result; - for (const Range& range : ranges) { - if (range.from >= from && range.to <= to) { - result.add(range); - } - } - return result; - } - - void add(const Range& range, bool merge = true) { - Range rangeToAdd = range; - if (merge) { - for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { - Range last = ranges[i]; - if (last.isAfter(range)) { - throw ParquetException(range.toString() + " cannot be added to " + - this->toString()); + + bool IsValid() const { + if (ranges.size() == 0) return true; + if (ranges[0].from < 0) { + return false; + } + for (size_t i = 1; i < ranges.size(); i++) { + if (ranges[i].from <= ranges[i - 1].to) { + return false; + } + } + return true; } - const Range u = Range::unionRange(last, rangeToAdd); - if (u.from == -1 && u.to == -1) { - break; + + bool IsOverlapping(int64_t from, int64_t to) const { + const Range searchRange(from, to); + return IsOverlapping(searchRange); } - rangeToAdd = u; - ranges.erase(ranges.begin() + i); - } - } else { - if (ranges.size() > 1) assert(rangeToAdd.from > ranges.back().to); - } - ranges.push_back(rangeToAdd); - } - - size_t rowCount() const { - size_t cnt = 0; - for (const Range& range : ranges) { - cnt += range.count(); - } - return cnt; - } - - bool isValid() const { - if (ranges.size() == 0) return true; - if (ranges[0].from < 0) { - return false; - } - for (size_t i = 1; i < ranges.size(); i++) { - if (ranges[i].from <= ranges[i - 1].to) { - return false; - } - } - return true; - } - - bool isOverlapping(int64_t from, int64_t to) const { - const Range searchRange(from, to); - return isOverlapping(searchRange); - } - - bool isOverlapping(const Range& searchRange) const { - auto it = std::lower_bound( - ranges.begin(), ranges.end(), searchRange, - [](const Range& r1, const Range& r2) { return r1.isBefore(r2); }); - return it != ranges.end() && !(*it).isAfter(searchRange); - } - - std::vector& getRanges() { return ranges; } - - const Range& operator[](size_t index) const { - assert(index < ranges.size()); - return ranges[index]; - } - - std::string toString() const { - std::string result = "["; - for (const Range& range : ranges) { - result += - "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; - } - if (!ranges.empty()) { - result = result.substr(0, result.size() - 2); - } - result += "]"; - return result; - } -}; - -using RowRangesPtr = std::shared_ptr; - -namespace internal { - -class PARQUET_EXPORT RecordSkipper { - public: - RecordSkipper(RowRanges& pages, RowRanges& row_ranges_) - : row_ranges(row_ranges_) { // copy row_ranges - RowRanges will_process_pages, skip_pages; - for (auto& page : pages.getRanges()) { - if (!row_ranges.isOverlapping(page)) { - skip_pages.add(page, false); - } - } - - /// Since the skipped pages will be slienly skipped without updating - /// current_rg_processed_records or records_read_, we need to pre-process the row - /// ranges as if these skipped pages never existed - adjust_ranges(skip_pages, row_ranges); - - total_rows_to_process = pages.rowCount() - skip_pages.rowCount(); - } - - /// \brief Return the number of records to read or to skip - /// if return values is positive, it means to read N records - /// if return values is negative, it means to skip N records - /// if return values is 0, it means end of RG - int64_t advise_next(const int64_t current_rg_procesed) { - if (row_ranges.getRanges().size() == row_range_idx) { - return 0; - } - - if (row_ranges[row_range_idx].to < current_rg_procesed) { - row_range_idx++; - if (row_ranges.getRanges().size() == row_range_idx) { - // negative, skip the ramaining rows - return current_rg_procesed - total_rows_to_process; - } - } - - if (row_ranges[row_range_idx].from > current_rg_procesed) { - // negative, skip - return current_rg_procesed - row_ranges[row_range_idx].from; - } - - const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; - assert(ret > 0); - return ret; - } - - private: - /// Keep copy of ranges, because advise_next() will modify them - RowRanges row_ranges; - - size_t row_range_idx = 0; - - size_t total_rows_to_process = 0; - - void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { - size_t skipped_rows = 0; - auto iter = to_adjust.getRanges().begin(); - auto skip_iter = skip_pages.getRanges().begin(); - while (iter != to_adjust.getRanges().end()) { - while (skip_iter != skip_pages.getRanges().end() && skip_iter->isBefore(*iter)) { - skipped_rows += skip_iter->count(); - ++skip_iter; - } - iter->from -= skipped_rows; - iter->to -= skipped_rows; - ++iter; - } - } -}; - -/// \brief Stateful column reader that delimits semantic records for both flat -/// and nested columns -/// -/// \note API EXPERIMENTAL -/// \since 1.3.0 -class PARQUET_EXPORT RecordReader { - public: - /// \brief Creates a record reader. - /// @param descr Column descriptor - /// @param leaf_info Level info, used to determine if a column is nullable or not - /// @param pool Memory pool to use for buffering values and rep/def levels - /// @param read_dictionary True if reading directly as Arrow dictionary-encoded - /// @param read_dense_for_nullable True if reading dense and not leaving space for null - /// values - static std::shared_ptr Make( - const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool read_dictionary = false, bool read_dense_for_nullable = false); - - virtual ~RecordReader() = default; - - /// \brief Attempt to read indicated number of records from column chunk - /// Note that for repeated fields, a record may have more than one value - /// and all of them are read. If read_dense_for_nullable() it will - /// not leave any space for null values. Otherwise, it will read spaced. - /// \return number of records read - virtual int64_t ReadRecords(int64_t num_records) = 0; - - /// \brief Attempt to skip indicated number of records from column chunk. - /// Note that for repeated fields, a record may have more than one value - /// and all of them are skipped. - /// \return number of records skipped - virtual int64_t SkipRecords(int64_t num_records) = 0; - - /// \brief Pre-allocate space for data. Results in better flat read performance - virtual void Reserve(int64_t num_values) = 0; - - /// \brief Clear consumed values and repetition/definition levels as the - /// result of calling ReadRecords - /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them. - virtual void Reset() = 0; - - /// \brief Transfer filled values buffer to caller. A new one will be - /// allocated in subsequent ReadRecords calls - virtual std::shared_ptr ReleaseValues() = 0; - - /// \brief Transfer filled validity bitmap buffer to caller. A new one will - /// be allocated in subsequent ReadRecords calls - virtual std::shared_ptr ReleaseIsValid() = 0; - - /// \brief Return true if the record reader has more internal data yet to - /// process - virtual bool HasMoreData() const = 0; - - /// \brief Advance record reader to the next row group. Must be set before - /// any records could be read/skipped. - /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader - virtual void SetPageReader(std::unique_ptr reader) = 0; - - /// \brief Returns the underlying column reader's descriptor. - virtual const ColumnDescriptor* descr() const = 0; - - virtual void DebugPrintState() = 0; - - /// \brief Decoded definition levels - int16_t* def_levels() const { - return reinterpret_cast(def_levels_->mutable_data()); - } - - /// \brief Decoded repetition levels - int16_t* rep_levels() const { - return reinterpret_cast(rep_levels_->mutable_data()); - } - - /// \brief Decoded values, including nulls, if any - /// FLBA and ByteArray types do not use this array and read into their own - /// builders. - uint8_t* values() const { return values_->mutable_data(); } - - /// \brief Number of values written, including space left for nulls if any. - /// If this Reader was constructed with read_dense_for_nullable(), there is no space for - /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For - /// FLBA and ByteArray types this value reflects the values written with the last - /// ReadRecords call since those readers will reset the values after each call. - int64_t values_written() const { return values_written_; } - - /// \brief Number of definition / repetition levels (from those that have - /// been decoded) that have been consumed inside the reader. - int64_t levels_position() const { return levels_position_; } - - /// \brief Number of definition / repetition levels that have been written - /// internally in the reader. This may be larger than values_written() because - /// for repeated fields we need to look at the levels in advance to figure out - /// the record boundaries. - int64_t levels_written() const { return levels_written_; } - - /// \brief Number of nulls in the leaf that we have read so far into the - /// values vector. This is only valid when !read_dense_for_nullable(). When - /// read_dense_for_nullable() it will always be 0. - int64_t null_count() const { return null_count_; } - - /// \brief True if the leaf values are nullable - bool nullable_values() const { return nullable_values_; } - - /// \brief True if reading directly as Arrow dictionary-encoded - bool read_dictionary() const { return read_dictionary_; } - - /// \brief True if reading dense for nullable columns. - bool read_dense_for_nullable() const { return read_dense_for_nullable_; } - - void reset_current_rg_processed_records() { current_rg_processed_records = 0; } - - void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } - - protected: - /// \brief Indicates if we can have nullable values. Note that repeated fields - /// may or may not be nullable. - bool nullable_values_; - - bool at_record_start_; - int64_t records_read_; - - int64_t current_rg_processed_records; // counting both read and skip records - - /// \brief Stores values. These values are populated based on each ReadRecords - /// call. No extra values are buffered for the next call. SkipRecords will not - /// add any value to this buffer. - std::shared_ptr<::arrow::ResizableBuffer> values_; - /// \brief False for BYTE_ARRAY, in which case we don't allocate the values - /// buffer and we directly read into builder classes. - bool uses_values_; - - /// \brief Values that we have read into 'values_' + 'null_count_'. - int64_t values_written_; - int64_t values_capacity_; - int64_t null_count_; - - /// \brief Each bit corresponds to one element in 'values_' and specifies if it - /// is null or not null. Not set if read_dense_for_nullable_ is true. - std::shared_ptr<::arrow::ResizableBuffer> valid_bits_; - - /// \brief Buffer for definition levels. May contain more levels than - /// is actually read. This is because we read levels ahead to - /// figure out record boundaries for repeated fields. - /// For flat required fields, 'def_levels_' and 'rep_levels_' are not - /// populated. For non-repeated fields 'rep_levels_' is not populated. - /// 'def_levels_' and 'rep_levels_' must be of the same size if present. - std::shared_ptr<::arrow::ResizableBuffer> def_levels_; - /// \brief Buffer for repetition levels. Only populated for repeated - /// fields. - std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; - - /// \brief Number of definition / repetition levels that have been written - /// internally in the reader. This may be larger than values_written() since - /// for repeated fields we need to look at the levels in advance to figure out - /// the record boundaries. - int64_t levels_written_; - /// \brief Position of the next level that should be consumed. - int64_t levels_position_; - int64_t levels_capacity_; - - bool read_dictionary_ = false; - // If true, we will not leave any space for the null values in the values_ - // vector. - bool read_dense_for_nullable_ = false; - - std::shared_ptr skipper = NULLPTR; -}; - -class BinaryRecordReader : virtual public RecordReader { - public: - virtual std::vector> GetBuilderChunks() = 0; -}; - -/// \brief Read records directly to dictionary-encoded Arrow form (int32 -/// indices). Only valid for BYTE_ARRAY columns -class DictionaryRecordReader : virtual public RecordReader { - public: - virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; -}; - -} // namespace internal - -using BoolReader = TypedColumnReader; -using Int32Reader = TypedColumnReader; -using Int64Reader = TypedColumnReader; -using Int96Reader = TypedColumnReader; -using FloatReader = TypedColumnReader; -using DoubleReader = TypedColumnReader; -using ByteArrayReader = TypedColumnReader; -using FixedLenByteArrayReader = TypedColumnReader; - -} // namespace parquet + + bool IsOverlapping(const Range&searchRange) const { + auto it = std::lower_bound( + ranges.begin(), ranges.end(), searchRange, + [](const Range&r1, const Range&r2) { return r1.IsBefore(r2); }); + return it != ranges.end() && !(*it).IsAfter(searchRange); + } + + std::vector& GetRanges() { return ranges; } + + const Range& operator[](size_t index) const { + assert(index < ranges.size()); + return ranges[index]; + } + + std::string ToString() const { + std::string result = "["; + for (const Range&range: ranges) { + result += + "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; + } + if (!ranges.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; + } + + private: + std::vector ranges; + }; + + using RowRangesPtr = std::shared_ptr; + + namespace internal { + class PARQUET_EXPORT RecordSkipper { + public: + RecordSkipper(RowRanges&pages, RowRanges&row_ranges_) + : row_ranges(row_ranges_) { + // copy row_ranges + RowRanges will_process_pages, skip_pages; + for (auto&page: pages.GetRanges()) { + if (!row_ranges.IsOverlapping(page)) { + skip_pages.Add(page, false); + } + } + + /// Since the skipped pages will be slienly skipped without updating + /// current_rg_processed_records or records_read_, we need to pre-process the row + /// ranges as if these skipped pages never existed + adjust_ranges(skip_pages, row_ranges); + + total_rows_to_process = pages.RowCount() - skip_pages.RowCount(); + } + + /// \brief Return the number of records to read or to skip + /// if return values is positive, it means to read N records + /// if return values is negative, it means to skip N records + /// if return values is 0, it means end of RG + int64_t advise_next(const int64_t current_rg_procesed) { + if (row_ranges.GetRanges().size() == row_range_idx) { + return 0; + } + + if (row_ranges[row_range_idx].to < current_rg_procesed) { + row_range_idx++; + if (row_ranges.GetRanges().size() == row_range_idx) { + // negative, skip the ramaining rows + return current_rg_procesed - total_rows_to_process; + } + } + + if (row_ranges[row_range_idx].from > current_rg_procesed) { + // negative, skip + return current_rg_procesed - row_ranges[row_range_idx].from; + } + + const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; + assert(ret > 0); + return ret; + } + + private: + void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { + size_t skipped_rows = 0; + auto iter = to_adjust.GetRanges().begin(); + auto skip_iter = skip_pages.GetRanges().begin(); + while (iter != to_adjust.GetRanges().end()) { + while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) { + skipped_rows += skip_iter->Count(); + ++skip_iter; + } + iter->from -= skipped_rows; + iter->to -= skipped_rows; + ++iter; + } + } + + /// Keep copy of ranges, because advise_next() will modify them + RowRanges row_ranges; + + size_t row_range_idx = 0; + size_t total_rows_to_process = 0; + }; + + /// \brief Stateful column reader that delimits semantic records for both flat + /// and nested columns + /// + /// \note API EXPERIMENTAL + /// \since 1.3.0 + class PARQUET_EXPORT RecordReader { + public: + /// \brief Creates a record reader. + /// @param descr Column descriptor + /// @param leaf_info Level info, used to determine if a column is nullable or not + /// @param pool Memory pool to use for buffering values and rep/def levels + /// @param read_dictionary True if reading directly as Arrow dictionary-encoded + /// @param read_dense_for_nullable True if reading dense and not leaving space for null + /// values + static std::shared_ptr Make( + const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool read_dictionary = false, bool read_dense_for_nullable = false); + + virtual ~RecordReader() = default; + + /// \brief Attempt to read indicated number of records from column chunk + /// Note that for repeated fields, a record may have more than one value + /// and all of them are read. If read_dense_for_nullable() it will + /// not leave any space for null values. Otherwise, it will read spaced. + /// \return number of records read + virtual int64_t ReadRecords(int64_t num_records) = 0; + + /// \brief Attempt to skip indicated number of records from column chunk. + /// Note that for repeated fields, a record may have more than one value + /// and all of them are skipped. + /// \return number of records skipped + virtual int64_t SkipRecords(int64_t num_records) = 0; + + /// \brief Pre-allocate space for data. Results in better flat read performance + virtual void Reserve(int64_t num_values) = 0; + + /// \brief Clear consumed values and repetition/definition levels as the + /// result of calling ReadRecords + /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them. + virtual void Reset() = 0; + + /// \brief Transfer filled values buffer to caller. A new one will be + /// allocated in subsequent ReadRecords calls + virtual std::shared_ptr ReleaseValues() = 0; + + /// \brief Transfer filled validity bitmap buffer to caller. A new one will + /// be allocated in subsequent ReadRecords calls + virtual std::shared_ptr ReleaseIsValid() = 0; + + /// \brief Return true if the record reader has more internal data yet to + /// process + virtual bool HasMoreData() const = 0; + + /// \brief Advance record reader to the next row group. Must be set before + /// any records could be read/skipped. + /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader + virtual void SetPageReader(std::unique_ptr reader) = 0; + + /// \brief Returns the underlying column reader's descriptor. + virtual const ColumnDescriptor* descr() const = 0; + + virtual void DebugPrintState() = 0; + + /// \brief Decoded definition levels + int16_t* def_levels() const { + return reinterpret_cast(def_levels_->mutable_data()); + } + + /// \brief Decoded repetition levels + int16_t* rep_levels() const { + return reinterpret_cast(rep_levels_->mutable_data()); + } + + /// \brief Decoded values, including nulls, if any + /// FLBA and ByteArray types do not use this array and read into their own + /// builders. + uint8_t* values() const { return values_->mutable_data(); } + + /// \brief Number of values written, including space left for nulls if any. + /// If this Reader was constructed with read_dense_for_nullable(), there is no space for + /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For + /// FLBA and ByteArray types this value reflects the values written with the last + /// ReadRecords call since those readers will reset the values after each call. + int64_t values_written() const { return values_written_; } + + /// \brief Number of definition / repetition levels (from those that have + /// been decoded) that have been consumed inside the reader. + int64_t levels_position() const { return levels_position_; } + + /// \brief Number of definition / repetition levels that have been written + /// internally in the reader. This may be larger than values_written() because + /// for repeated fields we need to look at the levels in advance to figure out + /// the record boundaries. + int64_t levels_written() const { return levels_written_; } + + /// \brief Number of nulls in the leaf that we have read so far into the + /// values vector. This is only valid when !read_dense_for_nullable(). When + /// read_dense_for_nullable() it will always be 0. + int64_t null_count() const { return null_count_; } + + /// \brief True if the leaf values are nullable + bool nullable_values() const { return nullable_values_; } + + /// \brief True if reading directly as Arrow dictionary-encoded + bool read_dictionary() const { return read_dictionary_; } + + /// \brief True if reading dense for nullable columns. + bool read_dense_for_nullable() const { return read_dense_for_nullable_; } + + void reset_current_rg_processed_records() { current_rg_processed_records = 0; } + + void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } + + protected: + /// \brief Indicates if we can have nullable values. Note that repeated fields + /// may or may not be nullable. + bool nullable_values_; + + bool at_record_start_; + int64_t records_read_; + + int64_t current_rg_processed_records; // counting both read and skip records + + /// \brief Stores values. These values are populated based on each ReadRecords + /// call. No extra values are buffered for the next call. SkipRecords will not + /// add any value to this buffer. + std::shared_ptr<::arrow::ResizableBuffer> values_; + /// \brief False for BYTE_ARRAY, in which case we don't allocate the values + /// buffer and we directly read into builder classes. + bool uses_values_; + + /// \brief Values that we have read into 'values_' + 'null_count_'. + int64_t values_written_; + int64_t values_capacity_; + int64_t null_count_; + + /// \brief Each bit corresponds to one element in 'values_' and specifies if it + /// is null or not null. Not set if read_dense_for_nullable_ is true. + std::shared_ptr<::arrow::ResizableBuffer> valid_bits_; + + /// \brief Buffer for definition levels. May contain more levels than + /// is actually read. This is because we read levels ahead to + /// figure out record boundaries for repeated fields. + /// For flat required fields, 'def_levels_' and 'rep_levels_' are not + /// populated. For non-repeated fields 'rep_levels_' is not populated. + /// 'def_levels_' and 'rep_levels_' must be of the same size if present. + std::shared_ptr<::arrow::ResizableBuffer> def_levels_; + /// \brief Buffer for repetition levels. Only populated for repeated + /// fields. + std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; + + /// \brief Number of definition / repetition levels that have been written + /// internally in the reader. This may be larger than values_written() since + /// for repeated fields we need to look at the levels in advance to figure out + /// the record boundaries. + int64_t levels_written_; + /// \brief Position of the next level that should be consumed. + int64_t levels_position_; + int64_t levels_capacity_; + + bool read_dictionary_ = false; + // If true, we will not leave any space for the null values in the values_ + // vector. + bool read_dense_for_nullable_ = false; + + std::shared_ptr skipper = NULLPTR; + }; + + class BinaryRecordReader : virtual public RecordReader { + public: + virtual std::vector> GetBuilderChunks() = 0; + }; + + /// \brief Read records directly to dictionary-encoded Arrow form (int32 + /// indices). Only valid for BYTE_ARRAY columns + class DictionaryRecordReader : virtual public RecordReader { + public: + virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; + }; + } // namespace internal + + using BoolReader = TypedColumnReader; + using Int32Reader = TypedColumnReader; + using Int64Reader = TypedColumnReader; + using Int96Reader = TypedColumnReader; + using FloatReader = TypedColumnReader; + using DoubleReader = TypedColumnReader; + using ByteArrayReader = TypedColumnReader; + using FixedLenByteArrayReader = TypedColumnReader; +} // namespace parquet From 945e543164a01a3c29774966c0b291dc49684e06 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 1 Jan 2024 18:48:04 +0800 Subject: [PATCH 12/25] simply map --- cpp/src/parquet/arrow/reader.cc | 28 ++++---- cpp/src/parquet/arrow/reader.h | 4 +- cpp/src/parquet/arrow/reader_internal.h | 2 +- cpp/src/parquet/range_reader_test.cc | 94 ++++++++++++------------- 4 files changed, 62 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 06e8b5bcf026..cbca49435e29 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -207,7 +207,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader( int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. @@ -224,13 +224,13 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->row_ranges_map = row_ranges_map; + ctx->row_ranges_map = row_ranges_map; // it will be shared by all field readers, so copy instead of std::move() return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders( const std::vector& column_indices, const std::vector& row_groups, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -272,7 +272,7 @@ class FileReaderImpl : public FileReader { std::vector row_groups = Iota(reader_->metadata()->num_row_groups()); std::unique_ptr reader; - RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, NULLPTR, &reader)); + RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, {}, &reader)); return ReadColumn(i, row_groups, reader.get(), out); } @@ -345,24 +345,24 @@ class FileReaderImpl : public FileReader { Status GetRecordBatchReader( const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::unique_ptr* out) override; Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::unique_ptr* out) override { - return GetRecordBatchReader(row_group_indices, column_indices, NULLPTR, out); + return GetRecordBatchReader(row_group_indices, column_indices, {}, out); } Status GetRecordBatchReader(const std::vector& row_group_indices, std::unique_ptr* out) override { return GetRecordBatchReader(row_group_indices, - Iota(reader_->metadata()->num_columns()), NULLPTR, out); + Iota(reader_->metadata()->num_columns()), {}, out); } Status GetRecordBatchReader(std::unique_ptr* out) override { return GetRecordBatchReader(Iota(num_row_groups()), - Iota(reader_->metadata()->num_columns()), NULLPTR, out); + Iota(reader_->metadata()->num_columns()), {}, out); } ::arrow::Result<::arrow::AsyncGenerator>> @@ -614,13 +614,13 @@ class LeafReader : public ColumnReaderImpl { std::unique_ptr page_reader = input_->NextChunk(); /// using page index to reduce cost - if (page_reader != nullptr && ctx_->row_ranges_map) { + if (page_reader != nullptr) { // reset skipper record_reader_->set_record_skipper(NULLPTR); // if specific row range is provided for this rg - if (const auto iter = ctx_->row_ranges_map->find(input_->current_row_group()); - iter != ctx_->row_ranges_map->end()) { + if (const auto iter = ctx_->row_ranges_map.find(input_->current_row_group()); + iter != ctx_->row_ranges_map.end()) { if (iter->second != nullptr && iter->second->RowCount() != 0) { std::shared_ptr page_ranges; checkAndGetPageRanges(iter->second, page_ranges); @@ -1113,7 +1113,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& Status FileReaderImpl::GetRecordBatchReader( const std::vector& row_groups, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); @@ -1384,7 +1384,7 @@ Future> FileReaderImpl::DecodeRowGroups( std::vector> readers; std::shared_ptr<::arrow::Schema> result_schema; RETURN_NOT_OK( - GetFieldReaders(column_indices, row_groups, NULLPTR, &readers, &result_schema)); + GetFieldReaders(column_indices, row_groups, {}, &readers, &result_schema)); // OptionalParallelForAsync requires an executor if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool(); @@ -1449,7 +1449,7 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice Status FileReader::GetRecordBatchReader( const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::shared_ptr* out) { std::unique_ptr tmp; RETURN_NOT_OK( diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 0cd8f298d79d..d5b5cf54f131 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -190,7 +190,7 @@ class PARQUET_EXPORT FileReader { virtual ::arrow::Status GetRecordBatchReader( const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from @@ -207,7 +207,7 @@ class PARQUET_EXPORT FileReader { /// contains an invalid index ::arrow::Status GetRecordBatchReader( const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr>& row_ranges_map, + const std::map& row_ranges_map, std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index 56be0f93f414..fba583d27b06 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -113,7 +113,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; - std::shared_ptr> row_ranges_map; + std::map row_ranges_map; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index 5bccaaa0c0f6..abbbb5fa60e4 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -274,13 +274,13 @@ class TestRecordBatchReaderWithRanges : public testing::Test { TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert({0, std::make_shared(parquet::Range{0, 9})}); - row_ranges_map->insert( + auto row_ranges_map = std::map(); + row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 9})}); + row_ranges_map.insert( {1, std::make_shared(parquet::Range{10, 19})}); - row_ranges_map->insert( + row_ranges_map.insert( {2, std::make_shared(parquet::Range{20, 29})}); - row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + row_ranges_map.insert({3, std::make_shared(parquet::Range{0, 9})}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -292,13 +292,13 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert( - {0, std::make_shared( - std::vector{parquet::Range{0, 7}, parquet::Range{16, 23}})}); - row_ranges_map->insert({1, nullptr}); - row_ranges_map->insert({2, nullptr}); - row_ranges_map->insert({3, nullptr}); + auto row_ranges_map = std::map(); + row_ranges_map.insert( + {0, std::make_shared(std::vector{ + parquet::Range{0, 7}, parquet::Range{16, 23}})}); + row_ranges_map.insert({1, nullptr}); + row_ranges_map.insert({2, nullptr}); + row_ranges_map.insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -310,14 +310,11 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert( - {0, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map->insert( - {1, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map->insert( - {2, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map->insert({3, std::make_shared(parquet::Range{0, 9})}); + auto row_ranges_map = std::map(); + row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map.insert({1, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map.insert({2, std::make_shared(parquet::Range{0, 29})}); + row_ranges_map.insert({3, std::make_shared(parquet::Range{0, 9})}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -329,15 +326,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); + auto row_ranges_map = std::map(); // here we test four kinds of empty range: // rg 0 not put into map -> will read - row_ranges_map->insert({1, nullptr}); // value is nullptr -> will skip - row_ranges_map->insert( + row_ranges_map.insert({1, nullptr}); // value is nullptr -> will skip + row_ranges_map.insert( {2, std::make_shared( std::vector())}); // value is empty -> will skip - row_ranges_map->insert( + row_ranges_map.insert( {3, std::make_shared()}); // value is empty -> will skip const std::vector column_indices{0, 1, 2, 3, 4}; @@ -352,15 +349,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 1: only care about RG 0 { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); + auto row_ranges_map = std::map(); std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map->insert({0, std::make_shared(ranges)}); - row_ranges_map->insert({1, nullptr}); - row_ranges_map->insert({2, nullptr}); - row_ranges_map->insert({3, nullptr}); + row_ranges_map.insert({0, std::make_shared(ranges)}); + row_ranges_map.insert({1, nullptr}); + row_ranges_map.insert({2, nullptr}); + row_ranges_map.insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -371,15 +368,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 2: care about RG 0 and 2 { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); + auto row_ranges_map = std::map(); std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map->insert({0, std::make_shared(ranges)}); - row_ranges_map->insert({1, nullptr}); - row_ranges_map->insert({2, std::make_shared(ranges)}); - row_ranges_map->insert({3, nullptr}); + row_ranges_map.insert({0, std::make_shared(ranges)}); + row_ranges_map.insert({1, nullptr}); + row_ranges_map.insert({2, std::make_shared(ranges)}); + row_ranges_map.insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); @@ -391,8 +388,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { std::shared_ptr rb_reader; { - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert( + auto row_ranges_map = std::map(); + row_ranges_map.insert( {0, std::make_shared(parquet::Range{-1, 5})}); const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -404,9 +401,9 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert({0, std::make_shared(std::vector{ - parquet::Range{0, 4}, parquet::Range{2, 5}})}); + auto row_ranges_map = std::map(); + row_ranges_map.insert({0, std::make_shared(std::vector{ + parquet::Range{0, 4}, parquet::Range{2, 5}})}); const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader); @@ -416,8 +413,8 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos); } { - const auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert( + auto row_ranges_map = std::map(); + row_ranges_map.insert( {0, std::make_shared(std::vector{parquet::Range{0, 30}})}); const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, @@ -467,9 +464,8 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); std::shared_ptr rb_reader; - auto row_ranges_map = std::make_shared>(); - row_ranges_map->insert( - {0, std::make_shared(parquet::Range{0, 29})}); + auto row_ranges_map = std::map(); + row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 29})}); std::vector column_indices{0, 1, 2, 3, 4}; auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader); @@ -510,15 +506,15 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test { TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { { std::shared_ptr rb_reader; - const auto row_ranges_map = std::make_shared>(); + auto row_ranges_map = std::map(); std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map->insert({0, std::make_shared(ranges)}); - row_ranges_map->insert({1, nullptr}); - row_ranges_map->insert({2, std::make_shared(ranges)}); - row_ranges_map->insert({3, nullptr}); + row_ranges_map.insert({0, std::make_shared(ranges)}); + row_ranges_map.insert({1, nullptr}); + row_ranges_map.insert({2, std::make_shared(ranges)}); + row_ranges_map.insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, row_ranges_map, &rb_reader)); From ed9d02b36c6e5ad5ae60b57bbfa0cd6470706b1b Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 1 Jan 2024 23:26:42 +0800 Subject: [PATCH 13/25] api refined --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/arrow/reader.cc | 143 ++++++++++++--------- cpp/src/parquet/arrow/reader.h | 20 +-- cpp/src/parquet/arrow/reader_internal.h | 2 +- cpp/src/parquet/column_reader.h | 74 ++++++++++- cpp/src/parquet/range_reader_test.cc | 159 +++++++++--------------- cpp/src/parquet/row_range_test.cc | 102 +++++++++++++++ 7 files changed, 330 insertions(+), 171 deletions(-) create mode 100644 cpp/src/parquet/row_range_test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 0b947af762b2..9f9a7f2336aa 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -355,6 +355,7 @@ add_parquet_test(reader-test column_scanner_test.cc reader_test.cc range_reader_test.cc + row_range_test.cc stream_reader_test.cc test_util.cc) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index cbca49435e29..222493487cc4 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -207,7 +207,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader( int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, - const std::map& row_ranges_map, + const std::shared_ptr> & row_ranges_map, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. @@ -224,13 +224,13 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->row_ranges_map = row_ranges_map; // it will be shared by all field readers, so copy instead of std::move() + ctx->row_ranges_map = row_ranges_map; return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders( const std::vector& column_indices, const std::vector& row_groups, - const std::map& row_ranges_map, + const std::shared_ptr> & row_ranges_map, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -343,25 +343,65 @@ class FileReaderImpl : public FileReader { return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table); } - Status GetRecordBatchReader( - const std::vector& row_group_indices, const std::vector& column_indices, - const std::map& row_ranges_map, - std::unique_ptr* out) override; + // This is a internal API owned by FileReaderImpl, not exposed in FileReader + Status GetRecordBatchReaderWithRowRanges(const std::vector& row_group_indices, + const std::vector& column_indices, + const std::shared_ptr> & row_ranges_map, + std::unique_ptr* out); + + Status GetRecordBatchReader(const RowRanges& rows_to_return, + const std::vector& column_indices, + std::unique_ptr* out) override { + const auto metadata = reader_->metadata(); + // check if the row ranges are valid + if (!rows_to_return.IsValid()) { + return Status::Invalid("The provided row range is invalid, keep it monotone and non-interleaving: " + + rows_to_return.ToString()); + } + // check if the row ranges are within the row group boundaries + if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().to >= metadata->num_rows()) { + return Status::Invalid("The provided row range " + rows_to_return.ToString() + + " exceeds the number of rows in the file: " + + std::to_string(metadata->num_rows())); + } + + std::vector split_points; + int64_t rows_so_far = 0; + for (int i = 0 ; i < metadata->num_row_groups() - 1; i++) { + rows_so_far += metadata->RowGroup(i)->num_rows(); + split_points.push_back(rows_so_far); + } + // We'll assign a RowRanges for each RG, even if it's not required to return any rows + const std::vector splits = rows_to_return.SplitAt(split_points); + // Call row_ranges_map because array index is the row group index + const std::shared_ptr> row_ranges_map = + std::make_shared>(); + rows_so_far = 0; + std::vector row_group_indices; + for (int i = 0 ; i < metadata->num_row_groups(); i++) { + row_ranges_map->push_back(splits[i].shift(-rows_so_far)); + rows_so_far += metadata->RowGroup(i)->num_rows(); + if (row_ranges_map->at(i).RowCount() > 0) + row_group_indices.push_back(i); + } + + return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_map, out); + } Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::unique_ptr* out) override { - return GetRecordBatchReader(row_group_indices, column_indices, {}, out); + return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, {}, out); } Status GetRecordBatchReader(const std::vector& row_group_indices, std::unique_ptr* out) override { - return GetRecordBatchReader(row_group_indices, + return GetRecordBatchReaderWithRowRanges(row_group_indices, Iota(reader_->metadata()->num_columns()), {}, out); } Status GetRecordBatchReader(std::unique_ptr* out) override { - return GetRecordBatchReader(Iota(num_row_groups()), + return GetRecordBatchReaderWithRowRanges(Iota(num_row_groups()), Iota(reader_->metadata()->num_columns()), {}, out); } @@ -466,7 +506,7 @@ class RowGroupReaderImpl : public RowGroupReader { // Column reader implementations struct RowRangesPageFilter { - explicit RowRangesPageFilter(const RowRangesPtr& row_ranges_, + explicit RowRangesPageFilter(const RowRanges& row_ranges_, const RowRangesPtr& page_ranges_) : row_ranges(row_ranges_), page_ranges(page_ranges_) { assert(page_ranges != nullptr); @@ -478,20 +518,20 @@ struct RowRangesPageFilter { Range current_page_range = (*page_ranges)[page_range_idx]; - while (row_range_idx < row_ranges->GetRanges().size() && - current_page_range.IsAfter((*row_ranges)[row_range_idx])) { + while (row_range_idx < row_ranges.GetRanges().size() && + current_page_range.IsAfter(row_ranges[row_range_idx])) { row_range_idx++; } - if (row_range_idx >= row_ranges->GetRanges().size()) { + if (row_range_idx >= row_ranges.GetRanges().size()) { return true; } - return current_page_range.IsBefore((*row_ranges)[row_range_idx]); + return current_page_range.IsBefore(row_ranges[row_range_idx]); } size_t row_range_idx = 0; - const RowRangesPtr row_ranges; + const RowRanges & row_ranges; int page_range_idx = -1; const RowRangesPtr page_ranges; @@ -559,8 +599,8 @@ class LeafReader : public ColumnReaderImpl { private: std::shared_ptr out_; - void checkAndGetPageRanges(const std::shared_ptr& row_ranges, - std::shared_ptr& page_ranges) { + void checkAndGetPageRanges(const RowRanges & row_ranges, + std::shared_ptr& page_ranges) const { // check offset exists const auto rg_pg_index_reader = ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group()); @@ -580,12 +620,6 @@ class LeafReader : public ColumnReaderImpl { field_->name()); } - if (!row_ranges->IsValid()) { - throw ParquetException( - "The provided row range is invalid, keep it monotone and non-interleaving: " + - row_ranges->ToString()); - } - const auto page_locations = offset_index->page_locations(); page_ranges = std::make_shared(); for (size_t i = 0; i < page_locations.size() - 1; i++) { @@ -601,10 +635,10 @@ class LeafReader : public ColumnReaderImpl { false); } - if (row_ranges->GetRanges().size() > 0) { - if ((*row_ranges).GetRanges().back().to > page_ranges->GetRanges().back().to) { + if (row_ranges.GetRanges().size() > 0) { + if (row_ranges.GetRanges().back().to > page_ranges->GetRanges().back().to) { throw ParquetException( - "The provided row range " + row_ranges->ToString() + + "The provided row range " + row_ranges.ToString() + " exceeds last page :" + page_ranges->GetRanges().back().ToString()); } } @@ -614,32 +648,28 @@ class LeafReader : public ColumnReaderImpl { std::unique_ptr page_reader = input_->NextChunk(); /// using page index to reduce cost - if (page_reader != nullptr) { + if (page_reader != nullptr && ctx_->row_ranges_map) { // reset skipper record_reader_->set_record_skipper(NULLPTR); - // if specific row range is provided for this rg - if (const auto iter = ctx_->row_ranges_map.find(input_->current_row_group()); - iter != ctx_->row_ranges_map.end()) { - if (iter->second != nullptr && iter->second->RowCount() != 0) { - std::shared_ptr page_ranges; - checkAndGetPageRanges(iter->second, page_ranges); - - // part 1, skip decompressing & decoding unnecessary pages - page_reader->set_data_page_filter( - RowRangesPageFilter(iter->second, page_ranges)); - - // part 2, skip unnecessary rows in necessary pages - record_reader_->set_record_skipper( - std::make_shared(*page_ranges, - *iter->second)); - } else { - NextRowGroup(); - return; - } + const auto & row_ranges = (*ctx_->row_ranges_map)[input_->current_row_group()]; + if (row_ranges.RowCount() != 0) { + // if specific row range is provided for this rg + std::shared_ptr page_ranges; + checkAndGetPageRanges(row_ranges, page_ranges); + + // part 1, skip decompressing & decoding unnecessary pages + page_reader->set_data_page_filter( + RowRangesPageFilter(row_ranges, page_ranges)); + + // part 2, skip unnecessary rows in necessary pages + record_reader_->set_record_skipper( + std::make_shared(*page_ranges, + row_ranges)); + } else { + NextRowGroup(); + return; } - // Else iff row_ranges_map exists but no row_ranges is found for this RG key, this - // RG will be read } record_reader_->reset_current_rg_processed_records(); @@ -1111,9 +1141,9 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& } // namespace -Status FileReaderImpl::GetRecordBatchReader( +Status FileReaderImpl::GetRecordBatchReaderWithRowRanges( const std::vector& row_groups, const std::vector& column_indices, - const std::map& row_ranges_map, + const std::shared_ptr> & row_ranges_map, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); @@ -1447,17 +1477,6 @@ Status FileReader::GetRecordBatchReader(const std::vector& row_group_indice return Status::OK(); } -Status FileReader::GetRecordBatchReader( - const std::vector& row_group_indices, const std::vector& column_indices, - const std::map& row_ranges_map, - std::shared_ptr* out) { - std::unique_ptr tmp; - RETURN_NOT_OK( - GetRecordBatchReader(row_group_indices, column_indices, row_ranges_map, &tmp)); - out->reset(tmp.release()); - return Status::OK(); -} - Status FileReader::Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index d5b5cf54f131..807be797aad6 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -188,10 +188,17 @@ class PARQUET_EXPORT FileReader { const std::vector& row_group_indices, const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; - virtual ::arrow::Status GetRecordBatchReader( - const std::vector& row_group_indices, const std::vector& column_indices, - const std::map& row_ranges_map, - std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; + /// \brief Return a RecordBatchReader of row groups selected from + /// rows_to_return, whose columns are selected by column_indices. + /// + /// Notice that rows_to_return is file based, it not only decides which row groups to read, + /// but also which rows to read in each row group. + /// + /// + /// \returns error Status if either rows_to_return or column_indices + /// contains an invalid index + virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return, + const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from /// row_group_indices, whose columns are selected by column_indices. @@ -205,10 +212,6 @@ class PARQUET_EXPORT FileReader { /// /// \returns error Status if either row_group_indices or column_indices /// contains an invalid index - ::arrow::Status GetRecordBatchReader( - const std::vector& row_group_indices, const std::vector& column_indices, - const std::map& row_ranges_map, - std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, std::shared_ptr<::arrow::RecordBatchReader>* out); @@ -216,6 +219,7 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out); + /// \brief Return a generator of record batches. /// /// The FileReader must outlive the generator, so this requires that you pass in a diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index fba583d27b06..4d98f8a7fe5c 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -113,7 +113,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; - std::map row_ranges_map; + std::shared_ptr> row_ranges_map; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 0c81087a3770..d884e0144e4b 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -343,11 +343,15 @@ namespace parquet { bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); } + bool IsValid() const { return from >= 0 && to >= 0 && to >= from; } + std::string ToString() const { return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; } + // inclusive int64_t from; + // inclusive int64_t to; }; @@ -364,6 +368,26 @@ namespace parquet { RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); } + static RowRanges Intersection(const RowRanges& left, const RowRanges& right) { + RowRanges result; + + size_t rightIndex = 0; + for (const Range& l : left.ranges) { + for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { + const Range& r = right.ranges[i]; + if (l.IsBefore(r)) { + break; + } else if (l.IsAfter(r)) { + rightIndex = i + 1; + continue; + } + result.Add(Range::Intersection(l, r)); + } + } + + return result; + } + void Add(const Range&range, bool merge = true) { Range rangeToAdd = range; if (merge) { @@ -423,11 +447,57 @@ namespace parquet { std::vector& GetRanges() { return ranges; } + const std::vector& GetRanges() const { return ranges; } + + // Split the ranges into N+1 parts at the given split point, where N = split_points.size() + // The RowRows object itself is not modified + std::vector SplitAt(const std::vector&split_points) const { + if (split_points.size() == 0) { + return {*this}; + } + + std::vector result; + int64_t last_split_point = -1; + for (const int64_t split_point: split_points) { + if (split_point <= 0) { + throw ParquetException("Invalid split point " + std::to_string(split_point)); + } + if (split_point <= last_split_point) { + throw ParquetException("Split points must be in ascending order"); + } + last_split_point = split_point; + } + + RowRanges spaces; + for (size_t i = 0 ; i < split_points.size(); ++i) { + auto start = i == 0 ? 0 : split_points[i - 1]; + auto end = split_points[i] - 1; + spaces.Add({start, end}, false); + } + spaces.Add({split_points[split_points.size() - 1], std::numeric_limits::max()}, + false); + + for(Range space : spaces.GetRanges()) { + RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this); + result.push_back(intersection); + } + + return result; + } + const Range& operator[](size_t index) const { assert(index < ranges.size()); return ranges[index]; } + RowRanges shift(const int64_t offset) const { + RowRanges result; + for (const Range&range: ranges) { + result.Add({range.from + offset, range.to + offset}); + } + return result; + } + std::string ToString() const { std::string result = "["; for (const Range&range: ranges) { @@ -450,7 +520,7 @@ namespace parquet { namespace internal { class PARQUET_EXPORT RecordSkipper { public: - RecordSkipper(RowRanges&pages, RowRanges&row_ranges_) + RecordSkipper(RowRanges&pages, const RowRanges&row_ranges_) : row_ranges(row_ranges_) { // copy row_ranges RowRanges will_process_pages, skip_pages; @@ -496,7 +566,7 @@ namespace parquet { } private: - void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { + void adjust_ranges(RowRanges&skip_pages, RowRanges&to_adjust) { size_t skipped_rows = 0; auto iter = to_adjust.GetRanges().begin(); auto skip_iter = skip_pages.GetRanges().begin(); diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index abbbb5fa60e4..6fcc35ec4fd4 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -33,6 +33,9 @@ #include #include +using parquet::Range; +using parquet::RowRanges; + std::string random_string(std::string::size_type length) { static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; @@ -181,7 +184,7 @@ bool checking_col(const std::string& col_name, column_names.end(); } -void check_rb(std::shared_ptr rb_reader, +void check_rb(std::unique_ptr rb_reader, const size_t expected_rows, const int64_t expected_sum) { const std::vector column_names = rb_reader->schema()->field_names(); @@ -272,128 +275,94 @@ class TestRecordBatchReaderWithRanges : public testing::Test { std::unique_ptr arrow_reader; }; +TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {} + TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); - row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 9})}); - row_ranges_map.insert( - {1, std::make_shared(parquet::Range{10, 19})}); - row_ranges_map.insert( - {2, std::make_shared(parquet::Range{20, 29})}); - row_ranges_map.insert({3, std::make_shared(parquet::Range{0, 9})}); + std::unique_ptr rb_reader; + RowRanges rows{{Range{0, 9}, Range{40, 49}, Range{80, 89}, Range{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); // (0+...+9) + (40+...+49) + (80+...+89) + (90+...+99) = 2280 - check_rb(rb_reader, 40, 2280); + check_rb(std::move(rb_reader), 40, 2280); } TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); - row_ranges_map.insert( - {0, std::make_shared(std::vector{ - parquet::Range{0, 7}, parquet::Range{16, 23}})}); - row_ranges_map.insert({1, nullptr}); - row_ranges_map.insert({2, nullptr}); - row_ranges_map.insert({3, nullptr}); + std::unique_ptr rb_reader; + RowRanges rows{{Range{0, 7}, Range{16, 23}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); // (0+...+7) + (16+...+23) = 184 - check_rb(rb_reader, 16, 184); + check_rb(std::move(rb_reader), 16, 184); } TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); - row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map.insert({1, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map.insert({2, std::make_shared(parquet::Range{0, 29})}); - row_ranges_map.insert({3, std::make_shared(parquet::Range{0, 9})}); + std::unique_ptr rb_reader; + RowRanges rows{{Range{0, 29}, Range{30, 59}, Range{60, 89}, Range{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); // (0+...+99) = 4950 - check_rb(rb_reader, 100, 4950); + check_rb(std::move(rb_reader), 100, 4950); } TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); - // here we test four kinds of empty range: - - // rg 0 not put into map -> will read - row_ranges_map.insert({1, nullptr}); // value is nullptr -> will skip - row_ranges_map.insert( - {2, std::make_shared( - std::vector())}); // value is empty -> will skip - row_ranges_map.insert( - {3, std::make_shared()}); // value is empty -> will skip + std::unique_ptr rb_reader; + RowRanges rows{}; const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const auto status = + arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_OK(status); - // (0+...29) = 435 - check_rb(rb_reader, 30, 435); + check_rb(std::move(rb_reader), 0, 0); } TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 1: only care about RG 0 { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); + std::unique_ptr rb_reader; std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map.insert({0, std::make_shared(ranges)}); - row_ranges_map.insert({1, nullptr}); - row_ranges_map.insert({2, nullptr}); - row_ranges_map.insert({3, nullptr}); const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + &rb_reader)); - check_rb(rb_reader, 15, 210); // 0 + 2 + ... + 28 = 210 + check_rb(std::move(rb_reader), 15, 210); // 0 + 2 + ... + 28 = 210 } // case 2: care about RG 0 and 2 { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); + std::unique_ptr rb_reader; std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map.insert({0, std::make_shared(ranges)}); - row_ranges_map.insert({1, nullptr}); - row_ranges_map.insert({2, std::make_shared(ranges)}); - row_ranges_map.insert({3, nullptr}); + + for (int64_t i = 60; i < 90; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + &rb_reader)); - check_rb(rb_reader, 30, 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 + check_rb(std::move(rb_reader), 30, + 1320); // (0 + 2 + ... + 28) + (60 + 62 ... + 88) = 1320 } } TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { - std::shared_ptr rb_reader; + std::unique_ptr rb_reader; { - auto row_ranges_map = std::map(); - row_ranges_map.insert( - {0, std::make_shared(parquet::Range{-1, 5})}); + RowRanges rows{{Range{-1, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const auto status = + arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it " "monotone and non-interleaving: [(-1, 5)]") != @@ -401,28 +370,25 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { - auto row_ranges_map = std::map(); - row_ranges_map.insert({0, std::make_shared(std::vector{ - parquet::Range{0, 4}, parquet::Range{2, 5}})}); + RowRanges rows{{Range{0, 4}, {2, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const auto status = + arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE( status.message().find("The provided row range is invalid, keep it monotone and " "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos); } { - auto row_ranges_map = std::map(); - row_ranges_map.insert( - {0, std::make_shared(std::vector{parquet::Range{0, 30}})}); + // will treat as {0,99} + RowRanges rows{{Range{0, 100}}}; const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + const auto status = + arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); - EXPECT_TRUE( - status.message().find("The provided row range [(0, 30)] exceeds last page :") != - std::string::npos); + EXPECT_TRUE(status.message().find("The provided row range [(0, 100)] exceeds the " + "number of rows in the file: 100") != + std::string::npos); } } @@ -463,12 +429,10 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { reader_builder.properties(arrow_reader_props); ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); - row_ranges_map.insert({0, std::make_shared(parquet::Range{0, 29})}); + std::unique_ptr rb_reader; + RowRanges rows{{Range{0, 29}}}; std::vector column_indices{0, 1, 2, 3, 4}; - auto status = arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader); + auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); EXPECT_TRUE(status.message().find("Attempting to read with Ranges but Page Index is " "not found for Row Group: 0") != std::string::npos); @@ -505,22 +469,21 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test { TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { { - std::shared_ptr rb_reader; - auto row_ranges_map = std::map(); + std::unique_ptr rb_reader; std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } - row_ranges_map.insert({0, std::make_shared(ranges)}); - row_ranges_map.insert({1, nullptr}); - row_ranges_map.insert({2, std::make_shared(ranges)}); - row_ranges_map.insert({3, nullptr}); + + for (int64_t i = 60; i < 90; i++) { + if (i % 2 == 0) ranges.push_back({i, i}); + } const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader({0, 1, 2, 3}, column_indices, - row_ranges_map, &rb_reader)); + ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + &rb_reader)); // 0-9 is masked as null, so the ramaining is: // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320 - check_rb(rb_reader, 30, 1300); + check_rb(std::move(rb_reader), 30, 1300); } } \ No newline at end of file diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc new file mode 100644 index 000000000000..3766df5e0fb5 --- /dev/null +++ b/cpp/src/parquet/row_range_test.cc @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include +#include "parquet/column_reader.h" + +using namespace parquet; + +class RowRangesTest : public ::testing::Test { + protected: + RowRanges rowRanges; +}; + +TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) { + rowRanges.Add(Range(0, 10)); + std::vector split_points; + + auto result = rowRanges.SplitAt(split_points); + + ASSERT_EQ(result.size(), 1); + ASSERT_EQ(result[0].GetRanges().size(), 1); + ASSERT_EQ(result[0][0].from, 0); + ASSERT_EQ(result[0][0].to, 10); +} + +TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) { + rowRanges.Add(Range(0, 10)); + std::vector split_points = {5}; + + auto result = rowRanges.SplitAt(split_points); + + ASSERT_EQ(result.size(), 2); + ASSERT_EQ(result[0].GetRanges().size(), 1); + ASSERT_EQ(result[0][0].from, 0); + ASSERT_EQ(result[0][0].to, 4); + ASSERT_EQ(result[1].GetRanges().size(), 1); + ASSERT_EQ(result[1][0].from, 5); + ASSERT_EQ(result[1][0].to, 10); +} + +TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) { + rowRanges.Add(Range(0, 10)); + std::vector split_points = {3, 7}; + + auto result = rowRanges.SplitAt(split_points); + + ASSERT_EQ(result.size(), 3); + ASSERT_EQ(result[0].GetRanges().size(), 1); + ASSERT_EQ(result[0][0].from, 0); + ASSERT_EQ(result[0][0].to, 2); + ASSERT_EQ(result[1].GetRanges().size(), 1); + ASSERT_EQ(result[1][0].from, 3); + ASSERT_EQ(result[1][0].to, 6); + ASSERT_EQ(result[2].GetRanges().size(), 1); + ASSERT_EQ(result[2][0].from, 7); + ASSERT_EQ(result[2][0].to, 10); +} + +TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) { + rowRanges.Add(Range(11, 18)); + std::vector split_points = {5, 10, 15, 20}; + + auto result = rowRanges.SplitAt(split_points); + + ASSERT_EQ(result.size(), 5); + ASSERT_EQ(result[0].GetRanges().size(), 0); + ASSERT_EQ(result[1].GetRanges().size(), 0); + ASSERT_EQ(result[2].GetRanges().size(), 1); + ASSERT_EQ(result[2][0].from, 11); + ASSERT_EQ(result[2][0].to, 14); + ASSERT_EQ(result[3].GetRanges().size(), 1); + ASSERT_EQ(result[3][0].from, 15); + ASSERT_EQ(result[3][0].to, 18); + ASSERT_EQ(result[4].GetRanges().size(), 0); +} + +TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) { + rowRanges.Add(Range(0, 10)); + std::vector split_points = {-1}; + + ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); +} + +TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) { + rowRanges.Add(Range(0, 10)); + std::vector split_points = {5, 3}; + + ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); +} From 14974c096f181c6ae6be8f7f054e5282501ec8b9 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Tue, 2 Jan 2024 00:03:34 +0800 Subject: [PATCH 14/25] clean code --- cpp/src/parquet/arrow/reader.cc | 25 ++-- cpp/src/parquet/column_reader.h | 175 +++++++++++++-------------- cpp/src/parquet/range_reader_test.cc | 22 ++-- cpp/src/parquet/row_range_test.cc | 44 +++---- 4 files changed, 129 insertions(+), 137 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 222493487cc4..92b746b8ad92 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -74,8 +74,7 @@ using arrow::internal::Iota; // Help reduce verbosity using ParquetReader = parquet::ParquetFileReader; -using parquet::Range; -using parquet::RowRangesPtr; +using parquet::IntervalRange; using parquet::internal::RecordReader; namespace bit_util = arrow::bit_util; @@ -359,7 +358,7 @@ class FileReaderImpl : public FileReader { rows_to_return.ToString()); } // check if the row ranges are within the row group boundaries - if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().to >= metadata->num_rows()) { + if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().end >= metadata->num_rows()) { return Status::Invalid("The provided row range " + rows_to_return.ToString() + " exceeds the number of rows in the file: " + std::to_string(metadata->num_rows())); @@ -507,16 +506,18 @@ class RowGroupReaderImpl : public RowGroupReader { struct RowRangesPageFilter { explicit RowRangesPageFilter(const RowRanges& row_ranges_, - const RowRangesPtr& page_ranges_) + const std::shared_ptr& page_ranges_) : row_ranges(row_ranges_), page_ranges(page_ranges_) { - assert(page_ranges != nullptr); - assert(page_ranges->GetRanges().size() > 0); + + if (page_ranges == nullptr || page_ranges->GetRanges().size() == 0) { + throw ParquetException("Page ranges is empty"); + } } bool operator()(const DataPageStats& stats) { ++page_range_idx; - Range current_page_range = (*page_ranges)[page_range_idx]; + IntervalRange current_page_range = (*page_ranges)[page_range_idx]; while (row_range_idx < row_ranges.GetRanges().size() && current_page_range.IsAfter(row_ranges[row_range_idx])) { @@ -534,7 +535,7 @@ struct RowRangesPageFilter { const RowRanges & row_ranges; int page_range_idx = -1; - const RowRangesPtr page_ranges; + const std::shared_ptr page_ranges; }; // Leaf reader is for primitive arrays and primitive children of nested arrays @@ -624,19 +625,17 @@ class LeafReader : public ColumnReaderImpl { page_ranges = std::make_shared(); for (size_t i = 0; i < page_locations.size() - 1; i++) { page_ranges->Add( - {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1}, - false); + {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1}); } if (page_locations.size() >= 1) { page_ranges->Add( {page_locations[page_locations.size() - 1].first_row_index, ctx_->reader->metadata()->RowGroup(input_->current_row_group())->num_rows() - - 1}, - false); + 1}); } if (row_ranges.GetRanges().size() > 0) { - if (row_ranges.GetRanges().back().to > page_ranges->GetRanges().back().to) { + if (row_ranges.GetRanges().back().end > page_ranges->GetRanges().back().end) { throw ParquetException( "The provided row range " + row_ranges.ToString() + " exceeds last page :" + page_ranges->GetRanges().back().ToString()); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index d884e0144e4b..9b9393e4ecc2 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -17,12 +17,12 @@ #pragma once -#include #include #include #include #include +#include "page_index.h" #include "parquet/exception.h" #include "parquet/level_conversion.h" #include "parquet/metadata.h" @@ -306,64 +306,64 @@ namespace parquet { int32_t* dict_len) = 0; }; - struct Range { - static Range UnionRange(const Range&left, const Range&right) { - if (left.from <= right.from) { - if (left.to + 1 >= right.from) { - return {left.from, std::max(left.to, right.to)}; + // Represent a range to read. The range is inclusive on both ends. + struct IntervalRange { + static IntervalRange Intersection(const IntervalRange&left, const IntervalRange&right) { + if (left.start <= right.start) { + if (left.end >= right.start) { + return {right.start, std::min(left.end, right.end)}; } } - else if (right.to + 1 >= left.from) { - return {right.from, std::max(left.to, right.to)}; - } - return {-1, -1}; - } - - static Range Intersection(const Range&left, const Range&right) { - if (left.from <= right.from) { - if (left.to >= right.from) { - return {right.from, std::min(left.to, right.to)}; - } - } - else if (right.to >= left.from) { - return {left.from, std::min(left.to, right.to)}; + else if (right.end >= left.start) { + return {left.start, std::min(left.end, right.end)}; } return {-1, -1}; // Return a default Range object if no intersection range found } - Range(const int64_t from_, const int64_t to_) : from(from_), to(to_) { - assert(from <= to); + IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) { + if (start > end) { + throw ParquetException("Invalid range with start: " + std::to_string(start) + + " and end: " + std::to_string(end)); + } } - size_t Count() const { return to - from + 1; } + size_t Count() const { return end - start + 1; } - bool IsBefore(const Range&other) const { return to < other.from; } + bool IsBefore(const IntervalRange&other) const { return end < other.start; } - bool IsAfter(const Range&other) const { return from > other.to; } + bool IsAfter(const IntervalRange&other) const { return start > other.end; } - bool IsOverlap(const Range&other) const { return !IsBefore(other) && !IsAfter(other); } + bool IsOverlap(const IntervalRange&other) const { return !IsBefore(other) && !IsAfter(other); } - bool IsValid() const { return from >= 0 && to >= 0 && to >= from; } + bool IsValid() const { return start >= 0 && end >= 0 && end >= start; } std::string ToString() const { - return "[" + std::to_string(from) + ", " + std::to_string(to) + "]"; + return "[" + std::to_string(start) + ", " + std::to_string(end) + "]"; } // inclusive - int64_t from; + int64_t start; // inclusive - int64_t to; + int64_t end; + }; + + struct BitmapRange { + int64_t offset; + // zero added to, if there are less than 64 elements left in the column. + uint64_t bitmap; }; + struct End {}; + + // Represent a set of ranges to read. The ranges are sorted and non-overlapping. class RowRanges { public: RowRanges() = default; - explicit RowRanges(const Range&range) { ranges.push_back(range); } + explicit RowRanges(const IntervalRange&range) { ranges.push_back(range); } - RowRanges(const std::vector&ranges) { this->ranges = ranges; } + RowRanges(const std::vector&ranges) { this->ranges = ranges; } - // copy cstr RowRanges(const RowRanges&other) { ranges = other.ranges; } RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); } @@ -372,49 +372,33 @@ namespace parquet { RowRanges result; size_t rightIndex = 0; - for (const Range& l : left.ranges) { + for (const IntervalRange& l : left.ranges) { for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { - const Range& r = right.ranges[i]; + const IntervalRange& r = right.ranges[i]; if (l.IsBefore(r)) { break; } else if (l.IsAfter(r)) { rightIndex = i + 1; continue; } - result.Add(Range::Intersection(l, r)); + result.Add(IntervalRange::Intersection(l, r)); } } return result; } - void Add(const Range&range, bool merge = true) { - Range rangeToAdd = range; - if (merge) { - for (int i = static_cast(ranges.size()) - 1; i >= 0; --i) { - Range last = ranges[i]; - if (last.IsAfter(range)) { - throw ParquetException(range.ToString() + " cannot be added to " + - this->ToString()); - } - const Range u = Range::UnionRange(last, rangeToAdd); - if (u.from == -1 && u.to == -1) { - break; - } - rangeToAdd = u; - ranges.erase(ranges.begin() + i); - } - } - else { - if (ranges.size() > 1) - assert(rangeToAdd.from > ranges.back().to); + void Add(const IntervalRange&range) { + const IntervalRange rangeToAdd = range; + if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) { + throw ParquetException("Ranges must be added in order"); } ranges.push_back(rangeToAdd); } size_t RowCount() const { size_t cnt = 0; - for (const Range&range: ranges) { + for (const IntervalRange&range: ranges) { cnt += range.Count(); } return cnt; @@ -422,32 +406,32 @@ namespace parquet { bool IsValid() const { if (ranges.size() == 0) return true; - if (ranges[0].from < 0) { + if (ranges[0].start < 0) { return false; } for (size_t i = 1; i < ranges.size(); i++) { - if (ranges[i].from <= ranges[i - 1].to) { + if (ranges[i].start <= ranges[i - 1].end) { return false; } } return true; } - bool IsOverlapping(int64_t from, int64_t to) const { - const Range searchRange(from, to); + bool IsOverlapping(int64_t start, int64_t end) const { + const IntervalRange searchRange(start, end); return IsOverlapping(searchRange); } - bool IsOverlapping(const Range&searchRange) const { + bool IsOverlapping(const IntervalRange&searchRange) const { auto it = std::lower_bound( ranges.begin(), ranges.end(), searchRange, - [](const Range&r1, const Range&r2) { return r1.IsBefore(r2); }); + [](const IntervalRange&r1, const IntervalRange&r2) { return r1.IsBefore(r2); }); return it != ranges.end() && !(*it).IsAfter(searchRange); } - std::vector& GetRanges() { return ranges; } + std::vector& GetRanges() { return ranges; } - const std::vector& GetRanges() const { return ranges; } + const std::vector& GetRanges() const { return ranges; } // Split the ranges into N+1 parts at the given split point, where N = split_points.size() // The RowRows object itself is not modified @@ -472,12 +456,11 @@ namespace parquet { for (size_t i = 0 ; i < split_points.size(); ++i) { auto start = i == 0 ? 0 : split_points[i - 1]; auto end = split_points[i] - 1; - spaces.Add({start, end}, false); + spaces.Add({start, end}); } - spaces.Add({split_points[split_points.size() - 1], std::numeric_limits::max()}, - false); + spaces.Add({split_points[split_points.size() - 1], std::numeric_limits::max()}); - for(Range space : spaces.GetRanges()) { + for(IntervalRange space : spaces.GetRanges()) { RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this); result.push_back(intersection); } @@ -485,24 +468,27 @@ namespace parquet { return result; } - const Range& operator[](size_t index) const { - assert(index < ranges.size()); + const IntervalRange& operator[](size_t index) const { + // check index + if (index >= ranges.size() || index < 0) { + throw ParquetException("Index out of range"); + } return ranges[index]; } RowRanges shift(const int64_t offset) const { RowRanges result; - for (const Range&range: ranges) { - result.Add({range.from + offset, range.to + offset}); + for (const IntervalRange&range: ranges) { + result.Add({range.start + offset, range.end + offset}); } return result; } std::string ToString() const { std::string result = "["; - for (const Range&range: ranges) { + for (const IntervalRange&range: ranges) { result += - "(" + std::to_string(range.from) + ", " + std::to_string(range.to) + "), "; + "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), "; } if (!ranges.empty()) { result = result.substr(0, result.size() - 2); @@ -511,12 +497,20 @@ namespace parquet { return result; } + /// The following APIs are to be implemented + /// Comment out for now to pass compile + +// // Returns a vector of PageLocations that must be read all to get values for all included in this range +// virtual std::vector PageIndexesToInclude(const std::vector& all_pages) = 0; +// class Iterator { +// virtual std::variant NextRange() = 0; +// }; +// virtual std::unique_ptr NewIterator() = 0; + private: - std::vector ranges; + std::vector ranges; }; - using RowRangesPtr = std::shared_ptr; - namespace internal { class PARQUET_EXPORT RecordSkipper { public: @@ -526,11 +520,11 @@ namespace parquet { RowRanges will_process_pages, skip_pages; for (auto&page: pages.GetRanges()) { if (!row_ranges.IsOverlapping(page)) { - skip_pages.Add(page, false); + skip_pages.Add(page); } } - /// Since the skipped pages will be slienly skipped without updating + /// Since the skipped pages will be silently skipped without updating /// current_rg_processed_records or records_read_, we need to pre-process the row /// ranges as if these skipped pages never existed adjust_ranges(skip_pages, row_ranges); @@ -542,31 +536,30 @@ namespace parquet { /// if return values is positive, it means to read N records /// if return values is negative, it means to skip N records /// if return values is 0, it means end of RG - int64_t advise_next(const int64_t current_rg_procesed) { + int64_t advise_next(const int64_t current_rg_processed) { if (row_ranges.GetRanges().size() == row_range_idx) { return 0; } - if (row_ranges[row_range_idx].to < current_rg_procesed) { + if (row_ranges[row_range_idx].end < current_rg_processed) { row_range_idx++; if (row_ranges.GetRanges().size() == row_range_idx) { // negative, skip the ramaining rows - return current_rg_procesed - total_rows_to_process; + return current_rg_processed - total_rows_to_process; } } - if (row_ranges[row_range_idx].from > current_rg_procesed) { + if (row_ranges[row_range_idx].start > current_rg_processed) { // negative, skip - return current_rg_procesed - row_ranges[row_range_idx].from; + return current_rg_processed - row_ranges[row_range_idx].start; } - const auto ret = row_ranges[row_range_idx].to - current_rg_procesed + 1; - assert(ret > 0); + const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1; return ret; } private: - void adjust_ranges(RowRanges&skip_pages, RowRanges&to_adjust) { + void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) { size_t skipped_rows = 0; auto iter = to_adjust.GetRanges().begin(); auto skip_iter = skip_pages.GetRanges().begin(); @@ -575,13 +568,13 @@ namespace parquet { skipped_rows += skip_iter->Count(); ++skip_iter; } - iter->from -= skipped_rows; - iter->to -= skipped_rows; + iter->start -= skipped_rows; + iter->end -= skipped_rows; ++iter; } } - /// Keep copy of ranges, because advise_next() will modify them + /// Keep copy of ranges, because adjust_ranges() will modify them RowRanges row_ranges; size_t row_range_idx = 0; diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index 6fcc35ec4fd4..b3127a8e346c 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -33,7 +33,7 @@ #include #include -using parquet::Range; +using parquet::IntervalRange; using parquet::RowRanges; std::string random_string(std::string::size_type length) { @@ -279,7 +279,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {} TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { std::unique_ptr rb_reader; - RowRanges rows{{Range{0, 9}, Range{40, 49}, Range{80, 89}, Range{90, 99}}}; + RowRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -290,7 +290,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { std::unique_ptr rb_reader; - RowRanges rows{{Range{0, 7}, Range{16, 23}}}; + RowRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -301,7 +301,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { std::unique_ptr rb_reader; - RowRanges rows{{Range{0, 29}, Range{30, 59}, Range{60, 89}, Range{90, 99}}}; + RowRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -325,7 +325,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 1: only care about RG 0 { std::unique_ptr rb_reader; - std::vector ranges; + std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } @@ -339,7 +339,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { // case 2: care about RG 0 and 2 { std::unique_ptr rb_reader; - std::vector ranges; + std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } @@ -359,7 +359,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { std::unique_ptr rb_reader; { - RowRanges rows{{Range{-1, 5}}}; + RowRanges rows{{IntervalRange{-1, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -370,7 +370,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { - RowRanges rows{{Range{0, 4}, {2, 5}}}; + RowRanges rows{{IntervalRange{0, 4}, {2, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -381,7 +381,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { // will treat as {0,99} - RowRanges rows{{Range{0, 100}}}; + RowRanges rows{{IntervalRange{0, 100}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -430,7 +430,7 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); std::unique_ptr rb_reader; - RowRanges rows{{Range{0, 29}}}; + RowRanges rows{{IntervalRange{0, 29}}}; std::vector column_indices{0, 1, 2, 3, 4}; auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); @@ -470,7 +470,7 @@ class TestRecordBatchReaderWithRangesWithNulls : public testing::Test { TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { { std::unique_ptr rb_reader; - std::vector ranges; + std::vector ranges; for (int64_t i = 0; i < 30; i++) { if (i % 2 == 0) ranges.push_back({i, i}); } diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc index 3766df5e0fb5..2e043f57a7b2 100644 --- a/cpp/src/parquet/row_range_test.cc +++ b/cpp/src/parquet/row_range_test.cc @@ -25,52 +25,52 @@ class RowRangesTest : public ::testing::Test { }; TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) { - rowRanges.Add(Range(0, 10)); + rowRanges.Add(IntervalRange(0, 10)); std::vector split_points; auto result = rowRanges.SplitAt(split_points); ASSERT_EQ(result.size(), 1); ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].from, 0); - ASSERT_EQ(result[0][0].to, 10); + ASSERT_EQ(result[0][0].start, 0); + ASSERT_EQ(result[0][0].end, 10); } TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) { - rowRanges.Add(Range(0, 10)); + rowRanges.Add(IntervalRange(0, 10)); std::vector split_points = {5}; auto result = rowRanges.SplitAt(split_points); ASSERT_EQ(result.size(), 2); ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].from, 0); - ASSERT_EQ(result[0][0].to, 4); + ASSERT_EQ(result[0][0].start, 0); + ASSERT_EQ(result[0][0].end, 4); ASSERT_EQ(result[1].GetRanges().size(), 1); - ASSERT_EQ(result[1][0].from, 5); - ASSERT_EQ(result[1][0].to, 10); + ASSERT_EQ(result[1][0].start, 5); + ASSERT_EQ(result[1][0].end, 10); } TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) { - rowRanges.Add(Range(0, 10)); + rowRanges.Add(IntervalRange(0, 10)); std::vector split_points = {3, 7}; auto result = rowRanges.SplitAt(split_points); ASSERT_EQ(result.size(), 3); ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].from, 0); - ASSERT_EQ(result[0][0].to, 2); + ASSERT_EQ(result[0][0].start, 0); + ASSERT_EQ(result[0][0].end, 2); ASSERT_EQ(result[1].GetRanges().size(), 1); - ASSERT_EQ(result[1][0].from, 3); - ASSERT_EQ(result[1][0].to, 6); + ASSERT_EQ(result[1][0].start, 3); + ASSERT_EQ(result[1][0].end, 6); ASSERT_EQ(result[2].GetRanges().size(), 1); - ASSERT_EQ(result[2][0].from, 7); - ASSERT_EQ(result[2][0].to, 10); + ASSERT_EQ(result[2][0].start, 7); + ASSERT_EQ(result[2][0].end, 10); } TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) { - rowRanges.Add(Range(11, 18)); + rowRanges.Add(IntervalRange(11, 18)); std::vector split_points = {5, 10, 15, 20}; auto result = rowRanges.SplitAt(split_points); @@ -79,23 +79,23 @@ TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) { ASSERT_EQ(result[0].GetRanges().size(), 0); ASSERT_EQ(result[1].GetRanges().size(), 0); ASSERT_EQ(result[2].GetRanges().size(), 1); - ASSERT_EQ(result[2][0].from, 11); - ASSERT_EQ(result[2][0].to, 14); + ASSERT_EQ(result[2][0].start, 11); + ASSERT_EQ(result[2][0].end, 14); ASSERT_EQ(result[3].GetRanges().size(), 1); - ASSERT_EQ(result[3][0].from, 15); - ASSERT_EQ(result[3][0].to, 18); + ASSERT_EQ(result[3][0].start, 15); + ASSERT_EQ(result[3][0].end, 18); ASSERT_EQ(result[4].GetRanges().size(), 0); } TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) { - rowRanges.Add(Range(0, 10)); + rowRanges.Add(IntervalRange(0, 10)); std::vector split_points = {-1}; ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); } TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) { - rowRanges.Add(Range(0, 10)); + rowRanges.Add(IntervalRange(0, 10)); std::vector split_points = {5, 3}; ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); From ef8a7c8350f1c0e761915101a15ad9ff37d72c70 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Tue, 2 Jan 2024 00:06:22 +0800 Subject: [PATCH 15/25] clean code --- cpp/src/parquet/column_reader.h | 1470 +++++++++++++++---------------- 1 file changed, 735 insertions(+), 735 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 9b9393e4ecc2..7ebabf1f2095 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -32,743 +32,743 @@ #include "parquet/types.h" namespace arrow { - class Array; - class ChunkedArray; +class Array; +class ChunkedArray; - namespace bit_util { - class BitReader; - } // namespace bit_util +namespace bit_util { +class BitReader; +} // namespace bit_util - namespace util { - class RleDecoder; - } // namespace util -} // namespace arrow +namespace util { +class RleDecoder; +} // namespace util +} // namespace arrow namespace parquet { - class Decryptor; - class Page; - - // 16 MB is the default maximum page header size - static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; - - // 16 KB is the default expected page header size - static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; - - // \brief DataPageStats stores encoded statistics and number of values/rows for - // a page. - struct PARQUET_EXPORT DataPageStats { - DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values, - std::optional num_rows) - : encoded_statistics(encoded_statistics), - num_values(num_values), - num_rows(num_rows) { +class Decryptor; +class Page; + +// 16 MB is the default maximum page header size +static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; + +// 16 KB is the default expected page header size +static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; + +// \brief DataPageStats stores encoded statistics and number of values/rows for +// a page. +struct PARQUET_EXPORT DataPageStats { + DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values, + std::optional num_rows) + : encoded_statistics(encoded_statistics), + num_values(num_values), + num_rows(num_rows) {} + + // Encoded statistics extracted from the page header. + // Nullptr if there are no statistics in the page header. + const EncodedStatistics* encoded_statistics; + // Number of values stored in the page. Filled for both V1 and V2 data pages. + // For repeated fields, this can be greater than number of rows. For + // non-repeated fields, this will be the same as the number of rows. + int32_t num_values; + // Number of rows stored in the page. std::nullopt if not available. + std::optional num_rows; +}; + +class PARQUET_EXPORT LevelDecoder { + public: + LevelDecoder(); + ~LevelDecoder(); + + // Initialize the LevelDecoder state with new data + // and return the number of bytes consumed + int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, + const uint8_t* data, int32_t data_size); + + void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values, + const uint8_t* data); + + // Decodes a batch of levels into an array and returns the number of levels decoded + int Decode(int batch_size, int16_t* levels); + + private: + int bit_width_; + int num_values_remaining_; + Encoding::type encoding_; + std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; + int16_t max_level_; +}; + +struct CryptoContext { + CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, + std::shared_ptr meta, std::shared_ptr data) + : start_decrypt_with_dictionary_page(start_with_dictionary_page), + row_group_ordinal(rg_ordinal), + column_ordinal(col_ordinal), + meta_decryptor(std::move(meta)), + data_decryptor(std::move(data)) {} + CryptoContext() {} + + bool start_decrypt_with_dictionary_page = false; + int16_t row_group_ordinal = -1; + int16_t column_ordinal = -1; + std::shared_ptr meta_decryptor; + std::shared_ptr data_decryptor; +}; + +// Abstract page iterator interface. This way, we can feed column pages to the +// ColumnReader through whatever mechanism we choose +class PARQUET_EXPORT PageReader { + using DataPageFilter = std::function; + + public: + virtual ~PageReader() = default; + + static std::unique_ptr Open( + std::shared_ptr stream, int64_t total_num_values, + Compression::type codec, bool always_compressed = false, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + const CryptoContext* ctx = NULLPTR); + static std::unique_ptr Open(std::shared_ptr stream, + int64_t total_num_values, + Compression::type codec, + const ReaderProperties& properties, + bool always_compressed = false, + const CryptoContext* ctx = NULLPTR); + + // If data_page_filter is present (not null), NextPage() will call the + // callback function exactly once per page in the order the pages appear in + // the column. If the callback function returns true the page will be + // skipped. The callback will be called only if the page type is DATA_PAGE or + // DATA_PAGE_V2. Dictionary pages will not be skipped. + // Caller is responsible for checking that statistics are correct using + // ApplicationVersion::HasCorrectStatistics(). + // \note API EXPERIMENTAL + void set_data_page_filter(DataPageFilter data_page_filter) { + data_page_filter_ = std::move(data_page_filter); + } + + // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr + // containing new Page otherwise + // + // The returned Page may contain references that aren't guaranteed to live + // beyond the next call to NextPage(). + virtual std::shared_ptr NextPage() = 0; + + virtual void set_max_page_header_size(uint32_t size) = 0; + + protected: + // Callback that decides if we should skip a page or not. + DataPageFilter data_page_filter_; +}; + +class PARQUET_EXPORT ColumnReader { + public: + virtual ~ColumnReader() = default; + + static std::shared_ptr Make( + const ColumnDescriptor* descr, std::unique_ptr pager, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + // Returns true if there are still values in this column. + virtual bool HasNext() = 0; + + virtual Type::type type() const = 0; + + virtual const ColumnDescriptor* descr() const = 0; + + // Get the encoding that can be exposed by this reader. If it returns + // dictionary encoding, then ReadBatchWithDictionary can be used to read data. + // + // \note API EXPERIMENTAL + virtual ExposedEncoding GetExposedEncoding() = 0; + + protected: + friend class RowGroupReader; + // Set the encoding that can be exposed by this reader. + // + // \note API EXPERIMENTAL + virtual void SetExposedEncoding(ExposedEncoding encoding) = 0; +}; + +// API to read values from a single column. This is a main client facing API. +template +class TypedColumnReader : public ColumnReader { + public: + typedef typename DType::c_type T; + + // Read a batch of repetition levels, definition levels, and values from the + // column. + // + // Since null values are not stored in the values, the number of values read + // may be less than the number of repetition and definition levels. With + // nested data this is almost certainly true. + // + // Set def_levels or rep_levels to nullptr if you want to skip reading them. + // This is only safe if you know through some other source that there are no + // undefined values. + // + // To fully exhaust a row group, you must read batches until the number of + // values read reaches the number of stored values according to the metadata. + // + // This API is the same for both V1 and V2 of the DataPage + // + // @returns: actual number of levels read (see values_read for number of values read) + virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, + T* values, int64_t* values_read) = 0; + + /// Read a batch of repetition levels, definition levels, and values from the + /// column and leave spaces for null entries on the lowest level in the values + /// buffer. + /// + /// In comparison to ReadBatch the length of repetition and definition levels + /// is the same as of the number of values read for max_definition_level == 1. + /// In the case of max_definition_level > 1, the repetition and definition + /// levels are larger than the values but the values include the null entries + /// with definition_level == (max_definition_level - 1). + /// + /// To fully exhaust a row group, you must read batches until the number of + /// values read reaches the number of stored values according to the metadata. + /// + /// @param batch_size the number of levels to read + /// @param[out] def_levels The Parquet definition levels, output has + /// the length levels_read. + /// @param[out] rep_levels The Parquet repetition levels, output has + /// the length levels_read. + /// @param[out] values The values in the lowest nested level including + /// spacing for nulls on the lowest levels; output has the length + /// values_read. + /// @param[out] valid_bits Memory allocated for a bitmap that indicates if + /// the row is null or on the maximum definition level. For performance + /// reasons the underlying buffer should be able to store 1 bit more than + /// required. If this requires an additional byte, this byte is only read + /// but never written to. + /// @param valid_bits_offset The offset in bits of the valid_bits where the + /// first relevant bit resides. + /// @param[out] levels_read The number of repetition/definition levels that were read. + /// @param[out] values_read The number of values read, this includes all + /// non-null entries as well as all null-entries on the lowest level + /// (i.e. definition_level == max_definition_level - 1) + /// @param[out] null_count The number of nulls on the lowest levels. + /// (i.e. (values_read - null_count) is total number of non-null entries) + /// + /// \deprecated Since 4.0.0 + ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.") + virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, + int16_t* rep_levels, T* values, uint8_t* valid_bits, + int64_t valid_bits_offset, int64_t* levels_read, + int64_t* values_read, int64_t* null_count) = 0; + + // Skip reading values. This method will work for both repeated and + // non-repeated fields. Note that this method is skipping values and not + // records. This distinction is important for repeated fields, meaning that + // we are not skipping over the values to the next record. For example, + // consider the following two consecutive records containing one repeated field: + // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which + // is inside the first record. + // Returns the number of values skipped. + virtual int64_t Skip(int64_t num_values_to_skip) = 0; + + // Read a batch of repetition levels, definition levels, and indices from the + // column. And read the dictionary if a dictionary page is encountered during + // reading pages. This API is similar to ReadBatch(), with ability to read + // dictionary and indices. It is only valid to call this method when the reader can + // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns + // DICTIONARY). + // + // The dictionary is read along with the data page. When there's no data page, + // the dictionary won't be returned. + // + // @param batch_size The batch size to read + // @param[out] def_levels The Parquet definition levels. + // @param[out] rep_levels The Parquet repetition levels. + // @param[out] indices The dictionary indices. + // @param[out] indices_read The number of indices read. + // @param[out] dict The pointer to dictionary values. It will return nullptr if + // there's no data page. Each column chunk only has one dictionary page. The dictionary + // is owned by the reader, so the caller is responsible for copying the dictionary + // values before the reader gets destroyed. + // @param[out] dict_len The dictionary length. It will return 0 if there's no data + // page. + // @returns: actual number of levels read (see indices_read for number of + // indices read + // + // \note API EXPERIMENTAL + virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels, + int16_t* rep_levels, int32_t* indices, + int64_t* indices_read, const T** dict, + int32_t* dict_len) = 0; +}; + +// Represent a range to read. The range is inclusive on both ends. +struct IntervalRange { + static IntervalRange Intersection(const IntervalRange& left, + const IntervalRange& right) { + if (left.start <= right.start) { + if (left.end >= right.start) { + return {right.start, std::min(left.end, right.end)}; + } + } else if (right.end >= left.start) { + return {left.start, std::min(left.end, right.end)}; + } + return {-1, -1}; // Return a default Range object if no intersection range found + } + + IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) { + if (start > end) { + throw ParquetException("Invalid range with start: " + std::to_string(start) + + " and end: " + std::to_string(end)); + } + } + + size_t Count() const { return end - start + 1; } + + bool IsBefore(const IntervalRange& other) const { return end < other.start; } + + bool IsAfter(const IntervalRange& other) const { return start > other.end; } + + bool IsOverlap(const IntervalRange& other) const { + return !IsBefore(other) && !IsAfter(other); + } + + bool IsValid() const { return start >= 0 && end >= 0 && end >= start; } + + std::string ToString() const { + return "[" + std::to_string(start) + ", " + std::to_string(end) + "]"; + } + + // inclusive + int64_t start; + // inclusive + int64_t end; +}; + +struct BitmapRange { + int64_t offset; + // zero added to, if there are less than 64 elements left in the column. + uint64_t bitmap; +}; + +struct End {}; + +// Represent a set of ranges to read. The ranges are sorted and non-overlapping. +class RowRanges { + public: + RowRanges() = default; + + explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); } + + RowRanges(const std::vector& ranges) { this->ranges = ranges; } + + RowRanges(const RowRanges& other) { ranges = other.ranges; } + + RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); } + + static RowRanges Intersection(const RowRanges& left, const RowRanges& right) { + RowRanges result; + + size_t rightIndex = 0; + for (const IntervalRange& l : left.ranges) { + for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { + const IntervalRange& r = right.ranges[i]; + if (l.IsBefore(r)) { + break; + } else if (l.IsAfter(r)) { + rightIndex = i + 1; + continue; } - - // Encoded statistics extracted from the page header. - // Nullptr if there are no statistics in the page header. - const EncodedStatistics* encoded_statistics; - // Number of values stored in the page. Filled for both V1 and V2 data pages. - // For repeated fields, this can be greater than number of rows. For - // non-repeated fields, this will be the same as the number of rows. - int32_t num_values; - // Number of rows stored in the page. std::nullopt if not available. - std::optional num_rows; - }; - - class PARQUET_EXPORT LevelDecoder { - public: - LevelDecoder(); - - ~LevelDecoder(); - - // Initialize the LevelDecoder state with new data - // and return the number of bytes consumed - int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, - const uint8_t* data, int32_t data_size); - - void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values, - const uint8_t* data); - - // Decodes a batch of levels into an array and returns the number of levels decoded - int Decode(int batch_size, int16_t* levels); - - private: - int bit_width_; - int num_values_remaining_; - Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; - std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_; - int16_t max_level_; - }; - - struct CryptoContext { - CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal, - std::shared_ptr meta, std::shared_ptr data) - : start_decrypt_with_dictionary_page(start_with_dictionary_page), - row_group_ordinal(rg_ordinal), - column_ordinal(col_ordinal), - meta_decryptor(std::move(meta)), - data_decryptor(std::move(data)) { - } - - CryptoContext() { - } - - bool start_decrypt_with_dictionary_page = false; - int16_t row_group_ordinal = -1; - int16_t column_ordinal = -1; - std::shared_ptr meta_decryptor; - std::shared_ptr data_decryptor; - }; - - // Abstract page iterator interface. This way, we can feed column pages to the - // ColumnReader through whatever mechanism we choose - class PARQUET_EXPORT PageReader { - using DataPageFilter = std::function; - - public: - virtual ~PageReader() = default; - - static std::unique_ptr Open( - std::shared_ptr stream, int64_t total_num_values, - Compression::type codec, bool always_compressed = false, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - const CryptoContext* ctx = NULLPTR); - - static std::unique_ptr Open(std::shared_ptr stream, - int64_t total_num_values, - Compression::type codec, - const ReaderProperties&properties, - bool always_compressed = false, - const CryptoContext* ctx = NULLPTR); - - // If data_page_filter is present (not null), NextPage() will call the - // callback function exactly once per page in the order the pages appear in - // the column. If the callback function returns true the page will be - // skipped. The callback will be called only if the page type is DATA_PAGE or - // DATA_PAGE_V2. Dictionary pages will not be skipped. - // Caller is responsible for checking that statistics are correct using - // ApplicationVersion::HasCorrectStatistics(). - // \note API EXPERIMENTAL - void set_data_page_filter(DataPageFilter data_page_filter) { - data_page_filter_ = std::move(data_page_filter); - } - - // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr - // containing new Page otherwise - // - // The returned Page may contain references that aren't guaranteed to live - // beyond the next call to NextPage(). - virtual std::shared_ptr NextPage() = 0; - - virtual void set_max_page_header_size(uint32_t size) = 0; - - protected: - // Callback that decides if we should skip a page or not. - DataPageFilter data_page_filter_; - }; - - class PARQUET_EXPORT ColumnReader { - public: - virtual ~ColumnReader() = default; - - static std::shared_ptr Make( - const ColumnDescriptor* descr, std::unique_ptr pager, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - // Returns true if there are still values in this column. - virtual bool HasNext() = 0; - - virtual Type::type type() const = 0; - - virtual const ColumnDescriptor* descr() const = 0; - - // Get the encoding that can be exposed by this reader. If it returns - // dictionary encoding, then ReadBatchWithDictionary can be used to read data. - // - // \note API EXPERIMENTAL - virtual ExposedEncoding GetExposedEncoding() = 0; - - protected: - friend class RowGroupReader; - // Set the encoding that can be exposed by this reader. - // - // \note API EXPERIMENTAL - virtual void SetExposedEncoding(ExposedEncoding encoding) = 0; - }; - - // API to read values from a single column. This is a main client facing API. - template - class TypedColumnReader : public ColumnReader { - public: - typedef typename DType::c_type T; - - // Read a batch of repetition levels, definition levels, and values from the - // column. - // - // Since null values are not stored in the values, the number of values read - // may be less than the number of repetition and definition levels. With - // nested data this is almost certainly true. - // - // Set def_levels or rep_levels to nullptr if you want to skip reading them. - // This is only safe if you know through some other source that there are no - // undefined values. - // - // To fully exhaust a row group, you must read batches until the number of - // values read reaches the number of stored values according to the metadata. - // - // This API is the same for both V1 and V2 of the DataPage - // - // @returns: actual number of levels read (see values_read for number of values read) - virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, - T* values, int64_t* values_read) = 0; - - /// Read a batch of repetition levels, definition levels, and values from the - /// column and leave spaces for null entries on the lowest level in the values - /// buffer. - /// - /// In comparison to ReadBatch the length of repetition and definition levels - /// is the same as of the number of values read for max_definition_level == 1. - /// In the case of max_definition_level > 1, the repetition and definition - /// levels are larger than the values but the values include the null entries - /// with definition_level == (max_definition_level - 1). - /// - /// To fully exhaust a row group, you must read batches until the number of - /// values read reaches the number of stored values according to the metadata. - /// - /// @param batch_size the number of levels to read - /// @param[out] def_levels The Parquet definition levels, output has - /// the length levels_read. - /// @param[out] rep_levels The Parquet repetition levels, output has - /// the length levels_read. - /// @param[out] values The values in the lowest nested level including - /// spacing for nulls on the lowest levels; output has the length - /// values_read. - /// @param[out] valid_bits Memory allocated for a bitmap that indicates if - /// the row is null or on the maximum definition level. For performance - /// reasons the underlying buffer should be able to store 1 bit more than - /// required. If this requires an additional byte, this byte is only read - /// but never written to. - /// @param valid_bits_offset The offset in bits of the valid_bits where the - /// first relevant bit resides. - /// @param[out] levels_read The number of repetition/definition levels that were read. - /// @param[out] values_read The number of values read, this includes all - /// non-null entries as well as all null-entries on the lowest level - /// (i.e. definition_level == max_definition_level - 1) - /// @param[out] null_count The number of nulls on the lowest levels. - /// (i.e. (values_read - null_count) is total number of non-null entries) - /// - /// \deprecated Since 4.0.0 - ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.") - virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, - int16_t* rep_levels, T* values, uint8_t* valid_bits, - int64_t valid_bits_offset, int64_t* levels_read, - int64_t* values_read, int64_t* null_count) = 0; - - // Skip reading values. This method will work for both repeated and - // non-repeated fields. Note that this method is skipping values and not - // records. This distinction is important for repeated fields, meaning that - // we are not skipping over the values to the next record. For example, - // consider the following two consecutive records containing one repeated field: - // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which - // is inside the first record. - // Returns the number of values skipped. - virtual int64_t Skip(int64_t num_values_to_skip) = 0; - - // Read a batch of repetition levels, definition levels, and indices from the - // column. And read the dictionary if a dictionary page is encountered during - // reading pages. This API is similar to ReadBatch(), with ability to read - // dictionary and indices. It is only valid to call this method when the reader can - // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns - // DICTIONARY). - // - // The dictionary is read along with the data page. When there's no data page, - // the dictionary won't be returned. - // - // @param batch_size The batch size to read - // @param[out] def_levels The Parquet definition levels. - // @param[out] rep_levels The Parquet repetition levels. - // @param[out] indices The dictionary indices. - // @param[out] indices_read The number of indices read. - // @param[out] dict The pointer to dictionary values. It will return nullptr if - // there's no data page. Each column chunk only has one dictionary page. The dictionary - // is owned by the reader, so the caller is responsible for copying the dictionary - // values before the reader gets destroyed. - // @param[out] dict_len The dictionary length. It will return 0 if there's no data - // page. - // @returns: actual number of levels read (see indices_read for number of - // indices read - // - // \note API EXPERIMENTAL - virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels, - int16_t* rep_levels, int32_t* indices, - int64_t* indices_read, const T** dict, - int32_t* dict_len) = 0; - }; - - // Represent a range to read. The range is inclusive on both ends. - struct IntervalRange { - static IntervalRange Intersection(const IntervalRange&left, const IntervalRange&right) { - if (left.start <= right.start) { - if (left.end >= right.start) { - return {right.start, std::min(left.end, right.end)}; - } - } - else if (right.end >= left.start) { - return {left.start, std::min(left.end, right.end)}; - } - return {-1, -1}; // Return a default Range object if no intersection range found - } - - IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) { - if (start > end) { - throw ParquetException("Invalid range with start: " + std::to_string(start) - + " and end: " + std::to_string(end)); - } - } - - size_t Count() const { return end - start + 1; } - - bool IsBefore(const IntervalRange&other) const { return end < other.start; } - - bool IsAfter(const IntervalRange&other) const { return start > other.end; } - - bool IsOverlap(const IntervalRange&other) const { return !IsBefore(other) && !IsAfter(other); } - - bool IsValid() const { return start >= 0 && end >= 0 && end >= start; } - - std::string ToString() const { - return "[" + std::to_string(start) + ", " + std::to_string(end) + "]"; - } - - // inclusive - int64_t start; - // inclusive - int64_t end; - }; - - struct BitmapRange { - int64_t offset; - // zero added to, if there are less than 64 elements left in the column. - uint64_t bitmap; - }; - - struct End {}; - - // Represent a set of ranges to read. The ranges are sorted and non-overlapping. - class RowRanges { - public: - RowRanges() = default; - - explicit RowRanges(const IntervalRange&range) { ranges.push_back(range); } - - RowRanges(const std::vector&ranges) { this->ranges = ranges; } - - RowRanges(const RowRanges&other) { ranges = other.ranges; } - - RowRanges(RowRanges&&other) noexcept { ranges = std::move(other.ranges); } - - static RowRanges Intersection(const RowRanges& left, const RowRanges& right) { - RowRanges result; - - size_t rightIndex = 0; - for (const IntervalRange& l : left.ranges) { - for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { - const IntervalRange& r = right.ranges[i]; - if (l.IsBefore(r)) { - break; - } else if (l.IsAfter(r)) { - rightIndex = i + 1; - continue; - } - result.Add(IntervalRange::Intersection(l, r)); - } - } - - return result; - } - - void Add(const IntervalRange&range) { - const IntervalRange rangeToAdd = range; - if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) { - throw ParquetException("Ranges must be added in order"); - } - ranges.push_back(rangeToAdd); - } - - size_t RowCount() const { - size_t cnt = 0; - for (const IntervalRange&range: ranges) { - cnt += range.Count(); - } - return cnt; - } - - bool IsValid() const { - if (ranges.size() == 0) return true; - if (ranges[0].start < 0) { - return false; - } - for (size_t i = 1; i < ranges.size(); i++) { - if (ranges[i].start <= ranges[i - 1].end) { - return false; - } - } - return true; - } - - bool IsOverlapping(int64_t start, int64_t end) const { - const IntervalRange searchRange(start, end); - return IsOverlapping(searchRange); - } - - bool IsOverlapping(const IntervalRange&searchRange) const { - auto it = std::lower_bound( - ranges.begin(), ranges.end(), searchRange, - [](const IntervalRange&r1, const IntervalRange&r2) { return r1.IsBefore(r2); }); - return it != ranges.end() && !(*it).IsAfter(searchRange); - } - - std::vector& GetRanges() { return ranges; } - - const std::vector& GetRanges() const { return ranges; } - - // Split the ranges into N+1 parts at the given split point, where N = split_points.size() - // The RowRows object itself is not modified - std::vector SplitAt(const std::vector&split_points) const { - if (split_points.size() == 0) { - return {*this}; - } - - std::vector result; - int64_t last_split_point = -1; - for (const int64_t split_point: split_points) { - if (split_point <= 0) { - throw ParquetException("Invalid split point " + std::to_string(split_point)); - } - if (split_point <= last_split_point) { - throw ParquetException("Split points must be in ascending order"); - } - last_split_point = split_point; - } - - RowRanges spaces; - for (size_t i = 0 ; i < split_points.size(); ++i) { - auto start = i == 0 ? 0 : split_points[i - 1]; - auto end = split_points[i] - 1; - spaces.Add({start, end}); - } - spaces.Add({split_points[split_points.size() - 1], std::numeric_limits::max()}); - - for(IntervalRange space : spaces.GetRanges()) { - RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this); - result.push_back(intersection); - } - - return result; - } - - const IntervalRange& operator[](size_t index) const { - // check index - if (index >= ranges.size() || index < 0) { - throw ParquetException("Index out of range"); - } - return ranges[index]; - } - - RowRanges shift(const int64_t offset) const { - RowRanges result; - for (const IntervalRange&range: ranges) { - result.Add({range.start + offset, range.end + offset}); - } - return result; - } - - std::string ToString() const { - std::string result = "["; - for (const IntervalRange&range: ranges) { - result += - "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), "; - } - if (!ranges.empty()) { - result = result.substr(0, result.size() - 2); - } - result += "]"; - return result; - } - - /// The following APIs are to be implemented - /// Comment out for now to pass compile - -// // Returns a vector of PageLocations that must be read all to get values for all included in this range -// virtual std::vector PageIndexesToInclude(const std::vector& all_pages) = 0; -// class Iterator { -// virtual std::variant NextRange() = 0; -// }; -// virtual std::unique_ptr NewIterator() = 0; - - private: - std::vector ranges; - }; - - namespace internal { - class PARQUET_EXPORT RecordSkipper { - public: - RecordSkipper(RowRanges&pages, const RowRanges&row_ranges_) - : row_ranges(row_ranges_) { - // copy row_ranges - RowRanges will_process_pages, skip_pages; - for (auto&page: pages.GetRanges()) { - if (!row_ranges.IsOverlapping(page)) { - skip_pages.Add(page); - } - } - - /// Since the skipped pages will be silently skipped without updating - /// current_rg_processed_records or records_read_, we need to pre-process the row - /// ranges as if these skipped pages never existed - adjust_ranges(skip_pages, row_ranges); - - total_rows_to_process = pages.RowCount() - skip_pages.RowCount(); - } - - /// \brief Return the number of records to read or to skip - /// if return values is positive, it means to read N records - /// if return values is negative, it means to skip N records - /// if return values is 0, it means end of RG - int64_t advise_next(const int64_t current_rg_processed) { - if (row_ranges.GetRanges().size() == row_range_idx) { - return 0; - } - - if (row_ranges[row_range_idx].end < current_rg_processed) { - row_range_idx++; - if (row_ranges.GetRanges().size() == row_range_idx) { - // negative, skip the ramaining rows - return current_rg_processed - total_rows_to_process; - } - } - - if (row_ranges[row_range_idx].start > current_rg_processed) { - // negative, skip - return current_rg_processed - row_ranges[row_range_idx].start; - } - - const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1; - return ret; - } - - private: - void adjust_ranges(RowRanges & skip_pages, RowRanges & to_adjust) { - size_t skipped_rows = 0; - auto iter = to_adjust.GetRanges().begin(); - auto skip_iter = skip_pages.GetRanges().begin(); - while (iter != to_adjust.GetRanges().end()) { - while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) { - skipped_rows += skip_iter->Count(); - ++skip_iter; - } - iter->start -= skipped_rows; - iter->end -= skipped_rows; - ++iter; - } - } - - /// Keep copy of ranges, because adjust_ranges() will modify them - RowRanges row_ranges; - - size_t row_range_idx = 0; - size_t total_rows_to_process = 0; - }; - - /// \brief Stateful column reader that delimits semantic records for both flat - /// and nested columns - /// - /// \note API EXPERIMENTAL - /// \since 1.3.0 - class PARQUET_EXPORT RecordReader { - public: - /// \brief Creates a record reader. - /// @param descr Column descriptor - /// @param leaf_info Level info, used to determine if a column is nullable or not - /// @param pool Memory pool to use for buffering values and rep/def levels - /// @param read_dictionary True if reading directly as Arrow dictionary-encoded - /// @param read_dense_for_nullable True if reading dense and not leaving space for null - /// values - static std::shared_ptr Make( - const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool read_dictionary = false, bool read_dense_for_nullable = false); - - virtual ~RecordReader() = default; - - /// \brief Attempt to read indicated number of records from column chunk - /// Note that for repeated fields, a record may have more than one value - /// and all of them are read. If read_dense_for_nullable() it will - /// not leave any space for null values. Otherwise, it will read spaced. - /// \return number of records read - virtual int64_t ReadRecords(int64_t num_records) = 0; - - /// \brief Attempt to skip indicated number of records from column chunk. - /// Note that for repeated fields, a record may have more than one value - /// and all of them are skipped. - /// \return number of records skipped - virtual int64_t SkipRecords(int64_t num_records) = 0; - - /// \brief Pre-allocate space for data. Results in better flat read performance - virtual void Reserve(int64_t num_values) = 0; - - /// \brief Clear consumed values and repetition/definition levels as the - /// result of calling ReadRecords - /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them. - virtual void Reset() = 0; - - /// \brief Transfer filled values buffer to caller. A new one will be - /// allocated in subsequent ReadRecords calls - virtual std::shared_ptr ReleaseValues() = 0; - - /// \brief Transfer filled validity bitmap buffer to caller. A new one will - /// be allocated in subsequent ReadRecords calls - virtual std::shared_ptr ReleaseIsValid() = 0; - - /// \brief Return true if the record reader has more internal data yet to - /// process - virtual bool HasMoreData() const = 0; - - /// \brief Advance record reader to the next row group. Must be set before - /// any records could be read/skipped. - /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader - virtual void SetPageReader(std::unique_ptr reader) = 0; - - /// \brief Returns the underlying column reader's descriptor. - virtual const ColumnDescriptor* descr() const = 0; - - virtual void DebugPrintState() = 0; - - /// \brief Decoded definition levels - int16_t* def_levels() const { - return reinterpret_cast(def_levels_->mutable_data()); - } - - /// \brief Decoded repetition levels - int16_t* rep_levels() const { - return reinterpret_cast(rep_levels_->mutable_data()); - } - - /// \brief Decoded values, including nulls, if any - /// FLBA and ByteArray types do not use this array and read into their own - /// builders. - uint8_t* values() const { return values_->mutable_data(); } - - /// \brief Number of values written, including space left for nulls if any. - /// If this Reader was constructed with read_dense_for_nullable(), there is no space for - /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For - /// FLBA and ByteArray types this value reflects the values written with the last - /// ReadRecords call since those readers will reset the values after each call. - int64_t values_written() const { return values_written_; } - - /// \brief Number of definition / repetition levels (from those that have - /// been decoded) that have been consumed inside the reader. - int64_t levels_position() const { return levels_position_; } - - /// \brief Number of definition / repetition levels that have been written - /// internally in the reader. This may be larger than values_written() because - /// for repeated fields we need to look at the levels in advance to figure out - /// the record boundaries. - int64_t levels_written() const { return levels_written_; } - - /// \brief Number of nulls in the leaf that we have read so far into the - /// values vector. This is only valid when !read_dense_for_nullable(). When - /// read_dense_for_nullable() it will always be 0. - int64_t null_count() const { return null_count_; } - - /// \brief True if the leaf values are nullable - bool nullable_values() const { return nullable_values_; } - - /// \brief True if reading directly as Arrow dictionary-encoded - bool read_dictionary() const { return read_dictionary_; } - - /// \brief True if reading dense for nullable columns. - bool read_dense_for_nullable() const { return read_dense_for_nullable_; } - - void reset_current_rg_processed_records() { current_rg_processed_records = 0; } - - void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } - - protected: - /// \brief Indicates if we can have nullable values. Note that repeated fields - /// may or may not be nullable. - bool nullable_values_; - - bool at_record_start_; - int64_t records_read_; - - int64_t current_rg_processed_records; // counting both read and skip records - - /// \brief Stores values. These values are populated based on each ReadRecords - /// call. No extra values are buffered for the next call. SkipRecords will not - /// add any value to this buffer. - std::shared_ptr<::arrow::ResizableBuffer> values_; - /// \brief False for BYTE_ARRAY, in which case we don't allocate the values - /// buffer and we directly read into builder classes. - bool uses_values_; - - /// \brief Values that we have read into 'values_' + 'null_count_'. - int64_t values_written_; - int64_t values_capacity_; - int64_t null_count_; - - /// \brief Each bit corresponds to one element in 'values_' and specifies if it - /// is null or not null. Not set if read_dense_for_nullable_ is true. - std::shared_ptr<::arrow::ResizableBuffer> valid_bits_; - - /// \brief Buffer for definition levels. May contain more levels than - /// is actually read. This is because we read levels ahead to - /// figure out record boundaries for repeated fields. - /// For flat required fields, 'def_levels_' and 'rep_levels_' are not - /// populated. For non-repeated fields 'rep_levels_' is not populated. - /// 'def_levels_' and 'rep_levels_' must be of the same size if present. - std::shared_ptr<::arrow::ResizableBuffer> def_levels_; - /// \brief Buffer for repetition levels. Only populated for repeated - /// fields. - std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; - - /// \brief Number of definition / repetition levels that have been written - /// internally in the reader. This may be larger than values_written() since - /// for repeated fields we need to look at the levels in advance to figure out - /// the record boundaries. - int64_t levels_written_; - /// \brief Position of the next level that should be consumed. - int64_t levels_position_; - int64_t levels_capacity_; - - bool read_dictionary_ = false; - // If true, we will not leave any space for the null values in the values_ - // vector. - bool read_dense_for_nullable_ = false; - - std::shared_ptr skipper = NULLPTR; - }; - - class BinaryRecordReader : virtual public RecordReader { - public: - virtual std::vector> GetBuilderChunks() = 0; - }; - - /// \brief Read records directly to dictionary-encoded Arrow form (int32 - /// indices). Only valid for BYTE_ARRAY columns - class DictionaryRecordReader : virtual public RecordReader { - public: - virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; - }; - } // namespace internal - - using BoolReader = TypedColumnReader; - using Int32Reader = TypedColumnReader; - using Int64Reader = TypedColumnReader; - using Int96Reader = TypedColumnReader; - using FloatReader = TypedColumnReader; - using DoubleReader = TypedColumnReader; - using ByteArrayReader = TypedColumnReader; - using FixedLenByteArrayReader = TypedColumnReader; -} // namespace parquet + result.Add(IntervalRange::Intersection(l, r)); + } + } + + return result; + } + + void Add(const IntervalRange& range) { + const IntervalRange rangeToAdd = range; + if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) { + throw ParquetException("Ranges must be added in order"); + } + ranges.push_back(rangeToAdd); + } + + size_t RowCount() const { + size_t cnt = 0; + for (const IntervalRange& range : ranges) { + cnt += range.Count(); + } + return cnt; + } + + bool IsValid() const { + if (ranges.size() == 0) return true; + if (ranges[0].start < 0) { + return false; + } + for (size_t i = 1; i < ranges.size(); i++) { + if (ranges[i].start <= ranges[i - 1].end) { + return false; + } + } + return true; + } + + bool IsOverlapping(int64_t start, int64_t end) const { + const IntervalRange searchRange(start, end); + return IsOverlapping(searchRange); + } + + bool IsOverlapping(const IntervalRange& searchRange) const { + auto it = std::lower_bound( + ranges.begin(), ranges.end(), searchRange, + [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); + return it != ranges.end() && !(*it).IsAfter(searchRange); + } + + std::vector& GetRanges() { return ranges; } + + const std::vector& GetRanges() const { return ranges; } + + // Split the ranges into N+1 parts at the given split point, where N = + // split_points.size() The RowRows object itself is not modified + std::vector SplitAt(const std::vector& split_points) const { + if (split_points.size() == 0) { + return {*this}; + } + + std::vector result; + int64_t last_split_point = -1; + for (const int64_t split_point : split_points) { + if (split_point <= 0) { + throw ParquetException("Invalid split point " + std::to_string(split_point)); + } + if (split_point <= last_split_point) { + throw ParquetException("Split points must be in ascending order"); + } + last_split_point = split_point; + } + + RowRanges spaces; + for (size_t i = 0; i < split_points.size(); ++i) { + auto start = i == 0 ? 0 : split_points[i - 1]; + auto end = split_points[i] - 1; + spaces.Add({start, end}); + } + spaces.Add( + {split_points[split_points.size() - 1], std::numeric_limits::max()}); + + for (IntervalRange space : spaces.GetRanges()) { + RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this); + result.push_back(intersection); + } + + return result; + } + + const IntervalRange& operator[](size_t index) const { + // check index + if (index >= ranges.size() || index < 0) { + throw ParquetException("Index out of range"); + } + return ranges[index]; + } + + RowRanges shift(const int64_t offset) const { + RowRanges result; + for (const IntervalRange& range : ranges) { + result.Add({range.start + offset, range.end + offset}); + } + return result; + } + + std::string ToString() const { + std::string result = "["; + for (const IntervalRange& range : ranges) { + result += + "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), "; + } + if (!ranges.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; + } + + /// The following APIs are to be implemented + /// Comment out for now to pass compile + + // // Returns a vector of PageLocations that must be read all to get values for + // all included in this range virtual std::vector + // PageIndexesToInclude(const std::vector& all_pages) = 0; class + // Iterator { + // virtual std::variant NextRange() = 0; + // }; + // virtual std::unique_ptr NewIterator() = 0; + + private: + std::vector ranges; +}; + +namespace internal { +class PARQUET_EXPORT RecordSkipper { + public: + RecordSkipper(RowRanges& pages, const RowRanges& row_ranges_) + : row_ranges(row_ranges_) { + // copy row_ranges + RowRanges will_process_pages, skip_pages; + for (auto& page : pages.GetRanges()) { + if (!row_ranges.IsOverlapping(page)) { + skip_pages.Add(page); + } + } + + /// Since the skipped pages will be silently skipped without updating + /// current_rg_processed_records or records_read_, we need to pre-process the row + /// ranges as if these skipped pages never existed + adjust_ranges(skip_pages, row_ranges); + + total_rows_to_process = pages.RowCount() - skip_pages.RowCount(); + } + + /// \brief Return the number of records to read or to skip + /// if return values is positive, it means to read N records + /// if return values is negative, it means to skip N records + /// if return values is 0, it means end of RG + int64_t advise_next(const int64_t current_rg_processed) { + if (row_ranges.GetRanges().size() == row_range_idx) { + return 0; + } + + if (row_ranges[row_range_idx].end < current_rg_processed) { + row_range_idx++; + if (row_ranges.GetRanges().size() == row_range_idx) { + // negative, skip the ramaining rows + return current_rg_processed - total_rows_to_process; + } + } + + if (row_ranges[row_range_idx].start > current_rg_processed) { + // negative, skip + return current_rg_processed - row_ranges[row_range_idx].start; + } + + const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1; + return ret; + } + + private: + void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { + size_t skipped_rows = 0; + auto iter = to_adjust.GetRanges().begin(); + auto skip_iter = skip_pages.GetRanges().begin(); + while (iter != to_adjust.GetRanges().end()) { + while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) { + skipped_rows += skip_iter->Count(); + ++skip_iter; + } + iter->start -= skipped_rows; + iter->end -= skipped_rows; + ++iter; + } + } + + /// Keep copy of ranges, because adjust_ranges() will modify them + RowRanges row_ranges; + + size_t row_range_idx = 0; + size_t total_rows_to_process = 0; +}; + +/// \brief Stateful column reader that delimits semantic records for both flat +/// and nested columns +/// +/// \note API EXPERIMENTAL +/// \since 1.3.0 +class PARQUET_EXPORT RecordReader { + public: + /// \brief Creates a record reader. + /// @param descr Column descriptor + /// @param leaf_info Level info, used to determine if a column is nullable or not + /// @param pool Memory pool to use for buffering values and rep/def levels + /// @param read_dictionary True if reading directly as Arrow dictionary-encoded + /// @param read_dense_for_nullable True if reading dense and not leaving space for null + /// values + static std::shared_ptr Make( + const ColumnDescriptor* descr, LevelInfo leaf_info, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool read_dictionary = false, bool read_dense_for_nullable = false); + + virtual ~RecordReader() = default; + + /// \brief Attempt to read indicated number of records from column chunk + /// Note that for repeated fields, a record may have more than one value + /// and all of them are read. If read_dense_for_nullable() it will + /// not leave any space for null values. Otherwise, it will read spaced. + /// \return number of records read + virtual int64_t ReadRecords(int64_t num_records) = 0; + + /// \brief Attempt to skip indicated number of records from column chunk. + /// Note that for repeated fields, a record may have more than one value + /// and all of them are skipped. + /// \return number of records skipped + virtual int64_t SkipRecords(int64_t num_records) = 0; + + /// \brief Pre-allocate space for data. Results in better flat read performance + virtual void Reserve(int64_t num_values) = 0; + + /// \brief Clear consumed values and repetition/definition levels as the + /// result of calling ReadRecords + /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them. + virtual void Reset() = 0; + + /// \brief Transfer filled values buffer to caller. A new one will be + /// allocated in subsequent ReadRecords calls + virtual std::shared_ptr ReleaseValues() = 0; + + /// \brief Transfer filled validity bitmap buffer to caller. A new one will + /// be allocated in subsequent ReadRecords calls + virtual std::shared_ptr ReleaseIsValid() = 0; + + /// \brief Return true if the record reader has more internal data yet to + /// process + virtual bool HasMoreData() const = 0; + + /// \brief Advance record reader to the next row group. Must be set before + /// any records could be read/skipped. + /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader + virtual void SetPageReader(std::unique_ptr reader) = 0; + + /// \brief Returns the underlying column reader's descriptor. + virtual const ColumnDescriptor* descr() const = 0; + + virtual void DebugPrintState() = 0; + + /// \brief Decoded definition levels + int16_t* def_levels() const { + return reinterpret_cast(def_levels_->mutable_data()); + } + + /// \brief Decoded repetition levels + int16_t* rep_levels() const { + return reinterpret_cast(rep_levels_->mutable_data()); + } + + /// \brief Decoded values, including nulls, if any + /// FLBA and ByteArray types do not use this array and read into their own + /// builders. + uint8_t* values() const { return values_->mutable_data(); } + + /// \brief Number of values written, including space left for nulls if any. + /// If this Reader was constructed with read_dense_for_nullable(), there is no space for + /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For + /// FLBA and ByteArray types this value reflects the values written with the last + /// ReadRecords call since those readers will reset the values after each call. + int64_t values_written() const { return values_written_; } + + /// \brief Number of definition / repetition levels (from those that have + /// been decoded) that have been consumed inside the reader. + int64_t levels_position() const { return levels_position_; } + + /// \brief Number of definition / repetition levels that have been written + /// internally in the reader. This may be larger than values_written() because + /// for repeated fields we need to look at the levels in advance to figure out + /// the record boundaries. + int64_t levels_written() const { return levels_written_; } + + /// \brief Number of nulls in the leaf that we have read so far into the + /// values vector. This is only valid when !read_dense_for_nullable(). When + /// read_dense_for_nullable() it will always be 0. + int64_t null_count() const { return null_count_; } + + /// \brief True if the leaf values are nullable + bool nullable_values() const { return nullable_values_; } + + /// \brief True if reading directly as Arrow dictionary-encoded + bool read_dictionary() const { return read_dictionary_; } + + /// \brief True if reading dense for nullable columns. + bool read_dense_for_nullable() const { return read_dense_for_nullable_; } + + void reset_current_rg_processed_records() { current_rg_processed_records = 0; } + + void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } + + protected: + /// \brief Indicates if we can have nullable values. Note that repeated fields + /// may or may not be nullable. + bool nullable_values_; + + bool at_record_start_; + int64_t records_read_; + + int64_t current_rg_processed_records; // counting both read and skip records + + /// \brief Stores values. These values are populated based on each ReadRecords + /// call. No extra values are buffered for the next call. SkipRecords will not + /// add any value to this buffer. + std::shared_ptr<::arrow::ResizableBuffer> values_; + /// \brief False for BYTE_ARRAY, in which case we don't allocate the values + /// buffer and we directly read into builder classes. + bool uses_values_; + + /// \brief Values that we have read into 'values_' + 'null_count_'. + int64_t values_written_; + int64_t values_capacity_; + int64_t null_count_; + + /// \brief Each bit corresponds to one element in 'values_' and specifies if it + /// is null or not null. Not set if read_dense_for_nullable_ is true. + std::shared_ptr<::arrow::ResizableBuffer> valid_bits_; + + /// \brief Buffer for definition levels. May contain more levels than + /// is actually read. This is because we read levels ahead to + /// figure out record boundaries for repeated fields. + /// For flat required fields, 'def_levels_' and 'rep_levels_' are not + /// populated. For non-repeated fields 'rep_levels_' is not populated. + /// 'def_levels_' and 'rep_levels_' must be of the same size if present. + std::shared_ptr<::arrow::ResizableBuffer> def_levels_; + /// \brief Buffer for repetition levels. Only populated for repeated + /// fields. + std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; + + /// \brief Number of definition / repetition levels that have been written + /// internally in the reader. This may be larger than values_written() since + /// for repeated fields we need to look at the levels in advance to figure out + /// the record boundaries. + int64_t levels_written_; + /// \brief Position of the next level that should be consumed. + int64_t levels_position_; + int64_t levels_capacity_; + + bool read_dictionary_ = false; + // If true, we will not leave any space for the null values in the values_ + // vector. + bool read_dense_for_nullable_ = false; + + std::shared_ptr skipper = NULLPTR; +}; + +class BinaryRecordReader : virtual public RecordReader { + public: + virtual std::vector> GetBuilderChunks() = 0; +}; + +/// \brief Read records directly to dictionary-encoded Arrow form (int32 +/// indices). Only valid for BYTE_ARRAY columns +class DictionaryRecordReader : virtual public RecordReader { + public: + virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; +}; + +} // namespace internal + +using BoolReader = TypedColumnReader; +using Int32Reader = TypedColumnReader; +using Int64Reader = TypedColumnReader; +using Int96Reader = TypedColumnReader; +using FloatReader = TypedColumnReader; +using DoubleReader = TypedColumnReader; +using ByteArrayReader = TypedColumnReader; +using FixedLenByteArrayReader = TypedColumnReader; + +} // namespace parquet From 7b5d4a627e67f212b274deecb6abab7687842513 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Sun, 14 Jan 2024 22:38:46 +0800 Subject: [PATCH 16/25] RowRangesPageFilter refactored --- cpp/src/parquet/arrow/reader.cc | 112 +++++++------ cpp/src/parquet/arrow/reader.h | 3 +- cpp/src/parquet/arrow/reader_internal.h | 8 +- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/column_reader.h | 199 ++++++++++++++---------- cpp/src/parquet/range_reader_test.cc | 24 +-- cpp/src/parquet/row_range_test.cc | 2 +- 7 files changed, 207 insertions(+), 143 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 92b746b8ad92..8289b63b475d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -17,7 +17,7 @@ #include "parquet/arrow/reader.h" -#include +#include "parquet/page_index.h" #include #include @@ -74,7 +74,6 @@ using arrow::internal::Iota; // Help reduce verbosity using ParquetReader = parquet::ParquetFileReader; -using parquet::IntervalRange; using parquet::internal::RecordReader; namespace bit_util = arrow::bit_util; @@ -206,7 +205,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader( int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, - const std::shared_ptr> & row_ranges_map, + const std::shared_ptr> & row_ranges_per_rg, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. @@ -223,13 +222,13 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->row_ranges_map = row_ranges_map; + ctx->row_ranges_per_rg = row_ranges_per_rg; return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders( const std::vector& column_indices, const std::vector& row_groups, - const std::shared_ptr> & row_ranges_map, + const std::shared_ptr> & row_ranges_per_rg, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -244,7 +243,7 @@ class FileReaderImpl : public FileReader { for (size_t i = 0; i < out->size(); ++i) { std::unique_ptr reader; RETURN_NOT_OK(GetFieldReader(field_indices[i], included_leaves, row_groups, - row_ranges_map, &reader)); + row_ranges_per_rg, &reader)); out_fields[i] = reader->field(); out->at(i) = std::move(reader); @@ -345,10 +344,10 @@ class FileReaderImpl : public FileReader { // This is a internal API owned by FileReaderImpl, not exposed in FileReader Status GetRecordBatchReaderWithRowRanges(const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr> & row_ranges_map, + const std::shared_ptr> & row_ranges_per_rg, std::unique_ptr* out); - Status GetRecordBatchReader(const RowRanges& rows_to_return, + Status GetRecordBatchReader(const IntervalRanges& rows_to_return, const std::vector& column_indices, std::unique_ptr* out) override { const auto metadata = reader_->metadata(); @@ -358,7 +357,7 @@ class FileReaderImpl : public FileReader { rows_to_return.ToString()); } // check if the row ranges are within the row group boundaries - if (rows_to_return.RowCount() != 0 && rows_to_return.GetRanges().back().end >= metadata->num_rows()) { + if (rows_to_return.RowCount() != 0 && rows_to_return.LastRow() >= metadata->num_rows()) { return Status::Invalid("The provided row range " + rows_to_return.ToString() + " exceeds the number of rows in the file: " + std::to_string(metadata->num_rows())); @@ -371,20 +370,19 @@ class FileReaderImpl : public FileReader { split_points.push_back(rows_so_far); } // We'll assign a RowRanges for each RG, even if it's not required to return any rows - const std::vector splits = rows_to_return.SplitAt(split_points); - // Call row_ranges_map because array index is the row group index - const std::shared_ptr> row_ranges_map = - std::make_shared>(); + const std::vector splits = rows_to_return.SplitAt(split_points); + const std::shared_ptr> row_ranges_per_rg = + std::make_shared>(); rows_so_far = 0; std::vector row_group_indices; for (int i = 0 ; i < metadata->num_row_groups(); i++) { - row_ranges_map->push_back(splits[i].shift(-rows_so_far)); + row_ranges_per_rg->push_back(splits[i].shift(-rows_so_far)); rows_so_far += metadata->RowGroup(i)->num_rows(); - if (row_ranges_map->at(i).RowCount() > 0) + if (row_ranges_per_rg->at(i).RowCount() > 0) row_group_indices.push_back(i); } - return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_map, out); + return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_per_rg, out); } Status GetRecordBatchReader(const std::vector& row_group_indices, @@ -504,38 +502,59 @@ class RowGroupReaderImpl : public RowGroupReader { // ---------------------------------------------------------------------- // Column reader implementations -struct RowRangesPageFilter { - explicit RowRangesPageFilter(const RowRanges& row_ranges_, - const std::shared_ptr& page_ranges_) - : row_ranges(row_ranges_), page_ranges(page_ranges_) { +// Only support IntervalRange case for now +class RowRangesPageFilter { + public: + RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr& page_ranges) + : row_ranges_(row_ranges), page_ranges_(page_ranges) { + } - if (page_ranges == nullptr || page_ranges->GetRanges().size() == 0) { - throw ParquetException("Page ranges is empty"); - } + // To avoid error "std::function target must be copy-constructible", we must define copy constructor + RowRangesPageFilter(const RowRangesPageFilter& other) + : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) { } bool operator()(const DataPageStats& stats) { - ++page_range_idx; - IntervalRange current_page_range = (*page_ranges)[page_range_idx]; + if (!initted) { + row_ranges_itr_ = row_ranges_.NewIterator(); + page_ranges_itr_ = page_ranges_->NewIterator(); + + current_row_range_ = row_ranges_itr_->NextRange(); + + if (current_row_range_.index() != 0) { + throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange"); + } + initted = true; + } + + current_page_range_ = page_ranges_itr_->NextRange(); + if (current_page_range_.index() != 0) { + throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange"); + } - while (row_range_idx < row_ranges.GetRanges().size() && - current_page_range.IsAfter(row_ranges[row_range_idx])) { - row_range_idx++; + while (current_row_range_.index() == 0 && + std::get(current_page_range_).IsAfter( + std::get(current_row_range_))) { + current_row_range_ = row_ranges_itr_->NextRange(); } - if (row_range_idx >= row_ranges.GetRanges().size()) { + if (current_row_range_.index() != 0) { return true; } - return current_page_range.IsBefore(row_ranges[row_range_idx]); + return std::get(current_page_range_).IsBefore( + std::get(current_row_range_)); } - size_t row_range_idx = 0; - const RowRanges & row_ranges; - - int page_range_idx = -1; - const std::shared_ptr page_ranges; + private: + const RowRanges& row_ranges_; + const std::shared_ptr page_ranges_; + std::unique_ptr row_ranges_itr_ = NULLPTR; + std::unique_ptr page_ranges_itr_ = NULLPTR; + std::variant current_row_range_ = End(); + std::variant current_page_range_ = End(); + bool initted = false; }; // Leaf reader is for primitive arrays and primitive children of nested arrays @@ -600,8 +619,8 @@ class LeafReader : public ColumnReaderImpl { private: std::shared_ptr out_; - void checkAndGetPageRanges(const RowRanges & row_ranges, - std::shared_ptr& page_ranges) const { + void checkAndGetPageRanges(const IntervalRanges& row_ranges, + std::shared_ptr& page_ranges) const { // check offset exists const auto rg_pg_index_reader = ctx_->reader->GetPageIndexReader()->RowGroup(input_->current_row_group()); @@ -622,7 +641,7 @@ class LeafReader : public ColumnReaderImpl { } const auto page_locations = offset_index->page_locations(); - page_ranges = std::make_shared(); + page_ranges = std::make_shared(); for (size_t i = 0; i < page_locations.size() - 1; i++) { page_ranges->Add( {page_locations[i].first_row_index, page_locations[i + 1].first_row_index - 1}); @@ -634,8 +653,8 @@ class LeafReader : public ColumnReaderImpl { 1}); } - if (row_ranges.GetRanges().size() > 0) { - if (row_ranges.GetRanges().back().end > page_ranges->GetRanges().back().end) { + if (row_ranges.RowCount() > 0) { + if (row_ranges.LastRow() > page_ranges->LastRow()) { throw ParquetException( "The provided row range " + row_ranges.ToString() + " exceeds last page :" + page_ranges->GetRanges().back().ToString()); @@ -647,14 +666,17 @@ class LeafReader : public ColumnReaderImpl { std::unique_ptr page_reader = input_->NextChunk(); /// using page index to reduce cost - if (page_reader != nullptr && ctx_->row_ranges_map) { + if (page_reader != nullptr && ctx_->row_ranges_per_rg) { // reset skipper record_reader_->set_record_skipper(NULLPTR); - const auto & row_ranges = (*ctx_->row_ranges_map)[input_->current_row_group()]; + const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()]; if (row_ranges.RowCount() != 0) { + // BitmapRange is not supported yet, the following implementations + // are based on ItervalRanges assumption !!! + // if specific row range is provided for this rg - std::shared_ptr page_ranges; + std::shared_ptr page_ranges; checkAndGetPageRanges(row_ranges, page_ranges); // part 1, skip decompressing & decoding unnecessary pages @@ -1142,7 +1164,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& Status FileReaderImpl::GetRecordBatchReaderWithRowRanges( const std::vector& row_groups, const std::vector& column_indices, - const std::shared_ptr> & row_ranges_map, + const std::shared_ptr> & row_ranges_per_rg, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); @@ -1156,7 +1178,7 @@ Status FileReaderImpl::GetRecordBatchReaderWithRowRanges( std::vector> readers; std::shared_ptr<::arrow::Schema> batch_schema; - RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_map, &readers, + RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, row_ranges_per_rg, &readers, &batch_schema)); if (readers.empty()) { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 807be797aad6..b439f82789a0 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -197,7 +197,7 @@ class PARQUET_EXPORT FileReader { /// /// \returns error Status if either rows_to_return or column_indices /// contains an invalid index - virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return, + virtual ::arrow::Status GetRecordBatchReader(const IntervalRanges& rows_to_return, const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from @@ -219,7 +219,6 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::RecordBatchReader>* out); ::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out); - /// \brief Return a generator of record batches. /// /// The FileReader must outlive the generator, so this requires that you pass in a diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index 4d98f8a7fe5c..f579e62f610f 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -76,7 +76,7 @@ class FileColumnIterator { } auto row_group_reader = reader_->RowGroup(row_groups_.front()); - current_rg = row_groups_.front(); + current_rg_ = row_groups_.front(); row_groups_.pop_front(); return row_group_reader->GetColumnPageReader(column_index_); } @@ -89,14 +89,14 @@ class FileColumnIterator { int column_index() const { return column_index_; } - int current_row_group() const { return current_rg; } + int current_row_group() const { return current_rg_; } protected: int column_index_; ParquetFileReader* reader_; const SchemaDescriptor* schema_; std::deque row_groups_; - int current_rg = 0; + int current_rg_ = 0; }; using FileColumnIteratorFactory = @@ -113,7 +113,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; - std::shared_ptr> row_ranges_map; + std::shared_ptr> row_ranges_per_rg; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index b517ee7c798e..56e0f0b99450 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1998,7 +1998,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, } while (true) { - const auto advise = skipper->advise_next(current_rg_processed_records); + const auto advise = skipper->AdviseNext(current_rg_processed_records); if (advise == 0) { return 0; } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 7ebabf1f2095..4d9770296d92 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -321,7 +321,13 @@ struct IntervalRange { } } - size_t Count() const { return end - start + 1; } + size_t Count() const { + if(!IsValid()) { + throw ParquetException("Invalid range with start: " + std::to_string(start) + + " and end: " + std::to_string(end)); + } + return end - start + 1; + } bool IsBefore(const IntervalRange& other) const { return end < other.start; } @@ -355,22 +361,89 @@ struct End {}; class RowRanges { public: RowRanges() = default; + virtual ~RowRanges() = default; + virtual size_t RowCount() const = 0; + virtual int64_t LastRow() const = 0; + virtual bool IsValid() const = 0; + + // Returns a vector of PageLocations that must be read all to get values for + // all included in this range virtual std::vector + // PageIndexesToInclude(const std::vector& all_pages) = 0; + + class Iterator { + public: + virtual std::variant NextRange() = 0; + virtual ~Iterator() = default; + }; + virtual std::unique_ptr NewIterator() const = 0; + +}; - explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); } +class IntervalRanges : public RowRanges { + public: + IntervalRanges() = default; + + explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); } + + IntervalRanges(const std::vector& ranges) { this->ranges_ = ranges; } - RowRanges(const std::vector& ranges) { this->ranges = ranges; } + IntervalRanges(const IntervalRanges& other) { ranges_ = other.ranges_; } - RowRanges(const RowRanges& other) { ranges = other.ranges; } + IntervalRanges(IntervalRanges&& other) noexcept { ranges_ = std::move(other.ranges_); } - RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); } + class IntervalRowRangesIterator : public Iterator { + public: + IntervalRowRangesIterator(const std::vector & ranges) : ranges_(ranges) {} + ~IntervalRowRangesIterator() override {} + + std::variant NextRange() override { + if(current_index_ >= ranges_.size()) + return End(); + + return ranges_[current_index_++]; + } - static RowRanges Intersection(const RowRanges& left, const RowRanges& right) { - RowRanges result; + private: + const std::vector & ranges_; + size_t current_index_ = 0; + }; + + std::unique_ptr NewIterator() const override { + return std::make_unique(ranges_); + } + + size_t RowCount() const override { + size_t cnt = 0; + for (const IntervalRange& range : ranges_) { + cnt += range.Count(); + } + return cnt; + } + + int64_t LastRow() const override { + return ranges_.back().end; + } + + bool IsValid() const override { + if (ranges_.size() == 0) return true; + if (ranges_[0].start < 0) { + return false; + } + for (size_t i = 1; i < ranges_.size(); i++) { + if (ranges_[i].start <= ranges_[i - 1].end) { + return false; + } + } + return true; + } + + static IntervalRanges Intersection(const IntervalRanges& left, const IntervalRanges& right) { + IntervalRanges result; size_t rightIndex = 0; - for (const IntervalRange& l : left.ranges) { - for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) { - const IntervalRange& r = right.ranges[i]; + for (const IntervalRange& l : left.ranges_) { + for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { + const IntervalRange& r = right.ranges_[i]; if (l.IsBefore(r)) { break; } else if (l.IsAfter(r)) { @@ -386,31 +459,10 @@ class RowRanges { void Add(const IntervalRange& range) { const IntervalRange rangeToAdd = range; - if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) { + if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { throw ParquetException("Ranges must be added in order"); } - ranges.push_back(rangeToAdd); - } - - size_t RowCount() const { - size_t cnt = 0; - for (const IntervalRange& range : ranges) { - cnt += range.Count(); - } - return cnt; - } - - bool IsValid() const { - if (ranges.size() == 0) return true; - if (ranges[0].start < 0) { - return false; - } - for (size_t i = 1; i < ranges.size(); i++) { - if (ranges[i].start <= ranges[i - 1].end) { - return false; - } - } - return true; + ranges_.push_back(rangeToAdd); } bool IsOverlapping(int64_t start, int64_t end) const { @@ -420,23 +472,23 @@ class RowRanges { bool IsOverlapping(const IntervalRange& searchRange) const { auto it = std::lower_bound( - ranges.begin(), ranges.end(), searchRange, + ranges_.begin(), ranges_.end(), searchRange, [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); - return it != ranges.end() && !(*it).IsAfter(searchRange); + return it != ranges_.end() && !(*it).IsAfter(searchRange); } - std::vector& GetRanges() { return ranges; } + std::vector& GetRanges() { return ranges_; } - const std::vector& GetRanges() const { return ranges; } + const std::vector& GetRanges() const { return ranges_; } // Split the ranges into N+1 parts at the given split point, where N = - // split_points.size() The RowRows object itself is not modified - std::vector SplitAt(const std::vector& split_points) const { + // split_points.size(). The RowRows object itself is not modified + std::vector SplitAt(const std::vector& split_points) const { if (split_points.size() == 0) { return {*this}; } - std::vector result; + std::vector result; int64_t last_split_point = -1; for (const int64_t split_point : split_points) { if (split_point <= 0) { @@ -448,7 +500,7 @@ class RowRanges { last_split_point = split_point; } - RowRanges spaces; + IntervalRanges spaces; for (size_t i = 0; i < split_points.size(); ++i) { auto start = i == 0 ? 0 : split_points[i - 1]; auto end = split_points[i] - 1; @@ -458,7 +510,7 @@ class RowRanges { {split_points[split_points.size() - 1], std::numeric_limits::max()}); for (IntervalRange space : spaces.GetRanges()) { - RowRanges intersection = RowRanges::Intersection(RowRanges(space), *this); + IntervalRanges intersection = IntervalRanges::Intersection(IntervalRanges(space), *this); result.push_back(intersection); } @@ -467,15 +519,15 @@ class RowRanges { const IntervalRange& operator[](size_t index) const { // check index - if (index >= ranges.size() || index < 0) { + if (index >= ranges_.size() || index < 0) { throw ParquetException("Index out of range"); } - return ranges[index]; + return ranges_[index]; } - RowRanges shift(const int64_t offset) const { - RowRanges result; - for (const IntervalRange& range : ranges) { + IntervalRanges shift(const int64_t offset) const { + IntervalRanges result; + for (const IntervalRange& range : ranges_) { result.Add({range.start + offset, range.end + offset}); } return result; @@ -483,39 +535,30 @@ class RowRanges { std::string ToString() const { std::string result = "["; - for (const IntervalRange& range : ranges) { + for (const IntervalRange& range : ranges_) { result += "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), "; } - if (!ranges.empty()) { + if (!ranges_.empty()) { result = result.substr(0, result.size() - 2); } result += "]"; return result; } - /// The following APIs are to be implemented - /// Comment out for now to pass compile - // // Returns a vector of PageLocations that must be read all to get values for - // all included in this range virtual std::vector - // PageIndexesToInclude(const std::vector& all_pages) = 0; class - // Iterator { - // virtual std::variant NextRange() = 0; - // }; - // virtual std::unique_ptr NewIterator() = 0; private: - std::vector ranges; + std::vector ranges_; }; namespace internal { class PARQUET_EXPORT RecordSkipper { public: - RecordSkipper(RowRanges& pages, const RowRanges& row_ranges_) - : row_ranges(row_ranges_) { + RecordSkipper(IntervalRanges& pages, const IntervalRanges& row_ranges) + : row_ranges_(row_ranges) { // copy row_ranges - RowRanges will_process_pages, skip_pages; + IntervalRanges skip_pages; for (auto& page : pages.GetRanges()) { if (!row_ranges.IsOverlapping(page)) { skip_pages.Add(page); @@ -525,39 +568,39 @@ class PARQUET_EXPORT RecordSkipper { /// Since the skipped pages will be silently skipped without updating /// current_rg_processed_records or records_read_, we need to pre-process the row /// ranges as if these skipped pages never existed - adjust_ranges(skip_pages, row_ranges); + AdjustRanges(skip_pages, row_ranges_); - total_rows_to_process = pages.RowCount() - skip_pages.RowCount(); + total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount(); } /// \brief Return the number of records to read or to skip /// if return values is positive, it means to read N records /// if return values is negative, it means to skip N records /// if return values is 0, it means end of RG - int64_t advise_next(const int64_t current_rg_processed) { - if (row_ranges.GetRanges().size() == row_range_idx) { + int64_t AdviseNext(const int64_t current_rg_processed) { + if (row_ranges_.GetRanges().size() == row_range_idx_) { return 0; } - if (row_ranges[row_range_idx].end < current_rg_processed) { - row_range_idx++; - if (row_ranges.GetRanges().size() == row_range_idx) { + if (row_ranges_[row_range_idx_].end < current_rg_processed) { + row_range_idx_++; + if (row_ranges_.GetRanges().size() == row_range_idx_) { // negative, skip the ramaining rows - return current_rg_processed - total_rows_to_process; + return current_rg_processed - total_rows_to_process_; } } - if (row_ranges[row_range_idx].start > current_rg_processed) { + if (row_ranges_[row_range_idx_].start > current_rg_processed) { // negative, skip - return current_rg_processed - row_ranges[row_range_idx].start; + return current_rg_processed - row_ranges_[row_range_idx_].start; } - const auto ret = row_ranges[row_range_idx].end - current_rg_processed + 1; + const auto ret = row_ranges_[row_range_idx_].end - current_rg_processed + 1; return ret; } private: - void adjust_ranges(RowRanges& skip_pages, RowRanges& to_adjust) { + void AdjustRanges(IntervalRanges& skip_pages, IntervalRanges& to_adjust) { size_t skipped_rows = 0; auto iter = to_adjust.GetRanges().begin(); auto skip_iter = skip_pages.GetRanges().begin(); @@ -572,11 +615,11 @@ class PARQUET_EXPORT RecordSkipper { } } - /// Keep copy of ranges, because adjust_ranges() will modify them - RowRanges row_ranges; + /// Keep copy of ranges, because AdjustRanges() will modify them + IntervalRanges row_ranges_; - size_t row_range_idx = 0; - size_t total_rows_to_process = 0; + size_t row_range_idx_ = 0; + size_t total_rows_to_process_ = 0; }; /// \brief Stateful column reader that delimits semantic records for both flat diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index b3127a8e346c..cde60c583f50 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -34,7 +34,7 @@ #include using parquet::IntervalRange; -using parquet::RowRanges; +using parquet::IntervalRanges; std::string random_string(std::string::size_type length) { static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; @@ -279,7 +279,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {} TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { std::unique_ptr rb_reader; - RowRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}}; + IntervalRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -290,7 +290,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { std::unique_ptr rb_reader; - RowRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}}; + IntervalRanges rows{{IntervalRange{0, 7}, IntervalRange{16, 23}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -301,7 +301,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { std::unique_ptr rb_reader; - RowRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}}; + IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -312,7 +312,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { TEST_F(TestRecordBatchReaderWithRanges, SelectEmptyRange) { std::unique_ptr rb_reader; - RowRanges rows{}; + IntervalRanges rows{}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = @@ -330,7 +330,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices, &rb_reader)); check_rb(std::move(rb_reader), 15, 210); // 0 + 2 + ... + 28 = 210 @@ -348,7 +348,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices, &rb_reader)); check_rb(std::move(rb_reader), 30, @@ -359,7 +359,7 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { std::unique_ptr rb_reader; { - RowRanges rows{{IntervalRange{-1, 5}}}; + IntervalRanges rows{{IntervalRange{-1, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -370,7 +370,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { - RowRanges rows{{IntervalRange{0, 4}, {2, 5}}}; + IntervalRanges rows{{IntervalRange{0, 4}, {2, 5}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -381,7 +381,7 @@ TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { } { // will treat as {0,99} - RowRanges rows{{IntervalRange{0, 100}}}; + IntervalRanges rows{{IntervalRange{0, 100}}}; const std::vector column_indices{0, 1, 2, 3, 4}; const auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); @@ -430,7 +430,7 @@ TEST(TestRecordBatchReaderWithRangesBadCases, NoPageIndex) { ASSERT_OK_AND_ASSIGN(auto arrow_reader, reader_builder.Build()); std::unique_ptr rb_reader; - RowRanges rows{{IntervalRange{0, 29}}}; + IntervalRanges rows{{IntervalRange{0, 29}}}; std::vector column_indices{0, 1, 2, 3, 4}; auto status = arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); ASSERT_NOT_OK(status); @@ -479,7 +479,7 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { if (i % 2 == 0) ranges.push_back({i, i}); } const std::vector column_indices{0, 1, 2, 3, 4}; - ASSERT_OK(arrow_reader->GetRecordBatchReader(RowRanges(ranges), column_indices, + ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices, &rb_reader)); // 0-9 is masked as null, so the ramaining is: diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc index 2e043f57a7b2..81b38bc28268 100644 --- a/cpp/src/parquet/row_range_test.cc +++ b/cpp/src/parquet/row_range_test.cc @@ -21,7 +21,7 @@ using namespace parquet; class RowRangesTest : public ::testing::Test { protected: - RowRanges rowRanges; + IntervalRanges rowRanges; }; TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) { From 25f83f8850005961fa4b6529d61f3af63fb9eaa0 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Sun, 14 Jan 2024 22:44:51 +0800 Subject: [PATCH 17/25] checkAndGetPageRanges refactored --- cpp/src/parquet/arrow/reader.cc | 7 +++---- cpp/src/parquet/column_reader.h | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 8289b63b475d..843e4f227659 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -619,7 +619,7 @@ class LeafReader : public ColumnReaderImpl { private: std::shared_ptr out_; - void checkAndGetPageRanges(const IntervalRanges& row_ranges, + void checkAndGetPageRanges(const RowRanges& row_ranges, std::shared_ptr& page_ranges) const { // check offset exists const auto rg_pg_index_reader = @@ -671,11 +671,10 @@ class LeafReader : public ColumnReaderImpl { record_reader_->set_record_skipper(NULLPTR); const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()]; + // if specific row range is provided for this rg if (row_ranges.RowCount() != 0) { - // BitmapRange is not supported yet, the following implementations - // are based on ItervalRanges assumption !!! - // if specific row range is provided for this rg + // Use IntervalRanges to represent pages std::shared_ptr page_ranges; checkAndGetPageRanges(row_ranges, page_ranges); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 4d9770296d92..35924581c8a5 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -340,7 +340,7 @@ struct IntervalRange { bool IsValid() const { return start >= 0 && end >= 0 && end >= start; } std::string ToString() const { - return "[" + std::to_string(start) + ", " + std::to_string(end) + "]"; + return "(" + std::to_string(start) + ", " + std::to_string(end) + ")"; } // inclusive @@ -365,6 +365,7 @@ class RowRanges { virtual size_t RowCount() const = 0; virtual int64_t LastRow() const = 0; virtual bool IsValid() const = 0; + virtual std::string ToString() const = 0; // Returns a vector of PageLocations that must be read all to get values for // all included in this range virtual std::vector @@ -533,11 +534,10 @@ class IntervalRanges : public RowRanges { return result; } - std::string ToString() const { + std::string ToString() const override { std::string result = "["; for (const IntervalRange& range : ranges_) { - result += - "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + "), "; + result += range.ToString() + ", "; } if (!ranges_.empty()) { result = result.substr(0, result.size() - 2); From 2e43866ab4b6e0d2375096c2d47c8aaf2ea0a79e Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Sun, 14 Jan 2024 23:22:30 +0800 Subject: [PATCH 18/25] RecordSkipper refactored --- cpp/src/parquet/column_reader.h | 59 +++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 35924581c8a5..b18ef38c7006 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -365,6 +365,7 @@ class RowRanges { virtual size_t RowCount() const = 0; virtual int64_t LastRow() const = 0; virtual bool IsValid() const = 0; + virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0; virtual std::string ToString() const = 0; // Returns a vector of PageLocations that must be read all to get values for @@ -471,7 +472,7 @@ class IntervalRanges : public RowRanges { return IsOverlapping(searchRange); } - bool IsOverlapping(const IntervalRange& searchRange) const { + bool IsOverlapping(const IntervalRange& searchRange) const override { auto it = std::lower_bound( ranges_.begin(), ranges_.end(), searchRange, [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); @@ -546,8 +547,6 @@ class IntervalRanges : public RowRanges { return result; } - - private: std::vector ranges_; }; @@ -555,12 +554,11 @@ class IntervalRanges : public RowRanges { namespace internal { class PARQUET_EXPORT RecordSkipper { public: - RecordSkipper(IntervalRanges& pages, const IntervalRanges& row_ranges) - : row_ranges_(row_ranges) { + RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) { // copy row_ranges IntervalRanges skip_pages; for (auto& page : pages.GetRanges()) { - if (!row_ranges.IsOverlapping(page)) { + if (!orig_row_ranges.IsOverlapping(page)) { skip_pages.Add(page); } } @@ -568,7 +566,9 @@ class PARQUET_EXPORT RecordSkipper { /// Since the skipped pages will be silently skipped without updating /// current_rg_processed_records or records_read_, we need to pre-process the row /// ranges as if these skipped pages never existed - AdjustRanges(skip_pages, row_ranges_); + AdjustRanges(skip_pages, orig_row_ranges, row_ranges_); + range_iter_ = row_ranges_->NewIterator(); + current_range_variant = range_iter_->NextRange(); total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount(); } @@ -578,47 +578,56 @@ class PARQUET_EXPORT RecordSkipper { /// if return values is negative, it means to skip N records /// if return values is 0, it means end of RG int64_t AdviseNext(const int64_t current_rg_processed) { - if (row_ranges_.GetRanges().size() == row_range_idx_) { + if (current_range_variant.index() == 2) { return 0; } - if (row_ranges_[row_range_idx_].end < current_rg_processed) { - row_range_idx_++; - if (row_ranges_.GetRanges().size() == row_range_idx_) { + auto & current_range = std::get(current_range_variant); + + if (current_range.end < current_rg_processed) { + current_range_variant = range_iter_->NextRange(); + if (current_range_variant.index() == 2) { // negative, skip the ramaining rows return current_rg_processed - total_rows_to_process_; } } - if (row_ranges_[row_range_idx_].start > current_rg_processed) { + current_range = std::get(current_range_variant); + + if (current_range.start > current_rg_processed) { // negative, skip - return current_rg_processed - row_ranges_[row_range_idx_].start; + return current_rg_processed - current_range.start; } - const auto ret = row_ranges_[row_range_idx_].end - current_rg_processed + 1; + const auto ret = current_range.end - current_rg_processed + 1; return ret; } - private: - void AdjustRanges(IntervalRanges& skip_pages, IntervalRanges& to_adjust) { +private: + void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges, std::unique_ptr& ret) { + std::unique_ptr temp = std::make_unique(); + size_t skipped_rows = 0; - auto iter = to_adjust.GetRanges().begin(); + const auto orig_range_iter = orig_row_ranges.NewIterator(); + auto orig_range_variant = orig_range_iter->NextRange(); auto skip_iter = skip_pages.GetRanges().begin(); - while (iter != to_adjust.GetRanges().end()) { - while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(*iter)) { + while (orig_range_variant.index() != 2) { + const auto & origin_range = std::get(orig_range_variant); + while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(origin_range)) { skipped_rows += skip_iter->Count(); ++skip_iter; } - iter->start -= skipped_rows; - iter->end -= skipped_rows; - ++iter; + + temp->Add(IntervalRange(origin_range.start - skipped_rows, origin_range.end - skipped_rows)); + orig_range_variant = orig_range_iter->NextRange(); } + ret = std::move(temp); } - /// Keep copy of ranges, because AdjustRanges() will modify them - IntervalRanges row_ranges_; + std::unique_ptr row_ranges_; + std::unique_ptr range_iter_; + std::variant current_range_variant = End(); - size_t row_range_idx_ = 0; size_t total_rows_to_process_ = 0; }; From cb0d67b27dfd0c01ac7ea04f147d3ec484034846 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 18:20:04 +0800 Subject: [PATCH 19/25] refactor complete --- cpp/src/parquet/arrow/reader.cc | 47 +++--- cpp/src/parquet/arrow/reader.h | 2 +- cpp/src/parquet/arrow/reader_internal.h | 2 +- cpp/src/parquet/column_reader.h | 165 +++++++++----------- cpp/src/parquet/row_range_test.cc | 193 ++++++++++++++++++------ 5 files changed, 239 insertions(+), 170 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 843e4f227659..49aeeb3f0f5e 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -205,7 +205,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader( int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, - const std::shared_ptr> & row_ranges_per_rg, + const std::shared_ptr>> & row_ranges_per_rg, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. @@ -222,13 +222,13 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->row_ranges_per_rg = row_ranges_per_rg; + ctx->row_ranges_per_rg = row_ranges_per_rg; // copy the shared pointer to extend its lifecycle return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders( const std::vector& column_indices, const std::vector& row_groups, - const std::shared_ptr> & row_ranges_per_rg, + const std::shared_ptr>> & row_ranges_per_rg, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -344,10 +344,10 @@ class FileReaderImpl : public FileReader { // This is a internal API owned by FileReaderImpl, not exposed in FileReader Status GetRecordBatchReaderWithRowRanges(const std::vector& row_group_indices, const std::vector& column_indices, - const std::shared_ptr> & row_ranges_per_rg, + const std::shared_ptr>> & row_ranges_per_rg, std::unique_ptr* out); - Status GetRecordBatchReader(const IntervalRanges& rows_to_return, + Status GetRecordBatchReader(const RowRanges& rows_to_return, const std::vector& column_indices, std::unique_ptr* out) override { const auto metadata = reader_->metadata(); @@ -362,27 +362,24 @@ class FileReaderImpl : public FileReader { " exceeds the number of rows in the file: " + std::to_string(metadata->num_rows())); } + if (rows_to_return.RowCount() == 0) { + return GetRecordBatchReaderWithRowRanges({}, column_indices, {}, out); + } - std::vector split_points; - int64_t rows_so_far = 0; - for (int i = 0 ; i < metadata->num_row_groups() - 1; i++) { - rows_so_far += metadata->RowGroup(i)->num_rows(); - split_points.push_back(rows_so_far); + std::vector rows_per_rg; + for (int i = 0 ; i < metadata->num_row_groups(); i++) { + rows_per_rg.push_back( metadata->RowGroup(i)->num_rows()); } // We'll assign a RowRanges for each RG, even if it's not required to return any rows - const std::vector splits = rows_to_return.SplitAt(split_points); - const std::shared_ptr> row_ranges_per_rg = - std::make_shared>(); - rows_so_far = 0; + std::vector> row_ranges_per_rg = rows_to_return.SplitByRowGroups(rows_per_rg); std::vector row_group_indices; for (int i = 0 ; i < metadata->num_row_groups(); i++) { - row_ranges_per_rg->push_back(splits[i].shift(-rows_so_far)); - rows_so_far += metadata->RowGroup(i)->num_rows(); - if (row_ranges_per_rg->at(i).RowCount() > 0) + if (row_ranges_per_rg.at(i)->RowCount() > 0) row_group_indices.push_back(i); } - return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, row_ranges_per_rg, out); + return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, + std::make_shared>>(std::move(row_ranges_per_rg)), out); } Status GetRecordBatchReader(const std::vector& row_group_indices, @@ -502,7 +499,9 @@ class RowGroupReaderImpl : public RowGroupReader { // ---------------------------------------------------------------------- // Column reader implementations -// Only support IntervalRange case for now +// This class is used to skip decompressing & decoding unnecessary pages by comparing user-specified row_ranges +// and page_ranges from metadata. +// Only support IntervalRange case for now. class RowRangesPageFilter { public: RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr& page_ranges) @@ -672,20 +671,20 @@ class LeafReader : public ColumnReaderImpl { const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()]; // if specific row range is provided for this rg - if (row_ranges.RowCount() != 0) { + if (row_ranges->RowCount() != 0) { // Use IntervalRanges to represent pages std::shared_ptr page_ranges; - checkAndGetPageRanges(row_ranges, page_ranges); + checkAndGetPageRanges(*row_ranges, page_ranges); // part 1, skip decompressing & decoding unnecessary pages page_reader->set_data_page_filter( - RowRangesPageFilter(row_ranges, page_ranges)); + RowRangesPageFilter(*row_ranges, page_ranges)); // part 2, skip unnecessary rows in necessary pages record_reader_->set_record_skipper( std::make_shared(*page_ranges, - row_ranges)); + *row_ranges)); } else { NextRowGroup(); return; @@ -1163,7 +1162,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& Status FileReaderImpl::GetRecordBatchReaderWithRowRanges( const std::vector& row_groups, const std::vector& column_indices, - const std::shared_ptr> & row_ranges_per_rg, + const std::shared_ptr>> & row_ranges_per_rg, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index b439f82789a0..98ea6f5c1a05 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -197,7 +197,7 @@ class PARQUET_EXPORT FileReader { /// /// \returns error Status if either rows_to_return or column_indices /// contains an invalid index - virtual ::arrow::Status GetRecordBatchReader(const IntervalRanges& rows_to_return, + virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return, const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index f579e62f610f..b30aef2691c1 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -113,7 +113,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; - std::shared_ptr> row_ranges_per_rg; + std::shared_ptr>> row_ranges_per_rg; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index b18ef38c7006..5b9a96b27df9 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -366,6 +366,10 @@ class RowRanges { virtual int64_t LastRow() const = 0; virtual bool IsValid() const = 0; virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0; + // Given a RowRanges with rows accross all RGs, split it into N RowRanges, where N = number of RGs + // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested in RowRanges [90-110], then + // this function will return 2 RowRanges: [90-99] and [0-10] + virtual std::vector> SplitByRowGroups(const std::vector& rows_per_rg) const = 0; virtual std::string ToString() const = 0; // Returns a vector of PageLocations that must be read all to get values for @@ -387,26 +391,20 @@ class IntervalRanges : public RowRanges { explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); } - IntervalRanges(const std::vector& ranges) { this->ranges_ = ranges; } - - IntervalRanges(const IntervalRanges& other) { ranges_ = other.ranges_; } - - IntervalRanges(IntervalRanges&& other) noexcept { ranges_ = std::move(other.ranges_); } - class IntervalRowRangesIterator : public Iterator { - public: - IntervalRowRangesIterator(const std::vector & ranges) : ranges_(ranges) {} + public: + IntervalRowRangesIterator(const std::vector& ranges) + : ranges_(ranges) {} ~IntervalRowRangesIterator() override {} std::variant NextRange() override { - if(current_index_ >= ranges_.size()) - return End(); + if (current_index_ >= ranges_.size()) return End(); return ranges_[current_index_++]; } - private: - const std::vector & ranges_; + private: + const std::vector& ranges_; size_t current_index_ = 0; }; @@ -422,9 +420,7 @@ class IntervalRanges : public RowRanges { return cnt; } - int64_t LastRow() const override { - return ranges_.back().end; - } + int64_t LastRow() const override { return ranges_.back().end; } bool IsValid() const override { if (ranges_.size() == 0) return true; @@ -439,39 +435,6 @@ class IntervalRanges : public RowRanges { return true; } - static IntervalRanges Intersection(const IntervalRanges& left, const IntervalRanges& right) { - IntervalRanges result; - - size_t rightIndex = 0; - for (const IntervalRange& l : left.ranges_) { - for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { - const IntervalRange& r = right.ranges_[i]; - if (l.IsBefore(r)) { - break; - } else if (l.IsAfter(r)) { - rightIndex = i + 1; - continue; - } - result.Add(IntervalRange::Intersection(l, r)); - } - } - - return result; - } - - void Add(const IntervalRange& range) { - const IntervalRange rangeToAdd = range; - if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { - throw ParquetException("Ranges must be added in order"); - } - ranges_.push_back(rangeToAdd); - } - - bool IsOverlapping(int64_t start, int64_t end) const { - const IntervalRange searchRange(start, end); - return IsOverlapping(searchRange); - } - bool IsOverlapping(const IntervalRange& searchRange) const override { auto it = std::lower_bound( ranges_.begin(), ranges_.end(), searchRange, @@ -479,79 +442,93 @@ class IntervalRanges : public RowRanges { return it != ranges_.end() && !(*it).IsAfter(searchRange); } - std::vector& GetRanges() { return ranges_; } - - const std::vector& GetRanges() const { return ranges_; } - - // Split the ranges into N+1 parts at the given split point, where N = - // split_points.size(). The RowRows object itself is not modified - std::vector SplitAt(const std::vector& split_points) const { - if (split_points.size() == 0) { - return {*this}; + std::string ToString() const override { + std::string result = "["; + for (const IntervalRange& range : ranges_) { + result += range.ToString() + ", "; + } + if (!ranges_.empty()) { + result = result.substr(0, result.size() - 2); } + result += "]"; + return result; + } - std::vector result; - int64_t last_split_point = -1; - for (const int64_t split_point : split_points) { - if (split_point <= 0) { - throw ParquetException("Invalid split point " + std::to_string(split_point)); - } - if (split_point <= last_split_point) { - throw ParquetException("Split points must be in ascending order"); - } - last_split_point = split_point; + std::vector> SplitByRowGroups( + const std::vector& rows_per_rg) const override { + if (rows_per_rg.size() <= 1) { + std::unique_ptr single = + std::make_unique(*this); // return a copy of itself + auto ret = std::vector>(); + ret.push_back(std::move(single)); + return ret; } + std::vector> result; + IntervalRanges spaces; - for (size_t i = 0; i < split_points.size(); ++i) { - auto start = i == 0 ? 0 : split_points[i - 1]; - auto end = split_points[i] - 1; + int64_t rows_so_far = 0; + for (size_t i = 0; i < rows_per_rg.size(); ++i) { + auto start = rows_so_far; + rows_so_far += rows_per_rg[i]; + auto end = rows_so_far - 1; spaces.Add({start, end}); } - spaces.Add( - {split_points[split_points.size() - 1], std::numeric_limits::max()}); + // each RG's row range forms a space, we need to adjust RowRanges in each space to + // zero based. for (IntervalRange space : spaces.GetRanges()) { - IntervalRanges intersection = IntervalRanges::Intersection(IntervalRanges(space), *this); - result.push_back(intersection); + auto intersection = Intersection(IntervalRanges(space), *this); + + std::unique_ptr zero_based_ranges = + std::make_unique(); + for (const IntervalRange& range : intersection.GetRanges()) { + zero_based_ranges->Add({range.start - space.start, range.end - space.start}); + } + result.push_back(std::move(zero_based_ranges)); } return result; } - const IntervalRange& operator[](size_t index) const { - // check index - if (index >= ranges_.size() || index < 0) { - throw ParquetException("Index out of range"); - } - return ranges_[index]; - } - - IntervalRanges shift(const int64_t offset) const { + static IntervalRanges Intersection(const IntervalRanges& left, + const IntervalRanges& right) { IntervalRanges result; - for (const IntervalRange& range : ranges_) { - result.Add({range.start + offset, range.end + offset}); + + size_t rightIndex = 0; + for (const IntervalRange& l : left.ranges_) { + for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { + const IntervalRange& r = right.ranges_[i]; + if (l.IsBefore(r)) { + break; + } else if (l.IsAfter(r)) { + rightIndex = i + 1; + continue; + } + result.Add(IntervalRange::Intersection(l, r)); + } } + return result; } - std::string ToString() const override { - std::string result = "["; - for (const IntervalRange& range : ranges_) { - result += range.ToString() + ", "; - } - if (!ranges_.empty()) { - result = result.substr(0, result.size() - 2); + void Add(const IntervalRange& range) { + const IntervalRange rangeToAdd = range; + if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { + throw ParquetException("Ranges must be added in order"); } - result += "]"; - return result; + ranges_.push_back(rangeToAdd); } + const std::vector& GetRanges() const { return ranges_; } + private: std::vector ranges_; }; namespace internal { + +// A RecordSkipper is used to skip uncessary rows within each pages. class PARQUET_EXPORT RecordSkipper { public: RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) { diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc index 81b38bc28268..82ad60c6b3fe 100644 --- a/cpp/src/parquet/row_range_test.cc +++ b/cpp/src/parquet/row_range_test.cc @@ -24,79 +24,172 @@ class RowRangesTest : public ::testing::Test { IntervalRanges rowRanges; }; -TEST_F(RowRangesTest, SplitAt_EmptySplitPoints_ReturnsOriginalRowRanges) { +TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) { rowRanges.Add(IntervalRange(0, 10)); - std::vector split_points; - - auto result = rowRanges.SplitAt(split_points); + std::vector rows_per_rg; + auto result = rowRanges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 1); - ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].start, 0); - ASSERT_EQ(result[0][0].end, 10); + + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 10); + ASSERT_EQ(iter->NextRange().index(), 2); } -TEST_F(RowRangesTest, SplitAt_SingleSplitPoint_ReturnsTwoRowRanges) { +TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) { rowRanges.Add(IntervalRange(0, 10)); - std::vector split_points = {5}; + std::vector rows_per_rg = {11}; - auto result = rowRanges.SplitAt(split_points); + auto result = rowRanges.SplitByRowGroups(rows_per_rg); + ASSERT_EQ(result.size(), 1); - ASSERT_EQ(result.size(), 2); - ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].start, 0); - ASSERT_EQ(result[0][0].end, 4); - ASSERT_EQ(result[1].GetRanges().size(), 1); - ASSERT_EQ(result[1][0].start, 5); - ASSERT_EQ(result[1][0].end, 10); + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 10); + ASSERT_EQ(iter->NextRange().index(), 2); } -TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnsMultipleRowRanges) { +TEST_F(RowRangesTest, ReturnsTwoRowRanges) { rowRanges.Add(IntervalRange(0, 10)); - std::vector split_points = {3, 7}; + std::vector rows_per_rg = {5, 6}; - auto result = rowRanges.SplitAt(split_points); + auto result = rowRanges.SplitByRowGroups(rows_per_rg); + ASSERT_EQ(result.size(), 2); + { + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 4); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[1]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 5); + ASSERT_EQ(iter->NextRange().index(), 2); + } +} +TEST_F(RowRangesTest, ReturnsMultipleRowRanges) { + rowRanges.Add(IntervalRange(0, 11)); + std::vector rows_per_rg = {3, 4, 100}; + + auto result = rowRanges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 3); - ASSERT_EQ(result[0].GetRanges().size(), 1); - ASSERT_EQ(result[0][0].start, 0); - ASSERT_EQ(result[0][0].end, 2); - ASSERT_EQ(result[1].GetRanges().size(), 1); - ASSERT_EQ(result[1][0].start, 3); - ASSERT_EQ(result[1][0].end, 6); - ASSERT_EQ(result[2].GetRanges().size(), 1); - ASSERT_EQ(result[2][0].start, 7); - ASSERT_EQ(result[2][0].end, 10); + { + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 2); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[1]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 3); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[2]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 4); + ASSERT_EQ(iter->NextRange().index(), 2); + } } -TEST_F(RowRangesTest, SplitAt_MultipleSplitPoints_ReturnWithEmptyRowRanges) { - rowRanges.Add(IntervalRange(11, 18)); - std::vector split_points = {5, 10, 15, 20}; +TEST_F(RowRangesTest, MultipleInputRange) { + rowRanges.Add(IntervalRange(0, 10)); + rowRanges.Add(IntervalRange(90, 111)); + rowRanges.Add(IntervalRange(191, 210)); - auto result = rowRanges.SplitAt(split_points); + std::vector rows_per_rg = {100, 100}; - ASSERT_EQ(result.size(), 5); - ASSERT_EQ(result[0].GetRanges().size(), 0); - ASSERT_EQ(result[1].GetRanges().size(), 0); - ASSERT_EQ(result[2].GetRanges().size(), 1); - ASSERT_EQ(result[2][0].start, 11); - ASSERT_EQ(result[2][0].end, 14); - ASSERT_EQ(result[3].GetRanges().size(), 1); - ASSERT_EQ(result[3][0].start, 15); - ASSERT_EQ(result[3][0].end, 18); - ASSERT_EQ(result[4].GetRanges().size(), 0); + auto result = rowRanges.SplitByRowGroups(rows_per_rg); + ASSERT_EQ(result.size(), 2); + { + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 10); + + range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 90); + ASSERT_EQ(range.end, 99); + + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[1]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 11); + + range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 91); + ASSERT_EQ(range.end, 99); + + ASSERT_EQ(iter->NextRange().index(), 2); + } } -TEST_F(RowRangesTest, SplitAt_InvalidSplitPoint_ThrowsException) { - rowRanges.Add(IntervalRange(0, 10)); - std::vector split_points = {-1}; +TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) { + rowRanges.Add(IntervalRange(11, 18)); + std::vector rows_per_rg = {5, 5, 5, 5, 5}; - ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); + auto result = rowRanges.SplitByRowGroups(rows_per_rg); + ASSERT_EQ(result.size(), 5); + { + auto iter = result[0]->NewIterator(); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[1]->NewIterator(); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[2]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 1); + ASSERT_EQ(range.end, 4); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[3]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 3); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[4]->NewIterator(); + ASSERT_EQ(iter->NextRange().index(), 2); + } } -TEST_F(RowRangesTest, SplitAt_UnorderedSplitPoints_ThrowsException) { +TEST_F(RowRangesTest, RangeExceedRG) { rowRanges.Add(IntervalRange(0, 10)); - std::vector split_points = {5, 3}; + std::vector rows_per_rg = {5, 3}; - ASSERT_THROW(rowRanges.SplitAt(split_points), ParquetException); + auto result = rowRanges.SplitByRowGroups(rows_per_rg); + ASSERT_EQ(result.size(), 2); + { + auto iter = result[0]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 4); + ASSERT_EQ(iter->NextRange().index(), 2); + } + { + auto iter = result[1]->NewIterator(); + auto range = std::get(iter->NextRange()); + ASSERT_EQ(range.start, 0); + ASSERT_EQ(range.end, 2); + ASSERT_EQ(iter->NextRange().index(), 2); + } } From 5805b976d5b920c4e27d8c984973a1442b32713d Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 19:35:53 +0800 Subject: [PATCH 20/25] fix style --- cpp/src/parquet/arrow/reader.cc | 2 +- cpp/src/parquet/column_reader.cc | 10 +++++----- cpp/src/parquet/column_reader.h | 16 ++++++++++++---- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 49aeeb3f0f5e..8782f9d84b2a 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -205,7 +205,7 @@ class FileReaderImpl : public FileReader { Status GetFieldReader( int i, const std::shared_ptr>& included_leaves, const std::vector& row_groups, - const std::shared_ptr>> & row_ranges_per_rg, + const std::shared_ptr>>& row_ranges_per_rg, std::unique_ptr* out) { // Should be covered by GetRecordBatchReader checks but // manifest_.schema_fields is a separate variable so be extra careful. diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 56e0f0b99450..763274ed74a2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1637,7 +1637,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, int64_t skipped_records = 0; if (this->max_rep_level_ == 0 && this->max_def_level_ == 0) { skipped_records = this->Skip(num_records); - current_rg_processed_records += skipped_records; + current_rg_processed_records_ += skipped_records; return skipped_records; } if (this->max_rep_level_ == 0) { @@ -1656,7 +1656,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, skipped_records += this->SkipRecordsRepeated(num_records); } - current_rg_processed_records += skipped_records; + current_rg_processed_records_ += skipped_records; return skipped_records; } @@ -1988,17 +1988,17 @@ class TypedRecordReader : public TypedColumnReaderImpl, this->ConsumeBufferedValues(values_to_read); } - current_rg_processed_records += records_read; + current_rg_processed_records_ += records_read; return records_read; } int64_t ReadRecordDataWithSkipCheck(const int64_t num_records) { - if (!skipper) { + if (!skipper_) { return ReadRecordData(num_records); } while (true) { - const auto advise = skipper->AdviseNext(current_rg_processed_records); + const auto advise = skipper_->AdviseNext(current_rg_processed_records_); if (advise == 0) { return 0; } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 5b9a96b27df9..cae7a1336590 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -32,6 +32,7 @@ #include "parquet/types.h" namespace arrow { + class Array; class ChunkedArray; @@ -42,9 +43,11 @@ class BitReader; namespace util { class RleDecoder; } // namespace util + } // namespace arrow namespace parquet { + class Decryptor; class Page; @@ -427,6 +430,11 @@ class IntervalRanges : public RowRanges { if (ranges_[0].start < 0) { return false; } + for (size_t i = 0; i < ranges_.size(); i++) { + if (!ranges_[i].IsValid()) { + return false; + } + } for (size_t i = 1; i < ranges_.size(); i++) { if (ranges_[i].start <= ranges_[i - 1].end) { return false; @@ -718,9 +726,9 @@ class PARQUET_EXPORT RecordReader { /// \brief True if reading dense for nullable columns. bool read_dense_for_nullable() const { return read_dense_for_nullable_; } - void reset_current_rg_processed_records() { current_rg_processed_records = 0; } + void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; } - void set_record_skipper(std::shared_ptr skipper_) { skipper = skipper_; } + void set_record_skipper(std::shared_ptr skipper_) { skipper_ = skipper_; } protected: /// \brief Indicates if we can have nullable values. Note that repeated fields @@ -730,7 +738,7 @@ class PARQUET_EXPORT RecordReader { bool at_record_start_; int64_t records_read_; - int64_t current_rg_processed_records; // counting both read and skip records + int64_t current_rg_processed_records_; // counting both read and skip records /// \brief Stores values. These values are populated based on each ReadRecords /// call. No extra values are buffered for the next call. SkipRecords will not @@ -774,7 +782,7 @@ class PARQUET_EXPORT RecordReader { // vector. bool read_dense_for_nullable_ = false; - std::shared_ptr skipper = NULLPTR; + std::shared_ptr skipper_ = NULLPTR; }; class BinaryRecordReader : virtual public RecordReader { From 639d94a221d6cb1d5ee4e311918043643f5de12e Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 20:04:39 +0800 Subject: [PATCH 21/25] separete definition --- cpp/src/parquet/column_reader.cc | 141 +++++++++++++++++++++++++++ cpp/src/parquet/column_reader.h | 161 +++++-------------------------- 2 files changed, 165 insertions(+), 137 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 763274ed74a2..954b94ad47e0 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1319,6 +1319,147 @@ std::shared_ptr ColumnReader::Make(const ColumnDescriptor* descr, return std::shared_ptr(nullptr); } +// ---------------------------------------------------------------------- +// RowRanges and ins implementations + +IntervalRanges::IntervalRanges() = default; + +IntervalRanges::IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); } + +IntervalRanges::IntervalRanges(const std::vector& ranges) { + this->ranges_ = ranges; +} + +std::unique_ptr IntervalRanges::NewIterator() const { + return std::make_unique(ranges_); +} + +size_t IntervalRanges::RowCount() const { + size_t cnt = 0; + for (const IntervalRange& range : ranges_) { + cnt += range.Count(); + } + return cnt; +} + +int64_t IntervalRanges::LastRow() const { return ranges_.back().end; } + +bool IntervalRanges::IsValid() const { + if (ranges_.size() == 0) return true; + if (ranges_[0].start < 0) { + return false; + } + for (size_t i = 0; i < ranges_.size(); i++) { + if (!ranges_[i].IsValid()) { + return false; + } + } + for (size_t i = 1; i < ranges_.size(); i++) { + if (ranges_[i].start <= ranges_[i - 1].end) { + return false; + } + } + return true; +} + +bool IntervalRanges::IsOverlapping(const IntervalRange& searchRange) const { + auto it = std::lower_bound( + ranges_.begin(), ranges_.end(), searchRange, + [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); + return it != ranges_.end() && !(*it).IsAfter(searchRange); +} + +std::string IntervalRanges::ToString() const { + std::string result = "["; + for (const IntervalRange& range : ranges_) { + result += range.ToString() + ", "; + } + if (!ranges_.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; +} + +std::vector> IntervalRanges::SplitByRowGroups( + const std::vector& rows_per_rg) const { + if (rows_per_rg.size() <= 1) { + std::unique_ptr single = + std::make_unique(*this); // return a copy of itself + auto ret = std::vector>(); + ret.push_back(std::move(single)); + return ret; + } + + std::vector> result; + + IntervalRanges spaces; + int64_t rows_so_far = 0; + for (size_t i = 0; i < rows_per_rg.size(); ++i) { + auto start = rows_so_far; + rows_so_far += rows_per_rg[i]; + auto end = rows_so_far - 1; + spaces.Add({start, end}); + } + + // each RG's row range forms a space, we need to adjust RowRanges in each space to + // zero based. + for (IntervalRange space : spaces.GetRanges()) { + auto intersection = Intersection(IntervalRanges(space), *this); + + std::unique_ptr zero_based_ranges = + std::make_unique(); + for (const IntervalRange& range : intersection.GetRanges()) { + zero_based_ranges->Add({range.start - space.start, range.end - space.start}); + } + result.push_back(std::move(zero_based_ranges)); + } + + return result; +} + +IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left, + const IntervalRanges& right) { + IntervalRanges result; + + size_t rightIndex = 0; + for (const IntervalRange& l : left.ranges_) { + for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { + const IntervalRange& r = right.ranges_[i]; + if (l.IsBefore(r)) { + break; + } else if (l.IsAfter(r)) { + rightIndex = i + 1; + continue; + } + result.Add(IntervalRange::Intersection(l, r)); + } + } + + return result; +} + +void IntervalRanges::Add(const IntervalRange& range) { + const IntervalRange rangeToAdd = range; + if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { + throw ParquetException("Ranges must be added in order"); + } + ranges_.push_back(rangeToAdd); +} + +const std::vector& IntervalRanges::GetRanges() const { return ranges_; } + +IntervalRowRangesIterator::IntervalRowRangesIterator( + const std::vector& ranges) + : ranges_(ranges) {} +IntervalRowRangesIterator::~IntervalRowRangesIterator() {} + +std::variant IntervalRowRangesIterator::NextRange() { + if (current_index_ >= ranges_.size()) return End(); + + return ranges_[current_index_++]; +} + // ---------------------------------------------------------------------- // RecordReader diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index cae7a1336590..1a921e6c26df 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -390,148 +390,35 @@ class RowRanges { class IntervalRanges : public RowRanges { public: - IntervalRanges() = default; - - explicit IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); } - - class IntervalRowRangesIterator : public Iterator { - public: - IntervalRowRangesIterator(const std::vector& ranges) - : ranges_(ranges) {} - ~IntervalRowRangesIterator() override {} - - std::variant NextRange() override { - if (current_index_ >= ranges_.size()) return End(); - - return ranges_[current_index_++]; - } - - private: - const std::vector& ranges_; - size_t current_index_ = 0; - }; - - std::unique_ptr NewIterator() const override { - return std::make_unique(ranges_); - } - - size_t RowCount() const override { - size_t cnt = 0; - for (const IntervalRange& range : ranges_) { - cnt += range.Count(); - } - return cnt; - } - - int64_t LastRow() const override { return ranges_.back().end; } - - bool IsValid() const override { - if (ranges_.size() == 0) return true; - if (ranges_[0].start < 0) { - return false; - } - for (size_t i = 0; i < ranges_.size(); i++) { - if (!ranges_[i].IsValid()) { - return false; - } - } - for (size_t i = 1; i < ranges_.size(); i++) { - if (ranges_[i].start <= ranges_[i - 1].end) { - return false; - } - } - return true; - } - - bool IsOverlapping(const IntervalRange& searchRange) const override { - auto it = std::lower_bound( - ranges_.begin(), ranges_.end(), searchRange, - [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); - return it != ranges_.end() && !(*it).IsAfter(searchRange); - } - - std::string ToString() const override { - std::string result = "["; - for (const IntervalRange& range : ranges_) { - result += range.ToString() + ", "; - } - if (!ranges_.empty()) { - result = result.substr(0, result.size() - 2); - } - result += "]"; - return result; - } - + IntervalRanges(); + explicit IntervalRanges(const IntervalRange& range); + explicit IntervalRanges(const std::vector& ranges); + std::unique_ptr NewIterator() const override; + size_t RowCount() const override; + int64_t LastRow() const override; + bool IsValid() const override; + bool IsOverlapping(const IntervalRange& searchRange) const override; + std::string ToString() const override; std::vector> SplitByRowGroups( - const std::vector& rows_per_rg) const override { - if (rows_per_rg.size() <= 1) { - std::unique_ptr single = - std::make_unique(*this); // return a copy of itself - auto ret = std::vector>(); - ret.push_back(std::move(single)); - return ret; - } - - std::vector> result; - - IntervalRanges spaces; - int64_t rows_so_far = 0; - for (size_t i = 0; i < rows_per_rg.size(); ++i) { - auto start = rows_so_far; - rows_so_far += rows_per_rg[i]; - auto end = rows_so_far - 1; - spaces.Add({start, end}); - } - - // each RG's row range forms a space, we need to adjust RowRanges in each space to - // zero based. - for (IntervalRange space : spaces.GetRanges()) { - auto intersection = Intersection(IntervalRanges(space), *this); - - std::unique_ptr zero_based_ranges = - std::make_unique(); - for (const IntervalRange& range : intersection.GetRanges()) { - zero_based_ranges->Add({range.start - space.start, range.end - space.start}); - } - result.push_back(std::move(zero_based_ranges)); - } - - return result; - } - + const std::vector& rows_per_rg) const override; static IntervalRanges Intersection(const IntervalRanges& left, - const IntervalRanges& right) { - IntervalRanges result; - - size_t rightIndex = 0; - for (const IntervalRange& l : left.ranges_) { - for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { - const IntervalRange& r = right.ranges_[i]; - if (l.IsBefore(r)) { - break; - } else if (l.IsAfter(r)) { - rightIndex = i + 1; - continue; - } - result.Add(IntervalRange::Intersection(l, r)); - } - } + const IntervalRanges& right); + void Add(const IntervalRange& range); + const std::vector& GetRanges() const; - return result; - } - - void Add(const IntervalRange& range) { - const IntervalRange rangeToAdd = range; - if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { - throw ParquetException("Ranges must be added in order"); - } - ranges_.push_back(rangeToAdd); - } + private: + std::vector ranges_; +}; - const std::vector& GetRanges() const { return ranges_; } +class IntervalRowRangesIterator : public RowRanges::Iterator { + public: + IntervalRowRangesIterator(const std::vector& ranges); + ~IntervalRowRangesIterator() override; + std::variant NextRange() override; private: - std::vector ranges_; + const std::vector& ranges_; + size_t current_index_; }; namespace internal { @@ -728,7 +615,7 @@ class PARQUET_EXPORT RecordReader { void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; } - void set_record_skipper(std::shared_ptr skipper_) { skipper_ = skipper_; } + void set_record_skipper(const std::shared_ptr& skipper) { skipper_ = skipper; } protected: /// \brief Indicates if we can have nullable values. Note that repeated fields From 8f5a88a481142ea54fae90c7306d17fac80c2200 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 20:16:35 +0800 Subject: [PATCH 22/25] separete definition 2 --- cpp/src/parquet/column_reader.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 1a921e6c26df..7d75caba05cb 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -347,9 +347,9 @@ struct IntervalRange { } // inclusive - int64_t start; + int64_t start = -1; // inclusive - int64_t end; + int64_t end = -1; }; struct BitmapRange { @@ -418,7 +418,7 @@ class IntervalRowRangesIterator : public RowRanges::Iterator { private: const std::vector& ranges_; - size_t current_index_; + size_t current_index_ = 0; }; namespace internal { From 09286d7537b575331bda06e5c42f2efb16fc7a4d Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 20:29:01 +0800 Subject: [PATCH 23/25] separete definition 3 --- cpp/src/parquet/arrow/reader.cc | 1 - cpp/src/parquet/column_reader.cc | 67 +++++++++++++++++++++++++++++ cpp/src/parquet/column_reader.h | 74 ++++---------------------------- 3 files changed, 76 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 8782f9d84b2a..e471696a401d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1398,7 +1398,6 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto ctx->pool = pool_; ctx->iterator_factory = iterator_factory; ctx->filter_leaves = false; - std::unique_ptr result; RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result)); *out = std::move(result); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 954b94ad47e0..4ba8243f696e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -2447,5 +2447,72 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, return nullptr; } +RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) { + // copy row_ranges + IntervalRanges skip_pages; + for (auto& page : pages.GetRanges()) { + if (!orig_row_ranges.IsOverlapping(page)) { + skip_pages.Add(page); + } + } + + AdjustRanges(skip_pages, orig_row_ranges, row_ranges_); + range_iter_ = row_ranges_->NewIterator(); + current_range_variant = range_iter_->NextRange(); + + total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount(); +} + + +int64_t RecordSkipper::AdviseNext(const int64_t current_rg_processed) { + if (current_range_variant.index() == 2) { + return 0; + } + + auto& current_range = std::get(current_range_variant); + + if (current_range.end < current_rg_processed) { + current_range_variant = range_iter_->NextRange(); + if (current_range_variant.index() == 2) { + // negative, skip the ramaining rows + return current_rg_processed - total_rows_to_process_; + } + } + + current_range = std::get(current_range_variant); + + if (current_range.start > current_rg_processed) { + // negative, skip + return current_rg_processed - current_range.start; + } + + const auto ret = current_range.end - current_rg_processed + 1; + return ret; +} + +void RecordSkipper::AdjustRanges(IntervalRanges& skip_pages, + const RowRanges& orig_row_ranges, + std::unique_ptr& ret) { + std::unique_ptr temp = std::make_unique(); + + size_t skipped_rows = 0; + const auto orig_range_iter = orig_row_ranges.NewIterator(); + auto orig_range_variant = orig_range_iter->NextRange(); + auto skip_iter = skip_pages.GetRanges().begin(); + while (orig_range_variant.index() != 2) { + const auto& origin_range = std::get(orig_range_variant); + while (skip_iter != skip_pages.GetRanges().end() && + skip_iter->IsBefore(origin_range)) { + skipped_rows += skip_iter->Count(); + ++skip_iter; + } + + temp->Add(IntervalRange(origin_range.start - skipped_rows, + origin_range.end - skipped_rows)); + orig_range_variant = orig_range_iter->NextRange(); + } + ret = std::move(temp); +} + } // namespace internal } // namespace parquet diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 7d75caba05cb..34fb43b28eed 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -426,75 +426,19 @@ namespace internal { // A RecordSkipper is used to skip uncessary rows within each pages. class PARQUET_EXPORT RecordSkipper { public: - RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges) { - // copy row_ranges - IntervalRanges skip_pages; - for (auto& page : pages.GetRanges()) { - if (!orig_row_ranges.IsOverlapping(page)) { - skip_pages.Add(page); - } - } - - /// Since the skipped pages will be silently skipped without updating - /// current_rg_processed_records or records_read_, we need to pre-process the row - /// ranges as if these skipped pages never existed - AdjustRanges(skip_pages, orig_row_ranges, row_ranges_); - range_iter_ = row_ranges_->NewIterator(); - current_range_variant = range_iter_->NextRange(); - - total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount(); - } - - /// \brief Return the number of records to read or to skip + RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ranges); + /// Return the number of records to read or to skip /// if return values is positive, it means to read N records /// if return values is negative, it means to skip N records /// if return values is 0, it means end of RG - int64_t AdviseNext(const int64_t current_rg_processed) { - if (current_range_variant.index() == 2) { - return 0; - } - - auto & current_range = std::get(current_range_variant); - - if (current_range.end < current_rg_processed) { - current_range_variant = range_iter_->NextRange(); - if (current_range_variant.index() == 2) { - // negative, skip the ramaining rows - return current_rg_processed - total_rows_to_process_; - } - } + int64_t AdviseNext(const int64_t current_rg_processed); - current_range = std::get(current_range_variant); - - if (current_range.start > current_rg_processed) { - // negative, skip - return current_rg_processed - current_range.start; - } - - const auto ret = current_range.end - current_rg_processed + 1; - return ret; - } - -private: - void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges, std::unique_ptr& ret) { - std::unique_ptr temp = std::make_unique(); - - size_t skipped_rows = 0; - const auto orig_range_iter = orig_row_ranges.NewIterator(); - auto orig_range_variant = orig_range_iter->NextRange(); - auto skip_iter = skip_pages.GetRanges().begin(); - while (orig_range_variant.index() != 2) { - const auto & origin_range = std::get(orig_range_variant); - while (skip_iter != skip_pages.GetRanges().end() && skip_iter->IsBefore(origin_range)) { - skipped_rows += skip_iter->Count(); - ++skip_iter; - } - - temp->Add(IntervalRange(origin_range.start - skipped_rows, origin_range.end - skipped_rows)); - orig_range_variant = orig_range_iter->NextRange(); - } - ret = std::move(temp); - } + private: + /// Since the skipped pages will be silently skipped without updating + /// current_rg_processed_records or records_read_, we need to pre-process the row + /// ranges as if these skipped pages never existed + static void AdjustRanges(IntervalRanges& skip_pages, const RowRanges& orig_row_ranges, + std::unique_ptr& ret); std::unique_ptr row_ranges_; std::unique_ptr range_iter_; From b75abdf1e00eafa60f215cfea75fb8b8fb55837e Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 15 Jan 2024 20:35:01 +0800 Subject: [PATCH 24/25] minor --- cpp/src/parquet/column_reader.h | 2 +- cpp/src/parquet/row_range_test.cc | 34 +++++++++++++++---------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 34fb43b28eed..cf11c8975dc5 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -569,7 +569,7 @@ class PARQUET_EXPORT RecordReader { bool at_record_start_; int64_t records_read_; - int64_t current_rg_processed_records_; // counting both read and skip records + int64_t current_rg_processed_records_ = 0; // counting both read and skip records /// \brief Stores values. These values are populated based on each ReadRecords /// call. No extra values are buffered for the next call. SkipRecords will not diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc index 82ad60c6b3fe..44327baab04c 100644 --- a/cpp/src/parquet/row_range_test.cc +++ b/cpp/src/parquet/row_range_test.cc @@ -21,14 +21,14 @@ using namespace parquet; class RowRangesTest : public ::testing::Test { protected: - IntervalRanges rowRanges; + IntervalRanges row_ranges; }; TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) { - rowRanges.Add(IntervalRange(0, 10)); + row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 1); auto iter = result[0]->NewIterator(); @@ -39,10 +39,10 @@ TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) { } TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) { - rowRanges.Add(IntervalRange(0, 10)); + row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {11}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 1); auto iter = result[0]->NewIterator(); @@ -53,10 +53,10 @@ TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) { } TEST_F(RowRangesTest, ReturnsTwoRowRanges) { - rowRanges.Add(IntervalRange(0, 10)); + row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {5, 6}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator(); @@ -75,10 +75,10 @@ TEST_F(RowRangesTest, ReturnsTwoRowRanges) { } TEST_F(RowRangesTest, ReturnsMultipleRowRanges) { - rowRanges.Add(IntervalRange(0, 11)); + row_ranges.Add(IntervalRange(0, 11)); std::vector rows_per_rg = {3, 4, 100}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 3); { auto iter = result[0]->NewIterator(); @@ -104,13 +104,13 @@ TEST_F(RowRangesTest, ReturnsMultipleRowRanges) { } TEST_F(RowRangesTest, MultipleInputRange) { - rowRanges.Add(IntervalRange(0, 10)); - rowRanges.Add(IntervalRange(90, 111)); - rowRanges.Add(IntervalRange(191, 210)); + row_ranges.Add(IntervalRange(0, 10)); + row_ranges.Add(IntervalRange(90, 111)); + row_ranges.Add(IntervalRange(191, 210)); std::vector rows_per_rg = {100, 100}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator(); @@ -139,10 +139,10 @@ TEST_F(RowRangesTest, MultipleInputRange) { } TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) { - rowRanges.Add(IntervalRange(11, 18)); + row_ranges.Add(IntervalRange(11, 18)); std::vector rows_per_rg = {5, 5, 5, 5, 5}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 5); { auto iter = result[0]->NewIterator(); @@ -173,10 +173,10 @@ TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) { } TEST_F(RowRangesTest, RangeExceedRG) { - rowRanges.Add(IntervalRange(0, 10)); + row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {5, 3}; - auto result = rowRanges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowGroups(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator(); From e361c66f59b22534bb20d9cfa4c94aa67bc050a5 Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Mon, 22 Jan 2024 16:16:26 +0800 Subject: [PATCH 25/25] fix comments --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/arrow/reader.cc | 105 ++++++++------- cpp/src/parquet/arrow/reader.h | 9 +- cpp/src/parquet/column_reader.cc | 150 +-------------------- cpp/src/parquet/column_reader.h | 128 +----------------- cpp/src/parquet/range_reader_test.cc | 65 +++++---- cpp/src/parquet/row_range.cc | 190 +++++++++++++++++++++++++++ cpp/src/parquet/row_range.h | 156 ++++++++++++++++++++++ cpp/src/parquet/row_range_test.cc | 17 +-- 9 files changed, 461 insertions(+), 360 deletions(-) create mode 100644 cpp/src/parquet/row_range.cc create mode 100644 cpp/src/parquet/row_range.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 9f9a7f2336aa..7d12d87e5d9c 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -156,6 +156,7 @@ set(PARQUET_SRCS arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc + row_range.cc column_reader.cc column_scanner.cc column_writer.cc diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index e471696a401d..cb15145b8a78 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -222,13 +222,14 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; - ctx->row_ranges_per_rg = row_ranges_per_rg; // copy the shared pointer to extend its lifecycle + ctx->row_ranges_per_rg = + row_ranges_per_rg; // copy the shared pointer to extend its lifecycle return GetReader(manifest_.schema_fields[i], ctx, out); } Status GetFieldReaders( const std::vector& column_indices, const std::vector& row_groups, - const std::shared_ptr>> & row_ranges_per_rg, + const std::shared_ptr>>& row_ranges_per_rg, std::vector>* out, std::shared_ptr<::arrow::Schema>* out_schema) { // We only need to read schema fields which have columns indicated @@ -342,44 +343,43 @@ class FileReaderImpl : public FileReader { } // This is a internal API owned by FileReaderImpl, not exposed in FileReader - Status GetRecordBatchReaderWithRowRanges(const std::vector& row_group_indices, - const std::vector& column_indices, - const std::shared_ptr>> & row_ranges_per_rg, - std::unique_ptr* out); + Status GetRecordBatchReaderWithRowRanges( + const std::vector& row_group_indices, const std::vector& column_indices, + const std::shared_ptr>>& row_ranges_per_rg, + std::unique_ptr* out); Status GetRecordBatchReader(const RowRanges& rows_to_return, const std::vector& column_indices, std::unique_ptr* out) override { const auto metadata = reader_->metadata(); - // check if the row ranges are valid - if (!rows_to_return.IsValid()) { - return Status::Invalid("The provided row range is invalid, keep it monotone and non-interleaving: " + - rows_to_return.ToString()); - } // check if the row ranges are within the row group boundaries - if (rows_to_return.RowCount() != 0 && rows_to_return.LastRow() >= metadata->num_rows()) { + if (rows_to_return.num_rows() != 0 && + rows_to_return.last_row() >= metadata->num_rows()) { return Status::Invalid("The provided row range " + rows_to_return.ToString() + " exceeds the number of rows in the file: " + std::to_string(metadata->num_rows())); } - if (rows_to_return.RowCount() == 0) { + if (rows_to_return.num_rows() == 0) { return GetRecordBatchReaderWithRowRanges({}, column_indices, {}, out); } std::vector rows_per_rg; - for (int i = 0 ; i < metadata->num_row_groups(); i++) { - rows_per_rg.push_back( metadata->RowGroup(i)->num_rows()); + for (int i = 0; i < metadata->num_row_groups(); i++) { + rows_per_rg.push_back(metadata->RowGroup(i)->num_rows()); } // We'll assign a RowRanges for each RG, even if it's not required to return any rows - std::vector> row_ranges_per_rg = rows_to_return.SplitByRowGroups(rows_per_rg); + std::vector> row_ranges_per_rg = + rows_to_return.SplitByRowRange(rows_per_rg); std::vector row_group_indices; - for (int i = 0 ; i < metadata->num_row_groups(); i++) { - if (row_ranges_per_rg.at(i)->RowCount() > 0) - row_group_indices.push_back(i); + for (int i = 0; i < metadata->num_row_groups(); i++) { + if (row_ranges_per_rg.at(i)->num_rows() > 0) row_group_indices.push_back(i); } - return GetRecordBatchReaderWithRowRanges(row_group_indices, column_indices, - std::make_shared>>(std::move(row_ranges_per_rg)), out); + return GetRecordBatchReaderWithRowRanges( + row_group_indices, column_indices, + std::make_shared>>( + std::move(row_ranges_per_rg)), + out); } Status GetRecordBatchReader(const std::vector& row_group_indices, @@ -390,13 +390,13 @@ class FileReaderImpl : public FileReader { Status GetRecordBatchReader(const std::vector& row_group_indices, std::unique_ptr* out) override { - return GetRecordBatchReaderWithRowRanges(row_group_indices, - Iota(reader_->metadata()->num_columns()), {}, out); + return GetRecordBatchReaderWithRowRanges( + row_group_indices, Iota(reader_->metadata()->num_columns()), {}, out); } Status GetRecordBatchReader(std::unique_ptr* out) override { - return GetRecordBatchReaderWithRowRanges(Iota(num_row_groups()), - Iota(reader_->metadata()->num_columns()), {}, out); + return GetRecordBatchReaderWithRowRanges( + Iota(num_row_groups()), Iota(reader_->metadata()->num_columns()), {}, out); } ::arrow::Result<::arrow::AsyncGenerator>> @@ -499,22 +499,21 @@ class RowGroupReaderImpl : public RowGroupReader { // ---------------------------------------------------------------------- // Column reader implementations -// This class is used to skip decompressing & decoding unnecessary pages by comparing user-specified row_ranges -// and page_ranges from metadata. -// Only support IntervalRange case for now. +// This class is used to skip decompressing & decoding unnecessary pages by comparing +// user-specified row_ranges and page_ranges from metadata. Only support IntervalRange +// case for now. class RowRangesPageFilter { public: - RowRangesPageFilter(const RowRanges& row_ranges, const std::shared_ptr& page_ranges) - : row_ranges_(row_ranges), page_ranges_(page_ranges) { - } + RowRangesPageFilter(const RowRanges& row_ranges, + const std::shared_ptr& page_ranges) + : row_ranges_(row_ranges), page_ranges_(page_ranges) {} - // To avoid error "std::function target must be copy-constructible", we must define copy constructor + // To avoid error "std::function target must be copy-constructible", we must define copy + // constructor RowRangesPageFilter(const RowRangesPageFilter& other) - : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) { - } + : row_ranges_(other.row_ranges_), page_ranges_(other.page_ranges_) {} bool operator()(const DataPageStats& stats) { - if (!initted) { row_ranges_itr_ = row_ranges_.NewIterator(); page_ranges_itr_ = page_ranges_->NewIterator(); @@ -522,19 +521,21 @@ class RowRangesPageFilter { current_row_range_ = row_ranges_itr_->NextRange(); if (current_row_range_.index() != 0) { - throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange"); + throw ParquetException( + "RowRangesPageFilter expects first NextRange() to be a IntervalRange"); } initted = true; } current_page_range_ = page_ranges_itr_->NextRange(); if (current_page_range_.index() != 0) { - throw ParquetException("RowRangesPageFilter expects first NextRange() to be a IntervalRange"); + throw ParquetException( + "RowRangesPageFilter expects first NextRange() to be a IntervalRange"); } while (current_row_range_.index() == 0 && - std::get(current_page_range_).IsAfter( - std::get(current_row_range_))) { + IntervalRangeUtils::IsAfter(std::get(current_page_range_), + std::get(current_row_range_))) { current_row_range_ = row_ranges_itr_->NextRange(); } @@ -542,8 +543,8 @@ class RowRangesPageFilter { return true; } - return std::get(current_page_range_).IsBefore( - std::get(current_row_range_)); + return IntervalRangeUtils::IsBefore(std::get(current_page_range_), + std::get(current_row_range_)); } private: @@ -652,11 +653,11 @@ class LeafReader : public ColumnReaderImpl { 1}); } - if (row_ranges.RowCount() > 0) { - if (row_ranges.LastRow() > page_ranges->LastRow()) { + if (row_ranges.num_rows() > 0) { + if (row_ranges.last_row() > page_ranges->last_row()) { throw ParquetException( - "The provided row range " + row_ranges.ToString() + - " exceeds last page :" + page_ranges->GetRanges().back().ToString()); + "The provided row range " + row_ranges.ToString() + " exceeds last page :" + + IntervalRangeUtils::ToString(page_ranges->GetRanges().back())); } } } @@ -667,23 +668,21 @@ class LeafReader : public ColumnReaderImpl { /// using page index to reduce cost if (page_reader != nullptr && ctx_->row_ranges_per_rg) { // reset skipper - record_reader_->set_record_skipper(NULLPTR); + record_reader_->reset_record_skipper(); - const auto & row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()]; + const auto& row_ranges = (*ctx_->row_ranges_per_rg)[input_->current_row_group()]; // if specific row range is provided for this rg - if (row_ranges->RowCount() != 0) { - + if (row_ranges->num_rows() != 0) { // Use IntervalRanges to represent pages std::shared_ptr page_ranges; checkAndGetPageRanges(*row_ranges, page_ranges); // part 1, skip decompressing & decoding unnecessary pages - page_reader->set_data_page_filter( - RowRangesPageFilter(*row_ranges, page_ranges)); + page_reader->set_data_page_filter(RowRangesPageFilter(*row_ranges, page_ranges)); // part 2, skip unnecessary rows in necessary pages record_reader_->set_record_skipper( - std::make_shared(*page_ranges, + std::make_unique(*page_ranges, *row_ranges)); } else { NextRowGroup(); @@ -1162,7 +1161,7 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& Status FileReaderImpl::GetRecordBatchReaderWithRowRanges( const std::vector& row_groups, const std::vector& column_indices, - const std::shared_ptr>> & row_ranges_per_rg, + const std::shared_ptr>>& row_ranges_per_rg, std::unique_ptr* out) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 98ea6f5c1a05..1bcf04ee867e 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -191,14 +191,15 @@ class PARQUET_EXPORT FileReader { /// \brief Return a RecordBatchReader of row groups selected from /// rows_to_return, whose columns are selected by column_indices. /// - /// Notice that rows_to_return is file based, it not only decides which row groups to read, - /// but also which rows to read in each row group. + /// Notice that rows_to_return is file based, it not only decides which row groups to + /// read, but also which rows to read in each row group. /// /// /// \returns error Status if either rows_to_return or column_indices /// contains an invalid index - virtual ::arrow::Status GetRecordBatchReader(const RowRanges& rows_to_return, - const std::vector& column_indices, std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; + virtual ::arrow::Status GetRecordBatchReader( + const RowRanges& rows_to_return, const std::vector& column_indices, + std::unique_ptr<::arrow::RecordBatchReader>* out) = 0; /// \brief Return a RecordBatchReader of row groups selected from /// row_group_indices, whose columns are selected by column_indices. diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 4ba8243f696e..76fad7a75486 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1319,147 +1319,6 @@ std::shared_ptr ColumnReader::Make(const ColumnDescriptor* descr, return std::shared_ptr(nullptr); } -// ---------------------------------------------------------------------- -// RowRanges and ins implementations - -IntervalRanges::IntervalRanges() = default; - -IntervalRanges::IntervalRanges(const IntervalRange& range) { ranges_.push_back(range); } - -IntervalRanges::IntervalRanges(const std::vector& ranges) { - this->ranges_ = ranges; -} - -std::unique_ptr IntervalRanges::NewIterator() const { - return std::make_unique(ranges_); -} - -size_t IntervalRanges::RowCount() const { - size_t cnt = 0; - for (const IntervalRange& range : ranges_) { - cnt += range.Count(); - } - return cnt; -} - -int64_t IntervalRanges::LastRow() const { return ranges_.back().end; } - -bool IntervalRanges::IsValid() const { - if (ranges_.size() == 0) return true; - if (ranges_[0].start < 0) { - return false; - } - for (size_t i = 0; i < ranges_.size(); i++) { - if (!ranges_[i].IsValid()) { - return false; - } - } - for (size_t i = 1; i < ranges_.size(); i++) { - if (ranges_[i].start <= ranges_[i - 1].end) { - return false; - } - } - return true; -} - -bool IntervalRanges::IsOverlapping(const IntervalRange& searchRange) const { - auto it = std::lower_bound( - ranges_.begin(), ranges_.end(), searchRange, - [](const IntervalRange& r1, const IntervalRange& r2) { return r1.IsBefore(r2); }); - return it != ranges_.end() && !(*it).IsAfter(searchRange); -} - -std::string IntervalRanges::ToString() const { - std::string result = "["; - for (const IntervalRange& range : ranges_) { - result += range.ToString() + ", "; - } - if (!ranges_.empty()) { - result = result.substr(0, result.size() - 2); - } - result += "]"; - return result; -} - -std::vector> IntervalRanges::SplitByRowGroups( - const std::vector& rows_per_rg) const { - if (rows_per_rg.size() <= 1) { - std::unique_ptr single = - std::make_unique(*this); // return a copy of itself - auto ret = std::vector>(); - ret.push_back(std::move(single)); - return ret; - } - - std::vector> result; - - IntervalRanges spaces; - int64_t rows_so_far = 0; - for (size_t i = 0; i < rows_per_rg.size(); ++i) { - auto start = rows_so_far; - rows_so_far += rows_per_rg[i]; - auto end = rows_so_far - 1; - spaces.Add({start, end}); - } - - // each RG's row range forms a space, we need to adjust RowRanges in each space to - // zero based. - for (IntervalRange space : spaces.GetRanges()) { - auto intersection = Intersection(IntervalRanges(space), *this); - - std::unique_ptr zero_based_ranges = - std::make_unique(); - for (const IntervalRange& range : intersection.GetRanges()) { - zero_based_ranges->Add({range.start - space.start, range.end - space.start}); - } - result.push_back(std::move(zero_based_ranges)); - } - - return result; -} - -IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left, - const IntervalRanges& right) { - IntervalRanges result; - - size_t rightIndex = 0; - for (const IntervalRange& l : left.ranges_) { - for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { - const IntervalRange& r = right.ranges_[i]; - if (l.IsBefore(r)) { - break; - } else if (l.IsAfter(r)) { - rightIndex = i + 1; - continue; - } - result.Add(IntervalRange::Intersection(l, r)); - } - } - - return result; -} - -void IntervalRanges::Add(const IntervalRange& range) { - const IntervalRange rangeToAdd = range; - if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { - throw ParquetException("Ranges must be added in order"); - } - ranges_.push_back(rangeToAdd); -} - -const std::vector& IntervalRanges::GetRanges() const { return ranges_; } - -IntervalRowRangesIterator::IntervalRowRangesIterator( - const std::vector& ranges) - : ranges_(ranges) {} -IntervalRowRangesIterator::~IntervalRowRangesIterator() {} - -std::variant IntervalRowRangesIterator::NextRange() { - if (current_index_ >= ranges_.size()) return End(); - - return ranges_[current_index_++]; -} - // ---------------------------------------------------------------------- // RecordReader @@ -2451,7 +2310,7 @@ RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ra // copy row_ranges IntervalRanges skip_pages; for (auto& page : pages.GetRanges()) { - if (!orig_row_ranges.IsOverlapping(page)) { + if (!orig_row_ranges.IsOverlapping(page.start, page.end)) { skip_pages.Add(page); } } @@ -2460,10 +2319,9 @@ RecordSkipper::RecordSkipper(IntervalRanges& pages, const RowRanges& orig_row_ra range_iter_ = row_ranges_->NewIterator(); current_range_variant = range_iter_->NextRange(); - total_rows_to_process_ = pages.RowCount() - skip_pages.RowCount(); + total_rows_to_process_ = pages.num_rows() - skip_pages.num_rows(); } - int64_t RecordSkipper::AdviseNext(const int64_t current_rg_processed) { if (current_range_variant.index() == 2) { return 0; @@ -2502,8 +2360,8 @@ void RecordSkipper::AdjustRanges(IntervalRanges& skip_pages, while (orig_range_variant.index() != 2) { const auto& origin_range = std::get(orig_range_variant); while (skip_iter != skip_pages.GetRanges().end() && - skip_iter->IsBefore(origin_range)) { - skipped_rows += skip_iter->Count(); + IntervalRangeUtils::IsBefore(*skip_iter, origin_range)) { + skipped_rows += IntervalRangeUtils::Count(*skip_iter); ++skip_iter; } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index cf11c8975dc5..f41995a0138f 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -22,12 +22,12 @@ #include #include -#include "page_index.h" #include "parquet/exception.h" #include "parquet/level_conversion.h" #include "parquet/metadata.h" #include "parquet/platform.h" #include "parquet/properties.h" +#include "parquet/row_range.h" #include "parquet/schema.h" #include "parquet/types.h" @@ -303,124 +303,6 @@ class TypedColumnReader : public ColumnReader { int32_t* dict_len) = 0; }; -// Represent a range to read. The range is inclusive on both ends. -struct IntervalRange { - static IntervalRange Intersection(const IntervalRange& left, - const IntervalRange& right) { - if (left.start <= right.start) { - if (left.end >= right.start) { - return {right.start, std::min(left.end, right.end)}; - } - } else if (right.end >= left.start) { - return {left.start, std::min(left.end, right.end)}; - } - return {-1, -1}; // Return a default Range object if no intersection range found - } - - IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) { - if (start > end) { - throw ParquetException("Invalid range with start: " + std::to_string(start) + - " and end: " + std::to_string(end)); - } - } - - size_t Count() const { - if(!IsValid()) { - throw ParquetException("Invalid range with start: " + std::to_string(start) + - " and end: " + std::to_string(end)); - } - return end - start + 1; - } - - bool IsBefore(const IntervalRange& other) const { return end < other.start; } - - bool IsAfter(const IntervalRange& other) const { return start > other.end; } - - bool IsOverlap(const IntervalRange& other) const { - return !IsBefore(other) && !IsAfter(other); - } - - bool IsValid() const { return start >= 0 && end >= 0 && end >= start; } - - std::string ToString() const { - return "(" + std::to_string(start) + ", " + std::to_string(end) + ")"; - } - - // inclusive - int64_t start = -1; - // inclusive - int64_t end = -1; -}; - -struct BitmapRange { - int64_t offset; - // zero added to, if there are less than 64 elements left in the column. - uint64_t bitmap; -}; - -struct End {}; - -// Represent a set of ranges to read. The ranges are sorted and non-overlapping. -class RowRanges { - public: - RowRanges() = default; - virtual ~RowRanges() = default; - virtual size_t RowCount() const = 0; - virtual int64_t LastRow() const = 0; - virtual bool IsValid() const = 0; - virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0; - // Given a RowRanges with rows accross all RGs, split it into N RowRanges, where N = number of RGs - // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested in RowRanges [90-110], then - // this function will return 2 RowRanges: [90-99] and [0-10] - virtual std::vector> SplitByRowGroups(const std::vector& rows_per_rg) const = 0; - virtual std::string ToString() const = 0; - - // Returns a vector of PageLocations that must be read all to get values for - // all included in this range virtual std::vector - // PageIndexesToInclude(const std::vector& all_pages) = 0; - - class Iterator { - public: - virtual std::variant NextRange() = 0; - virtual ~Iterator() = default; - }; - virtual std::unique_ptr NewIterator() const = 0; - -}; - -class IntervalRanges : public RowRanges { - public: - IntervalRanges(); - explicit IntervalRanges(const IntervalRange& range); - explicit IntervalRanges(const std::vector& ranges); - std::unique_ptr NewIterator() const override; - size_t RowCount() const override; - int64_t LastRow() const override; - bool IsValid() const override; - bool IsOverlapping(const IntervalRange& searchRange) const override; - std::string ToString() const override; - std::vector> SplitByRowGroups( - const std::vector& rows_per_rg) const override; - static IntervalRanges Intersection(const IntervalRanges& left, - const IntervalRanges& right); - void Add(const IntervalRange& range); - const std::vector& GetRanges() const; - - private: - std::vector ranges_; -}; - -class IntervalRowRangesIterator : public RowRanges::Iterator { - public: - IntervalRowRangesIterator(const std::vector& ranges); - ~IntervalRowRangesIterator() override; - std::variant NextRange() override; - - private: - const std::vector& ranges_; - size_t current_index_ = 0; -}; - namespace internal { // A RecordSkipper is used to skip uncessary rows within each pages. @@ -559,7 +441,11 @@ class PARQUET_EXPORT RecordReader { void reset_current_rg_processed_records() { current_rg_processed_records_ = 0; } - void set_record_skipper(const std::shared_ptr& skipper) { skipper_ = skipper; } + void set_record_skipper(std::unique_ptr skipper) { + skipper_ = std::move(skipper); + } + + void reset_record_skipper() { skipper_.reset(); } protected: /// \brief Indicates if we can have nullable values. Note that repeated fields @@ -613,7 +499,7 @@ class PARQUET_EXPORT RecordReader { // vector. bool read_dense_for_nullable_ = false; - std::shared_ptr skipper_ = NULLPTR; + std::unique_ptr skipper_ = NULLPTR; }; class BinaryRecordReader : virtual public RecordReader { diff --git a/cpp/src/parquet/range_reader_test.cc b/cpp/src/parquet/range_reader_test.cc index cde60c583f50..04510143e54c 100644 --- a/cpp/src/parquet/range_reader_test.cc +++ b/cpp/src/parquet/range_reader_test.cc @@ -39,7 +39,7 @@ using parquet::IntervalRanges; std::string random_string(std::string::size_type length) { static auto& chrs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - static std::mt19937 rg{std::random_device{}()}; + static std::mt19937 rg = std::mt19937(std::random_device()()); static std::uniform_int_distribution pick(0, sizeof(chrs) - 2); std::string s; @@ -240,10 +240,18 @@ void check_rb(std::unique_ptr rb_reader, } ASSERT_EQ(expected_rows, total_rows); - if (checking_col("a", column_names)) ASSERT_EQ(expected_sum * 2, sum_a); - if (checking_col("b", column_names)) ASSERT_EQ(expected_sum * 3, sum_b); - if (checking_col("c", column_names)) ASSERT_EQ(expected_sum, sum_c); - if (checking_col("d", column_names)) ASSERT_EQ(expected_sum, sum_d); + if (checking_col("a", column_names)) { + ASSERT_EQ(expected_sum * 2, sum_a); + } + if (checking_col("b", column_names)) { + ASSERT_EQ(expected_sum * 3, sum_b); + } + if (checking_col("c", column_names)) { + ASSERT_EQ(expected_sum, sum_c); + } + if (checking_col("d", column_names)) { + ASSERT_EQ(expected_sum, sum_d); + } } class TestRecordBatchReaderWithRanges : public testing::Test { @@ -279,7 +287,7 @@ TEST_F(TestRecordBatchReaderWithRanges, TestRangesSplit) {} TEST_F(TestRecordBatchReaderWithRanges, SelectOnePageForEachRG) { std::unique_ptr rb_reader; - IntervalRanges rows{{IntervalRange{0, 9}, IntervalRange{40, 49}, IntervalRange{80, 89}, IntervalRange{90, 99}}}; + IntervalRanges rows{{{0, 9}, {40, 49}, {80, 89}, {90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -301,7 +309,8 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectSomePageForOneRG) { TEST_F(TestRecordBatchReaderWithRanges, SelectAllRange) { std::unique_ptr rb_reader; - IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, IntervalRange{90, 99}}}; + IntervalRanges rows{{IntervalRange{0, 29}, IntervalRange{30, 59}, IntervalRange{60, 89}, + IntervalRange{90, 99}}}; const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader)); @@ -341,11 +350,15 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { std::unique_ptr rb_reader; std::vector ranges; for (int64_t i = 0; i < 30; i++) { - if (i % 2 == 0) ranges.push_back({i, i}); + if (i % 2 == 0) { + ranges.push_back({i, i}); + } } for (int64_t i = 60; i < 90; i++) { - if (i % 2 == 0) ranges.push_back({i, i}); + if (i % 2 == 0) { + ranges.push_back({i, i}); + } } const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices, @@ -359,25 +372,17 @@ TEST_F(TestRecordBatchReaderWithRanges, SelectOneRowSkipOneRow) { TEST_F(TestRecordBatchReaderWithRanges, InvalidRanges) { std::unique_ptr rb_reader; { - IntervalRanges rows{{IntervalRange{-1, 5}}}; - const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = - arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); - ASSERT_NOT_OK(status); - EXPECT_TRUE(status.message().find("The provided row range is invalid, keep it " - "monotone and non-interleaving: [(-1, 5)]") != - std::string::npos); + auto create_ranges = []() -> IntervalRanges { + return IntervalRanges{{IntervalRange{-1, 5}}}; + }; + EXPECT_THROW(create_ranges(), parquet::ParquetException); } { - IntervalRanges rows{{IntervalRange{0, 4}, {2, 5}}}; - const std::vector column_indices{0, 1, 2, 3, 4}; - const auto status = - arrow_reader->GetRecordBatchReader(rows, column_indices, &rb_reader); - ASSERT_NOT_OK(status); - EXPECT_TRUE( - status.message().find("The provided row range is invalid, keep it monotone and " - "non-interleaving: [(0, 4), (2, 5)]") != std::string::npos); + auto create_ranges = []() -> IntervalRanges { + return IntervalRanges{{{0, 4}, {2, 5}}}; + }; + EXPECT_THROW(create_ranges(), parquet::ParquetException); } { // will treat as {0,99} @@ -472,11 +477,15 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { std::unique_ptr rb_reader; std::vector ranges; for (int64_t i = 0; i < 30; i++) { - if (i % 2 == 0) ranges.push_back({i, i}); + if (i % 2 == 0) { + ranges.push_back({i, i}); + } } for (int64_t i = 60; i < 90; i++) { - if (i % 2 == 0) ranges.push_back({i, i}); + if (i % 2 == 0) { + ranges.push_back({i, i}); + } } const std::vector column_indices{0, 1, 2, 3, 4}; ASSERT_OK(arrow_reader->GetRecordBatchReader(IntervalRanges(ranges), column_indices, @@ -486,4 +495,4 @@ TEST_F(TestRecordBatchReaderWithRangesWithNulls, SelectOneRowSkipOneRow) { // (10 + 12 + ... + 28) + (60 + 62 ... + 88) = 1320 check_rb(std::move(rb_reader), 30, 1300); } -} \ No newline at end of file +} diff --git a/cpp/src/parquet/row_range.cc b/cpp/src/parquet/row_range.cc new file mode 100644 index 000000000000..fa996a198f43 --- /dev/null +++ b/cpp/src/parquet/row_range.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/row_range.h" + +#include + +#include "parquet/exception.h" + +namespace parquet { +// ---------------------------------------------------------------------- +// RowRanges and ins implementations +bool IsValid(const std::vector& ranges) { + if (ranges.size() == 0) return true; + if (ranges[0].start < 0) { + return false; + } + for (size_t i = 0; i < ranges.size(); i++) { + if (!IntervalRangeUtils::IsValid(ranges[i])) { + return false; + } + } + for (size_t i = 1; i < ranges.size(); i++) { + if (ranges[i].start <= ranges[i - 1].end) { + return false; + } + } + return true; +} + +IntervalRanges::IntervalRanges() = default; + +IntervalRanges::IntervalRanges(const IntervalRange& range) { + ranges_.push_back(range); + if (!IsValid(ranges_)) { + throw ParquetException("Invalid range with start: " + std::to_string(range.start) + + " and end: " + std::to_string(range.end) + + ", keep it monotone and non-interleaving"); + } +} + +IntervalRanges::IntervalRanges(const std::vector& ranges) { + this->ranges_ = ranges; + if (!IsValid(ranges_)) { + throw ParquetException("Invalid ranges: " + this->IntervalRanges::ToString() + + ", keep it monotone and non-interleaving"); + } +} + +std::unique_ptr IntervalRanges::NewIterator() const { + return std::make_unique(ranges_); +} + +size_t IntervalRanges::num_rows() const { + size_t cnt = 0; + for (const IntervalRange& range : ranges_) { + cnt += IntervalRangeUtils::Count(range); + } + return cnt; +} + +int64_t IntervalRanges::first_row() const { + if (ranges_.empty()) { + throw ParquetException("first_row() called on empty IntervalRanges"); + } + return ranges_.front().start; +} + +int64_t IntervalRanges::last_row() const { + if (ranges_.empty()) { + throw ParquetException("last_row() called on empty IntervalRanges"); + } + return ranges_.back().end; +} + +bool IntervalRanges::IsOverlapping(const int64_t start, const int64_t end) const { + auto searchRange = IntervalRange{start, end}; + auto it = std::lower_bound(ranges_.begin(), ranges_.end(), searchRange, + [](const IntervalRange& r1, const IntervalRange& r2) { + return IntervalRangeUtils::IsBefore(r1, r2); + }); + return it != ranges_.end() && !IntervalRangeUtils::IsAfter(*it, searchRange); +} + +std::string IntervalRanges::ToString() const { + std::string result = "["; + for (const IntervalRange& range : ranges_) { + result += IntervalRangeUtils::ToString(range) + ", "; + } + if (!ranges_.empty()) { + result = result.substr(0, result.size() - 2); + } + result += "]"; + return result; +} + +std::vector> IntervalRanges::SplitByRowRange( + const std::vector& num_rows_per_sub_ranges) const { + if (num_rows_per_sub_ranges.size() <= 1) { + std::unique_ptr single = + std::make_unique(*this); // return a copy of itself + auto ret = std::vector>(); + ret.push_back(std::move(single)); + return ret; + } + + std::vector> result; + + IntervalRanges spaces; + int64_t rows_so_far = 0; + for (size_t i = 0; i < num_rows_per_sub_ranges.size(); ++i) { + auto start = rows_so_far; + rows_so_far += num_rows_per_sub_ranges[i]; + auto end = rows_so_far - 1; + spaces.Add({start, end}); + } + + // each RG's row range forms a space, we need to adjust RowRanges in each space to + // zero based. + for (IntervalRange space : spaces.GetRanges()) { + auto intersection = Intersection(IntervalRanges(space), *this); + + std::unique_ptr zero_based_ranges = + std::make_unique(); + for (const IntervalRange& range : intersection.GetRanges()) { + zero_based_ranges->Add({range.start - space.start, range.end - space.start}); + } + result.push_back(std::move(zero_based_ranges)); + } + + return result; +} + +IntervalRanges IntervalRanges::Intersection(const IntervalRanges& left, + const IntervalRanges& right) { + IntervalRanges result; + + size_t rightIndex = 0; + for (const IntervalRange& l : left.ranges_) { + for (size_t i = rightIndex, n = right.ranges_.size(); i < n; ++i) { + const IntervalRange& r = right.ranges_[i]; + if (IntervalRangeUtils::IsBefore(l, r)) { + break; + } else if (IntervalRangeUtils::IsAfter(l, r)) { + rightIndex = i + 1; + continue; + } + result.Add(IntervalRangeUtils::Intersection(l, r)); + } + } + + return result; +} + +void IntervalRanges::Add(const IntervalRange& range) { + const IntervalRange rangeToAdd = range; + if (ranges_.size() > 1 && rangeToAdd.start <= ranges_.back().end) { + throw ParquetException("Ranges must be added in order"); + } + ranges_.push_back(rangeToAdd); +} + +const std::vector& IntervalRanges::GetRanges() const { return ranges_; } + +IntervalRowRangesIterator::IntervalRowRangesIterator( + const std::vector& ranges) + : ranges_(ranges) {} + +IntervalRowRangesIterator::~IntervalRowRangesIterator() {} + +std::variant IntervalRowRangesIterator::NextRange() { + if (current_index_ >= ranges_.size()) return End(); + + return ranges_[current_index_++]; +} +} // namespace parquet diff --git a/cpp/src/parquet/row_range.h b/cpp/src/parquet/row_range.h new file mode 100644 index 000000000000..4e7c2631eb6a --- /dev/null +++ b/cpp/src/parquet/row_range.h @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module contains the logical parquet-cpp types (independent of Thrift +// structures), schema nodes, and related type tools + +#pragma once +#include + +#include "parquet/exception.h" + +namespace parquet { + +// Represent a range to read. The range is inclusive on both ends. +struct IntervalRange { + IntervalRange(const int64_t start_, const int64_t end_) : start(start_), end(end_) { + if (start > end) { + throw ParquetException("Invalid range with start: " + std::to_string(start) + + " bigger than end: " + std::to_string(end)); + } + } + + // inclusive + int64_t start = -1; + // inclusive + int64_t end = -1; +}; + +class IntervalRangeUtils { + public: + static IntervalRange Intersection(const IntervalRange& left, + const IntervalRange& right) { + if (left.start <= right.start) { + if (left.end >= right.start) { + return {right.start, std::min(left.end, right.end)}; + } + } else if (right.end >= left.start) { + return {left.start, std::min(left.end, right.end)}; + } + return {-1, -1}; // Return a default Range object if no intersection range found + } + + static std::string ToString(const IntervalRange& range) { + return "(" + std::to_string(range.start) + ", " + std::to_string(range.end) + ")"; + } + + static bool IsValid(const IntervalRange& range) { + return range.start >= 0 && range.end >= 0 && range.end >= range.start; + } + + static size_t Count(const IntervalRange& range) { + if (!IsValid(range)) { + throw ParquetException("Invalid range: " + ToString(range)); + } + return range.end - range.start + 1; + } + + static bool IsBefore(const IntervalRange& self, const IntervalRange& other) { + return self.end < other.start; + } + + static bool IsAfter(const IntervalRange& self, const IntervalRange& other) { + return self.start > other.end; + } + + static bool IsOverlap(const IntervalRange& self, const IntervalRange& other) { + return !IsBefore(self, other) && !IsAfter(self, other); + } +}; + +struct BitmapRange { + int64_t offset; + // zero added to, if there are less than 64 elements left in the column. + uint64_t bitmap; +}; + +struct End {}; + +// Represent a set of ranges to read. The ranges are sorted and non-overlapping. +class RowRanges { + public: + virtual ~RowRanges() = default; + /// \brief Total number of rows in the row ranges. + virtual size_t num_rows() const = 0; + /// \brief First row in the ranges + virtual int64_t first_row() const = 0; + /// \brief Last row in the ranges + virtual int64_t last_row() const = 0; + /// \brief Whether the given range from start to end overlaps with the row ranges. + virtual bool IsOverlapping(int64_t start, int64_t end) const = 0; + /// \brief Split the row ranges into sub row ranges according to the + /// specified number of rows per sub row ranges. A typical use case is + /// to convert file based RowRanges to row group based RowRanges. + /// + /// \param num_rows_per_sub_ranges number of rows per sub row range. + virtual std::vector> SplitByRowRange( + const std::vector& num_rows_per_sub_ranges) const = 0; + /// \brief Readable string representation + virtual std::string ToString() const = 0; + + class Iterator { + public: + virtual std::variant NextRange() = 0; + virtual ~Iterator() = default; + }; + /// \brief Create an iterator to iterate over the ranges + virtual std::unique_ptr NewIterator() const = 0; +}; + +class IntervalRanges : public RowRanges { + public: + IntervalRanges(); + explicit IntervalRanges(const IntervalRange& range); + explicit IntervalRanges(const std::vector& ranges); + std::unique_ptr NewIterator() const override; + size_t num_rows() const override; + int64_t first_row() const override; + int64_t last_row() const override; + bool IsOverlapping(int64_t start, int64_t end) const override; + std::string ToString() const override; + std::vector> SplitByRowRange( + const std::vector& num_rows_per_sub_ranges) const override; + static IntervalRanges Intersection(const IntervalRanges& left, + const IntervalRanges& right); + void Add(const IntervalRange& range); + const std::vector& GetRanges() const; + + private: + std::vector ranges_; +}; + +class IntervalRowRangesIterator : public RowRanges::Iterator { + public: + explicit IntervalRowRangesIterator(const std::vector& ranges); + ~IntervalRowRangesIterator() override; + std::variant NextRange() override; + + private: + const std::vector& ranges_; + size_t current_index_ = 0; +}; +} // namespace parquet diff --git a/cpp/src/parquet/row_range_test.cc b/cpp/src/parquet/row_range_test.cc index 44327baab04c..bf0563211b8e 100644 --- a/cpp/src/parquet/row_range_test.cc +++ b/cpp/src/parquet/row_range_test.cc @@ -17,7 +17,8 @@ #include #include "parquet/column_reader.h" -using namespace parquet; +using parquet::IntervalRange; +using parquet::IntervalRanges; class RowRangesTest : public ::testing::Test { protected: @@ -28,7 +29,7 @@ TEST_F(RowRangesTest, EmptyRG_ReturnsOriginalRowRanges) { row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 1); auto iter = result[0]->NewIterator(); @@ -42,7 +43,7 @@ TEST_F(RowRangesTest, SingleRG_ReturnsOriginalRowRanges2) { row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {11}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 1); auto iter = result[0]->NewIterator(); @@ -56,7 +57,7 @@ TEST_F(RowRangesTest, ReturnsTwoRowRanges) { row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {5, 6}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator(); @@ -78,7 +79,7 @@ TEST_F(RowRangesTest, ReturnsMultipleRowRanges) { row_ranges.Add(IntervalRange(0, 11)); std::vector rows_per_rg = {3, 4, 100}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 3); { auto iter = result[0]->NewIterator(); @@ -110,7 +111,7 @@ TEST_F(RowRangesTest, MultipleInputRange) { std::vector rows_per_rg = {100, 100}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator(); @@ -142,7 +143,7 @@ TEST_F(RowRangesTest, MultipleSplitPoints_ReturnWithEmptyRowRanges) { row_ranges.Add(IntervalRange(11, 18)); std::vector rows_per_rg = {5, 5, 5, 5, 5}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 5); { auto iter = result[0]->NewIterator(); @@ -176,7 +177,7 @@ TEST_F(RowRangesTest, RangeExceedRG) { row_ranges.Add(IntervalRange(0, 10)); std::vector rows_per_rg = {5, 3}; - auto result = row_ranges.SplitByRowGroups(rows_per_rg); + auto result = row_ranges.SplitByRowRange(rows_per_rg); ASSERT_EQ(result.size(), 2); { auto iter = result[0]->NewIterator();