-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-11843: [C++] Provide async Parquet reader #9620
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
099d6d0
54fc2ac
0e7b97b
69b49a8
70a2d99
92ea23b
65c1fa0
95241bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -333,6 +333,53 @@ Result<std::unique_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader | |
| return std::move(arrow_reader); | ||
| } | ||
|
|
||
| Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReaderAsync( | ||
| const FileSource& source, const std::shared_ptr<ScanOptions>& options) const { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| auto parquet_scan_options, | ||
| GetFragmentScanOptions<ParquetFragmentScanOptions>(kParquetTypeName, options.get(), | ||
| default_fragment_scan_options)); | ||
| auto properties = | ||
| MakeReaderProperties(*this, parquet_scan_options.get(), options->pool); | ||
| ARROW_ASSIGN_OR_RAISE(auto input, source.Open()); | ||
| // TODO(ARROW-12259): workaround since we have Future<(move-only type)> | ||
| auto reader_fut = | ||
| parquet::ParquetFileReader::OpenAsync(std::move(input), std::move(properties)); | ||
| auto path = source.path(); | ||
| auto self = checked_pointer_cast<const ParquetFileFormat>(shared_from_this()); | ||
| return reader_fut.Then( | ||
| [=](const std::unique_ptr<parquet::ParquetFileReader>&) mutable | ||
| -> Result<std::shared_ptr<parquet::arrow::FileReader>> { | ||
| ARROW_ASSIGN_OR_RAISE(std::unique_ptr<parquet::ParquetFileReader> reader, | ||
| reader_fut.MoveResult()); | ||
| std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata(); | ||
| auto arrow_properties = MakeArrowReaderProperties(*self, *metadata); | ||
| arrow_properties.set_batch_size(options->batch_size); | ||
| // Must be set here since the sync ScanTask handles pre-buffering itself | ||
| arrow_properties.set_pre_buffer( | ||
| parquet_scan_options->arrow_reader_properties->pre_buffer()); | ||
| arrow_properties.set_cache_options( | ||
| parquet_scan_options->arrow_reader_properties->cache_options()); | ||
| arrow_properties.set_io_context( | ||
| parquet_scan_options->arrow_reader_properties->io_context()); | ||
| // TODO: ARROW-12597 will let us enable parallel conversion | ||
| if (!options->use_threads) { | ||
| arrow_properties.set_use_threads( | ||
| parquet_scan_options->enable_parallel_column_conversion); | ||
|
||
| } | ||
| std::unique_ptr<parquet::arrow::FileReader> arrow_reader; | ||
| RETURN_NOT_OK(parquet::arrow::FileReader::Make(options->pool, std::move(reader), | ||
| std::move(arrow_properties), | ||
| &arrow_reader)); | ||
| return std::move(arrow_reader); | ||
| }, | ||
| [path]( | ||
| const Status& status) -> Result<std::shared_ptr<parquet::arrow::FileReader>> { | ||
| return status.WithMessage("Could not open Parquet input source '", path, | ||
| "': ", status.message()); | ||
| }); | ||
| } | ||
|
|
||
| Result<ScanTaskIterator> ParquetFileFormat::ScanFile( | ||
| const std::shared_ptr<ScanOptions>& options, | ||
| const std::shared_ptr<FileFragment>& fragment) const { | ||
|
|
@@ -390,6 +437,47 @@ Result<ScanTaskIterator> ParquetFileFormat::ScanFile( | |
| return MakeVectorIterator(std::move(tasks)); | ||
| } | ||
|
|
||
| Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync( | ||
| const std::shared_ptr<ScanOptions>& options, | ||
| const std::shared_ptr<FileFragment>& file) const { | ||
| auto parquet_fragment = checked_pointer_cast<ParquetFileFragment>(file); | ||
| std::vector<int> row_groups; | ||
| bool pre_filtered = false; | ||
| // If RowGroup metadata is cached completely we can pre-filter RowGroups before opening | ||
| // a FileReader, potentially avoiding IO altogether if all RowGroups are excluded due to | ||
| // prior statistics knowledge. In the case where a RowGroup doesn't have statistics | ||
| // metdata, it will not be excluded. | ||
| if (parquet_fragment->metadata() != nullptr) { | ||
| ARROW_ASSIGN_OR_RAISE(row_groups, parquet_fragment->FilterRowGroups(options->filter)); | ||
| pre_filtered = true; | ||
| if (row_groups.empty()) return MakeEmptyGenerator<std::shared_ptr<RecordBatch>>(); | ||
| } | ||
| // Open the reader and pay the real IO cost. | ||
| auto make_generator = | ||
| [=](const std::shared_ptr<parquet::arrow::FileReader>& reader) mutable | ||
| -> Result<RecordBatchGenerator> { | ||
| // Ensure that parquet_fragment has FileMetaData | ||
| RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get())); | ||
| if (!pre_filtered) { | ||
| // row groups were not already filtered; do this now | ||
| ARROW_ASSIGN_OR_RAISE(row_groups, | ||
| parquet_fragment->FilterRowGroups(options->filter)); | ||
| if (row_groups.empty()) return MakeEmptyGenerator<std::shared_ptr<RecordBatch>>(); | ||
| } | ||
| auto column_projection = InferColumnProjection(*reader, *options); | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| auto parquet_scan_options, | ||
| GetFragmentScanOptions<ParquetFragmentScanOptions>( | ||
| kParquetTypeName, options.get(), default_fragment_scan_options)); | ||
| ARROW_ASSIGN_OR_RAISE(auto generator, reader->GetRecordBatchGenerator( | ||
| reader, row_groups, column_projection, | ||
| internal::GetCpuThreadPool())); | ||
| return MakeReadaheadGenerator(std::move(generator), options->batch_readahead); | ||
| }; | ||
| return MakeFromFuture(GetReaderAsync(parquet_fragment->source(), options) | ||
| .Then(std::move(make_generator))); | ||
| } | ||
|
|
||
| Future<util::optional<int64_t>> ParquetFileFormat::CountRows( | ||
| const std::shared_ptr<FileFragment>& file, compute::Expression predicate, | ||
| const std::shared_ptr<ScanOptions>& options) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2287,10 +2287,9 @@ TEST(TestArrowReadWrite, WaitCoalescedReads) { | |
| ASSERT_OK(builder.Open(std::make_shared<BufferReader>(buffer))); | ||
| ASSERT_OK(builder.properties(properties)->Build(&reader)); | ||
| // Pre-buffer data and wait for I/O to complete. | ||
| ASSERT_OK(reader->parquet_reader() | ||
| ->PreBuffer({0}, {0, 1, 2, 3, 4}, ::arrow::io::IOContext(), | ||
| ::arrow::io::CacheOptions::Defaults()) | ||
| .status()); | ||
| reader->parquet_reader()->PreBuffer({0}, {0, 1, 2, 3, 4}, ::arrow::io::IOContext(), | ||
| ::arrow::io::CacheOptions::Defaults()); | ||
| ASSERT_OK(reader->parquet_reader()->WhenBuffered({0}, {0, 1, 2, 3, 4}).status()); | ||
|
|
||
| std::shared_ptr<::arrow::RecordBatchReader> rb_reader; | ||
| ASSERT_OK_NO_THROW(reader->GetRecordBatchReader({0}, {0, 1, 2, 3, 4}, &rb_reader)); | ||
|
|
@@ -2331,6 +2330,66 @@ TEST(TestArrowReadWrite, GetRecordBatchReaderNoColumns) { | |
| ASSERT_EQ(actual_batch->num_rows(), num_rows); | ||
| } | ||
|
|
||
| TEST(TestArrowReadWrite, GetRecordBatchGenerator) { | ||
| ArrowReaderProperties properties = default_arrow_reader_properties(); | ||
| const int num_rows = 1024; | ||
| const int row_group_size = 512; | ||
| const int num_columns = 2; | ||
|
|
||
| std::shared_ptr<Table> table; | ||
| ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, 1, &table)); | ||
|
|
||
| std::shared_ptr<Buffer> buffer; | ||
| ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, | ||
| default_arrow_writer_properties(), &buffer)); | ||
|
|
||
| std::shared_ptr<FileReader> reader; | ||
| { | ||
| std::unique_ptr<FileReader> unique_reader; | ||
| FileReaderBuilder builder; | ||
| ASSERT_OK(builder.Open(std::make_shared<BufferReader>(buffer))); | ||
| ASSERT_OK(builder.properties(properties)->Build(&unique_reader)); | ||
| reader = std::move(unique_reader); | ||
| } | ||
|
|
||
| auto check_batches = [](const std::shared_ptr<::arrow::RecordBatch>& batch, | ||
| int num_columns, int num_rows) { | ||
| ASSERT_NE(batch, nullptr); | ||
| ASSERT_EQ(batch->num_columns(), num_columns); | ||
| ASSERT_EQ(batch->num_rows(), num_rows); | ||
|
||
| }; | ||
| { | ||
| ASSERT_OK_AND_ASSIGN(auto batch_generator, | ||
| reader->GetRecordBatchGenerator(reader, {0, 1}, {0, 1})); | ||
| auto fut1 = batch_generator(); | ||
| auto fut2 = batch_generator(); | ||
| auto fut3 = batch_generator(); | ||
| ASSERT_OK_AND_ASSIGN(auto batch1, fut1.result()); | ||
| ASSERT_OK_AND_ASSIGN(auto batch2, fut2.result()); | ||
| ASSERT_OK_AND_ASSIGN(auto batch3, fut3.result()); | ||
| ASSERT_EQ(batch3, nullptr); | ||
| check_batches(batch1, num_columns, row_group_size); | ||
| check_batches(batch2, num_columns, row_group_size); | ||
| ASSERT_OK_AND_ASSIGN(auto actual, ::arrow::Table::FromRecordBatches( | ||
| batch1->schema(), {batch1, batch2})); | ||
| AssertTablesEqual(*table, *actual, /*same_chunk_layout=*/false); | ||
| } | ||
| { | ||
| // No columns case | ||
| ASSERT_OK_AND_ASSIGN(auto batch_generator, | ||
| reader->GetRecordBatchGenerator(reader, {0, 1}, {})); | ||
|
||
| auto fut1 = batch_generator(); | ||
| auto fut2 = batch_generator(); | ||
| auto fut3 = batch_generator(); | ||
| ASSERT_OK_AND_ASSIGN(auto batch1, fut1.result()); | ||
| ASSERT_OK_AND_ASSIGN(auto batch2, fut2.result()); | ||
| ASSERT_OK_AND_ASSIGN(auto batch3, fut3.result()); | ||
| ASSERT_EQ(batch3, nullptr); | ||
| check_batches(batch1, 0, row_group_size); | ||
| check_batches(batch2, 0, row_group_size); | ||
| } | ||
| } | ||
|
|
||
| TEST(TestArrowReadWrite, ScanContents) { | ||
| const int num_columns = 20; | ||
| const int num_rows = 1000; | ||
|
|
@@ -2700,7 +2759,7 @@ TEST(ArrowReadWrite, Decimal256) { | |
|
|
||
| auto type = ::arrow::decimal256(8, 4); | ||
|
|
||
| const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", | ||
| const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", | ||
| "-9999.9999", "9999.9999"])"; | ||
| auto array = ::arrow::ArrayFromJSON(type, json); | ||
| auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a little bit surprised this works at all honestly. Futures aren't really compatible with move-only types. I thought you would have gotten a compile error. This reminded me to create ARROW-12559. As long as this works I think you're ok. There is only one callback being added to
reader_futand you don't access the value that gets passed in as an arg here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case it works since Future internally heap-allocates its implementation, and yes, this is rather iffy, but works since this is the only callback. I'll add a TODO referencing ARROW-12259.