From b40fe56dd9fb204e0f68d00fe1b9b74adde088cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 24 Aug 2020 15:17:29 +0200 Subject: [PATCH 1/3] ARROW-9827: [C++][Dataset] Skip parsing RowGroup metadata statistics when there is no filter --- cpp/src/arrow/dataset/file_parquet.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0f939d7da71..7c5a855b629 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -385,12 +385,14 @@ Result ParquetFileFormat::ScanFile(std::shared_ptrsource(), options.get(), context.get())); if (!parquet_fragment->HasCompleteMetadata()) { - // row groups were not already filtered; do this now - RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get())); - ARROW_ASSIGN_OR_RAISE(row_groups, - parquet_fragment->FilterRowGroups(*options->filter)); - if (row_groups.empty()) { - return MakeEmptyIterator>(); + // row groups were not already filtered; do this now (if there is a filter) + if (!options->filter->Equals(true)) { + RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get())); + ARROW_ASSIGN_OR_RAISE(row_groups, + parquet_fragment->FilterRowGroups(*options->filter)); + if (row_groups.empty()) { + return MakeEmptyIterator>(); + } } } From 105ddbe312cd5741df06befc946eba160999c943 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Aug 2020 10:54:48 +0200 Subject: [PATCH 2/3] still need to get vector of row groups if no filter is specified --- cpp/src/arrow/dataset/file_parquet.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 7c5a855b629..f5168f2f742 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -393,6 +393,15 @@ Result ParquetFileFormat::ScanFile(std::shared_ptr>(); } + } else { + row_groups = parquet_fragment->row_groups(); + if (row_groups.empty()) { + // empty vector represents all row groups + std::shared_ptr metadata = + reader->parquet_reader()->metadata(); + int num_row_groups = metadata->num_row_groups(); + row_groups = RowGroupInfo::FromCount(num_row_groups); + } } } From 99d3aec6f2b6a51abe424701367e25dbcec7fe22 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Sep 2020 14:19:27 +0200 Subject: [PATCH 3/3] add comment --- cpp/src/arrow/dataset/file_parquet.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index f5168f2f742..d1969ed87af 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -394,6 +394,8 @@ Result ParquetFileFormat::ScanFile(std::shared_ptr>(); } } else { + // since we are not scanning this fragment with a filter, don't bother loading + // statistics row_groups = parquet_fragment->row_groups(); if (row_groups.empty()) { // empty vector represents all row groups