From ec05176f5c5f4fd66da034a6212abc2f647616a2 Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Mon, 30 Dec 2024 22:48:53 +0800 Subject: [PATCH] =?UTF-8?q?[fix](parquet-reader)=20Fixed=20the=20issue=20o?= =?UTF-8?q?f=20excessive=20scanning=20data=20in=20=20late=20materializatio?= =?UTF-8?q?n=E2=80=8C=20case=20of=20parquet=20reader=20(#46121)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Related PR: #40641 Problem Summary: [Fix](parquet-reader) Fixed the issue of excessive scanning data in late materialization‌ case of parquet reader introduced by #40641 in scenarios with particularly high filtering rates. --- .../vec/exec/format/parquet/vparquet_group_reader.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index d662c174d9bfdb..a63e098c97c7e4 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -520,16 +520,18 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re Block::erase_useless_column(block, origin_column_num); if (!pre_eof) { - if (pre_raw_read_rows >= config::doris_scanner_row_num) { - break; - } // If continuous batches are skipped, we can cache them to skip a whole page _cached_filtered_rows += pre_read_rows; + if (pre_raw_read_rows >= config::doris_scanner_row_num) { + *read_rows = 0; + _convert_dict_cols_to_string_cols(block); + return Status::OK(); + } } else { // pre_eof // If select_vector_ptr->filter_all() and pre_eof, we can skip whole row group. *read_rows = 0; *batch_eof = true; - _lazy_read_filtered_rows += pre_read_rows; + _lazy_read_filtered_rows += (pre_read_rows + _cached_filtered_rows); _convert_dict_cols_to_string_cols(block); return Status::OK(); }