From 336c71a9988f85050563383abcf7d3120b9f241e Mon Sep 17 00:00:00 2001 From: daidai Date: Mon, 14 Apr 2025 09:22:04 +0800 Subject: [PATCH] [fix](parquet)fix when hive_parquet_use_column_names=false && read partition tb cause be core. (#49966) ### What problem does this PR solve? related pr : #38432 Problem Summary: when you query hive parquet format partition table, and `set hive_parquet_use_column_names = false`, maybe you will get : ``` *** SIGABRT unknown detail explain (@0x2f59de) received by PID 3103198 (TID 3110278 OR 0x7f51c8e63640) from PID 3103198; stack trace: *** 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_master/doris/be/src/common/signal_handler.h:421 1# 0x00007F55DFB45520 in /lib/x86_64-linux-gnu/libc.so.6 2# pthread_kill at ./nptl/pthread_kill.c:89 3# raise at ../sysdeps/posix/raise.c:27 4# abort at ./stdlib/abort.c:81 5# __gnu_cxx::__verbose_terminate_handler() [clone .cold] at ../../../../libstdc++-v3/libsupc++/vterminate.cc:75 6# __cxxabiv1::__terminate(void (*)()) at ../../../../libstdc++-v3/libsupc++/eh_terminate.cc:48 7# 0x000055C8BD4E2041 in /mnt/disk1/doris-clusters/doris-master/output/be/lib/doris_be 8# 0x000055C8BD4E2194 in /mnt/disk1/doris-clusters/doris-master/output/be/lib/doris_be 9# 0x000055C8BD4E2586 in /mnt/disk1/doris-clusters/doris-master/output/be/lib/doris_be 10# std::__cxx11::basic_string, std::allocator >::_M_assign(std::__cxx11::basic_string, std::allocator > const&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.tcc:265 11# doris::vectorized::ParquetReader::get_next_block(doris::vectorized::Block*, unsigned long*, bool*) at /home/zcp/repo_center/doris_master/doris/be/src/vec/exec/format/parquet/vparquet_reader.cpp:586 ```` The reason is that when `get_next_block` replaces the column name, data out of bounds occurs. --- .../exec/format/parquet/vparquet_reader.cpp | 3 +- .../test_external_catalog_hive_partition.out | 96 +++++++++++++++++++ ...est_external_catalog_hive_partition.groovy | 9 ++ 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index 1748138a5777de..ae978453f8a552 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -558,6 +558,7 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } + std::vector original_block_column_name = block->get_names(); if (!_hive_use_column_names) { for (auto i = 0; i < block->get_names().size(); i++) { auto& col = block->get_by_position(i); @@ -581,7 +582,7 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) if (!_hive_use_column_names) { for (auto i = 0; i < block->columns(); i++) { - block->get_by_position(i).name = (*_column_names)[i]; + block->get_by_position(i).name = original_block_column_name[i]; } block->initialize_index_by_name(); } diff --git a/regression-test/data/external_table_p0/hive/test_external_catalog_hive_partition.out b/regression-test/data/external_table_p0/hive/test_external_catalog_hive_partition.out index aa1e48a439d1e9..0402feef40e6b5 100644 --- a/regression-test/data/external_table_p0/hive/test_external_catalog_hive_partition.out +++ b/regression-test/data/external_table_p0/hive/test_external_catalog_hive_partition.out @@ -143,3 +143,99 @@ -- !q06 -- 2023-01-03T00:00 100 0.3 test3 +-- !q01 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N +0.3 test3 2023-01-03T00:00 100 + +-- !q02 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N + +-- !q03 -- +0.3 test3 2023-01-03T00:00 100 + +-- !q04 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 +2023-01-03T00:00 100 0.3 test3 + +-- !q05 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 + +-- !q06 -- +2023-01-03T00:00 100 0.3 test3 + +-- !q01 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N +0.3 test3 2023-01-03T00:00 100 + +-- !q02 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N + +-- !q03 -- +0.3 test3 2023-01-03T00:00 100 + +-- !q04 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 +2023-01-03T00:00 100 0.3 test3 + +-- !q05 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 + +-- !q06 -- +2023-01-03T00:00 100 0.3 test3 + +-- !q01 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N +0.3 test3 2023-01-03T00:00 100 + +-- !q02 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N + +-- !q03 -- +0.3 test3 2023-01-03T00:00 100 + +-- !q04 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 +2023-01-03T00:00 100 0.3 test3 + +-- !q05 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 + +-- !q06 -- +2023-01-03T00:00 100 0.3 test3 + +-- !q01 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N +0.3 test3 2023-01-03T00:00 100 + +-- !q02 -- +0.1 test1 2023-01-01T00:00 \N +0.2 test2 2023-01-02T00:00 \N + +-- !q03 -- +0.3 test3 2023-01-03T00:00 100 + +-- !q04 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 +2023-01-03T00:00 100 0.3 test3 + +-- !q05 -- +2023-01-01T00:00 \N 0.1 test1 +2023-01-02T00:00 \N 0.2 test2 + +-- !q06 -- +2023-01-03T00:00 100 0.3 test3 + diff --git a/regression-test/suites/external_table_p0/hive/test_external_catalog_hive_partition.groovy b/regression-test/suites/external_table_p0/hive/test_external_catalog_hive_partition.groovy index 32b80f5650da8f..d34467c4c56356 100644 --- a/regression-test/suites/external_table_p0/hive/test_external_catalog_hive_partition.groovy +++ b/regression-test/suites/external_table_p0/hive/test_external_catalog_hive_partition.groovy @@ -65,9 +65,18 @@ suite("test_external_catalog_hive_partition", "p0,external,hive,external_docker, qt_q06 """ select * from multi_catalog.text_partitioned_columns where t_int is not null order by t_float """ } sql """ use `multi_catalog`; """ + sql """ set hive_parquet_use_column_names = true; """ + sql """ set hive_orc_use_column_names = true""" + q01_parquet() q01_orc() q01_text() + + sql """ set hive_parquet_use_column_names = false; """ + sql """ set hive_orc_use_column_names = false""" + q01_parquet() + q01_orc() + } }