From 040043799002cd184f4dbd94107ae2b3bb405e04 Mon Sep 17 00:00:00 2001 From: daidai Date: Thu, 4 Sep 2025 10:21:17 +0800 Subject: [PATCH 1/2] branch-3.1: [fix](load)fix ingestion load error case cause be core. (#55500) Related PR: #45937 Problem Summary: Fix the error case on ingestion load and the core in parquet reader. ==8898==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62f0020603fc at pc 0x55f634e64ded bp 0x7fba0d03c410 sp 0x7fba0d03bbd8 READ of size 4 at 0x62f0020603fc thread T768 (PUSH-9699) #0 0x55f634e64dec in __asan_memcpy (/mnt/hdd01/ci/doris-deploy-branch-3.1-local/be/lib/doris_be+0x39a24dec) (BuildId: 9b04e7f7d3075dac) #1 0x55f634eca93f in std::char_traits::copy(char*, char const*, unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/char_traits.h:409:33 #2 0x55f634eca93f in std::__cxx11::basic_string, std::allocator>::_S_copy(char*, char const*, unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:351:4 #3 0x55f634eca93f in std::__cxx11::basic_string, std::allocator>::_S_copy_chars(char*, char const*, char const*) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:398:9 #4 0x55f634eca93f in void std::__cxx11::basic_string, std::allocator>::_M_construct(char const*, char const*, std::forward_iterator_tag) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.tcc:225:6 #5 0x55f654a4f74d in void std::__cxx11::basic_string, std::allocator>::_M_construct_aux(char const*, char const*, std::__false_type) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:247:11 #6 0x55f654a4f74d in void std::__cxx11::basic_string, std::allocator>::_M_construct(char const*, char const*) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:266:4 #7 0x55f654a4f74d in std::__cxx11::basic_string, std::allocator>::basic_string(char const*, unsigned long, std::allocator const&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:513:9 #8 0x55f654a4f74d in doris::vectorized::parse_thrift_footer(std::shared_ptr, doris::vectorized::FileMetaData**, unsigned long*, doris::io::IOContext*) /home/zcp/repo_center/doris_branch-3.1/doris/be/src/vec/exec/format/parquet/parquet_thrift_util.h:55:17 --- .../vec/exec/format/parquet/parquet_thrift_util.h | 7 +++++-- .../test_ingestion_load_alter_partition.out | 7 +++++++ .../ingestion_load/test_ingestion_load.groovy | 10 +++++----- .../test_ingestion_load_alter_column.groovy | 11 +++++------ .../test_ingestion_load_alter_partition.groovy | 13 +++++++------ .../test_ingestion_load_drop_table.groovy | 3 +-- .../test_ingestion_load_multi_table.groovy | 2 +- .../test_ingestion_load_with_inverted_index.groovy | 2 +- .../test_ingestion_load_with_partition.groovy | 13 ++++++------- 9 files changed, 38 insertions(+), 30 deletions(-) create mode 100644 regression-test/data/load_p0/ingestion_load/test_ingestion_load_alter_partition.out diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h index 346bbe60c02491..dc29195ccb7c54 100644 --- a/be/src/vec/exec/format/parquet/parquet_thrift_util.h +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -47,8 +47,11 @@ static Status parse_thrift_footer(io::FileReaderSPtr file, // validate magic uint8_t* magic_ptr = footer.data() + bytes_read - 4; - if (bytes_read < PARQUET_FOOTER_SIZE || - memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) { + if (bytes_read < PARQUET_FOOTER_SIZE) { + return Status::Corruption( + "Read parquet file footer fail, bytes read: {}, file size: {}, path: {}", + bytes_read, file_size, file->path().native()); + } else if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) { return Status::Corruption( "Invalid magic number in parquet file, bytes read: {}, file size: {}, path: {}, " "read magic: {}", diff --git a/regression-test/data/load_p0/ingestion_load/test_ingestion_load_alter_partition.out b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_alter_partition.out new file mode 100644 index 00000000000000..37d0553e58c3c5 --- /dev/null +++ b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_alter_partition.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +2024-09-01 5 +2024-09-02 1 +2024-09-03 1 +2024-09-04 3 + diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy index a6e213bba89f2a..267e54b4960c50 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy @@ -21,7 +21,7 @@ import java.nio.file.StandardCopyOption suite('test_ingestion_load', 'p0') { - def testIngestLoadJob = { testTable, loadLabel, String dataFile -> + def testIngestLoadJob = { testTable, loadLabel, String dataFile , filesize -> sql "TRUNCATE TABLE ${testTable}" @@ -85,7 +85,7 @@ suite('test_ingestion_load', 'p0') { "msg": "", "appId": "", "dppResult": "${dppResult}", - "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "filePathToSize": "{\\"${etlResultFilePath}\\": ${filesize}}", "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" } }""" @@ -156,7 +156,7 @@ suite('test_ingestion_load', 'p0') { def label = "test_ingestion_load" - testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet',5745) tableName = 'tbl_test_spark_load_unique_mor' @@ -189,7 +189,7 @@ suite('test_ingestion_load', 'p0') { label = "test_ingestion_load_unique_mor" - testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet',5745) tableName = 'tbl_test_spark_load_agg' @@ -215,7 +215,7 @@ suite('test_ingestion_load', 'p0') { label = "test_ingestion_load_agg" - testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data1.parquet') + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data1.parquet',4057) } diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy index 4a56663d6291ed..8c93b45fd70ed9 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy @@ -85,7 +85,7 @@ suite('test_ingestion_load_alter_column', 'p0') { "msg": "", "appId": "", "dppResult": "${dppResult}", - "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 5745}", "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" } }""" @@ -112,7 +112,7 @@ suite('test_ingestion_load_alter_column', 'p0') { while (max_try_milli_secs) { result = sql "show load where label = '${loadLabel}'" if (result[0][2] == "CANCELLED") { - msg = result[0][7] + def msg = result[0][7] logger.info("err msg: " + msg) assertTrue((result[0][7] =~ /schema of index \[\d+\] has changed/).find()) break @@ -134,6 +134,8 @@ suite('test_ingestion_load_alter_column', 'p0') { try { + sql "DROP TABLE if exists ${tableName1}" + sql "DROP TABLE if exists ${tableName2}" sql """ CREATE TABLE IF NOT EXISTS ${tableName1} ( c_int int(11) NULL, @@ -199,10 +201,7 @@ suite('test_ingestion_load_alter_column', 'p0') { }) } finally { - sql "DROP TABLE ${tableName1}" - sql "DROP TABLE ${tableName2}" - } } -} \ No newline at end of file +} diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy index de91935710294b..68c4c6c0b178cc 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy @@ -123,8 +123,8 @@ suite('test_ingestion_load_alter_partition', 'p0') { qt_select "select c1, count(*) from ${testTable} group by c1 order by c1" break } else if (result[0][2] == "CANCELLED") { - msg = result[0][7] - logger.info("err msg: " + msg) + def msg2 = result[0][7] + logger.info("err msg: " + msg2) assertTrue((result[0][7] =~ /partition does not exist/).find()) break } else { @@ -146,6 +146,10 @@ suite('test_ingestion_load_alter_partition', 'p0') { try { + sql "DROP TABLE if exists ${tableName1}" + sql "DROP TABLE if exists ${tableName2}" + sql "DROP TABLE if exists ${tableName3}" + sql """ CREATE TABLE IF NOT EXISTS ${tableName1} ( c0 int not null, @@ -214,11 +218,8 @@ suite('test_ingestion_load_alter_partition', 'p0') { }) } finally { - sql "DROP TABLE ${tableName1}" - sql "DROP TABLE ${tableName2}" - sql "DROP TABLE ${tableName3}" } } -} \ No newline at end of file +} diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy index 4f245c3d535b15..870b0a4220a3c3 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy @@ -85,7 +85,7 @@ suite('test_ingestion_load_drop_table', 'p0') { "msg": "", "appId": "", "dppResult": "${dppResult}", - "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 5745}", "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" } }""" @@ -188,7 +188,6 @@ suite('test_ingestion_load_drop_table', 'p0') { }) } finally { - sql "DROP TABLE ${tableName}" } } diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy index 67455d8c692cd3..c51608d7af7ccd 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy @@ -103,7 +103,7 @@ suite('test_ingestion_load_multi_table', 'p0') { "msg": "", "appId": "", "dppResult": "${dppResult}", - "filePathToSize": "{\\"${etlResultFilePath1}\\": 81758, \\"${etlResultFilePath2}\\": 81758}", + "filePathToSize": "{\\"${etlResultFilePath1}\\": 5745, \\"${etlResultFilePath2}\\": 5745}", "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" } }""" diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy index 7eed4bfdc58342..121518db92d600 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy @@ -85,7 +85,7 @@ suite('test_ingestion_load_with_inverted_index', 'p0') { "msg": "", "appId": "", "dppResult": "${dppResult}", - "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 5745}", "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" } }""" diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy index 97ebb7a0761067..82ce9d478c0115 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy @@ -71,7 +71,7 @@ suite('test_ingestion_load_with_partition', 'p0') { } } - etlResultFilePaths = [] + def etlResultFilePaths = [] for(int i=0; i < dataFiles.size(); i++) { Files.copy(Paths.get(dataFiles[i]), Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileNames[i]}"), StandardCopyOption.REPLACE_EXISTING) @@ -115,7 +115,7 @@ suite('test_ingestion_load_with_partition', 'p0') { max_try_milli_secs = 120000 while (max_try_milli_secs) { - result = sql "show load where label = '${loadLabel}'" + def result = sql "show load where label = '${loadLabel}'" if (result[0][2] == "FINISHED") { sql "sync" qt_select "select c1, count(*) from ${testTable} group by c1 order by c1" @@ -132,9 +132,8 @@ suite('test_ingestion_load_with_partition', 'p0') { } if (enableHdfs()) { - - tableName = 'tbl_test_spark_load_partition' - + def tableName = 'tbl_test_spark_load_with_partition' + sql "DROP TABLE if exists ${tableName}" sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( c0 int not null, @@ -151,10 +150,10 @@ suite('test_ingestion_load_with_partition', 'p0') { ) """ - def label = "test_ingestion_load_partition" + def label = "test_ingestion_load_with_partition__" testIngestLoadJob.call(tableName, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet']) } -} \ No newline at end of file +} From 0899a493fd3b0819344abc830d2d756d892e44e5 Mon Sep 17 00:00:00 2001 From: morningman Date: Thu, 11 Sep 2025 10:58:37 -0700 Subject: [PATCH 2/2] fix --- .../ingestion_load/test_ingestion_load_alter_column.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy index 9037d4c715b465..11baee6fe15971 100644 --- a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy @@ -202,6 +202,7 @@ suite('test_ingestion_load_alter_column', 'p0,external') { } finally { + } } }