Skip to content

Conversation

@XLPE
Copy link
Contributor

@XLPE XLPE commented Jun 27, 2025

What problem does this PR solve?

Issue Number: close #xxx

Related PR: #48623 #43262

Problem Summary:
A coredump may occur in the BE when manually canceling a running warm-up task, or when the BE restarts during warm-up and creates a new warm-up task.

*** SIGSEGV address not mapped to object (@0x20) received by PID 3550867 (TID 3559352 OR 0x7f33659ce700) from PID 32; stack trace: ***
 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /doris/be/src/common/signal_handler.h:421
 1# PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] in /usr/local/jdk-17.0.2/lib/server/libjvm.so
 2# JVM_handle_linux_signal in /usr/local/jdk-17.0.2/lib/server/libjvm.so
 3# 0x00007F3649AC4400 in /lib64/libc.so.6
 4# std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:165
 5# std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:703
 6# std::__shared_ptr<doris::JobMeta, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr_base.h:1149
 7# std::shared_ptr<doris::JobMeta>::~shared_ptr() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/shared_ptr.h:122
 8# void std::destroy_at<std::shared_ptr<doris::JobMeta> >(std::shared_ptr<doris::JobMeta>*) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_construct.h:89
 9# void std::_Destroy<std::shared_ptr<doris::JobMeta> >(std::shared_ptr<doris::JobMeta>*) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_construct.h:142
10# void std::_Destroy_aux<false>::__destroy<std::shared_ptr<doris::JobMeta>*>(std::shared_ptr<doris::JobMeta>*, std::shared_ptr<doris::JobMeta>*) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_construct.h:151
11# void std::_Destroy<std::shared_ptr<doris::JobMeta>*>(std::shared_ptr<doris::JobMeta>*, std::shared_ptr<doris::JobMeta>*) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_construct.h:186
12# void std::_Destroy<std::shared_ptr<doris::JobMeta>*, std::shared_ptr<doris::JobMeta> >(std::shared_ptr<doris::JobMeta>*, std::shared_ptr<doris::JobMeta>*, std::allocator<std::shared_ptr<doris::JobMeta> >&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:747
13# std::deque<std::shared_ptr<doris::JobMeta>, std::allocator<std::shared_ptr<doris::JobMeta> > >::_M_destroy_data_aux(std::_Deque_iterator<std::shared_ptr<doris::JobMeta>, std::shared_ptr<doris::JobMeta>&, std::shared_ptr<doris::JobMeta>*>, std::_Deque_iterator<std::shared_ptr<doris::JobMeta>, std::shared_ptr<doris::JobMeta>&, std::shared_ptr<doris::JobMeta>*>) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/deque.tcc:874
14# std::deque<std::shared_ptr<doris::JobMeta>, std::allocator<std::shared_ptr<doris::JobMeta> > >::_M_destroy_data(std::_Deque_iterator<std::shared_ptr<doris::JobMeta>, std::shared_ptr<doris::JobMeta>&, std::shared_ptr<doris::JobMeta>*>, std::_Deque_iterator<std::shared_ptr<doris::JobMeta>, std::shared_ptr<doris::JobMeta>&, std::shared_ptr<doris::JobMeta>*>, std::allocator<std::shared_ptr<doris::JobMeta> > const&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_deque.h:2049
15# std::deque<std::shared_ptr<doris::JobMeta>, std::allocator<std::shared_ptr<doris::JobMeta> > >::_M_erase_at_end(std::_Deque_iterator<std::shared_ptr<doris::JobMeta>, std::shared_ptr<doris::JobMeta>&, std::shared_ptr<doris::JobMeta>*>) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_deque.h:2065
16# std::deque<std::shared_ptr<doris::JobMeta>, std::allocator<std::shared_ptr<doris::JobMeta> > >::clear() at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/stl_deque.h:1793
17# doris::CloudWarmUpManager::clear_job(long) at /doris/be/src/cloud/cloud_warm_up_manager.cpp:270
18# doris::CloudBackendService::warm_up_tablets(doris::TWarmUpTabletsResponse&, doris::TWarmUpTabletsRequest const&) at /doris/be/src/cloud/cloud_backend_service.cpp:143
19# doris::BackendServiceProcessor::process_warm_up_tablets(int, apache::thrift::protocol::TProtocol*, apache::thrift::protocol::TProtocol*, void*) at /doris/gensrc/build/gen_cpp/BackendService.cpp:8155
20# doris::BackendServiceProcessor::dispatchCall(apache::thrift::protocol::TProtocol*, apache::thrift::protocol::TProtocol*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int, void*) at /doris/gensrc/build/gen_cpp/BackendService.cpp:6887
21# apache::thrift::TDispatchProcessor::process(std::shared_ptr<apache::thrift::protocol::TProtocol>, std::shared_ptr<apache::thrift::protocol::TProtocol>, void*) at /root/installed/include/thrift/TDispatchProcessor.h:121
22# apache::thrift::server::TConnectedClient::run() in /data03/doris-master-cloud-02/be/lib/doris_be
23# apache::thrift::server::TThreadedServer::TConnectedClientRunner::run() in /data03/doris-master-cloud-02/be/lib/doris_be
24# apache::thrift::concurrency::Thread::threadMain(std::shared_ptr<apache::thrift::concurrency::Thread>) in /data03/doris-master-cloud-02/be/lib/doris_be
25# void std::__invoke_impl<void, void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> >(std::__invoke_other, void (*&&)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread>&&) in /data03/doris-master-cloud-02/be/lib/doris_be
26# std::__invoke_result<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> >::type std::__invoke<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> >(void (*&&)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread>&&) in /data03/doris-master-cloud-02/be/lib/doris_be
27# void std::thread::_Invoker<std::tuple<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> > >::_M_invoke<0ul, 1ul>(std::_Index_tuple<0ul, 1ul>) in /data03/doris-master-cloud-02/be/lib/doris_be
28# std::thread::_Invoker<std::tuple<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> > >::operator()() in /data03/doris-master-cloud-02/be/lib/doris_be
29# std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> > > >::_M_run() in /data03/doris-master-cloud-02/be/lib/doris_be
30# execute_native_thread_routine at ../../../../../libstdc++-v3/src/c++11/thread.cc:84
31# start_thread in /lib64/libpthread.so.0
32# clone in /lib64/libc.so.6


The coredump is caused by the clear_job function clearing _pending_job_metas, while another thread simultaneously calls _pending_job_metas.pop_front(), leading to undefined behavior due to memory corruption. If the _pending_job_metas is accessed again afterward, it may trigger a coredump.

Release note

None

Check List (For Author)

  • Test

    • Regression test
    • Unit Test
    • Manual test (add detailed scripts or steps below)
      cancel a running warm-up task or restarts during warm-up and creates a new warm-up task
    • No need to test or manual test. Explain why:
      • This is a refactor/code format and no logic has been changed.
      • Previous test can cover this change.
      • No code files have been changed.
      • Other reason
  • Behavior changed:

    • No.
    • Yes.
  • Does this need documentation?

    • No.
    • Yes.

Check List (For Reviewer who merge this PR)

  • Confirm the release note
  • Confirm test cases
  • Confirm document
  • Add branch pick label

@hello-stephen
Copy link
Contributor

Thank you for your contribution to Apache Doris.
Don't know what should be done next? See How to process your PR.

Please clearly describe your PR:

  1. What problem was fixed (it's best to include specific error reporting information). How it was fixed.
  2. Which behaviors were modified. What was the previous behavior, what is it now, why was it modified, and what possible impacts might there be.
  3. What features were added. Why was this function added?
  4. Which code was refactored and why was this part of the code refactored?
  5. Which functions were optimized and what is the difference before and after the optimization?

@XLPE
Copy link
Contributor Author

XLPE commented Jun 27, 2025

run buildall

@XLPE
Copy link
Contributor Author

XLPE commented Jun 27, 2025

@freemandealer can review?

@doris-robot
Copy link

BE UT Coverage Report

Increment line coverage 0.00% (0/3) 🎉

Increment coverage report
Complete coverage report

Category Coverage
Function Coverage 57.02% (15364/26943)
Line Coverage 46.12% (139408/302295)
Region Coverage 45.44% (70630/155433)
Branch Coverage 40.20% (37307/92804)

@XLPE
Copy link
Contributor Author

XLPE commented Jun 27, 2025

run buildall

@doris-robot
Copy link

TPC-H: Total hot run time: 34316 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpch-tools
Tpch sf100 test result on commit f9ab15ec62c6e0bae8b8c535d4beca3c12cf81b0, data reload: false

------ Round 1 ----------------------------------
q1	17589	5328	5326	5326
q2	1928	298	181	181
q3	10302	1319	717	717
q4	10223	1020	510	510
q5	7642	2400	2355	2355
q6	209	163	135	135
q7	879	737	600	600
q8	9314	1263	1110	1110
q9	7000	5173	5186	5173
q10	6891	2382	2019	2019
q11	482	284	274	274
q12	348	352	215	215
q13	17762	3623	3064	3064
q14	228	230	238	230
q15	535	484	488	484
q16	416	421	388	388
q17	618	881	382	382
q18	7954	7203	7057	7057
q19	1214	947	558	558
q20	335	356	225	225
q21	4058	2583	2340	2340
q22	1005	994	973	973
Total cold run time: 106932 ms
Total hot run time: 34316 ms

----- Round 2, with runtime_filter_mode=off -----
q1	5215	5170	5149	5149
q2	254	336	218	218
q3	2231	2659	2335	2335
q4	1346	1788	1290	1290
q5	4222	4479	4470	4470
q6	210	169	127	127
q7	2012	1928	1759	1759
q8	2603	2496	2519	2496
q9	7104	7231	7223	7223
q10	3175	3299	2847	2847
q11	580	505	494	494
q12	667	754	608	608
q13	3484	3945	3288	3288
q14	273	303	291	291
q15	545	490	475	475
q16	439	481	420	420
q17	1172	1542	1387	1387
q18	7809	7626	7443	7443
q19	835	903	1062	903
q20	1973	2051	1911	1911
q21	5020	4316	4256	4256
q22	1046	1000	991	991
Total cold run time: 52215 ms
Total hot run time: 50381 ms

@doris-robot
Copy link

TPC-DS: Total hot run time: 185702 ms
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/tpcds-tools
TPC-DS sf100 test result on commit f9ab15ec62c6e0bae8b8c535d4beca3c12cf81b0, data reload: false

query1	1003	381	382	381
query2	6526	1688	1640	1640
query3	6752	214	212	212
query4	26644	24132	23197	23197
query5	4323	573	437	437
query6	295	236	201	201
query7	4625	493	291	291
query8	276	221	226	221
query9	8599	2686	2688	2686
query10	478	319	265	265
query11	15487	15110	14905	14905
query12	158	103	100	100
query13	1640	531	399	399
query14	8520	5721	5716	5716
query15	200	195	168	168
query16	7121	612	464	464
query17	925	690	579	579
query18	1986	399	327	327
query19	189	187	152	152
query20	118	112	112	112
query21	208	122	116	116
query22	3924	4242	4184	4184
query23	34092	33295	33128	33128
query24	8429	2387	2386	2386
query25	540	478	394	394
query26	1242	265	149	149
query27	2767	504	342	342
query28	4347	2151	2142	2142
query29	774	572	479	479
query30	286	215	190	190
query31	905	840	772	772
query32	73	70	64	64
query33	581	371	352	352
query34	796	841	508	508
query35	779	819	750	750
query36	963	970	875	875
query37	109	99	72	72
query38	4071	4070	4027	4027
query39	1466	1414	1421	1414
query40	210	115	104	104
query41	55	57	53	53
query42	128	117	111	111
query43	485	507	474	474
query44	1350	817	812	812
query45	179	172	161	161
query46	835	1018	637	637
query47	1742	1814	1707	1707
query48	388	433	308	308
query49	736	487	381	381
query50	638	712	421	421
query51	4130	4116	4156	4116
query52	110	106	98	98
query53	228	256	190	190
query54	588	579	512	512
query55	85	79	82	79
query56	293	312	273	273
query57	1180	1170	1109	1109
query58	272	259	247	247
query59	2532	2660	2531	2531
query60	346	338	325	325
query61	156	151	148	148
query62	831	714	661	661
query63	228	192	188	188
query64	4453	1111	749	749
query65	4303	4197	4176	4176
query66	1216	411	310	310
query67	15845	15819	15446	15446
query68	8401	897	534	534
query69	479	305	264	264
query70	1168	1123	1104	1104
query71	452	328	303	303
query72	5940	4767	4962	4767
query73	730	653	351	351
query74	9211	9120	8899	8899
query75	3920	3207	2745	2745
query76	3641	1146	717	717
query77	805	375	309	309
query78	10229	10230	9395	9395
query79	1909	813	589	589
query80	638	584	430	430
query81	481	257	224	224
query82	428	131	99	99
query83	246	266	238	238
query84	303	107	81	81
query85	811	435	316	316
query86	333	317	302	302
query87	4434	4457	4373	4373
query88	3425	2293	2302	2293
query89	385	313	285	285
query90	1917	222	212	212
query91	146	140	111	111
query92	77	59	54	54
query93	1177	973	598	598
query94	672	393	291	291
query95	374	303	287	287
query96	487	585	285	285
query97	2727	2754	2638	2638
query98	235	210	207	207
query99	1620	1375	1277	1277
Total cold run time: 273560 ms
Total hot run time: 185702 ms

@doris-robot
Copy link

ClickBench: Total hot run time: 29.39 s
machine: 'aliyun_ecs.c7a.8xlarge_32C64G'
scripts: https://github.com/apache/doris/tree/master/tools/clickbench-tools
ClickBench test result on commit f9ab15ec62c6e0bae8b8c535d4beca3c12cf81b0, data reload: false

query1	0.05	0.04	0.03
query2	0.07	0.05	0.04
query3	0.26	0.07	0.08
query4	1.61	0.11	0.10
query5	0.44	0.43	0.44
query6	1.16	0.65	0.66
query7	0.03	0.02	0.02
query8	0.05	0.04	0.04
query9	0.60	0.52	0.52
query10	0.57	0.58	0.57
query11	0.16	0.11	0.11
query12	0.14	0.11	0.12
query13	0.64	0.61	0.62
query14	0.79	0.81	0.81
query15	0.89	0.88	0.88
query16	0.38	0.41	0.39
query17	1.09	1.04	1.04
query18	0.22	0.21	0.21
query19	1.97	1.82	1.84
query20	0.01	0.02	0.02
query21	15.40	0.90	0.53
query22	0.76	1.05	0.95
query23	14.71	1.34	0.63
query24	6.89	1.55	0.34
query25	0.49	0.24	0.09
query26	0.56	0.17	0.14
query27	0.06	0.05	0.05
query28	9.56	0.93	0.44
query29	12.55	3.95	3.29
query30	0.25	0.09	0.08
query31	2.84	0.58	0.39
query32	3.28	0.57	0.47
query33	3.09	3.08	3.11
query34	16.13	5.48	4.78
query35	4.85	4.86	4.84
query36	0.68	0.52	0.49
query37	0.09	0.07	0.07
query38	0.06	0.04	0.04
query39	0.03	0.03	0.02
query40	0.17	0.14	0.13
query41	0.08	0.02	0.02
query42	0.03	0.03	0.02
query43	0.03	0.04	0.03
Total cold run time: 103.72 s
Total hot run time: 29.39 s

@doris-robot
Copy link

BE UT Coverage Report

Increment line coverage 0.00% (0/3) 🎉

Increment coverage report
Complete coverage report

Category Coverage
Function Coverage 57.02% (15364/26945)
Line Coverage 46.11% (139406/302339)
Region Coverage 45.43% (70621/155446)
Branch Coverage 40.20% (37308/92812)

@github-actions
Copy link
Contributor

We're closing this PR because it hasn't been updated in a while.
This isn't a judgement on the merit of the PR in any way. It's just a way of keeping the PR queue manageable.
If you'd like to revive this PR, please reopen it and feel free a maintainer to remove the Stale tag!

@github-actions github-actions bot added the Stale label Dec 25, 2025
@github-actions github-actions bot closed this Jan 4, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants