From 374d09fd9f9d31f19405485010e8900982bcf2ee Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 4 Mar 2025 23:33:43 +0800 Subject: [PATCH] [fix](cloud) fix filecache warmup crash due to spurious wakeup (#48623) *** SIGSEGV address not mapped to object (@0x30) received by PID 1379 (TID 1888 OR 0x7fa73d4c8700) from PID 48; stack trace: *** 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_release/doris/be/src/common/signal_handler.h:421 1# PosixSignals::chained_handler(int, siginfo*, void*) [clone .part.0] in /opt/jdk-17/lib/server/libjvm.so 2# JVM_handle_linux_signal in /opt/jdk-17/lib/server/libjvm.so 3# 0x00007FAB4C4ED400 in /lib64/libc.so.6 4# doris::CloudWarmUpManager::handle_jobs() at /home/zcp/repo_center/doris_release/doris/be/src/cloud/cloud_warm_up_manager.cpp:73 _pending_job_metas may wake up with empty, i.e. spurious wakeup --- be/src/cloud/cloud_warm_up_manager.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 06d6df11dc4cc3..58b5711d997b4a 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -66,10 +66,17 @@ void CloudWarmUpManager::handle_jobs() { std::shared_ptr cur_job = nullptr; { std::unique_lock lock(_mtx); - _cond.wait(lock, [this]() { return _closed || !_pending_job_metas.empty(); }); + while (!_closed && _pending_job_metas.empty()) { + _cond.wait(lock); + } if (_closed) break; cur_job = _pending_job_metas.front(); } + + if (!cur_job) { + LOG_WARNING("Warm up job is null"); + continue; + } for (int64_t tablet_id : cur_job->tablet_ids) { if (_cur_job_id == 0) { // The job is canceled break;