From 9c9cee84134a83646fc2d172e3fa8ef5905cd58e Mon Sep 17 00:00:00 2001 From: zhiqiang-hhhh Date: Thu, 11 Jul 2024 16:14:24 +0800 Subject: [PATCH 1/2] FIX --- be/src/runtime/fragment_mgr.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 0b6394d2190a25..a1359fa7ddd69a 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -39,6 +39,7 @@ #include #include +#include #include #include "common/status.h" @@ -896,11 +897,31 @@ void FragmentMgr::cancel_worker() { print_id(q_ctx->query_id())); } } else { - LOG_WARNING( - "Could not find target coordinator {}:{} of query {}, going to " - "cancel it.", - q_ctx->coord_addr.hostname, q_ctx->coord_addr.port, - print_id(q_ctx->query_id())); + // In some rear cases, the rpc port of follower is not updated in time, + // then the port of this follower will be zero, but acutally it is still running, + // and be has already received the query from follower. + // So we need to check if host is in running_fes. + bool fe_host_is_standing = std::any_of( + running_fes.begin(), running_fes.end(), + [&q_ctx](const auto& fe) { + return fe.first.hostname == q_ctx->coord_addr.hostname; + }); + if (fe_host_is_standing) { + LOG_WARNING( + "Coordinator {}:{} is not found, but its host is still " + "running, " + "not going to cancel it.", + q_ctx->coord_addr.hostname, q_ctx->coord_addr.port, + print_id(q_ctx->query_id())); + continue; + } else { + LOG_WARNING( + "Could not find target coordinator {}:{} of query {}, " + "going to " + "cancel it.", + q_ctx->coord_addr.hostname, q_ctx->coord_addr.port, + print_id(q_ctx->query_id())); + } } } // Coordinator of this query has already dead or query context has been released. From 097d51f891a07a955c7b139eed8161f8d67aaf61 Mon Sep 17 00:00:00 2001 From: zhiqiang-hhhh Date: Thu, 11 Jul 2024 20:01:10 +0800 Subject: [PATCH 2/2] FIX --- be/src/runtime/fragment_mgr.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index a1359fa7ddd69a..5389bf2b7ec862 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -904,13 +904,14 @@ void FragmentMgr::cancel_worker() { bool fe_host_is_standing = std::any_of( running_fes.begin(), running_fes.end(), [&q_ctx](const auto& fe) { - return fe.first.hostname == q_ctx->coord_addr.hostname; + return fe.first.hostname == q_ctx->coord_addr.hostname && + fe.first.port == 0; }); if (fe_host_is_standing) { LOG_WARNING( "Coordinator {}:{} is not found, but its host is still " - "running, " - "not going to cancel it.", + "running with an unstable brpc port, not going to cancel " + "it.", q_ctx->coord_addr.hostname, q_ctx->coord_addr.port, print_id(q_ctx->query_id())); continue;