From 9e9c197241e51faf888998ebf5981c86feb5e644 Mon Sep 17 00:00:00 2001 From: wangbo Date: Mon, 9 Sep 2024 10:55:02 +0800 Subject: [PATCH 1/4] [Improvement]Add more tip when drop workload group failed (#40468) ## Proposed changes Add more tip to tell user what to do when drop wg failed. --- .../doris/resource/workloadgroup/WorkloadGroupMgr.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/resource/workloadgroup/WorkloadGroupMgr.java b/fe/fe-core/src/main/java/org/apache/doris/resource/workloadgroup/WorkloadGroupMgr.java index 81a08ad76fecf8..a7ffddbf74ae53 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/resource/workloadgroup/WorkloadGroupMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/resource/workloadgroup/WorkloadGroupMgr.java @@ -446,7 +446,10 @@ public void dropWorkloadGroup(DropWorkloadGroupStmt stmt) throws DdlException { // user need to reset user property first Pair ret = Env.getCurrentEnv().getAuth().isWorkloadGroupInUse(workloadGroupName); if (ret.first) { - throw new DdlException("workload group " + workloadGroupName + " is set for user " + ret.second); + throw new DdlException("workload group " + workloadGroupName + " is set for user " + ret.second + + ", you can reset the user's property(eg, " + + "set property for " + ret.second + " 'default_workload_group'='xxx'; ), " + + "then you can drop the group."); } // A group with related policies should not be deleted. From 1e7b2e6c97706b3eee852b21421812fce2622d7d Mon Sep 17 00:00:00 2001 From: wangbo Date: Tue, 10 Sep 2024 11:06:11 +0800 Subject: [PATCH 2/4] [Fix]only publish topic to alive be (#40535) ## Proposed changes Fix strange core stack when BE not start correctly, and FE send publish topic request. --- .../apache/doris/common/publish/TopicPublisherThread.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java index 2407e3a2516b71..86bece8845abd9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java @@ -79,10 +79,13 @@ protected void runAfterCatalogReady() { Collection nodesToPublish = clusterInfoService.getIdToBackend().values(); AckResponseHandler handler = new AckResponseHandler(nodesToPublish); for (Backend be : nodesToPublish) { - executor.submit(new TopicPublishWorker(request, be, handler)); + if (be.isAlive()) { + executor.submit(new TopicPublishWorker(request, be, handler)); + } } try { int timeoutMs = Config.publish_topic_info_interval_ms / 3 * 2; + timeoutMs = timeoutMs <= 0 ? 3000 : timeoutMs; if (!handler.awaitAllInMs(timeoutMs)) { Backend[] backends = handler.pendingNodes(); if (backends.length > 0) { From 52ff29494def59edef6e8b012b70f69b65ab8eb6 Mon Sep 17 00:00:00 2001 From: wangbo Date: Mon, 23 Sep 2024 14:40:12 +0800 Subject: [PATCH 3/4] [Fix]Fix publish may wait timeout because of dead BE (#40763) ```AckResponseHandler``` should only accept alive BE, otherwise pubslih may wait timeout if dead Be exists. --- .../common/publish/TopicPublisherThread.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java index 86bece8845abd9..f59693e68ca6be 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/publish/TopicPublisherThread.java @@ -35,7 +35,6 @@ import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -76,12 +75,24 @@ protected void runAfterCatalogReady() { // because it may means workload group/policy is dropped // step 2: publish topic info to all be - Collection nodesToPublish = clusterInfoService.getIdToBackend().values(); + List nodesToPublish = new ArrayList<>(); + try { + for (Backend be : clusterInfoService.getIdToBackend().values()) { + if (be.isAlive()) { + nodesToPublish.add(be); + } + } + } catch (Exception e) { + LOG.warn("get backends failed", e); + return; + } + if (nodesToPublish.isEmpty()) { + LOG.info("no alive backend, skip publish topic"); + return; + } AckResponseHandler handler = new AckResponseHandler(nodesToPublish); for (Backend be : nodesToPublish) { - if (be.isAlive()) { - executor.submit(new TopicPublishWorker(request, be, handler)); - } + executor.submit(new TopicPublishWorker(request, be, handler)); } try { int timeoutMs = Config.publish_topic_info_interval_ms / 3 * 2; From adcdb0a272fe096f39d70ec3577d2843e3952090 Mon Sep 17 00:00:00 2001 From: wangbo Date: Wed, 18 Sep 2024 18:54:27 +0800 Subject: [PATCH 4/4] [Fix]Fix thread num not reset 0 when fetch failed (#40855) --- be/src/util/doris_metrics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index 165dfd632b9a31..4d68cc6e1f7f9a 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -335,7 +335,7 @@ void DorisMetrics::_update_process_thread_num() { std::filesystem::directory_iterator dict_iter("/proc/self/task/", ec); if (ec) { LOG(WARNING) << "failed to count thread num: " << ec.message(); - process_fd_num_used->set_value(0); + process_thread_num->set_value(0); return; } int64_t count =