From 6fb7c29a6616ffd21407d184faadec783510c4a6 Mon Sep 17 00:00:00 2001 From: deardeng <565620795@qq.com> Date: Fri, 13 Sep 2024 10:34:31 +0800 Subject: [PATCH 1/2] [Fix](cloud) Fix cluster status inconsistent with bes and add config disable auto --- .../java/org/apache/doris/common/Config.java | 9 +++++++ .../cloud/catalog/CloudClusterChecker.java | 12 ++++++--- .../cloud/system/CloudSystemInfoService.java | 20 +++++++++++--- .../org/apache/doris/system/HeartbeatMgr.java | 4 ++- .../multi_cluster/test_auto_start.groovy | 26 ++++++++++++++++++- 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 97f9cdc70b516a..c94c5d79a74d1c 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -3042,6 +3042,15 @@ public static int metaServiceRpcRetryTimes() { @ConfField(mutable = true, description = {"存算分离模式下,当tablet分布的be异常,是否立即映射tablet到新的be上,默认true"}) public static boolean enable_immediate_be_assign = true; + @ConfField(mutable = true, description = {"存算分离模式下是否启用自动启停功能,默认true", + "Whether to enable the automatic start-stop feature in cloud model, default is true."}) + public static boolean enable_auto_start_for_cloud_cluster = true; + + @ConfField(mutable = true, description = {"存算分离模式下自动启停等待cluster唤醒退避重试次数,默认300次大约5分钟", + "The automatic start-stop wait time for cluster wake-up backoff retry count in the cloud " + + "model is set to 300 times, which is approximately 5 minutes by default."}) + public static int auto_start_wait_to_resume_times = 300; + // ATTN: DONOT add any config not related to cloud mode here // ATTN: DONOT add any config not related to cloud mode here // ATTN: DONOT add any config not related to cloud mode here diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java index 567dc4b31242d4..fd7645bff2901e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java @@ -219,7 +219,13 @@ private void checkDiffNode(Map remoteClusterIdToPB, if (LOG.isDebugEnabled()) { LOG.debug("current cluster status {} {}", currentClusterStatus, newClusterStatus); } - if (!currentClusterStatus.equals(newClusterStatus)) { + boolean needChange = false; + Set clusterStatusInMem = cloudSystemInfoService.getClusterStatus(currentBes); + if (clusterStatusInMem.size() != 1) { + LOG.warn("cluster {}, multi be nodes cluster status inconsistent, fix it {}", cid, clusterStatusInMem); + needChange = true; + } + if (!currentClusterStatus.equals(newClusterStatus) || needChange) { // cluster's status changed LOG.info("cluster_status corresponding to cluster_id has been changed," + " cluster_id : {} , current_cluster_status : {}, new_cluster_status :{}", @@ -426,8 +432,8 @@ private void checkCloudFes() { } return nodeMap; }); - LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}", - expectedFes, currentFes, toAdd, toDel); + LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}, enable auto start: {}", + expectedFes, currentFes, toAdd, toDel, Config.enable_auto_start_for_cloud_cluster); if (toAdd.isEmpty() && toDel.isEmpty()) { if (LOG.isDebugEnabled()) { LOG.debug("runAfterCatalogReady getObserverFes nothing todo"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java index 606f52369e5f7c..0c020a435e516c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java @@ -55,6 +55,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; @@ -567,10 +568,18 @@ public String getCloudStatusById(final String clusterId) { } } + public Set getClusterStatus(List backends) { + // ATTN: found bug, In the same cluster, the cluster status in the tags of BE nodes is inconsistent. + // Using a set to collect the cluster statuses from the BE nodes. + return backends.stream().map(Backend::getCloudClusterStatus).collect(Collectors.toSet()); + } + public String getCloudStatusByIdNoLock(final String clusterId) { - return clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>()) - .stream().map(Backend::getCloudClusterStatus).findFirst() - .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN)); + List bes = clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>()); + Optional hasNormal = bes.stream().map(Backend::getCloudClusterStatus) + .filter(status -> status.equals(String.valueOf(Cloud.ClusterStatus.NORMAL))).findAny(); + return hasNormal.orElseGet(() -> bes.stream().map(Backend::getCloudClusterStatus).findFirst() + .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN))); } public void updateClusterNameToId(final String newName, @@ -949,6 +958,9 @@ public String waitForAutoStart(String clusterName) throws DdlException { if (Config.isNotCloudMode()) { return null; } + if (!Config.enable_auto_start_for_cloud_cluster) { + return null; + } clusterName = getClusterNameAutoStart(clusterName); if (Strings.isNullOrEmpty(clusterName)) { LOG.warn("auto start in cloud mode, but clusterName empty {}", clusterName); @@ -999,7 +1011,7 @@ public String waitForAutoStart(String clusterName) throws DdlException { } } // wait 5 mins - int retryTimes = 5 * 60; + int retryTimes = Config.auto_start_wait_to_resume_times < 0 ? 300 : Config.auto_start_wait_to_resume_times; int retryTime = 0; StopWatch stopWatch = new StopWatch(); stopWatch.start(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java index 5dd8dd9fca1ca0..ee85a2422399c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java @@ -99,7 +99,9 @@ public void setMaster(int clusterId, String token, long epoch) { // Set cloud_instance_id and meta_service_endpoint even if there are empty // Be can knowns that fe is working in cloud mode. // Set the cloud instance ID for cloud deployment identification - tMasterInfo.setCloudInstanceId(Config.cloud_instance_id); + if (!Strings.isNullOrEmpty(Config.cloud_instance_id)) { + tMasterInfo.setCloudInstanceId(Config.cloud_instance_id); + } // Set the endpoint for the metadata service in cloud mode tMasterInfo.setMetaServiceEndpoint(Config.meta_service_endpoint); } diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy index 2ce9a9d8f4b531..b5fa9ab4bce122 100644 --- a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy +++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy @@ -22,7 +22,7 @@ import org.awaitility.Awaitility; import org.apache.doris.regression.util.Http import static java.util.concurrent.TimeUnit.SECONDS; -suite('test_auto_start_in_cloud', 'multi_cluster') { +suite('test_auto_start_in_cloud', 'multi_cluster, docker') { if (!isCloudMode()) { return; } @@ -168,5 +168,29 @@ suite('test_auto_start_in_cloud', 'multi_cluster') { future1.get() future2.get() + + tag = getCloudBeTagByName(clusterName) + logger.info("tag check = {}", tag) + jsonObject = jsonSlurper.parseText(tag) + String cluster_status = jsonObject.cloud_cluster_status + assertEquals("NORMAL", cluster_status) + + // add 1 nodes, check it status NORMAL + cluster.addBackend(1, null) + dockerAwaitUntil(5) { + def result = sql """SHOW BACKENDS""" + result.size() == 4 + } + + def bes = sql_return_maparray "SHOW BACKENDS" + bes.each { + tag = it.Tag + if (!tag.contains(clusterName)) { + return + } + jsonObject = jsonSlurper.parseText(tag) + String cluster_status = jsonObject.cloud_cluster_status + assertEquals("NORMAL", cluster_status) + } } } From 2c035bb3c5038ecb47b5bfd744c104aa4cd74fce Mon Sep 17 00:00:00 2001 From: deardeng <565620795@qq.com> Date: Fri, 13 Sep 2024 20:15:05 +0800 Subject: [PATCH 2/2] fix --- .../apache/doris/cloud/catalog/CloudClusterChecker.java | 8 +++++++- .../apache/doris/cloud/system/CloudSystemInfoService.java | 2 +- .../src/main/java/org/apache/doris/system/Backend.java | 2 +- .../suites/cloud_p0/multi_cluster/test_auto_start.groovy | 4 ++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java index fd7645bff2901e..0dfcf322a0cbb9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java @@ -220,7 +220,13 @@ private void checkDiffNode(Map remoteClusterIdToPB, LOG.debug("current cluster status {} {}", currentClusterStatus, newClusterStatus); } boolean needChange = false; - Set clusterStatusInMem = cloudSystemInfoService.getClusterStatus(currentBes); + // ATTN: found bug, In the same cluster, the cluster status in the tags of BE nodes is inconsistent. + // Using a set to collect the cluster statuses from the BE nodes. + Set clusterStatusInMem = new HashSet<>(); + for (Backend backend : currentBes) { + String beClusterStatus = backend.getTagMap().get(Tag.CLOUD_CLUSTER_STATUS); + clusterStatusInMem.add(beClusterStatus == null ? "NOT_SET" : beClusterStatus); + } if (clusterStatusInMem.size() != 1) { LOG.warn("cluster {}, multi be nodes cluster status inconsistent, fix it {}", cid, clusterStatusInMem); needChange = true; diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java index 0c020a435e516c..a91892870d67a3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java @@ -579,7 +579,7 @@ public String getCloudStatusByIdNoLock(final String clusterId) { Optional hasNormal = bes.stream().map(Backend::getCloudClusterStatus) .filter(status -> status.equals(String.valueOf(Cloud.ClusterStatus.NORMAL))).findAny(); return hasNormal.orElseGet(() -> bes.stream().map(Backend::getCloudClusterStatus).findFirst() - .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN))); + .orElse(String.valueOf(Cloud.ClusterStatus.NORMAL))); } public void updateClusterNameToId(final String newName, diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java index 876e6ca40b4ef2..01bf800e97ebb8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java @@ -188,7 +188,7 @@ public Backend(long id, String host, int heartbeatPort) { } public String getCloudClusterStatus() { - return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, String.valueOf(Cloud.ClusterStatus.UNKNOWN)); + return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, String.valueOf(Cloud.ClusterStatus.NORMAL)); } public void setCloudClusterStatus(final String clusterStatus) { diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy index b5fa9ab4bce122..d6db6364d38af7 100644 --- a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy +++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy @@ -178,7 +178,7 @@ suite('test_auto_start_in_cloud', 'multi_cluster, docker') { // add 1 nodes, check it status NORMAL cluster.addBackend(1, null) dockerAwaitUntil(5) { - def result = sql """SHOW BACKENDS""" + result = sql """SHOW BACKENDS""" result.size() == 4 } @@ -189,7 +189,7 @@ suite('test_auto_start_in_cloud', 'multi_cluster, docker') { return } jsonObject = jsonSlurper.parseText(tag) - String cluster_status = jsonObject.cloud_cluster_status + cluster_status = jsonObject.cloud_cluster_status assertEquals("NORMAL", cluster_status) } }