From 96d71a802a44d046cdb0672e00c224abdf9d8c47 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Thu, 24 Apr 2025 11:27:06 +0300 Subject: [PATCH 1/7] Change default hdds.scm.safemode.min.datanode to 3 --- .../src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 20f21ce5caf4..4f134b1de8a3 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -80,7 +80,7 @@ public final class HddsConfigKeys { public static final boolean HDDS_SCM_SAFEMODE_ENABLED_DEFAULT = true; public static final String HDDS_SCM_SAFEMODE_MIN_DATANODE = "hdds.scm.safemode.min.datanode"; - public static final int HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT = 1; + public static final int HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT = 3; public static final String HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT = From d23c395313702ee1b521f908229aed5564935a73 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Fri, 25 Apr 2025 15:46:00 +0300 Subject: [PATCH 2/7] Add warn log, fix doc --- .../src/main/resources/ozone-default.xml | 2 +- .../concept/StorageContainerManager.md | 4 ++-- .../scm/safemode/DataNodeSafeModeRule.java | 15 ++++++++++++- .../safemode/TestDataNodeSafeModeRule.java | 21 +++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index ccd04eb8404c..2c53d1a287a2 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -1637,7 +1637,7 @@ hdds.scm.safemode.min.datanode - 1 + 3 HDDS,SCM,OPERATION Minimum DataNodes which should be registered to get SCM out of safe mode. diff --git a/hadoop-hdds/docs/content/concept/StorageContainerManager.md b/hadoop-hdds/docs/content/concept/StorageContainerManager.md index 3b2f4d8f9d73..48d509016a68 100644 --- a/hadoop-hdds/docs/content/concept/StorageContainerManager.md +++ b/hadoop-hdds/docs/content/concept/StorageContainerManager.md @@ -93,6 +93,6 @@ key | default | description ----|---------|------------ ozone.scm.container.size | 5GB | Default container size used by Ozone ozone.scm.block.size | 256MB | The default size of a data block. -hdds.scm.safemode.min.datanode | 1 | Minimum number of datanodes to start the real work. +hdds.scm.safemode.min.datanode | 3 | Minimum number of datanodes to start the real work. ozone.scm.http-address | 0.0.0.0:9876 | HTTP address of the SCM server -ozone.metadata.dirs | none | Directory to store persisted data (RocksDB). \ No newline at end of file +ozone.metadata.dirs | none | Directory to store persisted data (RocksDB). diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java index 081a9d02308d..813720522523 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java @@ -27,6 +27,8 @@ import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; /** * Class defining Safe mode exit criteria according to number of DataNodes @@ -36,6 +38,7 @@ public class DataNodeSafeModeRule extends SafeModeExitRule { private static final String NAME = "DataNodeSafeModeRule"; + private static final Logger LOG = LogManager.getLogger(NAME); // Min DataNodes required to exit safe mode. private int requiredDns; @@ -66,7 +69,17 @@ protected boolean validate() { if (validateBasedOnReportProcessing()) { return registeredDns >= requiredDns; } - return nodeManager.getNodes(NodeStatus.inServiceHealthy()).size() >= requiredDns; + + int healthyCount = nodeManager.getNodes(NodeStatus.inServiceHealthy()).size(); + int healthyReadOnlyCount = nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly()).size(); + int staleCount = nodeManager.getNodes(NodeStatus.inServiceStale()).size(); + + if (healthyCount + healthyReadOnlyCount + staleCount == 1) { + LOG.warn("Only one Datanode is available in the cluster. " + + "Consider setting 'hdds.scm.safemode.min.datanode=1' in the configuration."); + } + + return healthyCount >= requiredDns; } @Override diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java index 847da184fab0..06e8dd2bd846 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java @@ -153,6 +153,8 @@ public void testDataNodeSafeModeRuleWithNodeManager() throws Exception { rule.setValidateBasedOnReportProcessing(false); when(nodeManager.getNodes(NodeStatus.inServiceHealthy())).thenReturn(new ArrayList<>()); + when(nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly())).thenReturn(new ArrayList<>()); + when(nodeManager.getNodes(NodeStatus.inServiceStale())).thenReturn(new ArrayList<>()); assertFalse(rule.validate()); @@ -169,4 +171,23 @@ public void testDataNodeSafeModeRuleWithNodeManager() throws Exception { verify(nodeManager, times(2)).getNodes(NodeStatus.inServiceHealthy()); } + + @Test + public void testDataNodeSafeModeRuleWithSingleDataNode() throws Exception { + int requiredDns = 2; + setup(requiredDns); + + rule.setValidateBasedOnReportProcessing(false); + + List singleHealthy = new ArrayList<>(); + singleHealthy.add(MockDatanodeDetails.randomDatanodeDetails()); + + when(nodeManager.getNodes(NodeStatus.inServiceHealthy())).thenReturn(singleHealthy); + when(nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly())).thenReturn(new ArrayList<>()); + when(nodeManager.getNodes(NodeStatus.inServiceStale())).thenReturn(new ArrayList<>()); + + assertFalse(rule.validate()); + + verify(nodeManager, times(1)).getNodes(NodeStatus.inServiceHealthy()); + } } From f4b32185e24d9f9abe75cd1f23a5b3294878d158 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Fri, 25 Apr 2025 21:26:38 +0300 Subject: [PATCH 3/7] Fix doc --- hadoop-hdds/docs/content/concept/StorageContainerManager.zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/docs/content/concept/StorageContainerManager.zh.md b/hadoop-hdds/docs/content/concept/StorageContainerManager.zh.md index 7adecdee0290..0aca2055d800 100644 --- a/hadoop-hdds/docs/content/concept/StorageContainerManager.zh.md +++ b/hadoop-hdds/docs/content/concept/StorageContainerManager.zh.md @@ -81,6 +81,6 @@ SCM 负责创建 Ozone 集群。当通过 `init` 命令启动 SCM 时,SCM 将 ----|---------|------------ ozone.scm.container.size | 5GB | Ozone 使用的默认容器的大小 ozone.scm.block.size | 256MB | 数据块的默认大小 -hdds.scm.safemode.min.datanode | 1 | 能够启动实际工作所需的最小数据节点数 +hdds.scm.safemode.min.datanode | 3 | 能够启动实际工作所需的最小数据节点数 ozone.scm.http-address | 0.0.0.0:9876 | SCM 服务端使用的 HTTP 地址 ozone.metadata.dirs | none | 存储持久化数据的目录(RocksDB) From 0be4df6776993fec406283ddbe19a7bff64d7b3d Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Mon, 28 Apr 2025 10:42:07 +0300 Subject: [PATCH 4/7] Fix tests and config --- .../hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java | 6 +++--- .../hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java | 1 + .../hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java | 1 + .../dist/src/main/compose/ozonescripts/docker-config | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java index 813720522523..a94a67d6846e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java @@ -27,8 +27,8 @@ import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.TypedEvent; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Class defining Safe mode exit criteria according to number of DataNodes @@ -38,7 +38,7 @@ public class DataNodeSafeModeRule extends SafeModeExitRule { private static final String NAME = "DataNodeSafeModeRule"; - private static final Logger LOG = LogManager.getLogger(NAME); + private static final Logger LOG = LoggerFactory.getLogger(DataNodeSafeModeRule.class); // Min DataNodes required to exit safe mode. private int requiredDns; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java index aa71edcc68f9..1d2061daf9bc 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java @@ -73,6 +73,7 @@ public void testHealthyPipelineSafeModeRuleWithNoPipelines() ContainerManager containerManager = mock(ContainerManager.class); when(containerManager.getContainers()).thenReturn(containers); config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempFile.getPath()); + config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 0); // enable pipeline check config.setBoolean( HddsConfigKeys.HDDS_SCM_SAFEMODE_PIPELINE_CREATION, false); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index 7a43792eb5db..38952b1d9477 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -105,6 +105,7 @@ public void setUp() throws IOException { config.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_PIPELINE_CREATION, false); config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath()); + config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1); scmMetadataStore = new SCMMetadataStoreImpl(config); } diff --git a/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config b/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config index adfaeb287d0d..6c0262870aac 100644 --- a/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config +++ b/hadoop-ozone/dist/src/main/compose/ozonescripts/docker-config @@ -24,6 +24,7 @@ OZONE-SITE.XML_ozone.scm.block.client.address=scm OZONE-SITE.XML_ozone.metadata.dirs=/data/metadata OZONE-SITE.XML_ozone.scm.client.address=scm OZONE-SITE.XML_ozone.server.default.replication=1 +OZONE-SITE.XML_hdds.scm.safemode.min.datanode=1 OZONE-SITE.XML_hdds.datanode.dir=/data/hdds OZONE-SITE.XML_hdds.datanode.volume.min.free.space=100MB OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s From f683f37a5dbb20a2b114997875e1ed5dd314fad6 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Wed, 21 May 2025 11:26:25 +0300 Subject: [PATCH 5/7] Delete warn message, revert changes --- .../hdds/scm/safemode/DataNodeSafeModeRule.java | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java index a94a67d6846e..57c83fa546e2 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java @@ -69,17 +69,7 @@ protected boolean validate() { if (validateBasedOnReportProcessing()) { return registeredDns >= requiredDns; } - - int healthyCount = nodeManager.getNodes(NodeStatus.inServiceHealthy()).size(); - int healthyReadOnlyCount = nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly()).size(); - int staleCount = nodeManager.getNodes(NodeStatus.inServiceStale()).size(); - - if (healthyCount + healthyReadOnlyCount + staleCount == 1) { - LOG.warn("Only one Datanode is available in the cluster. " + - "Consider setting 'hdds.scm.safemode.min.datanode=1' in the configuration."); - } - - return healthyCount >= requiredDns; + return nodeManager.getNodes(NodeStatus.inServiceHealthy()).size() >= requiredDns; } @Override From 3efcf2608191669a459a95faf55cf7e2b2a187e5 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Wed, 21 May 2025 11:28:18 +0300 Subject: [PATCH 6/7] Clean up --- .../apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java index 57c83fa546e2..081a9d02308d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java @@ -27,8 +27,6 @@ import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.hadoop.hdds.server.events.TypedEvent; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Class defining Safe mode exit criteria according to number of DataNodes @@ -38,7 +36,6 @@ public class DataNodeSafeModeRule extends SafeModeExitRule { private static final String NAME = "DataNodeSafeModeRule"; - private static final Logger LOG = LoggerFactory.getLogger(DataNodeSafeModeRule.class); // Min DataNodes required to exit safe mode. private int requiredDns; From e53e08da6f561f579c71ba403cd95cb13d49b3f1 Mon Sep 17 00:00:00 2001 From: Anastasia Kostryukova Date: Wed, 21 May 2025 11:32:17 +0300 Subject: [PATCH 7/7] Delete unused test, revert changes --- .../safemode/TestDataNodeSafeModeRule.java | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java index 06e8dd2bd846..847da184fab0 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java @@ -153,8 +153,6 @@ public void testDataNodeSafeModeRuleWithNodeManager() throws Exception { rule.setValidateBasedOnReportProcessing(false); when(nodeManager.getNodes(NodeStatus.inServiceHealthy())).thenReturn(new ArrayList<>()); - when(nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly())).thenReturn(new ArrayList<>()); - when(nodeManager.getNodes(NodeStatus.inServiceStale())).thenReturn(new ArrayList<>()); assertFalse(rule.validate()); @@ -171,23 +169,4 @@ public void testDataNodeSafeModeRuleWithNodeManager() throws Exception { verify(nodeManager, times(2)).getNodes(NodeStatus.inServiceHealthy()); } - - @Test - public void testDataNodeSafeModeRuleWithSingleDataNode() throws Exception { - int requiredDns = 2; - setup(requiredDns); - - rule.setValidateBasedOnReportProcessing(false); - - List singleHealthy = new ArrayList<>(); - singleHealthy.add(MockDatanodeDetails.randomDatanodeDetails()); - - when(nodeManager.getNodes(NodeStatus.inServiceHealthy())).thenReturn(singleHealthy); - when(nodeManager.getNodes(NodeStatus.inServiceHealthyReadOnly())).thenReturn(new ArrayList<>()); - when(nodeManager.getNodes(NodeStatus.inServiceStale())).thenReturn(new ArrayList<>()); - - assertFalse(rule.validate()); - - verify(nodeManager, times(1)).getNodes(NodeStatus.inServiceHealthy()); - } }