From dc5bf15c39907aea28ba16d36e7a6da1ba29bba9 Mon Sep 17 00:00:00 2001 From: Nandakumar Vadivelu Date: Sun, 2 Feb 2025 10:37:42 +0530 Subject: [PATCH 1/4] HDDS-12031. Enable Ratis by default on an upgraded cluster during SCM start-up. --- .../scm/server/StorageContainerManager.java | 90 +++++++++---------- 1 file changed, 41 insertions(+), 49 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index b6c41c7dcf0f..ea7d029fdbce 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -385,6 +385,13 @@ private StorageContainerManager(OzoneConfiguration conf, "failure.", ResultCodes.SCM_NOT_INITIALIZED); } + // Initialize Ratis if needed. + // This is for the clusters which got upgraded from older version of Ozone. + // We enable Ratis by default. + if (!scmStorageConfig.isSCMHAEnabled()) { + initializeRatis(conf, scmStorageConfig); + } + threadNamePrefix = getScmNodeDetails().threadNamePrefix(); primaryScmNodeId = scmStorageConfig.getPrimaryScmNodeId(); @@ -1255,15 +1262,12 @@ public static boolean scmInit(OzoneConfiguration conf, StorageState state = scmStorageConfig.getState(); final SCMHANodeDetails haDetails = SCMHANodeDetails.loadSCMHAConfig(conf, scmStorageConfig); - String primordialSCM = SCMHAUtils.getPrimordialSCM(conf); final String selfNodeId = haDetails.getLocalNodeDetails().getNodeId(); final String selfHostName = haDetails.getLocalNodeDetails().getHostName(); - if (primordialSCM != null && SCMHAUtils.isSCMHAEnabled(conf) - && !SCMHAUtils.isPrimordialSCM(conf, selfNodeId, selfHostName)) { - LOG.info( - "SCM init command can only be executed in Primordial SCM {}, " - + "self id {} " - + "Ignoring it.", primordialSCM, selfNodeId); + if (!SCMHAUtils.isPrimordialSCM(conf, selfNodeId, selfHostName)) { + final String primordialSCM = SCMHAUtils.getPrimordialSCM(conf); + LOG.info("SCM init command can only be executed on Primordial SCM. " + + "Primordial SCM ID: {}. Self ID: {}.", primordialSCM, selfNodeId); return true; } if (state != StorageState.INITIALIZED) { @@ -1294,15 +1298,7 @@ public static boolean scmInit(OzoneConfiguration conf, scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId()); scmStorageConfig.initialize(); - if (SCMHAUtils.isSCMHAEnabled(conf)) { - SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(), - scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), - conf); - scmStorageConfig = new SCMStorageConfig(conf); - scmStorageConfig.setSCMHAFlag(true); - // Do force initialize to persist SCM_HA flag. - scmStorageConfig.forceInitialize(); - } + initializeRatis(conf, scmStorageConfig); LOG.info("SCM initialization succeeded. Current cluster id for sd={}" + "; cid={}; layoutVersion={}; scmId={}", @@ -1326,48 +1322,44 @@ public static boolean scmInit(OzoneConfiguration conf, layoutVersionManager.close(); } - clusterId = scmStorageConfig.getClusterID(); - final boolean isSCMHAEnabled = scmStorageConfig.isSCMHAEnabled(); - // Initialize security if security is enabled later. initializeSecurityIfNeeded(conf, scmStorageConfig, selfHostName, true); - if (SCMHAUtils.isSCMHAEnabled(conf) && !isSCMHAEnabled) { - SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(), - scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), - conf); - scmStorageConfig.setSCMHAFlag(true); - scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId()); - scmStorageConfig.forceInitialize(); - - /* - * Since Ratis is initialized on an existing cluster, we have to - * trigger Ratis snapshot so that this SCM can send the latest scm.db - * to the bootstrapping SCMs later. - */ - - try { - SCMHAUtils.setRatisEnabled(true); - StorageContainerManager scm = createSCM(conf); - scm.start(); - scm.getScmHAManager().getRatisServer().triggerSnapshot(); - scm.stop(); - scm.join(); - } catch (AuthenticationException e) { - throw new IOException(e); - } - LOG.info("Enabled SCM HA"); - } - LOG.info("SCM already initialized. Reusing existing cluster id for sd={}" + ";cid={}; layoutVersion={}; HAEnabled={}", - scmStorageConfig.getStorageDir(), clusterId, - scmStorageConfig.getLayoutVersion(), - scmStorageConfig.isSCMHAEnabled()); + scmStorageConfig.getStorageDir(), scmStorageConfig.getClusterID(), + scmStorageConfig.getLayoutVersion(), scmStorageConfig.isSCMHAEnabled()); return true; } } + private static void initializeRatis(OzoneConfiguration conf, SCMStorageConfig storageConfig) + throws IOException { + final SCMHANodeDetails haDetails = SCMHANodeDetails.loadSCMHAConfig(conf, storageConfig); + SCMRatisServerImpl.initialize(storageConfig.getClusterID(), + storageConfig.getScmId(), haDetails.getLocalNodeDetails(), conf); + storageConfig.setSCMHAFlag(true); + storageConfig.setPrimaryScmNodeId(storageConfig.getScmId()); + storageConfig.forceInitialize(); + LOG.info("Enabled Ratis!"); + + /* + * Since Ratis can be initialized on an existing cluster, we have to + * trigger Ratis snapshot so that this SCM can send the latest scm.db + * to the bootstrapping SCMs later. + */ + try { + SCMHAUtils.setRatisEnabled(true); + StorageContainerManager scm = createSCM(conf); + scm.start(); + scm.getScmHAManager().getRatisServer().triggerSnapshot(); + scm.stop(); + scm.join(); + } catch (AuthenticationException e) { + throw new IOException(e); + } + } + private static InetSocketAddress getScmAddress(SCMHANodeDetails haDetails, ConfigurationSource conf) throws IOException { List scmNodeInfoList = SCMNodeInfo.buildNodeInfo( From 2c278b15eb0826a1648ae5140735fa6a02d94354 Mon Sep 17 00:00:00 2001 From: Nandakumar Vadivelu Date: Thu, 6 Feb 2025 19:58:20 +0530 Subject: [PATCH 2/4] fixed unit test failures --- .../scm/server/StorageContainerManager.java | 54 +++++++++++-------- .../hadoop/ozone/MiniOzoneHAClusterImpl.java | 12 +++-- 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index ea7d029fdbce..0bbcc88ac0e1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -253,7 +253,7 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl private PipelineManager pipelineManager; private ContainerManager containerManager; private BlockManager scmBlockManager; - private final SCMStorageConfig scmStorageConfig; + private SCMStorageConfig scmStorageConfig; private NodeDecommissionManager scmDecommissionManager; private WritableContainerFactory writableContainerFactory; private FinalizationManager finalizationManager; @@ -389,7 +389,9 @@ private StorageContainerManager(OzoneConfiguration conf, // This is for the clusters which got upgraded from older version of Ozone. // We enable Ratis by default. if (!scmStorageConfig.isSCMHAEnabled()) { - initializeRatis(conf, scmStorageConfig); + initializeRatis(conf); + // Since we have initialized Ratis, we have to reload StorageConfig + scmStorageConfig = new SCMStorageConfig(conf); } threadNamePrefix = getScmNodeDetails().threadNamePrefix(); @@ -1262,10 +1264,11 @@ public static boolean scmInit(OzoneConfiguration conf, StorageState state = scmStorageConfig.getState(); final SCMHANodeDetails haDetails = SCMHANodeDetails.loadSCMHAConfig(conf, scmStorageConfig); + final String primordialSCM = SCMHAUtils.getPrimordialSCM(conf); final String selfNodeId = haDetails.getLocalNodeDetails().getNodeId(); final String selfHostName = haDetails.getLocalNodeDetails().getHostName(); - if (!SCMHAUtils.isPrimordialSCM(conf, selfNodeId, selfHostName)) { - final String primordialSCM = SCMHAUtils.getPrimordialSCM(conf); + if (primordialSCM != null && + !SCMHAUtils.isPrimordialSCM(conf, selfNodeId, selfHostName)) { LOG.info("SCM init command can only be executed on Primordial SCM. " + "Primordial SCM ID: {}. Self ID: {}.", primordialSCM, selfNodeId); return true; @@ -1297,8 +1300,7 @@ public static boolean scmInit(OzoneConfiguration conf, scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId()); scmStorageConfig.initialize(); - - initializeRatis(conf, scmStorageConfig); + initializeRatis(conf); LOG.info("SCM initialization succeeded. Current cluster id for sd={}" + "; cid={}; layoutVersion={}; scmId={}", @@ -1325,6 +1327,27 @@ public static boolean scmInit(OzoneConfiguration conf, // Initialize security if security is enabled later. initializeSecurityIfNeeded(conf, scmStorageConfig, selfHostName, true); + // Enable Ratis if it's not already enabled. + if (!scmStorageConfig.isSCMHAEnabled()) { + initializeRatis(conf); + + /* + * Since Ratis can be initialized on an existing cluster, we have to + * trigger Ratis snapshot so that this SCM can send the latest scm.db + * to the bootstrapping SCMs later. + */ + try { + SCMHAUtils.setRatisEnabled(true); + StorageContainerManager scm = createSCM(conf); + scm.start(); + scm.getScmHAManager().getRatisServer().triggerSnapshot(); + scm.stop(); + scm.join(); + } catch (AuthenticationException e) { + throw new IOException(e); + } + } + LOG.info("SCM already initialized. Reusing existing cluster id for sd={}" + ";cid={}; layoutVersion={}; HAEnabled={}", scmStorageConfig.getStorageDir(), scmStorageConfig.getClusterID(), @@ -1333,8 +1356,9 @@ public static boolean scmInit(OzoneConfiguration conf, } } - private static void initializeRatis(OzoneConfiguration conf, SCMStorageConfig storageConfig) + private static void initializeRatis(OzoneConfiguration conf) throws IOException { + final SCMStorageConfig storageConfig = new SCMStorageConfig(conf); final SCMHANodeDetails haDetails = SCMHANodeDetails.loadSCMHAConfig(conf, storageConfig); SCMRatisServerImpl.initialize(storageConfig.getClusterID(), storageConfig.getScmId(), haDetails.getLocalNodeDetails(), conf); @@ -1342,22 +1366,6 @@ private static void initializeRatis(OzoneConfiguration conf, SCMStorageConfig st storageConfig.setPrimaryScmNodeId(storageConfig.getScmId()); storageConfig.forceInitialize(); LOG.info("Enabled Ratis!"); - - /* - * Since Ratis can be initialized on an existing cluster, we have to - * trigger Ratis snapshot so that this SCM can send the latest scm.db - * to the bootstrapping SCMs later. - */ - try { - SCMHAUtils.setRatisEnabled(true); - StorageContainerManager scm = createSCM(conf); - scm.start(); - scm.getScmHAManager().getRatisServer().triggerSnapshot(); - scm.stop(); - scm.join(); - } catch (AuthenticationException e) { - throw new IOException(e); - } } private static InetSocketAddress getScmAddress(SCMHANodeDetails haDetails, diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java index 152693302824..6ef0febf3ab8 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneHAClusterImpl.java @@ -538,7 +538,7 @@ protected OMHAService createOMService() throws IOException, } /** - * Start OM service with multiple OMs. + * Start SCM service with multiple SCMs. */ protected SCMHAService createSCMService() throws IOException, AuthenticationException { @@ -616,10 +616,10 @@ protected SCMHAService createSCMService() * Initialize HA related configurations. */ private void initSCMHAConfig() { - // Set configurations required for starting OM HA service, because that + // Set configurations required for starting SCM HA service, because that // is the serviceID being passed to start Ozone HA cluster. - // Here setting internal service and OZONE_OM_SERVICE_IDS_KEY, in this - // way in OM start it uses internal service id to find it's service id. + // Here setting internal service and OZONE_SCM_SERVICE_IDS_KEY, in this + // way in SCM start it uses internal service id to find it's service id. conf.set(ScmConfigKeys.OZONE_SCM_SERVICE_IDS_KEY, scmServiceId); conf.set(ScmConfigKeys.OZONE_SCM_DEFAULT_SERVICE_ID, scmServiceId); String scmNodesKey = ConfUtils.addKeySuffixes( @@ -629,6 +629,10 @@ private void initSCMHAConfig() { for (int i = 1; i <= numOfSCMs; i++) { String scmNodeId = SCM_NODE_ID_PREFIX + i; + + if (i == 1) { + conf.set(ScmConfigKeys.OZONE_SCM_PRIMORDIAL_NODE_ID_KEY, scmNodeId); + } scmNodesKeyValue.append(",").append(scmNodeId); String scmAddrKey = ConfUtils.addKeySuffixes( From 53f45e008c9ac537e964ccda080df2aa52b0297e Mon Sep 17 00:00:00 2001 From: Nandakumar Vadivelu Date: Fri, 7 Feb 2025 16:14:37 +0530 Subject: [PATCH 3/4] Fixed TestScmStartupSlvLessThanMlv failure. --- .../hadoop/ozone/upgrade/UpgradeTestUtils.java | 13 +++++++++++++ .../scm/upgrade/TestScmStartupSlvLessThanMlv.java | 14 +++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/upgrade/UpgradeTestUtils.java b/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/upgrade/UpgradeTestUtils.java index d6ea2ec8b858..3ebf4ee9e0ad 100644 --- a/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/upgrade/UpgradeTestUtils.java +++ b/hadoop-hdds/common/src/test/java/org/apache/hadoop/ozone/upgrade/UpgradeTestUtils.java @@ -18,6 +18,7 @@ */ package org.apache.hadoop.ozone.upgrade; +import jakarta.annotation.Nullable; import org.apache.hadoop.ozone.upgrade.InjectedUpgradeFinalizationExecutor.UpgradeTestInjectionPoints; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.ozone.common.StorageInfo; @@ -25,6 +26,7 @@ import java.io.File; import java.io.IOException; +import java.util.Properties; import java.util.UUID; import java.util.concurrent.CountDownLatch; @@ -40,6 +42,12 @@ private UpgradeTestUtils() { } */ public static File createVersionFile(File parentDir, HddsProtos.NodeType nodeType, int mlv) throws IOException { + return createVersionFile(parentDir, nodeType, mlv, null); + } + + public static File createVersionFile(File parentDir, + HddsProtos.NodeType nodeType, int mlv, + @Nullable Properties properties) throws IOException { final String versionFileName = "VERSION"; @@ -49,6 +57,11 @@ public static File createVersionFile(File parentDir, System.currentTimeMillis(), mlv); + if (properties != null) { + properties.forEach((key, value) -> + info.setProperty((String) key, (String) value)); + } + File versionFile = new File(parentDir, versionFileName); info.writeTo(versionFile); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/upgrade/TestScmStartupSlvLessThanMlv.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/upgrade/TestScmStartupSlvLessThanMlv.java index dc26b89cf6d4..cab89438758e 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/upgrade/TestScmStartupSlvLessThanMlv.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/upgrade/TestScmStartupSlvLessThanMlv.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hdds.scm.upgrade; +import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.ScmConfigKeys; @@ -30,7 +31,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.util.Properties; +import static org.apache.hadoop.ozone.OzoneConsts.SCM_HA; +import static org.apache.hadoop.ozone.OzoneConsts.SCM_ID; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -53,6 +57,8 @@ public void testStartupSlvLessThanMlv(@TempDir Path tempDir) OzoneConfiguration conf = new OzoneConfiguration(); conf.set(ScmConfigKeys.OZONE_SCM_DB_DIRS, tempDir.toAbsolutePath().toString()); + conf.set(HddsConfigKeys.OZONE_METADATA_DIRS, + tempDir.toAbsolutePath().toString()); // Set metadata layout version larger then software layout version. int largestSlv = 0; @@ -61,9 +67,15 @@ public void testStartupSlvLessThanMlv(@TempDir Path tempDir) } int mlv = largestSlv + 1; + Properties properties = new Properties(); + properties.setProperty(SCM_ID, "scm"); + properties.setProperty(SCM_HA, "true"); + // Create version file with MLV > SLV, which should fail the SCM // construction. - UpgradeTestUtils.createVersionFile(scmSubdir, HddsProtos.NodeType.SCM, mlv); + UpgradeTestUtils.createVersionFile(scmSubdir, HddsProtos.NodeType.SCM, mlv, + properties); + Throwable t = assertThrows(IOException.class, () -> new StorageContainerManager(conf)); From 9ce14e6c3c6f8612d2d9dbe86575b5c1733cb793 Mon Sep 17 00:00:00 2001 From: Nandakumar Vadivelu Date: Sat, 8 Feb 2025 12:14:57 +0530 Subject: [PATCH 4/4] Addressed review comments. --- .../hdds/scm/server/StorageContainerManager.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 0589b58dceb0..5b4d4e96a427 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -387,9 +387,8 @@ private StorageContainerManager(OzoneConfiguration conf, // This is for the clusters which got upgraded from older version of Ozone. // We enable Ratis by default. if (!scmStorageConfig.isSCMHAEnabled()) { - initializeRatis(conf); // Since we have initialized Ratis, we have to reload StorageConfig - scmStorageConfig = new SCMStorageConfig(conf); + scmStorageConfig = initializeRatis(conf); } threadNamePrefix = getScmNodeDetails().threadNamePrefix(); @@ -1298,7 +1297,7 @@ public static boolean scmInit(OzoneConfiguration conf, scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId()); scmStorageConfig.initialize(); - initializeRatis(conf); + scmStorageConfig = initializeRatis(conf); LOG.info("SCM initialization succeeded. Current cluster id for sd={}" + "; cid={}; layoutVersion={}; scmId={}", @@ -1316,7 +1315,7 @@ public static boolean scmInit(OzoneConfiguration conf, // Enable Ratis if it's not already enabled. if (!scmStorageConfig.isSCMHAEnabled()) { - initializeRatis(conf); + scmStorageConfig = initializeRatis(conf); /* * Since Ratis can be initialized on an existing cluster, we have to @@ -1343,7 +1342,7 @@ public static boolean scmInit(OzoneConfiguration conf, } } - private static void initializeRatis(OzoneConfiguration conf) + private static SCMStorageConfig initializeRatis(OzoneConfiguration conf) throws IOException { final SCMStorageConfig storageConfig = new SCMStorageConfig(conf); final SCMHANodeDetails haDetails = SCMHANodeDetails.loadSCMHAConfig(conf, storageConfig); @@ -1353,6 +1352,7 @@ private static void initializeRatis(OzoneConfiguration conf) storageConfig.setPrimaryScmNodeId(storageConfig.getScmId()); storageConfig.forceInitialize(); LOG.info("Enabled Ratis!"); + return storageConfig; } private static InetSocketAddress getScmAddress(SCMHANodeDetails haDetails,