From 5bb5b47b0623698076841e8585426bbd81ddf942 Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Mon, 17 May 2021 15:31:16 +0530 Subject: [PATCH 1/7] HDDS-5233. SCM subsequent init failed when previous scm start/init failed. --- .../scm/server/StorageContainerManager.java | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index fdfc4ec9c260..ab660339cea1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -946,19 +946,32 @@ public static boolean scmInit(OzoneConfiguration conf, scmStorageConfig.setClusterId(clusterId); } - if (SCMHAUtils.isSCMHAEnabled(conf)) { - SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(), - scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), - conf); - scmStorageConfig.setSCMHAFlag(true); - } if (OzoneSecurityUtil.isSecurityEnabled(conf)) { HASecurityUtils.initializeSecurity(scmStorageConfig, conf, getScmAddress(haDetails, conf), true); } + + // Do not move these code lines. If SCM version file is created file + // the subsequent scm init should use the groupID from version file. + // So, initialize() should happen before ratis server initialize. In + // this way,we do not leave ratis storage directory with multiple raft + // Groups in failure scenario. + + // The order of init should be + // 1. SCM storage config initialize to create version file. + // 2. Initialize Ratis server. + scmStorageConfig.setPrimaryScmNodeId(scmStorageConfig.getScmId()); scmStorageConfig.initialize(); + + if (SCMHAUtils.isSCMHAEnabled(conf)) { + SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(), + scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), + conf); + scmStorageConfig.setSCMHAFlag(true); + } + LOG.info("SCM initialization succeeded. Current cluster id for sd={}" + "; cid={}; layoutVersion={}; scmId={}", scmStorageConfig.getStorageDir(), scmStorageConfig.getClusterID(), From de7ac253e49c75fe433763bc96cc9cb6b7c6efda Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Mon, 17 May 2021 15:40:08 +0530 Subject: [PATCH 2/7] fix code comment --- .../apache/hadoop/hdds/scm/server/StorageContainerManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index ab660339cea1..0f3add746fda 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -952,7 +952,7 @@ public static boolean scmInit(OzoneConfiguration conf, getScmAddress(haDetails, conf), true); } - // Do not move these code lines. If SCM version file is created file + // Do not move these code lines. If SCM version file is created, // the subsequent scm init should use the groupID from version file. // So, initialize() should happen before ratis server initialize. In // this way,we do not leave ratis storage directory with multiple raft From c4cd528c826f617a28b67a620a9c9bf8d2f379c3 Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Mon, 17 May 2021 15:40:53 +0530 Subject: [PATCH 3/7] fix code comment --- .../hadoop/hdds/scm/server/StorageContainerManager.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 0f3add746fda..a9d2c30b97a2 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -954,9 +954,9 @@ public static boolean scmInit(OzoneConfiguration conf, // Do not move these code lines. If SCM version file is created, // the subsequent scm init should use the groupID from version file. - // So, initialize() should happen before ratis server initialize. In - // this way,we do not leave ratis storage directory with multiple raft - // Groups in failure scenario. + // So, scmStorageConfig#initialize() should happen before ratis server + // initialize. In this way,we do not leave ratis storage directory + // with multiple raft Groups in failure scenario. // The order of init should be // 1. SCM storage config initialize to create version file. From 67d13fd4f3c089445f9d31bf887eac07b8f5a6c4 Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Tue, 18 May 2021 12:09:30 +0530 Subject: [PATCH 4/7] fix code comments --- .../hadoop/hdds/scm/server/StorageContainerManager.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index a9d2c30b97a2..2368d6ec7075 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -952,11 +952,12 @@ public static boolean scmInit(OzoneConfiguration conf, getScmAddress(haDetails, conf), true); } - // Do not move these code lines. If SCM version file is created, - // the subsequent scm init should use the groupID from version file. + // Ensure scmRatisServer#initialize() is called post scm storage + // config initialization.. If SCM version file is created, + // the subsequent scm init should use the clusterID from version file. // So, scmStorageConfig#initialize() should happen before ratis server // initialize. In this way,we do not leave ratis storage directory - // with multiple raft Groups in failure scenario. + // with multiple raft group directories in failure scenario. // The order of init should be // 1. SCM storage config initialize to create version file. From 0915d96a84bc0423223e94c7d3989100975fa302 Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Tue, 18 May 2021 12:56:54 +0530 Subject: [PATCH 5/7] use registry for SM creation --- .../hdds/scm/ha/SCMRatisServerImpl.java | 15 ++++++++++----- .../hadoop/hdds/scm/ha/SCMStateMachine.java | 19 ++++++------------- .../scm/server/StorageContainerManager.java | 2 ++ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java index ea563f2cda6d..25c9b6ff39f5 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java @@ -82,7 +82,6 @@ public class SCMRatisServerImpl implements SCMRatisServer { final StorageContainerManager scm, final SCMHADBTransactionBuffer buffer) throws IOException { this.scm = scm; - this.stateMachine = new SCMStateMachine(scm, this, buffer); final RaftGroupId groupId = buildRaftGroupId(scm.getClusterId()); LOG.info("starting Raft server for scm:{}", scm.getScmId()); // During SCM startup, the bootstrapped node will be started just with @@ -100,9 +99,14 @@ public class SCMRatisServerImpl implements SCMRatisServer { Parameters parameters = createSCMServerTlsParameters(grpcTlsConfig); this.server = newRaftServer(scm.getScmId(), conf) - .setStateMachine(stateMachine) + .setStateMachineRegistry( + (gId) -> new SCMStateMachine(scm, buffer)) .setGroup(RaftGroup.valueOf(groupId)) .setParameters(parameters).build(); + + this.stateMachine = + (SCMStateMachine) server.getDivision(groupId).getStateMachine(); + this.division = server.getDivision(groupId); } @@ -111,7 +115,9 @@ public static void initialize(String clusterId, String scmId, final RaftGroup group = buildRaftGroup(details, scmId, clusterId); RaftServer server = null; try { - server = newRaftServer(scmId, conf).setGroup(group).build(); + server = newRaftServer(scmId, conf).setGroup(group) + .setStateMachineRegistry((groupId -> new SCMStateMachine())) + .build(); server.start(); waitForLeaderToBeReady(server, conf, group); } finally { @@ -158,8 +164,7 @@ private static RaftServer.Builder newRaftServer(final String scmId, final RaftProperties serverProperties = RatisUtil.newRaftProperties(haConf, conf); return RaftServer.newBuilder().setServerId(RaftPeerId.getRaftPeerId(scmId)) - .setProperties(serverProperties) - .setStateMachine(new SCMStateMachine()); + .setProperties(serverProperties); } @Override diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java index 968b6f605453..6fdbc5deac79 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java @@ -37,7 +37,6 @@ import org.apache.hadoop.hdds.utils.db.DBCheckpoint; import org.apache.hadoop.util.concurrent.HadoopExecutors; import org.apache.ratis.proto.RaftProtos; -import org.apache.hadoop.hdds.scm.exceptions.SCMException; import org.apache.hadoop.util.Time; import org.apache.ratis.protocol.Message; import org.apache.ratis.protocol.RaftGroupId; @@ -59,8 +58,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.SCM_NOT_INITIALIZED; - /** * The SCMStateMachine is the state machine for SCMRatisServer. It is * responsible for applying ratis committed transactions to @@ -84,20 +81,16 @@ public class SCMStateMachine extends BaseStateMachine { private DBCheckpoint installingDBCheckpoint = null; public SCMStateMachine(final StorageContainerManager scm, - final SCMRatisServer ratisServer, SCMHADBTransactionBuffer buffer) - throws SCMException { + SCMHADBTransactionBuffer buffer) { this.scm = scm; this.handlers = new EnumMap<>(RequestType.class); this.transactionBuffer = buffer; TransactionInfo latestTrxInfo = this.transactionBuffer.getLatestTrxInfo(); - if (!latestTrxInfo.isDefault() && - !updateLastAppliedTermIndex(latestTrxInfo.getTerm(), - latestTrxInfo.getTransactionIndex())) { - throw new SCMException( - String.format("Failed to update LastAppliedTermIndex " + - "in StateMachine to term:{} index:{}", - latestTrxInfo.getTerm(), latestTrxInfo.getTransactionIndex() - ), SCM_NOT_INITIALIZED); + if (!latestTrxInfo.isDefault()) { + updateLastAppliedTermIndex(latestTrxInfo.getTerm(), + latestTrxInfo.getTransactionIndex()); + LOG.info("Updated lastAppliedTermIndex {} with transactionInfo term and" + + "Index", latestTrxInfo); } this.installSnapshotExecutor = HadoopExecutors.newSingleThreadExecutor(); isInitialized = true; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index 2368d6ec7075..f0f42c9548d4 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -971,6 +971,8 @@ public static boolean scmInit(OzoneConfiguration conf, scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), conf); scmStorageConfig.setSCMHAFlag(true); + // Do force initialize to persist SCM_HA flag. + scmStorageConfig.forceInitialize(); } LOG.info("SCM initialization succeeded. Current cluster id for sd={}" From 630ff9ea96da6abbdc9558d3c23c988f10de3b7b Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Tue, 18 May 2021 13:34:23 +0530 Subject: [PATCH 6/7] fix cs --- .../java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java index 25c9b6ff39f5..063c007c0e89 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMRatisServerImpl.java @@ -99,8 +99,7 @@ public class SCMRatisServerImpl implements SCMRatisServer { Parameters parameters = createSCMServerTlsParameters(grpcTlsConfig); this.server = newRaftServer(scm.getScmId(), conf) - .setStateMachineRegistry( - (gId) -> new SCMStateMachine(scm, buffer)) + .setStateMachineRegistry((gId) -> new SCMStateMachine(scm, buffer)) .setGroup(RaftGroup.valueOf(groupId)) .setParameters(parameters).build(); From 953f9b268d4b9b46e6f950c51d0d7f6da633a66c Mon Sep 17 00:00:00 2001 From: Bharat Viswanadham Date: Wed, 19 May 2021 16:28:33 +0530 Subject: [PATCH 7/7] fix test failure --- .../scm/server/StorageContainerManager.java | 1 + .../ozone/TestStorageContainerManager.java | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java index f0f42c9548d4..09529d5a7a92 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java @@ -970,6 +970,7 @@ public static boolean scmInit(OzoneConfiguration conf, SCMRatisServerImpl.initialize(scmStorageConfig.getClusterID(), scmStorageConfig.getScmId(), haDetails.getLocalNodeDetails(), conf); + scmStorageConfig = new SCMStorageConfig(conf); scmStorageConfig.setSCMHAFlag(true); // Do force initialize to persist SCM_HA flag. scmStorageConfig.forceInitialize(); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java index 667042839f98..f91fc66e6cc1 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/TestStorageContainerManager.java @@ -529,6 +529,31 @@ public void testSCMReinitialization() throws Exception { } } + + @Test + public void testSCMReinitializationWithHAUpgrade() throws Exception { + OzoneConfiguration conf = new OzoneConfiguration(); + final String path = GenericTestUtils.getTempPath( + UUID.randomUUID().toString()); + Path scmPath = Paths.get(path, "scm-meta"); + conf.set(HddsConfigKeys.OZONE_METADATA_DIRS, scmPath.toString()); + //This will set the cluster id in the version file + final UUID clusterId = UUID.randomUUID(); + // This will initialize SCM + + StorageContainerManager.scmInit(conf, clusterId.toString()); + SCMStorageConfig scmStore = new SCMStorageConfig(conf); + Assert.assertEquals(clusterId.toString(), scmStore.getClusterID()); + Assert.assertFalse(scmStore.isSCMHAEnabled()); + + conf.setBoolean(ScmConfigKeys.OZONE_SCM_HA_ENABLE_KEY, true); + StorageContainerManager.scmInit(conf, clusterId.toString()); + scmStore = new SCMStorageConfig(conf); + Assert.assertTrue(scmStore.isSCMHAEnabled()); + validateRatisGroupExists(conf, clusterId.toString()); + + } + @VisibleForTesting public static void validateRatisGroupExists(OzoneConfiguration conf, String clusterId) throws IOException {