From 779c86c30aec0527545ff43fcb3b6ab17a1e1aac Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Tue, 27 Apr 2021 17:33:45 +0100 Subject: [PATCH] HDDS-5153. Decommissioning a dead node should complete immediately --- .../scm/node/NodeDecommissionManager.java | 31 ++++++++++++----- .../scm/node/TestNodeDecommissionManager.java | 33 +++++++++++++++++++ 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index 33c969700643..8462ac721869 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -20,6 +20,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState; import org.apache.hadoop.hdds.scm.DatanodeAdminError; import org.apache.hadoop.hdds.scm.ScmConfigKeys; @@ -268,11 +269,18 @@ public synchronized void startDecommission(DatanodeDetails dn) throws NodeNotFoundException, InvalidNodeStateException { NodeStatus nodeStatus = getNodeStatus(dn); NodeOperationalState opState = nodeStatus.getOperationalState(); + HddsProtos.NodeState health = nodeStatus.getHealth(); if (opState == NodeOperationalState.IN_SERVICE) { - LOG.info("Starting Decommission for node {}", dn); - nodeManager.setNodeOperationalState( - dn, NodeOperationalState.DECOMMISSIONING); - monitor.startMonitoring(dn); + if (health != HddsProtos.NodeState.DEAD) { + LOG.info("Starting Decommission for node {}", dn); + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.DECOMMISSIONING); + monitor.startMonitoring(dn); + } else { + LOG.info("{} is dead. Moving to decommissioned immediately", dn); + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.DECOMMISSIONED); + } } else if (nodeStatus.isDecommission()) { LOG.info("Start Decommission called on node {} in state {}. Nothing to "+ "do.", dn, opState); @@ -354,11 +362,18 @@ public synchronized void startMaintenance(DatanodeDetails dn, int endInHours) maintenanceEnd = (System.currentTimeMillis() / 1000L) + (endInHours * 60L * 60L); } + HddsProtos.NodeState health = nodeStatus.getHealth(); if (opState == NodeOperationalState.IN_SERVICE) { - nodeManager.setNodeOperationalState( - dn, NodeOperationalState.ENTERING_MAINTENANCE, maintenanceEnd); - monitor.startMonitoring(dn); - LOG.info("Starting Maintenance for node {}", dn); + if (health != HddsProtos.NodeState.DEAD) { + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.ENTERING_MAINTENANCE, maintenanceEnd); + monitor.startMonitoring(dn); + LOG.info("Starting Maintenance for node {}", dn); + } else { + LOG.info("{} is dead. Moving to maintenance immediately", dn); + nodeManager.setNodeOperationalState( + dn, NodeOperationalState.IN_MAINTENANCE); + } } else if (nodeStatus.isMaintenance()) { LOG.info("Starting Maintenance called on node {} with state {}. "+ "Nothing to do.", dn, opState); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java index 665c3f77d362..5b840511c1fe 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.TestUtils; import org.apache.hadoop.hdds.scm.DatanodeAdminError; +import org.apache.hadoop.hdds.scm.container.SimpleMockNodeManager; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.server.StorageContainerManager; @@ -176,6 +177,22 @@ public void testNodesCanBeDecommissionedAndRecommissioned() nodeManager.getNodeStatus(dns.get(10)).getOperationalState()); } + @Test + public void testDeadNodeDecommissionsImmediately() + throws NodeNotFoundException, InvalidNodeStateException { + List dns = generateDatanodes(); + DatanodeDetails dn = dns.get(1); + + SimpleMockNodeManager mockNM = new SimpleMockNodeManager(); + mockNM.register(dn, NodeStatus.inServiceDead()); + NodeDecommissionManager decomMgr = new NodeDecommissionManager(conf, mockNM, + null, SCMContext.emptyContext(), new EventQueue(), null); + + decomMgr.startDecommission(dns.get(1)); + assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED, + mockNM.getNodeStatus(dns.get(1)).getOperationalState()); + } + @Test public void testNodesCanBePutIntoMaintenanceAndRecommissioned() throws InvalidHostStringException, NodeNotFoundException { @@ -219,6 +236,22 @@ public void testNodesCanBePutIntoMaintenanceAndRecommissioned() nodeManager.getNodeStatus(dns.get(10)).getOperationalState()); } + @Test + public void testDeadNodeGoesToMaintenanceImmediately() + throws NodeNotFoundException, InvalidNodeStateException { + List dns = generateDatanodes(); + DatanodeDetails dn = dns.get(1); + + SimpleMockNodeManager mockNM = new SimpleMockNodeManager(); + mockNM.register(dn, NodeStatus.inServiceDead()); + NodeDecommissionManager decomMgr = new NodeDecommissionManager(conf, mockNM, + null, SCMContext.emptyContext(), new EventQueue(), null); + + decomMgr.startMaintenance(dns.get(1), 0); + assertEquals(HddsProtos.NodeOperationalState.IN_MAINTENANCE, + mockNM.getNodeStatus(dns.get(1)).getOperationalState()); + } + @Test public void testNodesCannotTransitionFromDecomToMaint() throws Exception { List dns = generateDatanodes();