From 416ff6c7174f563cd6842931f04f5c4315798bda Mon Sep 17 00:00:00 2001 From: Nanda kumar Date: Tue, 21 Apr 2020 01:23:43 +0530 Subject: [PATCH 1/6] HDDS-3418. Enable TestNodeFailure test cases. --- .../hdds/scm/pipeline/TestNodeFailure.java | 110 ++++++++---------- 1 file changed, 49 insertions(+), 61 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestNodeFailure.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestNodeFailure.java index 35d1774c0e8f..33b2268ca5fa 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestNodeFailure.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestNodeFailure.java @@ -18,42 +18,32 @@ package org.apache.hadoop.hdds.scm.pipeline; +import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.OzoneConfiguration; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.ratis.RatisHelper; -import org.apache.hadoop.hdds.scm.ScmConfigKeys; -import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.hdds.conf.DatanodeRatisServerConfig; +import org.apache.hadoop.test.GenericTestUtils; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; import java.io.IOException; +import java.util.List; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; - -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos - .ReplicationFactor.THREE; -import static org.apache.hadoop.hdds.protocol.proto.HddsProtos - .ReplicationType.RATIS; /** * Test Node failure detection and handling in Ratis. */ -@Ignore public class TestNodeFailure { private static MiniOzoneCluster cluster; - private static OzoneConfiguration conf; - private static Pipeline ratisPipelineOne; - private static Pipeline ratisPipelineTwo; - private static ContainerManager containerManager; + private static List ratisPipelines; private static PipelineManager pipelineManager; - private static long timeForFailure; + private static int timeForFailure; /** * Create a MiniDFSCluster for testing. @@ -62,7 +52,7 @@ public class TestNodeFailure { */ @BeforeClass public static void init() throws Exception { - conf = new OzoneConfiguration(); + final OzoneConfiguration conf = new OzoneConfiguration(); conf.setTimeDuration( RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." + DatanodeRatisServerConfig.RATIS_FOLLOWER_SLOWNESS_TIMEOUT_KEY, @@ -71,28 +61,22 @@ public static void init() throws Exception { RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." + DatanodeRatisServerConfig.RATIS_SERVER_NO_LEADER_TIMEOUT_KEY, 10, TimeUnit.SECONDS); - conf.setTimeDuration( - ScmConfigKeys.OZONE_SCM_CONTAINER_CREATION_LEASE_TIMEOUT, - 10, TimeUnit.SECONDS); + conf.set(HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL, "2s"); + cluster = MiniOzoneCluster.newBuilder(conf) .setNumDatanodes(6) .setHbInterval(1000) .setHbProcessorInterval(1000) .build(); cluster.waitForClusterToBeReady(); - StorageContainerManager scm = cluster.getStorageContainerManager(); - containerManager = scm.getContainerManager(); + + final StorageContainerManager scm = cluster.getStorageContainerManager(); pipelineManager = scm.getPipelineManager(); - ratisPipelineOne = pipelineManager.getPipeline( - containerManager.allocateContainer( - RATIS, THREE, "testOwner").getPipelineID()); - ratisPipelineTwo = pipelineManager.getPipeline( - containerManager.allocateContainer( - RATIS, THREE, "testOwner").getPipelineID()); - // At this stage, there should be 2 pipeline one with 1 open container each. - // Try closing the both the pipelines, one with a closed container and - // the other with an open container. - timeForFailure = conf.getObject(DatanodeRatisServerConfig.class) + ratisPipelines = pipelineManager.getPipelines( + HddsProtos.ReplicationType.RATIS, + HddsProtos.ReplicationFactor.THREE); + + timeForFailure = (int) conf.getObject(DatanodeRatisServerConfig.class) .getFollowerSlownessTimeout(); } @@ -106,35 +90,39 @@ public static void shutdown() { } } - @Ignore - // Enable this after we implement teardown pipeline logic once a datanode - // dies. - @Test(timeout = 300_000L) - public void testPipelineFail() throws InterruptedException, IOException, - TimeoutException { - Assert.assertEquals(ratisPipelineOne.getPipelineState(), - Pipeline.PipelineState.OPEN); - Pipeline pipelineToFail = ratisPipelineOne; - DatanodeDetails dnToFail = pipelineToFail.getFirstNode(); - cluster.shutdownHddsDatanode(dnToFail); - - // wait for sufficient time for the callback to be triggered - Thread.sleep(3 * timeForFailure); + @Test + public void testPipelineFail() { + ratisPipelines.forEach(pipeline -> { + try { + waitForPipelineCreation(pipeline.getId()); + cluster.shutdownHddsDatanode(pipeline.getFirstNode()); + GenericTestUtils.waitFor(() -> { + try { + return pipelineManager.getPipeline(pipeline.getId()) + .getPipelineState().equals(Pipeline.PipelineState.CLOSED); + } catch (PipelineNotFoundException ex) { + return true; + } + }, timeForFailure / 2, timeForFailure * 3); + } catch (Exception e) { + Assert.fail("Test Failed: " + e.getMessage()); + } + }); + } - Assert.assertEquals(Pipeline.PipelineState.CLOSED, - pipelineManager.getPipeline(ratisPipelineOne.getId()) - .getPipelineState()); - Assert.assertEquals(Pipeline.PipelineState.OPEN, - pipelineManager.getPipeline(ratisPipelineTwo.getId()) - .getPipelineState()); - // Now restart the datanode and make sure that a new pipeline is created. - cluster.setWaitForClusterToBeReadyTimeout(300000); - cluster.restartHddsDatanode(dnToFail, true); - Pipeline ratisPipelineThree = pipelineManager.getPipeline( - containerManager.allocateContainer( - RATIS, THREE, "testOwner").getPipelineID()); - //Assert that new container is not created from the ratis 2 pipeline - Assert.assertNotEquals(ratisPipelineThree.getId(), - ratisPipelineTwo.getId()); + /** + * Waits until the Pipeline is marked as OPEN. + * @param pipelineID Id of the pipeline + */ + private void waitForPipelineCreation(final PipelineID pipelineID) + throws Exception { + GenericTestUtils.waitFor(() -> { + try { + return pipelineManager.getPipeline(pipelineID) + .getPipelineState().equals(Pipeline.PipelineState.OPEN); + } catch (PipelineNotFoundException ex) { + return false; + } + }, 1000, 1000 * 60); } } \ No newline at end of file From b7bf6262962a9705f34a710b1fd6a0e7ce827466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Fri, 24 Apr 2020 10:21:15 +0200 Subject: [PATCH 2/6] retrigger build with empty commit From 85d91465c53f065d66e3ce6681e45e86a7e2d5fc Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Fri, 24 Apr 2020 13:32:36 +0200 Subject: [PATCH 3/6] trigger new CI check From d7222266e6636dade9082fbfe7b162612f018824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Fri, 24 Apr 2020 16:42:11 +0200 Subject: [PATCH 4/6] retrigger build with empty commit From 1eef08465b97978078e50bafc785ffee69bbad8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elek=20M=C3=A1rton?= Date: Tue, 28 Apr 2020 16:56:13 +0200 Subject: [PATCH 5/6] retrigger build with empty commit From 8c3d199f128609909e3f24f2756cd1df6867b488 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Thu, 30 Apr 2020 11:26:21 +0200 Subject: [PATCH 6/6] trigger new CI check