From 6648965b5ce224cff535fd3bf633fcd251c54167 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Wed, 12 Mar 2025 21:10:51 +0800 Subject: [PATCH 1/3] HDDS-12535. Intermittent failure in TestContainerReportHandling --- .../container/TestContainerReportHandling.java | 15 +++++++++++++-- .../TestContainerReportHandlingWithHA.java | 17 +++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java index d7eb78ad1619..7bb3cf1f2062 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java @@ -53,14 +53,12 @@ import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; import org.apache.ozone.test.GenericTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; /** * Tests for container report handling. */ -@Flaky("HDDS-12535") public class TestContainerReportHandling { private static final String VOLUME = "vol1"; private static final String BUCKET = "bucket1"; @@ -97,6 +95,19 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor ContainerID containerID = ContainerID.valueOf(keyLocation.getContainerID()); waitForContainerClose(cluster, containerID.getId()); + // also wait till the container is closed in SCM + GenericTestUtils.waitFor(() -> { + ContainerManager containerManager = cluster.getStorageContainerManager().getContainerManager(); + try { + if (containerManager.getContainer(containerID).getState() != HddsProtos.LifeCycleState.CLOSED) { + return false; + } + } catch (ContainerNotFoundException e) { + return false; + } + return true; + }, 2000, 20000); + // move the container to DELETING ContainerManager containerManager = cluster.getStorageContainerManager().getContainerManager(); containerManager.updateContainerState(containerID, HddsProtos.LifeCycleEvent.DELETE); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java index 9517fd9e4598..24438c35dce8 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java @@ -54,14 +54,12 @@ import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; import org.apache.ozone.test.GenericTestUtils; -import org.apache.ozone.test.tag.Flaky; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; /** * Tests for container report handling with SCM High Availability. */ -@Flaky("HDDS-12535") public class TestContainerReportHandlingWithHA { private static final String VOLUME = "vol1"; private static final String BUCKET = "bucket1"; @@ -99,6 +97,21 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor ContainerID containerID = ContainerID.valueOf(keyLocation.getContainerID()); waitForContainerClose(cluster, containerID.getId()); + // also wait till the container is closed in all SCMs + GenericTestUtils.waitFor(() -> { + for (int i = 0; i < numSCM; i++) { + ContainerManager containerManager = cluster.getStorageContainerManager(i).getContainerManager(); + try { + if (containerManager.getContainer(containerID).getState() != HddsProtos.LifeCycleState.CLOSED) { + return false; + } + } catch (ContainerNotFoundException e) { + return false; + } + } + return true; + }, 2000, 20000); + // move the container to DELETING ContainerManager containerManager = cluster.getScmLeader().getContainerManager(); containerManager.updateContainerState(containerID, HddsProtos.LifeCycleEvent.DELETE); From d1bd3bd9a455f08e0a1797155ad5edf3e9fc5735 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Thu, 13 Mar 2025 02:58:06 +0000 Subject: [PATCH 2/3] Create waitForContainerStateInSCM to reduce duplication --- .../TestContainerReportHandling.java | 25 ++---------- .../TestContainerReportHandlingWithHA.java | 38 ++----------------- .../hadoop/ozone/container/TestHelper.java | 17 +++++++++ 3 files changed, 23 insertions(+), 57 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java index 7bb3cf1f2062..1aed5b76d3d4 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandling.java @@ -23,11 +23,11 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.apache.hadoop.ozone.container.TestHelper.waitForContainerClose; +import static org.apache.hadoop.ozone.container.TestHelper.waitForContainerStateInSCM; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; import java.io.OutputStream; @@ -41,7 +41,6 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.ozone.HddsDatanodeService; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.client.ObjectStore; @@ -52,7 +51,6 @@ import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; -import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -96,17 +94,7 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor waitForContainerClose(cluster, containerID.getId()); // also wait till the container is closed in SCM - GenericTestUtils.waitFor(() -> { - ContainerManager containerManager = cluster.getStorageContainerManager().getContainerManager(); - try { - if (containerManager.getContainer(containerID).getState() != HddsProtos.LifeCycleState.CLOSED) { - return false; - } - } catch (ContainerNotFoundException e) { - return false; - } - return true; - }, 2000, 20000); + waitForContainerStateInSCM(cluster.getStorageContainerManager(), containerID, HddsProtos.LifeCycleState.CLOSED); // move the container to DELETING ContainerManager containerManager = cluster.getStorageContainerManager().getContainerManager(); @@ -122,14 +110,7 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor // restart a DN and wait for the container to get CLOSED. HddsDatanodeService dn = cluster.getHddsDatanode(keyLocation.getPipeline().getFirstNode()); cluster.restartHddsDatanode(dn.getDatanodeDetails(), false); - GenericTestUtils.waitFor(() -> { - try { - return containerManager.getContainer(containerID).getState() == HddsProtos.LifeCycleState.CLOSED; - } catch (ContainerNotFoundException e) { - fail(e); - } - return false; - }, 2000, 20000); + waitForContainerStateInSCM(cluster.getStorageContainerManager(), containerID, HddsProtos.LifeCycleState.CLOSED); assertEquals(HddsProtos.LifeCycleState.CLOSED, containerManager.getContainer(containerID).getState()); } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java index 24438c35dce8..9139071a5489 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java @@ -23,11 +23,11 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; import static org.apache.hadoop.ozone.container.TestHelper.waitForContainerClose; +import static org.apache.hadoop.ozone.container.TestHelper.waitForContainerStateInSCM; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; import java.io.OutputStream; @@ -41,7 +41,6 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.ozone.HddsDatanodeService; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl; @@ -53,7 +52,6 @@ import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfoGroup; -import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; @@ -96,21 +94,7 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor OmKeyLocationInfo keyLocation = keyLocations.get(0); ContainerID containerID = ContainerID.valueOf(keyLocation.getContainerID()); waitForContainerClose(cluster, containerID.getId()); - - // also wait till the container is closed in all SCMs - GenericTestUtils.waitFor(() -> { - for (int i = 0; i < numSCM; i++) { - ContainerManager containerManager = cluster.getStorageContainerManager(i).getContainerManager(); - try { - if (containerManager.getContainer(containerID).getState() != HddsProtos.LifeCycleState.CLOSED) { - return false; - } - } catch (ContainerNotFoundException e) { - return false; - } - } - return true; - }, 2000, 20000); + waitForContainerStateInSCM(cluster.getScmLeader(), containerID, HddsProtos.LifeCycleState.CLOSED); // move the container to DELETING ContainerManager containerManager = cluster.getScmLeader().getContainerManager(); @@ -126,23 +110,7 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor // restart a DN and wait for the container to get CLOSED in all SCMs HddsDatanodeService dn = cluster.getHddsDatanode(keyLocation.getPipeline().getFirstNode()); cluster.restartHddsDatanode(dn.getDatanodeDetails(), false); - ContainerManager[] array = new ContainerManager[numSCM]; - for (int i = 0; i < numSCM; i++) { - array[i] = cluster.getStorageContainerManager(i).getContainerManager(); - } - GenericTestUtils.waitFor(() -> { - try { - for (ContainerManager manager : array) { - if (manager.getContainer(containerID).getState() != HddsProtos.LifeCycleState.CLOSED) { - return false; - } - } - return true; - } catch (ContainerNotFoundException e) { - fail(e); - } - return false; - }, 2000, 20000); + waitForContainerStateInSCM(cluster.getScmLeader(), containerID, HddsProtos.LifeCycleState.CLOSED); assertEquals(HddsProtos.LifeCycleState.CLOSED, containerManager.getContainer(containerID).getState()); } diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java index 071386858623..c454facea070 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestHelper.java @@ -24,6 +24,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; import java.security.MessageDigest; @@ -455,4 +456,20 @@ public static void setConfig(OzoneConfiguration conf, String key, String value) conf.set(key, value); } } + + public static void waitForContainerStateInSCM(StorageContainerManager scm, + ContainerID containerID, HddsProtos.LifeCycleState expectedState) + throws TimeoutException, InterruptedException { + ContainerManager containerManager = scm.getContainerManager(); + GenericTestUtils.waitFor(() -> { + try { + return containerManager.getContainer(containerID).getState() == expectedState; + } catch (ContainerNotFoundException e) { + LOG.error("Container {} not found while waiting for state {}", + containerID, expectedState, e); + fail("Container " + containerID + " not found while waiting for state " + expectedState + ": " + e); + return false; + } + }, 2000, 20000); + } } From d21a7ec524b485eb6518cd4eef5c5ae96d97e64b Mon Sep 17 00:00:00 2001 From: peterxcli Date: Thu, 13 Mar 2025 16:57:29 +0000 Subject: [PATCH 3/3] Addressed review: Check container status in all scm --- .../TestContainerReportHandlingWithHA.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java index 9139071a5489..357945a3fa5f 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/TestContainerReportHandlingWithHA.java @@ -35,12 +35,14 @@ import java.nio.file.Paths; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdds.client.RatisReplicationConfig; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.ozone.HddsDatanodeService; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.MiniOzoneHAClusterImpl; @@ -94,7 +96,8 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor OmKeyLocationInfo keyLocation = keyLocations.get(0); ContainerID containerID = ContainerID.valueOf(keyLocation.getContainerID()); waitForContainerClose(cluster, containerID.getId()); - waitForContainerStateInSCM(cluster.getScmLeader(), containerID, HddsProtos.LifeCycleState.CLOSED); + + waitForContainerStateInAllSCMs(cluster, containerID, HddsProtos.LifeCycleState.CLOSED); // move the container to DELETING ContainerManager containerManager = cluster.getScmLeader().getContainerManager(); @@ -110,7 +113,8 @@ void testDeletingOrDeletedContainerTransitionsToClosedWhenNonEmptyReplicaIsRepor // restart a DN and wait for the container to get CLOSED in all SCMs HddsDatanodeService dn = cluster.getHddsDatanode(keyLocation.getPipeline().getFirstNode()); cluster.restartHddsDatanode(dn.getDatanodeDetails(), false); - waitForContainerStateInSCM(cluster.getScmLeader(), containerID, HddsProtos.LifeCycleState.CLOSED); + + waitForContainerStateInAllSCMs(cluster, containerID, HddsProtos.LifeCycleState.CLOSED); assertEquals(HddsProtos.LifeCycleState.CLOSED, containerManager.getContainer(containerID).getState()); } @@ -158,4 +162,12 @@ private void createTestData(OzoneClient client) throws IOException { } } + private static void waitForContainerStateInAllSCMs(MiniOzoneHAClusterImpl cluster, ContainerID containerID, + HddsProtos.LifeCycleState desiredState) + throws TimeoutException, InterruptedException { + for (StorageContainerManager scm : cluster.getStorageContainerManagersList()) { + waitForContainerStateInSCM(scm, containerID, desiredState); + } + } + }