From 329d531a98aeb463c3cc3f3bf11c6c9203748c19 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Fri, 14 Feb 2025 14:44:27 -0800
Subject: [PATCH 1/9] HDDS-12150. Abnormal container states should not crash
the SCM ContainerReportHandler thread
---
.../AbstractContainerReportHandler.java | 38 +++++++-
.../container/TestContainerReportHandler.java | 94 +++++++++++++++++--
2 files changed, 119 insertions(+), 13 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
index 34682b85bccd..589ae0a8d697 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
@@ -291,9 +291,6 @@ private boolean updateContainerState(final DatanodeDetails datanode,
}
if (replica.getState() == State.CLOSED) {
- Preconditions.checkArgument(replica.getBlockCommitSequenceId()
- == container.getSequenceId());
-
/*
For an EC container, only the first index and the parity indexes are
guaranteed to have block data. So, update the container's state in SCM
@@ -312,6 +309,9 @@ private boolean updateContainerState(final DatanodeDetails datanode,
logger.info("Moving container {} to CLOSED state, datanode {} " +
"reported CLOSED replica with index {}.", containerId, datanode,
replica.getReplicaIndex());
+ if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
+ return true; // ignored = true
+ }
containerManager.updateContainerState(containerId,
LifeCycleEvent.CLOSE);
}
@@ -336,8 +336,9 @@ private boolean updateContainerState(final DatanodeDetails datanode,
if (replica.getState() == State.CLOSED) {
logger.info("Moving container {} to CLOSED state, datanode {} " +
"reported CLOSED replica.", containerId, datanode);
- Preconditions.checkArgument(replica.getBlockCommitSequenceId()
- == container.getSequenceId());
+ if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
+ return true; // ignored = true
+ }
containerManager.updateContainerState(containerId,
LifeCycleEvent.FORCE_CLOSE);
}
@@ -376,6 +377,33 @@ private boolean updateContainerState(final DatanodeDetails datanode,
return ignored;
}
+ /**
+ * Helper method to verify that the replica's bcsId matches the container's in SCM.
+ * Throws IOException if the bcsIds do not match.
+ *
+ * @param replicaBcsId Replica bcsId
+ * @param containerBcsId Container bcsId in SCM
+ * @param datanode DatanodeDetails for logging
+ * @param containerId ContainerID for logging
+ * @return true if verification has passed, false otherwise
+ * @throws IOException Thrown when bcsIds do not match
+ */
+ private boolean verifyBcsId(long replicaBcsId, long containerBcsId,
+ DatanodeDetails datanode, ContainerID containerId) throws IOException {
+
+ if (replicaBcsId != containerBcsId) {
+ final String errMsg = "Unexpected bcsId for container " + containerId +
+ " from datanode " + datanode + ". replica's: " + replicaBcsId +
+ ", SCM's: " + containerBcsId +
+ ". Ignoring container report for " + containerId;
+
+ logger.error(errMsg);
+ return false;
+ } else {
+ return true;
+ }
+ }
+
private void updateContainerReplica(final DatanodeDetails datanodeDetails,
final ContainerID containerId,
final ContainerReplicaProto replicaProto)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index 0a863bc8165a..6c298e864279 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -169,7 +169,7 @@ private void testReplicaIndexUpdate(ContainerInfo container,
Map expectedReplicaMap) {
final ContainerReportsProto containerReport = getContainerReportsProto(
container.containerID(), ContainerReplicaProto.State.CLOSED,
- dn.getUuidString(), 2000000000L, 100000000L, replicaIndex);
+ dn.getUuidString(), 2000000000L, 100000000L, 10000L, replicaIndex);
final ContainerReportFromDatanode containerReportFromDatanode =
new ContainerReportFromDatanode(dn, containerReport);
final ContainerReportHandler reportHandler = new ContainerReportHandler(
@@ -604,7 +604,7 @@ private void createAndHandleContainerReport(ContainerID containerID,
@Test
public void testClosingToQuasiClosed()
- throws NodeNotFoundException, IOException, TimeoutException {
+ throws NodeNotFoundException, IOException {
/*
* The container is in CLOSING state and all the replicas are in
* OPEN/CLOSING state.
@@ -671,7 +671,7 @@ public void testClosingToQuasiClosed()
@Test
public void testQuasiClosedToClosed()
- throws NodeNotFoundException, IOException, TimeoutException {
+ throws NodeNotFoundException, IOException {
/*
* The container is in QUASI_CLOSED state.
* - One of the replica is in QUASI_CLOSED state
@@ -740,6 +740,70 @@ public void testQuasiClosedToClosed()
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}
+ @Test
+ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
+ throws NodeNotFoundException, IOException {
+ /*
+ * Negative test. When a replica with a (lower) mismatching bcsId gets reported,
+ * expect the ContainerReportHandler thread to not throw uncaught exception
+ * (which could lead to ContainerReportHandler thread crash before HDDS-12150)
+ */
+
+ final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager);
+ final Iterator nodeIterator = nodeManager.getNodes(
+ NodeStatus.inServiceHealthy()).iterator();
+
+ final DatanodeDetails datanodeOne = nodeIterator.next();
+ final DatanodeDetails datanodeTwo = nodeIterator.next();
+ final DatanodeDetails datanodeThree = nodeIterator.next();
+
+ final ContainerInfo containerOne = getContainer(LifeCycleState.QUASI_CLOSED);
+ final ContainerInfo containerTwo = getContainer(LifeCycleState.CLOSED);
+
+ final Set containerIDSet = Stream.of(
+ containerOne.containerID(), containerTwo.containerID())
+ .collect(Collectors.toSet());
+ final Set containerOneReplicas = getReplicas(
+ containerOne.containerID(), ContainerReplicaProto.State.QUASI_CLOSED,
+ 10000L, // sequenceId
+ datanodeOne);
+ containerOneReplicas.addAll(getReplicas(
+ containerOne.containerID(), ContainerReplicaProto.State.CLOSING,
+ 10000L,
+ datanodeTwo, datanodeThree));
+ final Set containerTwoReplicas = getReplicas(
+ containerTwo.containerID(), ContainerReplicaProto.State.CLOSED,
+ 10000L,
+ datanodeOne, datanodeTwo, datanodeThree);
+
+ nodeManager.setContainers(datanodeOne, containerIDSet);
+ nodeManager.setContainers(datanodeTwo, containerIDSet);
+ nodeManager.setContainers(datanodeThree, containerIDSet);
+
+ containerStateManager.addContainer(containerOne.getProtobuf());
+ containerStateManager.addContainer(containerTwo.getProtobuf());
+
+ containerOneReplicas.forEach(r ->
+ containerStateManager.updateContainerReplica(
+ containerTwo.containerID(), r));
+
+ containerTwoReplicas.forEach(r ->
+ containerStateManager.updateContainerReplica(
+ containerTwo.containerID(), r));
+
+
+ final ContainerReportsProto containerReport = getContainerReportsProto(
+ containerOne.containerID(), ContainerReplicaProto.State.CLOSED,
+ datanodeOne.getUuidString(),
+ 2000L);
+
+ final ContainerReportFromDatanode containerReportFromDatanode =
+ new ContainerReportFromDatanode(datanodeOne, containerReport);
+ reportHandler.onMessage(containerReportFromDatanode, publisher);
+
+ assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
+ }
+
@Test
public void openContainerKeyAndBytesUsedUpdatedToMinimumOfAllReplicas()
throws IOException, TimeoutException {
@@ -1095,7 +1159,7 @@ private ContainerReportFromDatanode getContainerReportFromDatanode(
DatanodeDetails dn, long bytesUsed, long keyCount, int replicaIndex) {
ContainerReportsProto containerReport = getContainerReportsProto(
containerId, state, dn.getUuidString(), bytesUsed, keyCount,
- replicaIndex);
+ 10000L, replicaIndex);
return new ContainerReportFromDatanode(dn, containerReport);
}
@@ -1104,20 +1168,34 @@ protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId) {
return getContainerReportsProto(containerId, state, originNodeId,
- 2000000000L, 100000000L, 0);
+ 2000000000L, 100000000L, 10000L, 0);
+ }
+
+ protected static ContainerReportsProto getContainerReportsProto(
+ final ContainerID containerId, final ContainerReplicaProto.State state,
+ final String originNodeId, final long bcsId) {
+ return getContainerReportsProto(containerId, state, originNodeId,
+ 2000000000L, 100000000L, bcsId, 0);
}
protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, int replicaIndex) {
return getContainerReportsProto(containerId, state, originNodeId,
- 2000000000L, 100000000L, replicaIndex);
+ 2000000000L, 100000000L, 10000L, replicaIndex);
+ }
+
+ protected static ContainerReportsProto getContainerReportsProto(
+ final ContainerID containerId, final ContainerReplicaProto.State state,
+ final String originNodeId, final long bcsId, int replicaIndex) {
+ return getContainerReportsProto(containerId, state, originNodeId,
+ 2000000000L, 100000000L, bcsId, replicaIndex);
}
protected static ContainerReportsProto getContainerReportsProto(
final ContainerID containerId, final ContainerReplicaProto.State state,
final String originNodeId, final long usedBytes, final long keyCount,
- final int replicaIndex) {
+ final long bcsId, final int replicaIndex) {
final ContainerReportsProto.Builder crBuilder =
ContainerReportsProto.newBuilder();
final ContainerReplicaProto replicaProto =
@@ -1133,7 +1211,7 @@ protected static ContainerReportsProto getContainerReportsProto(
.setWriteCount(100000000L)
.setReadBytes(2000000000L)
.setWriteBytes(2000000000L)
- .setBlockCommitSequenceId(10000L)
+ .setBlockCommitSequenceId(bcsId)
.setDeleteTransactionId(0)
.setReplicaIndex(replicaIndex)
.build();
From 22b4b787e8f1b53ab8795061deac2dde20f3ac3d Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Fri, 14 Feb 2025 18:15:57 -0800
Subject: [PATCH 2/9] Clean up test case.
---
.../container/TestContainerReportHandler.java | 74 ++++++++-----------
1 file changed, 30 insertions(+), 44 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index 6c298e864279..568364539da9 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -67,6 +67,7 @@
import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails;
import static org.apache.hadoop.hdds.scm.HddsTestUtils.getContainerReports;
+import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.doAnswer;
@@ -669,6 +670,11 @@ public void testClosingToQuasiClosed()
assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}
+ @Test
+ public void testClosingToQuasiClosedWithMismatchingBCSID()
+ throws NodeNotFoundException, IOException {
+ }
+
@Test
public void testQuasiClosedToClosed()
throws NodeNotFoundException, IOException {
@@ -748,60 +754,40 @@ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
* expect the ContainerReportHandler thread to not throw uncaught exception
* (which could lead to ContainerReportHandler thread crash before HDDS-12150)
*/
+ final ContainerReportHandler reportHandler =
+ new ContainerReportHandler(nodeManager, containerManager);
+ final Iterator nodeIterator =
+ nodeManager.getNodes(NodeStatus.inServiceHealthy()).iterator();
- final ContainerReportHandler reportHandler = new ContainerReportHandler(nodeManager, containerManager);
- final Iterator nodeIterator = nodeManager.getNodes(
- NodeStatus.inServiceHealthy()).iterator();
-
- final DatanodeDetails datanodeOne = nodeIterator.next();
- final DatanodeDetails datanodeTwo = nodeIterator.next();
- final DatanodeDetails datanodeThree = nodeIterator.next();
-
- final ContainerInfo containerOne = getContainer(LifeCycleState.QUASI_CLOSED);
- final ContainerInfo containerTwo = getContainer(LifeCycleState.CLOSED);
-
- final Set containerIDSet = Stream.of(
- containerOne.containerID(), containerTwo.containerID())
- .collect(Collectors.toSet());
- final Set containerOneReplicas = getReplicas(
- containerOne.containerID(), ContainerReplicaProto.State.QUASI_CLOSED,
- 10000L, // sequenceId
- datanodeOne);
- containerOneReplicas.addAll(getReplicas(
- containerOne.containerID(), ContainerReplicaProto.State.CLOSING,
- 10000L,
- datanodeTwo, datanodeThree));
- final Set containerTwoReplicas = getReplicas(
- containerTwo.containerID(), ContainerReplicaProto.State.CLOSED,
- 10000L,
- datanodeOne, datanodeTwo, datanodeThree);
-
- nodeManager.setContainers(datanodeOne, containerIDSet);
- nodeManager.setContainers(datanodeTwo, containerIDSet);
- nodeManager.setContainers(datanodeThree, containerIDSet);
+ final DatanodeDetails dn1 = nodeIterator.next();
+ final DatanodeDetails dn2 = nodeIterator.next();
+ final DatanodeDetails dn3 = nodeIterator.next();
- containerStateManager.addContainer(containerOne.getProtobuf());
- containerStateManager.addContainer(containerTwo.getProtobuf());
-
- containerOneReplicas.forEach(r ->
- containerStateManager.updateContainerReplica(
- containerTwo.containerID(), r));
+ final ContainerInfo container1 = getContainer(LifeCycleState.QUASI_CLOSED);
- containerTwoReplicas.forEach(r ->
- containerStateManager.updateContainerReplica(
- containerTwo.containerID(), r));
+ nodeManager.addContainer(dn1, container1.containerID());
+ nodeManager.addContainer(dn2, container1.containerID());
+ nodeManager.addContainer(dn3, container1.containerID());
+ containerStateManager.addContainer(container1.getProtobuf());
final ContainerReportsProto containerReport = getContainerReportsProto(
- containerOne.containerID(), ContainerReplicaProto.State.CLOSED,
- datanodeOne.getUuidString(),
+ container1.containerID(), ContainerReplicaProto.State.CLOSED,
+ dn1.getUuidString(),
2000L);
final ContainerReportFromDatanode containerReportFromDatanode =
- new ContainerReportFromDatanode(datanodeOne, containerReport);
- reportHandler.onMessage(containerReportFromDatanode, publisher);
+ new ContainerReportFromDatanode(dn1, containerReport);
- assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
+ // Handler should NOT throw IllegalArgumentException
+ try {
+ reportHandler.onMessage(containerReportFromDatanode, publisher);
+ } catch (IllegalArgumentException iaEx) {
+ fail("Handler should not throw IllegalArgumentException: " + iaEx.getMessage());
+ }
+
+ // Because the container report is ignored, the container remains in QUASI_CLOSED for SCM
+ assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(container1.containerID()).getState());
}
@Test
From 7a7d5b986b0db890d529f0aa1ef64b3e5d9aadc3 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 11:45:40 -0800
Subject: [PATCH 3/9] Address Nanda's comments.
---
.../AbstractContainerReportHandler.java | 24 ++++++++++++-------
.../container/TestContainerReportHandler.java | 3 ++-
2 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
index 589ae0a8d697..988494a0135e 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
@@ -306,12 +306,15 @@ private boolean updateContainerState(final DatanodeDetails datanode,
}
}
- logger.info("Moving container {} to CLOSED state, datanode {} " +
- "reported CLOSED replica with index {}.", containerId, datanode,
- replica.getReplicaIndex());
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
- return true; // ignored = true
+ logger.warn("Ignored moving container {} from CLOSING to CLOSED state because replica bcsId ({}) " +
+ "reported by datanode {} does not match sequenceId ({{}}).",
+ containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
+ return true;
}
+ logger.info("Moving container {} to CLOSED state, datanode {} " +
+ "reported CLOSED replica with index {}.", containerId, datanode,
+ replica.getReplicaIndex());
containerManager.updateContainerState(containerId,
LifeCycleEvent.CLOSE);
}
@@ -334,11 +337,15 @@ private boolean updateContainerState(final DatanodeDetails datanode,
*
*/
if (replica.getState() == State.CLOSED) {
- logger.info("Moving container {} to CLOSED state, datanode {} " +
- "reported CLOSED replica.", containerId, datanode);
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
- return true; // ignored = true
+ logger.warn("Ignored moving container {} from QUASI_CLOSED to CLOSED state because replica bcsId ({}) " +
+ "reported by datanode {} does not match sequenceId ({{}}).",
+ containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
+ return true;
}
+ logger.info("Moving container {} to CLOSED state, datanode {} " +
+ "reported CLOSED replica with index {}.", containerId, datanode,
+ replica.getReplicaIndex());
containerManager.updateContainerState(containerId,
LifeCycleEvent.FORCE_CLOSE);
}
@@ -386,10 +393,9 @@ private boolean updateContainerState(final DatanodeDetails datanode,
* @param datanode DatanodeDetails for logging
* @param containerId ContainerID for logging
* @return true if verification has passed, false otherwise
- * @throws IOException Thrown when bcsIds do not match
*/
private boolean verifyBcsId(long replicaBcsId, long containerBcsId,
- DatanodeDetails datanode, ContainerID containerId) throws IOException {
+ DatanodeDetails datanode, ContainerID containerId) {
if (replicaBcsId != containerBcsId) {
final String errMsg = "Unexpected bcsId for container " + containerId +
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index 568364539da9..033e62e89664 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -763,6 +763,7 @@ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
final DatanodeDetails dn2 = nodeIterator.next();
final DatanodeDetails dn3 = nodeIterator.next();
+ // sequenceId 10000L set here
final ContainerInfo container1 = getContainer(LifeCycleState.QUASI_CLOSED);
nodeManager.addContainer(dn1, container1.containerID());
@@ -771,11 +772,11 @@ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
containerStateManager.addContainer(container1.getProtobuf());
+ // Generate container report with replica in CLOSED state with lower bcsId
final ContainerReportsProto containerReport = getContainerReportsProto(
container1.containerID(), ContainerReplicaProto.State.CLOSED,
dn1.getUuidString(),
2000L);
-
final ContainerReportFromDatanode containerReportFromDatanode =
new ContainerReportFromDatanode(dn1, containerReport);
From 2a9b681ffb3204201583781c68cd89a72cd4e51b Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 13:37:37 -0800
Subject: [PATCH 4/9] Address Ritesh's comment.
---
.../hdds/scm/container/AbstractContainerReportHandler.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
index 61a8107eea72..353b96de1298 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
@@ -308,7 +308,7 @@ private boolean updateContainerState(final DatanodeDetails datanode,
containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
return true;
}
- logger.info("Moving container {} to CLOSED state, datanode {} " +
+ logger.info("Moving container {} from CLOSING to CLOSED state, datanode {} " +
"reported CLOSED replica with index {}.", containerId, datanode,
replica.getReplicaIndex());
containerManager.updateContainerState(containerId,
@@ -339,7 +339,7 @@ private boolean updateContainerState(final DatanodeDetails datanode,
containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
return true;
}
- logger.info("Moving container {} to CLOSED state, datanode {} " +
+ logger.info("Moving container {} from QUASI_CLOSED to CLOSED state, datanode {} " +
"reported CLOSED replica with index {}.", containerId, datanode,
replica.getReplicaIndex());
containerManager.updateContainerState(containerId,
From 5e34049aa5219b66b3702999f76ad1d50a136fb4 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 13:45:45 -0800
Subject: [PATCH 5/9] Add UT `testClosingToQuasiClosedWithMismatchingBCSID`.
(Could parameterize this)
---
.../container/TestContainerReportHandler.java | 41 +++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index edf30f7810a6..e419e5279d80 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -670,6 +670,47 @@ public void testClosingToQuasiClosed()
@Test
public void testClosingToQuasiClosedWithMismatchingBCSID()
throws NodeNotFoundException, IOException {
+ /*
+ * Negative test. When a replica with a (lower) mismatching bcsId gets reported,
+ * expect the ContainerReportHandler thread to not throw uncaught exception
+ * (which could lead to ContainerReportHandler thread crash before HDDS-12150)
+ */
+ final ContainerReportHandler reportHandler =
+ new ContainerReportHandler(nodeManager, containerManager);
+ final Iterator nodeIterator =
+ nodeManager.getNodes(NodeStatus.inServiceHealthy()).iterator();
+
+ final DatanodeDetails dn1 = nodeIterator.next();
+ final DatanodeDetails dn2 = nodeIterator.next();
+ final DatanodeDetails dn3 = nodeIterator.next();
+
+ // sequenceId 10000L set here
+ final ContainerInfo container1 = getContainer(LifeCycleState.CLOSING);
+
+ nodeManager.addContainer(dn1, container1.containerID());
+ nodeManager.addContainer(dn2, container1.containerID());
+ nodeManager.addContainer(dn3, container1.containerID());
+
+ containerStateManager.addContainer(container1.getProtobuf());
+
+ // Generate container report with replica in CLOSED state with lower bcsId
+ final ContainerReportsProto containerReport = getContainerReportsProto(
+ container1.containerID(), ContainerReplicaProto.State.CLOSED,
+ dn1.getUuidString(),
+ 2000L);
+ final ContainerReportFromDatanode containerReportFromDatanode =
+ new ContainerReportFromDatanode(dn1, containerReport);
+
+ // Handler should NOT throw IllegalArgumentException
+ try {
+ reportHandler.onMessage(containerReportFromDatanode, publisher);
+ } catch (IllegalArgumentException iaEx) {
+ fail("Handler should not throw IllegalArgumentException: " + iaEx.getMessage());
+ }
+
+ // Because the container report is ignored, the container remains in QUASI_CLOSED for SCM
+ assertEquals(LifeCycleState.CLOSING, containerManager.getContainer(container1.containerID()).getState());
+
}
@Test
From 4333d4f465668e2365138642c415d9f78c059eb9 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 15:24:12 -0800
Subject: [PATCH 6/9] Refactor added UTs: parameterized and dedup'ed.
---
.../container/TestContainerReportHandler.java | 59 +++----------------
1 file changed, 8 insertions(+), 51 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index e419e5279d80..c0ae1cd6de77 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -75,6 +75,8 @@
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
/**
* Test the behaviour of the ContainerReportHandler.
@@ -667,52 +669,6 @@ public void testClosingToQuasiClosed()
assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}
- @Test
- public void testClosingToQuasiClosedWithMismatchingBCSID()
- throws NodeNotFoundException, IOException {
- /*
- * Negative test. When a replica with a (lower) mismatching bcsId gets reported,
- * expect the ContainerReportHandler thread to not throw uncaught exception
- * (which could lead to ContainerReportHandler thread crash before HDDS-12150)
- */
- final ContainerReportHandler reportHandler =
- new ContainerReportHandler(nodeManager, containerManager);
- final Iterator nodeIterator =
- nodeManager.getNodes(NodeStatus.inServiceHealthy()).iterator();
-
- final DatanodeDetails dn1 = nodeIterator.next();
- final DatanodeDetails dn2 = nodeIterator.next();
- final DatanodeDetails dn3 = nodeIterator.next();
-
- // sequenceId 10000L set here
- final ContainerInfo container1 = getContainer(LifeCycleState.CLOSING);
-
- nodeManager.addContainer(dn1, container1.containerID());
- nodeManager.addContainer(dn2, container1.containerID());
- nodeManager.addContainer(dn3, container1.containerID());
-
- containerStateManager.addContainer(container1.getProtobuf());
-
- // Generate container report with replica in CLOSED state with lower bcsId
- final ContainerReportsProto containerReport = getContainerReportsProto(
- container1.containerID(), ContainerReplicaProto.State.CLOSED,
- dn1.getUuidString(),
- 2000L);
- final ContainerReportFromDatanode containerReportFromDatanode =
- new ContainerReportFromDatanode(dn1, containerReport);
-
- // Handler should NOT throw IllegalArgumentException
- try {
- reportHandler.onMessage(containerReportFromDatanode, publisher);
- } catch (IllegalArgumentException iaEx) {
- fail("Handler should not throw IllegalArgumentException: " + iaEx.getMessage());
- }
-
- // Because the container report is ignored, the container remains in QUASI_CLOSED for SCM
- assertEquals(LifeCycleState.CLOSING, containerManager.getContainer(container1.containerID()).getState());
-
- }
-
@Test
public void testQuasiClosedToClosed()
throws NodeNotFoundException, IOException {
@@ -784,8 +740,9 @@ public void testQuasiClosedToClosed()
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(containerOne.containerID()).getState());
}
- @Test
- public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
+ @ParameterizedTest
+ @EnumSource(value = LifeCycleState.class, names = {"CLOSING", "QUASI_CLOSED"})
+ public void testContainerStateTransitionToQuasiClosedWithMismatchingBCSID(LifeCycleState lcState)
throws NodeNotFoundException, IOException {
/*
* Negative test. When a replica with a (lower) mismatching bcsId gets reported,
@@ -802,7 +759,7 @@ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
final DatanodeDetails dn3 = nodeIterator.next();
// sequenceId 10000L set here
- final ContainerInfo container1 = getContainer(LifeCycleState.QUASI_CLOSED);
+ final ContainerInfo container1 = getContainer(lcState);
nodeManager.addContainer(dn1, container1.containerID());
nodeManager.addContainer(dn2, container1.containerID());
@@ -825,8 +782,8 @@ public void testQuasiClosedToClosedAttemptWithMismatchingBCSID()
fail("Handler should not throw IllegalArgumentException: " + iaEx.getMessage());
}
- // Because the container report is ignored, the container remains in QUASI_CLOSED for SCM
- assertEquals(LifeCycleState.QUASI_CLOSED, containerManager.getContainer(container1.containerID()).getState());
+ // Because the container report is ignored, the container remains in the same previous state in SCM
+ assertEquals(lcState, containerManager.getContainer(container1.containerID()).getState());
}
@Test
From 70b56b7bdc210be9b2ecd30214f651870c0530f6 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 15:26:45 -0800
Subject: [PATCH 7/9] Correct UT name
---
.../hadoop/hdds/scm/container/TestContainerReportHandler.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index c0ae1cd6de77..a6afd310c37c 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -742,7 +742,7 @@ public void testQuasiClosedToClosed()
@ParameterizedTest
@EnumSource(value = LifeCycleState.class, names = {"CLOSING", "QUASI_CLOSED"})
- public void testContainerStateTransitionToQuasiClosedWithMismatchingBCSID(LifeCycleState lcState)
+ public void testContainerStateTransitionToClosedWithMismatchingBCSID(LifeCycleState lcState)
throws NodeNotFoundException, IOException {
/*
* Negative test. When a replica with a (lower) mismatching bcsId gets reported,
From 5a657543a50215fdecf7a64733743a0c07a89d7d Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 15:42:13 -0800
Subject: [PATCH 8/9] Clean up
---
.../scm/container/AbstractContainerReportHandler.java | 4 ++--
.../hdds/scm/container/TestContainerReportHandler.java | 8 ++++----
.../container/TestIncrementalContainerReportHandler.java | 4 ++--
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
index 353b96de1298..aeb96cbf3196 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/AbstractContainerReportHandler.java
@@ -304,7 +304,7 @@ private boolean updateContainerState(final DatanodeDetails datanode,
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
logger.warn("Ignored moving container {} from CLOSING to CLOSED state because replica bcsId ({}) " +
- "reported by datanode {} does not match sequenceId ({{}}).",
+ "reported by datanode {} does not match sequenceId ({}).",
containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
return true;
}
@@ -335,7 +335,7 @@ private boolean updateContainerState(final DatanodeDetails datanode,
if (replica.getState() == State.CLOSED) {
if (!verifyBcsId(replica.getBlockCommitSequenceId(), container.getSequenceId(), datanode, containerId)) {
logger.warn("Ignored moving container {} from QUASI_CLOSED to CLOSED state because replica bcsId ({}) " +
- "reported by datanode {} does not match sequenceId ({{}}).",
+ "reported by datanode {} does not match sequenceId ({}).",
containerId, replica.getBlockCommitSequenceId(), datanode, container.getSequenceId());
return true;
}
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
index a6afd310c37c..f1a9091a991c 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestContainerReportHandler.java
@@ -746,8 +746,8 @@ public void testContainerStateTransitionToClosedWithMismatchingBCSID(LifeCycleSt
throws NodeNotFoundException, IOException {
/*
* Negative test. When a replica with a (lower) mismatching bcsId gets reported,
- * expect the ContainerReportHandler thread to not throw uncaught exception
- * (which could lead to ContainerReportHandler thread crash before HDDS-12150)
+ * expect the ContainerReportHandler thread to not throw uncaught exception.
+ * (That exception lead to ContainerReportHandler thread crash before HDDS-12150.)
*/
final ContainerReportHandler reportHandler =
new ContainerReportHandler(nodeManager, containerManager);
@@ -758,7 +758,7 @@ public void testContainerStateTransitionToClosedWithMismatchingBCSID(LifeCycleSt
final DatanodeDetails dn2 = nodeIterator.next();
final DatanodeDetails dn3 = nodeIterator.next();
- // sequenceId 10000L set here
+ // Initial sequenceId 10000L is set here
final ContainerInfo container1 = getContainer(lcState);
nodeManager.addContainer(dn1, container1.containerID());
@@ -767,7 +767,7 @@ public void testContainerStateTransitionToClosedWithMismatchingBCSID(LifeCycleSt
containerStateManager.addContainer(container1.getProtobuf());
- // Generate container report with replica in CLOSED state with lower bcsId
+ // Generate container report with replica in CLOSED state with intentional lower bcsId
final ContainerReportsProto containerReport = getContainerReportsProto(
container1.containerID(), ContainerReplicaProto.State.CLOSED,
dn1.getUuidString(),
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
index a4872c1f7456..64a08339a2be 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
@@ -339,7 +339,7 @@ private List setupECContainerForTesting(
}
@Test
- public void testClosingToQuasiClosed() throws IOException, TimeoutException {
+ public void testClosingToQuasiClosed() throws IOException {
final IncrementalContainerReportHandler reportHandler =
new IncrementalContainerReportHandler(
nodeManager, containerManager, scmContext);
@@ -372,7 +372,7 @@ public void testClosingToQuasiClosed() throws IOException, TimeoutException {
}
@Test
- public void testQuasiClosedToClosed() throws IOException, TimeoutException {
+ public void testQuasiClosedToClosed() throws IOException {
final IncrementalContainerReportHandler reportHandler =
new IncrementalContainerReportHandler(
nodeManager, containerManager, scmContext);
From f2288ffceb11c76071430b99f064fcebb5454583 Mon Sep 17 00:00:00 2001
From: Siyao Meng <50227127+smengcl@users.noreply.github.com>
Date: Tue, 25 Feb 2025 15:50:49 -0800
Subject: [PATCH 9/9] Add parameterized UT in
`TestIncrementalContainerReportHandler` as well.
---
...TestIncrementalContainerReportHandler.java | 80 +++++++++++++++++--
1 file changed, 74 insertions(+), 6 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
index 64a08339a2be..fa92fa1f7749 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestIncrementalContainerReportHandler.java
@@ -27,6 +27,7 @@
import static org.apache.hadoop.hdds.upgrade.HDDSLayoutVersionManager.maxLayoutVersion;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.fail;
import static org.mockito.Mockito.any;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock;
@@ -87,6 +88,8 @@
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
/**
* Test cases to verify the functionality of IncrementalContainerReportHandler.
@@ -407,6 +410,59 @@ public void testQuasiClosedToClosed() throws IOException {
assertEquals(LifeCycleState.CLOSED, containerManager.getContainer(container.containerID()).getState());
}
+ @ParameterizedTest
+ @EnumSource(value = LifeCycleState.class, names = {"CLOSING", "QUASI_CLOSED"})
+ public void testContainerStateTransitionToClosedWithMismatchingBCSID(LifeCycleState lcState) throws IOException {
+ /*
+ * Negative test. When a replica with a (lower) mismatching bcsId gets reported,
+ * expect the ContainerReportHandler thread to not throw uncaught exception.
+ * (That exception lead to ContainerReportHandler thread crash before HDDS-12150.)
+ */
+ final IncrementalContainerReportHandler reportHandler =
+ new IncrementalContainerReportHandler(nodeManager, containerManager, scmContext);
+
+ // Initial sequenceId 10000L is set here
+ final ContainerInfo container = getContainer(lcState);
+ final DatanodeDetails datanodeOne = randomDatanodeDetails();
+ final DatanodeDetails datanodeTwo = randomDatanodeDetails();
+ final DatanodeDetails datanodeThree = randomDatanodeDetails();
+ nodeManager.register(datanodeOne, null, null);
+ nodeManager.register(datanodeTwo, null, null);
+ nodeManager.register(datanodeThree, null, null);
+
+ final Set containerReplicas = getReplicas(
+ container.containerID(),
+ ContainerReplicaProto.State.CLOSING,
+ datanodeOne, datanodeTwo);
+ containerReplicas.addAll(getReplicas(
+ container.containerID(),
+ ContainerReplicaProto.State.QUASI_CLOSED,
+ datanodeThree));
+
+ containerStateManager.addContainer(container.getProtobuf());
+ containerReplicas.forEach(r -> containerStateManager.updateContainerReplica(
+ container.containerID(), r));
+
+ // Generate incremental container report with replica in CLOSED state with intentional lower bcsId
+ final IncrementalContainerReportProto containerReport =
+ getIncrementalContainerReportProto(container.containerID(),
+ CLOSED, datanodeThree.getUuidString(), false, 0,
+ 2000L);
+ final IncrementalContainerReportFromDatanode icr =
+ new IncrementalContainerReportFromDatanode(
+ datanodeOne, containerReport);
+
+ // Handler should NOT throw IllegalArgumentException
+ try {
+ reportHandler.onMessage(icr, publisher);
+ } catch (IllegalArgumentException iaEx) {
+ fail("Handler should not throw IllegalArgumentException: " + iaEx.getMessage());
+ }
+
+ // Because the container report is ignored, the container remains in the same previous state in SCM
+ assertEquals(lcState, containerManager.getContainer(container.containerID()).getState());
+ }
+
@Test
public void testOpenWithUnhealthyReplica() throws IOException {
final IncrementalContainerReportHandler reportHandler =
@@ -580,11 +636,23 @@ public void testICRFCRRace() throws IOException, NodeNotFoundException,
private static IncrementalContainerReportProto
getIncrementalContainerReportProto(
- final ContainerID containerId,
- final ContainerReplicaProto.State state,
- final String originNodeId,
- final boolean hasReplicaIndex,
- final int replicaIndex) {
+ final ContainerID containerId,
+ final ContainerReplicaProto.State state,
+ final String originNodeId,
+ final boolean hasReplicaIndex,
+ final int replicaIndex) {
+ return getIncrementalContainerReportProto(containerId, state, originNodeId,
+ hasReplicaIndex, replicaIndex, 10000L);
+ }
+
+ private static IncrementalContainerReportProto
+ getIncrementalContainerReportProto(
+ final ContainerID containerId,
+ final ContainerReplicaProto.State state,
+ final String originNodeId,
+ final boolean hasReplicaIndex,
+ final int replicaIndex,
+ final long bcsId) {
final ContainerReplicaProto.Builder replicaProto =
ContainerReplicaProto.newBuilder()
.setContainerID(containerId.getId())
@@ -598,7 +666,7 @@ public void testICRFCRRace() throws IOException, NodeNotFoundException,
.setWriteCount(100000000L)
.setReadBytes(2000000000L)
.setWriteBytes(2000000000L)
- .setBlockCommitSequenceId(10000L)
+ .setBlockCommitSequenceId(bcsId)
.setDeleteTransactionId(0);
if (hasReplicaIndex) {
replicaProto.setReplicaIndex(replicaIndex);