From f04e3393392b12589eeb1717c3ab0e2a0b8689f3 Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Tue, 21 Jan 2025 16:31:16 -0800 Subject: [PATCH 1/4] HDDS-11345. Add metrics specific to reconciliation tasks. --- .../checksum/ReconcileContainerTask.java | 4 +- .../TestReplicationSupervisor.java | 80 +++++++++++++------ 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ReconcileContainerTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ReconcileContainerTask.java index 5d949e90b194..f09258fdf6b5 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ReconcileContainerTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/checksum/ReconcileContainerTask.java @@ -70,12 +70,12 @@ protected Object getCommandForDebug() { } @Override - protected String getMetricName() { + public String getMetricName() { return "ContainerReconciliations"; } @Override - protected String getMetricDescriptionSegment() { + public String getMetricDescriptionSegment() { return "Container Reconciliations"; } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java index 315e0c0253b4..4c476f33ea48 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.time.Instant; import java.time.ZoneId; +import java.util.Collections; import java.util.List; import java.util.SortedMap; import java.util.UUID; @@ -50,6 +51,8 @@ import org.apache.hadoop.hdds.security.symmetric.SecretKeySignerClient; import org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient; import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl; +import org.apache.hadoop.ozone.container.checksum.DNContainerOperationClient; +import org.apache.hadoop.ozone.container.checksum.ReconcileContainerTask; import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion; import org.apache.hadoop.ozone.container.common.impl.ContainerSet; import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration; @@ -66,6 +69,7 @@ import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController; +import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand; import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand; import org.apache.ozone.test.GenericTestUtils; @@ -93,6 +97,7 @@ import static org.mockito.Mockito.any; import static org.mockito.Mockito.anyList; import static org.mockito.Mockito.anyLong; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -125,6 +130,8 @@ public class TestReplicationSupervisor { private StateContext context; private TestClock clock; private DatanodeDetails datanode; + private DNContainerOperationClient mockClient; + private ContainerController mockController; @BeforeEach public void setUp() throws Exception { @@ -137,6 +144,8 @@ public void setUp() throws Exception { stateMachine, ""); context.setTermOfLeaderSCM(CURRENT_TERM); datanode = MockDatanodeDetails.randomDatanodeDetails(); + mockClient = mock(DNContainerOperationClient.class); + mockController = mock(ContainerController.class); when(stateMachine.getDatanodeDetails()).thenReturn(datanode); } @@ -419,9 +428,11 @@ public void testMultipleReplication(ContainerLayoutVersion layout, try { //WHEN replicationSupervisor.addTask(createTask(1L)); + replicationSupervisor.addTask(createReconciliationTask(1L)); ecReconstructionSupervisor.addTask(createECTaskWithCoordinator(2L)); replicationSupervisor.addTask(createTask(1L)); replicationSupervisor.addTask(createTask(3L)); + replicationSupervisor.addTask(createReconciliationTask(3L)); ecReconstructionSupervisor.addTask(createECTaskWithCoordinator(4L)); SimpleContainerDownloader moc = mock(SimpleContainerDownloader.class); @@ -442,61 +453,75 @@ public void testMultipleReplication(ContainerLayoutVersion layout, ReplicateContainerCommand cmd1 = createCommand(6L); cmd1.setDeadline(clock.millis() + 10000); - ReplicationTask task1 = new ReplicationTask(cmd1, replicatorRef.get()); + ReplicationTask replicationTask = new ReplicationTask(cmd1, replicatorRef.get()); + ReconcileContainerTask reconciliationTask = createReconciliationTask(6L); clock.fastForward(15000); - replicationSupervisor.addTask(task1); + replicationSupervisor.addTask(replicationTask); + replicationSupervisor.addTask(reconciliationTask); + doThrow(IOException.class).when(mockController).reconcileContainer(any(), anyLong(), any()); + replicationSupervisor.addTask(createReconciliationTask(7L)); ReconstructECContainersCommand cmd2 = createReconstructionCmd(7L); cmd2.setDeadline(clock.millis() + 10000); - ECReconstructionCoordinatorTask task2 = new ECReconstructionCoordinatorTask( + ECReconstructionCoordinatorTask ecTask = new ECReconstructionCoordinatorTask( ecReplicatorRef.get(), new ECReconstructionCommandInfo(cmd2)); clock.fastForward(15000); - ecReconstructionSupervisor.addTask(task2); + ecReconstructionSupervisor.addTask(ecTask); ecReconstructionSupervisor.addTask(createECTask(8L)); ecReconstructionSupervisor.addTask(createECTask(9L)); //THEN - assertEquals(2, replicationSupervisor.getReplicationSuccessCount()); + assertEquals(4, replicationSupervisor.getReplicationSuccessCount()); assertEquals(2, replicationSupervisor.getReplicationSuccessCount( - task1.getMetricName())); - assertEquals(1, replicationSupervisor.getReplicationFailureCount()); + replicationTask.getMetricName())); + assertEquals(2, replicationSupervisor.getReplicationSuccessCount( + reconciliationTask.getMetricName())); + assertEquals(2, replicationSupervisor.getReplicationFailureCount()); + assertEquals(1, replicationSupervisor.getReplicationFailureCount( + replicationTask.getMetricName())); assertEquals(1, replicationSupervisor.getReplicationFailureCount( - task1.getMetricName())); + reconciliationTask.getMetricName())); assertEquals(1, replicationSupervisor.getReplicationSkippedCount()); assertEquals(1, replicationSupervisor.getReplicationSkippedCount( - task1.getMetricName())); - assertEquals(1, replicationSupervisor.getReplicationTimeoutCount()); + replicationTask.getMetricName())); + assertEquals(2, replicationSupervisor.getReplicationTimeoutCount()); + assertEquals(1, replicationSupervisor.getReplicationTimeoutCount( + replicationTask.getMetricName())); assertEquals(1, replicationSupervisor.getReplicationTimeoutCount( - task1.getMetricName())); - assertEquals(5, replicationSupervisor.getReplicationRequestCount()); + reconciliationTask.getMetricName())); + assertEquals(9, replicationSupervisor.getReplicationRequestCount()); assertEquals(5, replicationSupervisor.getReplicationRequestCount( - task1.getMetricName())); + replicationTask.getMetricName())); + assertEquals(4, replicationSupervisor.getReplicationRequestCount( + reconciliationTask.getMetricName())); assertEquals(0, replicationSupervisor.getReplicationRequestCount( - task2.getMetricName())); + ecTask.getMetricName())); assertEquals(2, ecReconstructionSupervisor.getReplicationSuccessCount()); assertEquals(2, ecReconstructionSupervisor.getReplicationSuccessCount( - task2.getMetricName())); + ecTask.getMetricName())); assertEquals(1, ecReconstructionSupervisor.getReplicationTimeoutCount()); assertEquals(1, ecReconstructionSupervisor.getReplicationTimeoutCount( - task2.getMetricName())); + ecTask.getMetricName())); assertEquals(2, ecReconstructionSupervisor.getReplicationFailureCount()); assertEquals(2, ecReconstructionSupervisor.getReplicationFailureCount( - task2.getMetricName())); + ecTask.getMetricName())); assertEquals(5, ecReconstructionSupervisor.getReplicationRequestCount()); assertEquals(5, ecReconstructionSupervisor.getReplicationRequestCount( - task2.getMetricName())); + ecTask.getMetricName())); assertEquals(0, ecReconstructionSupervisor.getReplicationRequestCount( - task1.getMetricName())); + replicationTask.getMetricName())); assertTrue(replicationSupervisor.getReplicationRequestTotalTime( - task1.getMetricName()) > 0); + replicationTask.getMetricName()) > 0); + assertTrue(replicationSupervisor.getReplicationRequestTotalTime( + reconciliationTask.getMetricName()) > 0); assertTrue(ecReconstructionSupervisor.getReplicationRequestTotalTime( - task2.getMetricName()) > 0); + ecTask.getMetricName()) > 0); assertTrue(replicationSupervisor.getReplicationRequestAvgTime( - task1.getMetricName()) > 0); + replicationTask.getMetricName()) > 0); assertTrue(ecReconstructionSupervisor.getReplicationRequestAvgTime( - task2.getMetricName()) > 0); + ecTask.getMetricName()) > 0); MetricsCollectorImpl replicationMetricsCollector = new MetricsCollectorImpl(); replicationMetrics.getMetrics(replicationMetricsCollector, true); @@ -691,6 +716,15 @@ private ReplicationTask createTask(long containerId) { return new ReplicationTask(cmd, replicatorRef.get()); } + private ReconcileContainerTask createReconciliationTask(long containerId) { + ReconcileContainerCommand reconcileContainerCommand = + new ReconcileContainerCommand(containerId, Collections.singleton(datanode)); + reconcileContainerCommand.setTerm(CURRENT_TERM); + reconcileContainerCommand.setDeadline(clock.millis() + 10000); + return new ReconcileContainerTask(mockController, mockClient, + reconcileContainerCommand); + } + private ECReconstructionCoordinatorTask createECTask(long containerId) { return new ECReconstructionCoordinatorTask(null, createReconstructionCmdInfo(containerId)); From 8649992c686bc95963439ecd6514109e857bd900 Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Wed, 22 Jan 2025 14:06:52 -0800 Subject: [PATCH 2/4] HDDS-11345. Add ReconcileContainerCommandHandler metrics and address review comments. --- .../ReconcileContainerCommandHandler.java | 34 +++--- .../common/utils/ContainerLogger.java | 9 ++ .../container/keyvalue/KeyValueHandler.java | 5 + .../TestReconcileContainerCommandHandler.java | 18 ++- .../TestReplicationSupervisor.java | 110 +++++++++++------- 5 files changed, 121 insertions(+), 55 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java index 99185a7e10b3..8a290d674422 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ReconcileContainerCommandHandler.java @@ -28,28 +28,28 @@ import org.apache.hadoop.ozone.protocol.commands.ReconcileContainerCommand; import org.apache.hadoop.ozone.protocol.commands.SCMCommand; -import java.util.concurrent.atomic.AtomicLong; - /** * Handles commands from SCM to reconcile a container replica on this datanode with the replicas on its peers. */ public class ReconcileContainerCommandHandler implements CommandHandler { private final ReplicationSupervisor supervisor; - private final AtomicLong invocationCount; private final DNContainerOperationClient dnClient; + private String metricsName; public ReconcileContainerCommandHandler(ReplicationSupervisor supervisor, DNContainerOperationClient dnClient) { this.supervisor = supervisor; this.dnClient = dnClient; - this.invocationCount = new AtomicLong(0); } @Override public void handle(SCMCommand command, OzoneContainer container, StateContext context, SCMConnectionManager connectionManager) { - invocationCount.incrementAndGet(); ReconcileContainerCommand reconcileCommand = (ReconcileContainerCommand) command; - supervisor.addTask(new ReconcileContainerTask(container.getController(), dnClient, reconcileCommand)); + ReconcileContainerTask task = new ReconcileContainerTask(container.getController(), dnClient, reconcileCommand); + if (metricsName == null) { + metricsName = task.getMetricName(); + } + supervisor.addTask(task); } @Override @@ -58,24 +58,30 @@ public SCMCommandProto.Type getCommandType() { } @Override - public int getInvocationCount() { - return (int)invocationCount.get(); + public int getQueuedCount() { + return this.metricsName == null ? 0 : (int) this.supervisor + .getReplicationQueuedCount(metricsName); } - // Uses ReplicationSupervisor for these metrics. + @Override + public int getInvocationCount() { + return this.metricsName == null ? 0 : (int) this.supervisor + .getReplicationRequestCount(metricsName); + } @Override public long getAverageRunTime() { - return 0; + return this.metricsName == null ? 0 : (int) this.supervisor + .getReplicationRequestAvgTime(metricsName); } @Override public long getTotalRunTime() { - return 0; + return this.metricsName == null ? 0 : this.supervisor + .getReplicationRequestTotalTime(metricsName); } - @Override - public int getQueuedCount() { - return 0; + public String getMetricsName() { + return this.metricsName; } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java index 92940b01940c..6841ab120495 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java @@ -146,6 +146,15 @@ public static void logRecovered(ContainerData containerData) { LOG.info(getMessage(containerData)); } + /** + * Logged when a container is reconciled. + * + * @param containerData The container that was reconciled on this datanode. + */ + public static void logContainerReconciled(ContainerData containerData) { + LOG.info(getMessage(containerData, "Container reconciled")); + } + private static String getMessage(ContainerData containerData, String message) { return String.join(FIELD_SEPARATOR, getMessage(containerData), message); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 06987f635619..77ec84d5b4ef 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -1449,6 +1449,10 @@ public void reconcileContainer(DNContainerOperationClient dnClient, Container Set peers) throws IOException { // TODO Just a deterministic placeholder hash for testing until actual implementation is finished. ContainerData data = container.getContainerData(); + if (container.getContainerState() == State.DELETED) { + throw new IOException("Container #" + container.getContainerData().getContainerID() + + " is deleted, cannot reconcile"); + } long id = data.getContainerID(); ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES) .putLong(id) @@ -1459,6 +1463,7 @@ public void reconcileContainer(DNContainerOperationClient dnClient, Container long dataChecksum = checksumImpl.getValue(); LOG.info("Generated data checksum of container {} for testing: {}", id, dataChecksum); data.setDataChecksum(dataChecksum); + ContainerLogger.logContainerReconciled(container.getContainerData()); sendICR(container); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index f27ed097d2f7..9785e07e95de 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -74,6 +74,7 @@ public class TestReconcileContainerCommandHandler { private OzoneContainer ozoneContainer; private StateContext context; private ReconcileContainerCommandHandler subject; + private ReplicationSupervisor mockSupervisor; public void init(ContainerLayoutVersion layout, IncrementalReportSender icrSender) throws Exception { @@ -81,7 +82,8 @@ public void init(ContainerLayoutVersion layout, IncrementalReportSender { ((ReconcileContainerTask)invocation.getArguments()[0]).runTask(); return null; @@ -90,6 +92,7 @@ public void init(ContainerLayoutVersion layout, IncrementalReportSender { }); + init(layout, c -> { + }); assertEquals(0, subject.getInvocationCount()); @@ -150,7 +154,17 @@ public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) ReconcileContainerCommand cmd = new ReconcileContainerCommand(id, Collections.emptySet()); subject.handle(cmd, ozoneContainer, context, null); } + + when(mockSupervisor.getReplicationRequestCount(subject.getMetricsName())).thenReturn(3L); + when(mockSupervisor.getReplicationRequestTotalTime(subject.getMetricsName())).thenReturn(10L); + when(mockSupervisor.getReplicationRequestAvgTime(subject.getMetricsName())).thenReturn(3L); + when(mockSupervisor.getReplicationQueuedCount(subject.getMetricsName())).thenReturn(1L); + + assertEquals(subject.getMetricsName(), "ContainerReconciliations"); assertEquals(NUM_CONTAINERS, subject.getInvocationCount()); + assertEquals(subject.getQueuedCount(), 1); + assertEquals(subject.getTotalRunTime(), 10); + assertEquals(subject.getAverageRunTime(), 3); } private void verifyAllContainerReports(Map reportsSent) { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java index 4c476f33ea48..798c2e658a9b 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java @@ -428,11 +428,9 @@ public void testMultipleReplication(ContainerLayoutVersion layout, try { //WHEN replicationSupervisor.addTask(createTask(1L)); - replicationSupervisor.addTask(createReconciliationTask(1L)); ecReconstructionSupervisor.addTask(createECTaskWithCoordinator(2L)); replicationSupervisor.addTask(createTask(1L)); replicationSupervisor.addTask(createTask(3L)); - replicationSupervisor.addTask(createReconciliationTask(3L)); ecReconstructionSupervisor.addTask(createECTaskWithCoordinator(4L)); SimpleContainerDownloader moc = mock(SimpleContainerDownloader.class); @@ -453,75 +451,61 @@ public void testMultipleReplication(ContainerLayoutVersion layout, ReplicateContainerCommand cmd1 = createCommand(6L); cmd1.setDeadline(clock.millis() + 10000); - ReplicationTask replicationTask = new ReplicationTask(cmd1, replicatorRef.get()); - ReconcileContainerTask reconciliationTask = createReconciliationTask(6L); + ReplicationTask task1 = new ReplicationTask(cmd1, replicatorRef.get()); clock.fastForward(15000); - replicationSupervisor.addTask(replicationTask); - replicationSupervisor.addTask(reconciliationTask); - doThrow(IOException.class).when(mockController).reconcileContainer(any(), anyLong(), any()); - replicationSupervisor.addTask(createReconciliationTask(7L)); + replicationSupervisor.addTask(task1); ReconstructECContainersCommand cmd2 = createReconstructionCmd(7L); cmd2.setDeadline(clock.millis() + 10000); - ECReconstructionCoordinatorTask ecTask = new ECReconstructionCoordinatorTask( + ECReconstructionCoordinatorTask task2 = new ECReconstructionCoordinatorTask( ecReplicatorRef.get(), new ECReconstructionCommandInfo(cmd2)); clock.fastForward(15000); - ecReconstructionSupervisor.addTask(ecTask); + ecReconstructionSupervisor.addTask(task2); ecReconstructionSupervisor.addTask(createECTask(8L)); ecReconstructionSupervisor.addTask(createECTask(9L)); //THEN - assertEquals(4, replicationSupervisor.getReplicationSuccessCount()); - assertEquals(2, replicationSupervisor.getReplicationSuccessCount( - replicationTask.getMetricName())); + assertEquals(2, replicationSupervisor.getReplicationSuccessCount()); assertEquals(2, replicationSupervisor.getReplicationSuccessCount( - reconciliationTask.getMetricName())); - assertEquals(2, replicationSupervisor.getReplicationFailureCount()); - assertEquals(1, replicationSupervisor.getReplicationFailureCount( - replicationTask.getMetricName())); + task1.getMetricName())); + assertEquals(1, replicationSupervisor.getReplicationFailureCount()); assertEquals(1, replicationSupervisor.getReplicationFailureCount( - reconciliationTask.getMetricName())); + task1.getMetricName())); assertEquals(1, replicationSupervisor.getReplicationSkippedCount()); assertEquals(1, replicationSupervisor.getReplicationSkippedCount( - replicationTask.getMetricName())); - assertEquals(2, replicationSupervisor.getReplicationTimeoutCount()); - assertEquals(1, replicationSupervisor.getReplicationTimeoutCount( - replicationTask.getMetricName())); + task1.getMetricName())); + assertEquals(1, replicationSupervisor.getReplicationTimeoutCount()); assertEquals(1, replicationSupervisor.getReplicationTimeoutCount( - reconciliationTask.getMetricName())); - assertEquals(9, replicationSupervisor.getReplicationRequestCount()); + task1.getMetricName())); + assertEquals(5, replicationSupervisor.getReplicationRequestCount()); assertEquals(5, replicationSupervisor.getReplicationRequestCount( - replicationTask.getMetricName())); - assertEquals(4, replicationSupervisor.getReplicationRequestCount( - reconciliationTask.getMetricName())); + task1.getMetricName())); assertEquals(0, replicationSupervisor.getReplicationRequestCount( - ecTask.getMetricName())); + task2.getMetricName())); assertEquals(2, ecReconstructionSupervisor.getReplicationSuccessCount()); assertEquals(2, ecReconstructionSupervisor.getReplicationSuccessCount( - ecTask.getMetricName())); + task2.getMetricName())); assertEquals(1, ecReconstructionSupervisor.getReplicationTimeoutCount()); assertEquals(1, ecReconstructionSupervisor.getReplicationTimeoutCount( - ecTask.getMetricName())); + task2.getMetricName())); assertEquals(2, ecReconstructionSupervisor.getReplicationFailureCount()); assertEquals(2, ecReconstructionSupervisor.getReplicationFailureCount( - ecTask.getMetricName())); + task2.getMetricName())); assertEquals(5, ecReconstructionSupervisor.getReplicationRequestCount()); assertEquals(5, ecReconstructionSupervisor.getReplicationRequestCount( - ecTask.getMetricName())); + task2.getMetricName())); assertEquals(0, ecReconstructionSupervisor.getReplicationRequestCount( - replicationTask.getMetricName())); + task1.getMetricName())); assertTrue(replicationSupervisor.getReplicationRequestTotalTime( - replicationTask.getMetricName()) > 0); - assertTrue(replicationSupervisor.getReplicationRequestTotalTime( - reconciliationTask.getMetricName()) > 0); + task1.getMetricName()) > 0); assertTrue(ecReconstructionSupervisor.getReplicationRequestTotalTime( - ecTask.getMetricName()) > 0); + task2.getMetricName()) > 0); assertTrue(replicationSupervisor.getReplicationRequestAvgTime( - replicationTask.getMetricName()) > 0); + task1.getMetricName()) > 0); assertTrue(ecReconstructionSupervisor.getReplicationRequestAvgTime( - ecTask.getMetricName()) > 0); + task2.getMetricName()) > 0); MetricsCollectorImpl replicationMetricsCollector = new MetricsCollectorImpl(); replicationMetrics.getMetrics(replicationMetricsCollector, true); @@ -538,6 +522,54 @@ public void testMultipleReplication(ContainerLayoutVersion layout, } } + @ContainerLayoutTestInfo.ContainerTest + public void testReconciliationTaskMetrics(ContainerLayoutVersion layout) throws IOException { + this.layoutVersion = layout; + // GIVEN + ReplicationSupervisor replicationSupervisor = + supervisorWithReplicator(FakeReplicator::new); + ReplicationSupervisorMetrics replicationMetrics = + ReplicationSupervisorMetrics.create(replicationSupervisor); + + try { + //WHEN + replicationSupervisor.addTask(createReconciliationTask(1L)); + replicationSupervisor.addTask(createReconciliationTask(2L)); + + ReconcileContainerTask reconciliationTask = createReconciliationTask(6L); + clock.fastForward(15000); + replicationSupervisor.addTask(reconciliationTask); + doThrow(IOException.class).when(mockController).reconcileContainer(any(), anyLong(), any()); + replicationSupervisor.addTask(createReconciliationTask(7L)); + + //THEN + assertEquals(2, replicationSupervisor.getReplicationSuccessCount()); + + assertEquals(2, replicationSupervisor.getReplicationSuccessCount( + reconciliationTask.getMetricName())); + assertEquals(1, replicationSupervisor.getReplicationFailureCount()); + assertEquals(1, replicationSupervisor.getReplicationFailureCount( + reconciliationTask.getMetricName())); + assertEquals(1, replicationSupervisor.getReplicationTimeoutCount()); + assertEquals(1, replicationSupervisor.getReplicationTimeoutCount( + reconciliationTask.getMetricName())); + assertEquals(4, replicationSupervisor.getReplicationRequestCount()); + assertEquals(4, replicationSupervisor.getReplicationRequestCount( + reconciliationTask.getMetricName())); + + + assertTrue(replicationSupervisor.getReplicationRequestTotalTime( + reconciliationTask.getMetricName()) > 0); + + MetricsCollectorImpl replicationMetricsCollector = new MetricsCollectorImpl(); + replicationMetrics.getMetrics(replicationMetricsCollector, true); + assertEquals(1, replicationMetricsCollector.getRecords().size()); + } finally { + replicationMetrics.unRegister(); + replicationSupervisor.stop(); + } + } + @ContainerLayoutTestInfo.ContainerTest public void testPriorityOrdering(ContainerLayoutVersion layout) throws InterruptedException { From 6fe3bdc4517dae57657acd483857d4640e54babf Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Fri, 24 Jan 2025 11:33:09 -0800 Subject: [PATCH 3/4] HDDS-11345. Remove container log. --- .../ozone/container/common/utils/ContainerLogger.java | 9 --------- .../hadoop/ozone/container/keyvalue/KeyValueHandler.java | 5 ----- .../container/replication/ReplicationSupervisor.java | 4 ++-- .../TestReconcileContainerCommandHandler.java | 3 +-- .../container/replication/TestReplicationSupervisor.java | 2 ++ 5 files changed, 5 insertions(+), 18 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java index 6841ab120495..92940b01940c 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/utils/ContainerLogger.java @@ -146,15 +146,6 @@ public static void logRecovered(ContainerData containerData) { LOG.info(getMessage(containerData)); } - /** - * Logged when a container is reconciled. - * - * @param containerData The container that was reconciled on this datanode. - */ - public static void logContainerReconciled(ContainerData containerData) { - LOG.info(getMessage(containerData, "Container reconciled")); - } - private static String getMessage(ContainerData containerData, String message) { return String.join(FIELD_SEPARATOR, getMessage(containerData), message); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java index 77ec84d5b4ef..06987f635619 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java @@ -1449,10 +1449,6 @@ public void reconcileContainer(DNContainerOperationClient dnClient, Container Set peers) throws IOException { // TODO Just a deterministic placeholder hash for testing until actual implementation is finished. ContainerData data = container.getContainerData(); - if (container.getContainerState() == State.DELETED) { - throw new IOException("Container #" + container.getContainerData().getContainerID() + - " is deleted, cannot reconcile"); - } long id = data.getContainerID(); ByteBuffer byteBuffer = ByteBuffer.allocate(Long.BYTES) .putLong(id) @@ -1463,7 +1459,6 @@ public void reconcileContainer(DNContainerOperationClient dnClient, Container long dataChecksum = checksumImpl.getValue(); LOG.info("Generated data checksum of container {} for testing: {}", id, dataChecksum); data.setDataChecksum(dataChecksum); - ContainerLogger.logContainerReconciled(container.getContainerData()); sendICR(container); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java index 9513cac84efe..9a1621be32ff 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java @@ -537,11 +537,11 @@ public long getReplicationQueuedCount(String metricsName) { public long getReplicationRequestAvgTime(String metricsName) { MutableRate rate = opsLatencyMs.get(metricsName); - return rate != null ? (long) rate.lastStat().mean() : 0; + return rate != null ? (long) Math.ceil(rate.lastStat().mean()) : 0; } public long getReplicationRequestTotalTime(String metricsName) { MutableRate rate = opsLatencyMs.get(metricsName); - return rate != null ? (long) rate.lastStat().total() : 0; + return rate != null ? (long) Math.ceil(rate.lastStat().total()) : 0; } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index 9785e07e95de..c94dfc9cc497 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -144,8 +144,7 @@ public void testReconcileContainerCommandReports(ContainerLayoutVersion layout) */ @ContainerLayoutTestInfo.ContainerTest public void testReconcileContainerCommandMetrics(ContainerLayoutVersion layout) throws Exception { - init(layout, c -> { - }); + init(layout, c -> { }); assertEquals(0, subject.getInvocationCount()); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java index 798c2e658a9b..bfae06b56a69 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestReplicationSupervisor.java @@ -560,6 +560,8 @@ public void testReconciliationTaskMetrics(ContainerLayoutVersion layout) throws assertTrue(replicationSupervisor.getReplicationRequestTotalTime( reconciliationTask.getMetricName()) > 0); + assertTrue(replicationSupervisor.getReplicationRequestAvgTime( + reconciliationTask.getMetricName()) > 0); MetricsCollectorImpl replicationMetricsCollector = new MetricsCollectorImpl(); replicationMetrics.getMetrics(replicationMetricsCollector, true); From 7b2b1e2231697ad43f845ae260a8c93a19a3cd13 Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Fri, 24 Jan 2025 11:36:32 -0800 Subject: [PATCH 4/4] HDDS-11345. Remove whitespace. --- .../commandhandler/TestReconcileContainerCommandHandler.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java index c94dfc9cc497..fbc0f9714a59 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestReconcileContainerCommandHandler.java @@ -83,7 +83,6 @@ public void init(ContainerLayoutVersion layout, IncrementalReportSender { ((ReconcileContainerTask)invocation.getArguments()[0]).runTask(); return null; @@ -92,7 +91,6 @@ public void init(ContainerLayoutVersion layout, IncrementalReportSender