From 2b5d7b9cbf5e8611d5597aed2e43921bb27caa5e Mon Sep 17 00:00:00 2001 From: Siddhant Sangwan Date: Thu, 15 May 2025 14:42:23 +0530 Subject: [PATCH 1/5] trigger heartbeat immediately + throttling logic --- .../container/common/impl/HddsDispatcher.java | 54 +++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index c4e03e453349..ed3e0170dc66 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -32,6 +32,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.HddsUtils; import org.apache.hadoop.hdds.client.BlockID; @@ -44,6 +45,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Type; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerAction; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerNotOpenException; import org.apache.hadoop.hdds.scm.container.common.helpers.InvalidContainerStateException; @@ -110,6 +112,8 @@ public class HddsDispatcher implements ContainerDispatcher, Auditor { private ContainerMetrics metrics; private final TokenVerifier tokenVerifier; private long slowOpThresholdNs; + private AtomicLong fullVolumeLastHeartbeatTriggerMs; + private long fullVolumeHeartbeatThrottleIntervalMs; /** * Constructs an OzoneContainer that receives calls from @@ -130,6 +134,10 @@ public HddsDispatcher(ConfigurationSource config, ContainerSet contSet, this.tokenVerifier = tokenVerifier != null ? tokenVerifier : new NoopTokenVerifier(); this.slowOpThresholdNs = getSlowOpThresholdMs(conf) * 1000000; + fullVolumeLastHeartbeatTriggerMs = new AtomicLong(-1); + long heartbeatInterval = + config.getTimeDuration("hdds.heartbeat.interval", 30000, TimeUnit.MILLISECONDS); + fullVolumeHeartbeatThrottleIntervalMs = Math.min(heartbeatInterval, 30000); protocolMetrics = new ProtocolMessageMetrics<>( @@ -335,7 +343,15 @@ && getMissingContainerSet().contains(containerID)) { // Small performance optimization. We check if the operation is of type // write before trying to send CloseContainerAction. if (!HddsUtils.isReadOnly(msg)) { - sendCloseContainerActionIfNeeded(container); + boolean isFull = isVolumeFull(container); + sendCloseContainerActionIfNeeded(container, isFull); + if (isFull) { + try { + handleFullVolume(container.getContainerData().getVolume()); + } catch (StorageContainerException e) { + ContainerUtils.logAndReturnError(LOG, e, msg); + } + } } Handler handler = getHandler(containerType); if (handler == null) { @@ -403,7 +419,7 @@ && getMissingContainerSet().contains(containerID)) { // in any case, the in memory state of the container should be unhealthy Preconditions.checkArgument( container.getContainerData().getState() == State.UNHEALTHY); - sendCloseContainerActionIfNeeded(container); + sendCloseContainerActionIfNeeded(container, isVolumeFull(container)); } if (cmdType == Type.CreateContainer && result == Result.SUCCESS && dispatcherContext != null) { @@ -435,6 +451,36 @@ && getMissingContainerSet().contains(containerID)) { } } + /** + * If the volume is full, we need to inform SCM about the latest volume usage stats and send the close container + * action for this container immediately. {@link HddsDispatcher#sendCloseContainerActionIfNeeded(Container, boolean)} + * just adds the action to the heartbeat. Here, we get the latest storage statistics for this node, add them to the + * heartbeat, and then send the heartbeat (including container close action) immediately. + * @param volume the volume being written to + */ + private void handleFullVolume(HddsVolume volume) throws StorageContainerException { + long current = System.currentTimeMillis(); + long last = fullVolumeLastHeartbeatTriggerMs.get(); + boolean isFirstTrigger = last == -1; + boolean allowedToTrigger = (current - fullVolumeHeartbeatThrottleIntervalMs) >= last; + if (isFirstTrigger || allowedToTrigger) { + if (fullVolumeLastHeartbeatTriggerMs.compareAndSet(last, current)) { + StorageContainerDatanodeProtocolProtos.NodeReportProto nodeReport; + try { + nodeReport = context.getParent().getContainer().getNodeReport(); + context.refreshFullReport(nodeReport); + context.getParent().triggerHeartbeat(); + } catch (IOException e) { + String volumePath = volume.getVolumeRootDir(); + StorageLocationReport volumeReport = volume.getReport(); + String error = String.format( + "Failed to create node report when handling full volume %s. Volume Report: %s", volumePath, volumeReport); + throw new StorageContainerException(error, e, Result.IO_EXCEPTION); + } + } + } + } + private long getSlowOpThresholdMs(ConfigurationSource config) { return config.getTimeDuration( HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY, @@ -578,9 +624,9 @@ public void validateContainerCommand( * marked unhealthy we send Close ContainerAction to SCM. * @param container current state of container */ - private void sendCloseContainerActionIfNeeded(Container container) { + private void sendCloseContainerActionIfNeeded(Container container, boolean isVolumeFull) { // We have to find a more efficient way to close a container. - boolean isSpaceFull = isContainerFull(container) || isVolumeFull(container); + boolean isSpaceFull = isContainerFull(container) || isVolumeFull; boolean shouldClose = isSpaceFull || isContainerUnhealthy(container); if (shouldClose) { ContainerData containerData = container.getContainerData(); From 865689ff4fad763420cb158219534fa0153b7df2 Mon Sep 17 00:00:00 2001 From: Siddhant Sangwan Date: Tue, 20 May 2025 16:16:56 +0530 Subject: [PATCH 2/5] add tests --- .../container/common/impl/HddsDispatcher.java | 1 + .../common/impl/TestHddsDispatcher.java | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index ed3e0170dc66..ef89837cee5e 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -470,6 +470,7 @@ private void handleFullVolume(HddsVolume volume) throws StorageContainerExceptio nodeReport = context.getParent().getContainer().getNodeReport(); context.refreshFullReport(nodeReport); context.getParent().triggerHeartbeat(); + LOG.info("Triggering heartbeat for full volume {}, with node report: {}.", volume, nodeReport); } catch (IOException e) { String volumePath = volume.getVolumeRootDir(); StorageLocationReport volumeReport = volume.getReport(); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java index 6afcadb809b9..087a955c6f83 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java @@ -28,6 +28,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; @@ -63,6 +64,7 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.WriteChunkRequestProto; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerAction; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; import org.apache.hadoop.hdds.security.token.TokenVerifier; @@ -79,6 +81,7 @@ import org.apache.hadoop.ozone.container.common.interfaces.VolumeChoosingPolicy; import org.apache.hadoop.ozone.container.common.report.IncrementalReportSender; import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration; +import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine; import org.apache.hadoop.ozone.container.common.statemachine.StateContext; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext; import org.apache.hadoop.ozone.container.common.transport.server.ratis.DispatcherContext.Op; @@ -93,6 +96,7 @@ import org.apache.hadoop.ozone.container.keyvalue.ContainerLayoutTestInfo; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; +import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.apache.hadoop.security.token.Token; import org.apache.ozone.test.GenericTestUtils.LogCapturer; import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; @@ -177,6 +181,9 @@ public void testContainerCloseActionWhenFull( verify(context, times(1)) .addContainerActionIfAbsent(any(ContainerAction.class)); + // since the volume is not full, context.refreshFullReport(NodeReportProto) should not be called + verify(context, times(0)).refreshFullReport(any()); + } finally { volumeSet.shutdown(); ContainerMetrics.remove(); @@ -276,6 +283,16 @@ public void testContainerCloseActionWhenVolumeFull( UUID scmId = UUID.randomUUID(); ContainerSet containerSet = newContainerSet(); StateContext context = ContainerTestUtils.getMockContext(dd, conf); + + // empty report object for testing that an immediate heartbeat is triggered + StorageContainerDatanodeProtocolProtos.NodeReportProto.Builder nrb + = StorageContainerDatanodeProtocolProtos. + NodeReportProto.newBuilder(); + StorageContainerDatanodeProtocolProtos.NodeReportProto reportProto = nrb.build(); + DatanodeStateMachine stateMachine = context.getParent(); + OzoneContainer ozoneContainer = mock(OzoneContainer.class); + doReturn(ozoneContainer).when(stateMachine).getContainer(); + doReturn(reportProto).when(ozoneContainer).getNodeReport(); // create a 50 byte container // available (160) > 100 (min free space) + 50 (container size) KeyValueContainerData containerData = new KeyValueContainerData(1L, @@ -308,6 +325,15 @@ public void testContainerCloseActionWhenVolumeFull( response.getResult()); verify(context, times(1)) .addContainerActionIfAbsent(any(ContainerAction.class)); + // verify that node report is refreshed and heartbeat is triggered + verify(context, times(1)).refreshFullReport(eq(reportProto)); + verify(stateMachine, times(1)).triggerHeartbeat(); + + // the volume is past the min free space boundary but this time the heartbeat should not be triggered because + // of throttling + hddsDispatcher.dispatch(getWriteChunkRequest(dd.getUuidString(), 1L, 2L), null); + verify(context, times(1)).refreshFullReport(eq(reportProto)); // was called once before + verify(stateMachine, times(1)).triggerHeartbeat(); // was called once before // try creating another container now as the volume used has crossed // threshold From 2437d5e4344afa7acc4368387a58af87ac9f74ec Mon Sep 17 00:00:00 2001 From: Siddhant Sangwan Date: Thu, 29 May 2025 13:49:36 +0530 Subject: [PATCH 3/5] use HddsServerUtil.getScmHeartbeatInterval() --- .../hadoop/ozone/container/common/impl/HddsDispatcher.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index ef89837cee5e..8faef9b74e7d 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -53,6 +53,7 @@ import org.apache.hadoop.hdds.security.token.NoopTokenVerifier; import org.apache.hadoop.hdds.security.token.TokenVerifier; import org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher; +import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics; import org.apache.hadoop.ozone.audit.AuditAction; import org.apache.hadoop.ozone.audit.AuditEventStatus; @@ -135,8 +136,7 @@ public HddsDispatcher(ConfigurationSource config, ContainerSet contSet, : new NoopTokenVerifier(); this.slowOpThresholdNs = getSlowOpThresholdMs(conf) * 1000000; fullVolumeLastHeartbeatTriggerMs = new AtomicLong(-1); - long heartbeatInterval = - config.getTimeDuration("hdds.heartbeat.interval", 30000, TimeUnit.MILLISECONDS); + long heartbeatInterval = HddsServerUtil.getScmHeartbeatInterval(conf); fullVolumeHeartbeatThrottleIntervalMs = Math.min(heartbeatInterval, 30000); protocolMetrics = From 4fd1252487c31ff67c6cf632a8bf7b0d4775f83b Mon Sep 17 00:00:00 2001 From: Siddhant Sangwan Date: Fri, 30 May 2025 16:05:40 +0530 Subject: [PATCH 4/5] use node report interval, log error, add logging test --- .../container/common/impl/HddsDispatcher.java | 10 ++- .../common/impl/TestHddsDispatcher.java | 90 +++++++++++++++++++ 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index 8faef9b74e7d..9e4ff07f1e7a 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -17,6 +17,8 @@ package org.apache.hadoop.ozone.container.common.impl; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.malformedRequest; import static org.apache.hadoop.hdds.scm.protocolPB.ContainerCommandResponseBuilders.unsupportedRequest; import static org.apache.hadoop.ozone.audit.AuditLogger.PerformanceStringBuilder; @@ -53,7 +55,6 @@ import org.apache.hadoop.hdds.security.token.NoopTokenVerifier; import org.apache.hadoop.hdds.security.token.TokenVerifier; import org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher; -import org.apache.hadoop.hdds.utils.HddsServerUtil; import org.apache.hadoop.hdds.utils.ProtocolMessageMetrics; import org.apache.hadoop.ozone.audit.AuditAction; import org.apache.hadoop.ozone.audit.AuditEventStatus; @@ -136,8 +137,9 @@ public HddsDispatcher(ConfigurationSource config, ContainerSet contSet, : new NoopTokenVerifier(); this.slowOpThresholdNs = getSlowOpThresholdMs(conf) * 1000000; fullVolumeLastHeartbeatTriggerMs = new AtomicLong(-1); - long heartbeatInterval = HddsServerUtil.getScmHeartbeatInterval(conf); - fullVolumeHeartbeatThrottleIntervalMs = Math.min(heartbeatInterval, 30000); + long nodeReportInterval = conf.getTimeDuration(HDDS_NODE_REPORT_INTERVAL, HDDS_NODE_REPORT_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS); + fullVolumeHeartbeatThrottleIntervalMs = Math.min(nodeReportInterval, 60000); // min of interval and 1 minute protocolMetrics = new ProtocolMessageMetrics<>( @@ -349,7 +351,7 @@ && getMissingContainerSet().contains(containerID)) { try { handleFullVolume(container.getContainerData().getVolume()); } catch (StorageContainerException e) { - ContainerUtils.logAndReturnError(LOG, e, msg); + LOG.warn("Failed to handle full volume while handling request: {}", msg, e); } } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java index 087a955c6f83..62360d793414 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java @@ -31,6 +31,7 @@ import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; @@ -355,6 +356,95 @@ public void testContainerCloseActionWhenVolumeFull( } } + /** + * Tests that we log any exception properly along with volume and request details when handling a full volume. + */ + @ContainerLayoutTestInfo.ContainerTest + public void testExceptionHandlingWhenVolumeFull(ContainerLayoutVersion layoutVersion) throws IOException { + /* + SETTING UP FULL VOLUME SCENARIO AND MOCKS, SAME AS OTHER TESTS + */ + String testDirPath = testDir.getPath(); + OzoneConfiguration conf = new OzoneConfiguration(); + conf.setStorageSize(DatanodeConfiguration.HDDS_DATANODE_VOLUME_MIN_FREE_SPACE, + 100.0, StorageUnit.BYTES); + DatanodeDetails dd = randomDatanodeDetails(); + + HddsVolume.Builder volumeBuilder = + new HddsVolume.Builder(testDirPath).datanodeUuid(dd.getUuidString()) + .conf(conf).usageCheckFactory(MockSpaceUsageCheckFactory.NONE); + // state of cluster : available (160) > 100 ,datanode volume + // utilisation threshold not yet reached. container creates are successful. + AtomicLong usedSpace = new AtomicLong(340); + SpaceUsageSource spaceUsage = MockSpaceUsageSource.of(500, usedSpace); + + SpaceUsageCheckFactory factory = MockSpaceUsageCheckFactory.of( + spaceUsage, Duration.ZERO, inMemory(new AtomicLong(0))); + volumeBuilder.usageCheckFactory(factory); + MutableVolumeSet volumeSet = mock(MutableVolumeSet.class); + when(volumeSet.getVolumesList()) + .thenReturn(Collections.singletonList(volumeBuilder.build())); + try { + UUID scmId = UUID.randomUUID(); + ContainerSet containerSet = newContainerSet(); + StateContext context = ContainerTestUtils.getMockContext(dd, conf); + + /* + MOCK TO THROW AN EXCEPTION WHEN getNodeReport() IS CALLED + */ + DatanodeStateMachine stateMachine = context.getParent(); + OzoneContainer ozoneContainer = mock(OzoneContainer.class); + doReturn(ozoneContainer).when(stateMachine).getContainer(); + doThrow(new IOException()).when(ozoneContainer).getNodeReport(); + // create a 50 byte container + // available (160) > 100 (min free space) + 50 (container size) + KeyValueContainerData containerData = new KeyValueContainerData(1L, + layoutVersion, + 50, UUID.randomUUID().toString(), + dd.getUuidString()); + Container container = new KeyValueContainer(containerData, conf); + StorageVolumeUtil.getHddsVolumesList(volumeSet.getVolumesList()) + .forEach(hddsVolume -> hddsVolume.setDbParentDir(tempDir.toFile())); + container.create(volumeSet, new RoundRobinVolumeChoosingPolicy(), + scmId.toString()); + containerSet.addContainer(container); + ContainerMetrics metrics = ContainerMetrics.create(conf); + Map handlers = Maps.newHashMap(); + for (ContainerType containerType : ContainerType.values()) { + handlers.put(containerType, + Handler.getHandlerForContainerType(containerType, conf, + context.getParent().getDatanodeDetails().getUuidString(), + containerSet, volumeSet, volumeChoosingPolicy, metrics, NO_OP_ICR_SENDER)); + } + HddsDispatcher hddsDispatcher = new HddsDispatcher( + conf, containerSet, volumeSet, handlers, context, metrics, null); + hddsDispatcher.setClusterId(scmId.toString()); + /* + CAPTURE LOGS TO ASSERT THAT THE EXCEPTION WAS LOGGED PROPERLY + */ + LogCapturer logCapturer = LogCapturer.captureLogs(HddsDispatcher.LOG); + containerData.getVolume().getVolumeUsage() + .ifPresent(usage -> usage.incrementUsedSpace(50)); + usedSpace.addAndGet(50); + ContainerCommandResponseProto response = hddsDispatcher + .dispatch(getWriteChunkRequest(dd.getUuidString(), 1L, 1L), null); + logCapturer.stopCapturing(); + + assertEquals(ContainerProtos.Result.SUCCESS, + response.getResult()); + verify(context, times(1)) + .addContainerActionIfAbsent(any(ContainerAction.class)); + /* + getNodeReport() SHOULD BE CALLED, AND LOG CAPTURE SHOULD CONTAIN THE EXCEPTION + */ + verify(ozoneContainer, times(1)).getNodeReport(); + assertTrue(logCapturer.getOutput().contains("Failed to handle full volume while handling request")); + } finally { + volumeSet.shutdown(); + ContainerMetrics.remove(); + } + } + @Test public void testCreateContainerWithWriteChunk() throws IOException { String testDirPath = testDir.getPath(); From 31bdb20afc7592231fb631cf073d7ee5c69cedde Mon Sep 17 00:00:00 2001 From: Siddhant Sangwan Date: Fri, 30 May 2025 19:17:34 +0530 Subject: [PATCH 5/5] extract out method, log instead of throwing exception --- .../container/common/impl/HddsDispatcher.java | 29 +++++++++---------- .../common/impl/TestHddsDispatcher.java | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java index 9e4ff07f1e7a..4eb945188873 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java @@ -47,8 +47,8 @@ import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerType; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Result; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.Type; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerAction; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.NodeReportProto; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerNotOpenException; import org.apache.hadoop.hdds.scm.container.common.helpers.InvalidContainerStateException; import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; @@ -348,11 +348,7 @@ && getMissingContainerSet().contains(containerID)) { boolean isFull = isVolumeFull(container); sendCloseContainerActionIfNeeded(container, isFull); if (isFull) { - try { - handleFullVolume(container.getContainerData().getVolume()); - } catch (StorageContainerException e) { - LOG.warn("Failed to handle full volume while handling request: {}", msg, e); - } + handleFullVolume(container.getContainerData().getVolume(), msg); } } Handler handler = getHandler(containerType); @@ -460,30 +456,33 @@ && getMissingContainerSet().contains(containerID)) { * heartbeat, and then send the heartbeat (including container close action) immediately. * @param volume the volume being written to */ - private void handleFullVolume(HddsVolume volume) throws StorageContainerException { + private void handleFullVolume(HddsVolume volume, ContainerCommandRequestProto request) { long current = System.currentTimeMillis(); long last = fullVolumeLastHeartbeatTriggerMs.get(); boolean isFirstTrigger = last == -1; boolean allowedToTrigger = (current - fullVolumeHeartbeatThrottleIntervalMs) >= last; if (isFirstTrigger || allowedToTrigger) { if (fullVolumeLastHeartbeatTriggerMs.compareAndSet(last, current)) { - StorageContainerDatanodeProtocolProtos.NodeReportProto nodeReport; try { - nodeReport = context.getParent().getContainer().getNodeReport(); - context.refreshFullReport(nodeReport); - context.getParent().triggerHeartbeat(); - LOG.info("Triggering heartbeat for full volume {}, with node report: {}.", volume, nodeReport); + NodeReportProto nodeReport = triggerHeartbeatWithNodeReport(); + LOG.info("Triggered heartbeat for full volume {}, with node report: {}.", volume, nodeReport); } catch (IOException e) { String volumePath = volume.getVolumeRootDir(); StorageLocationReport volumeReport = volume.getReport(); - String error = String.format( - "Failed to create node report when handling full volume %s. Volume Report: %s", volumePath, volumeReport); - throw new StorageContainerException(error, e, Result.IO_EXCEPTION); + LOG.warn("Failed to create node report when handling full volume at path {} for request {}. Volume Report:" + + " {}", volumePath, request, volumeReport, e); } } } } + private NodeReportProto triggerHeartbeatWithNodeReport() throws IOException { + NodeReportProto nodeReport = context.getParent().getContainer().getNodeReport(); + context.refreshFullReport(nodeReport); + context.getParent().triggerHeartbeat(); + return nodeReport; + } + private long getSlowOpThresholdMs(ConfigurationSource config) { return config.getTimeDuration( HddsConfigKeys.HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_KEY, diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java index 62360d793414..408f8fb771d0 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/impl/TestHddsDispatcher.java @@ -438,7 +438,7 @@ MOCK TO THROW AN EXCEPTION WHEN getNodeReport() IS CALLED getNodeReport() SHOULD BE CALLED, AND LOG CAPTURE SHOULD CONTAIN THE EXCEPTION */ verify(ozoneContainer, times(1)).getNodeReport(); - assertTrue(logCapturer.getOutput().contains("Failed to handle full volume while handling request")); + assertTrue(logCapturer.getOutput().contains("Failed to create node report when handling full volume")); } finally { volumeSet.shutdown(); ContainerMetrics.remove();