From e84b54911783dbfb89160ce75b79b8c7de6a0ee8 Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Mon, 28 Apr 2025 17:19:47 +0530 Subject: [PATCH 1/3] HDDS-12087. TransactionToDNeCommitMap too large causes GC to pause for a long time. --- .../java/org/apache/hadoop/hdds/HddsConfigKeys.java | 4 ++++ .../apache/hadoop/hdds/scm/block/DeletedBlockLog.java | 2 ++ .../hadoop/hdds/scm/block/DeletedBlockLogImpl.java | 5 +++++ .../hdds/scm/block/SCMBlockDeletingService.java | 11 +++++++++++ .../SCMDeletedBlockTransactionStatusManager.java | 4 ++++ 5 files changed, 26 insertions(+) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index c9d94f0b91e7..559881717005 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -74,6 +74,10 @@ public final class HddsConfigKeys { public static final String HDDS_CONTAINER_CLOSE_THRESHOLD = "hdds.container.close.threshold"; public static final float HDDS_CONTAINER_CLOSE_THRESHOLD_DEFAULT = 0.9f; + + public static final String HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE = "hdds.scm.txn.dn.commit.map.size"; + + public static final int HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT = 5000000; public static final String HDDS_SCM_SAFEMODE_ENABLED = "hdds.scm.safemode.enabled"; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLog.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLog.java index 1952bf7f2fb2..1ac97dae3bcd 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLog.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLog.java @@ -142,4 +142,6 @@ void addTransactions(Map> containerBlocksMap) * @param deletedBlocksTXTable delete transaction table */ void reinitialize(Table deletedBlocksTXTable); + + int getTransactionToDNsCommitMapSize(); } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java index bc675d0ff988..b57d3dcf2d19 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/DeletedBlockLogImpl.java @@ -488,6 +488,11 @@ public void recordTransactionCreated(UUID dnId, long scmCmdId, .recordTransactionCreated(dnId, scmCmdId, dnTxSet); } + @Override + public int getTransactionToDNsCommitMapSize() { + return getSCMDeletedBlockTransactionStatusManager().getTransactionToDNsCommitMapSize(); + } + @Override public void onDatanodeDead(UUID dnId) { getSCMDeletedBlockTransactionStatusManager().onDatanodeDead(dnId); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java index 31c3eea14b67..f3304bbfba3b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java @@ -17,6 +17,8 @@ package org.apache.hadoop.hdds.scm.block; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT; @@ -93,6 +95,7 @@ public class SCMBlockDeletingService extends BackgroundService private final long safemodeExitRunDelayMillis; private final long deleteBlocksPendingCommandLimit; private final Clock clock; + private final int transactionToDNsCommitMapSize; @SuppressWarnings("parameternumber") public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog, @@ -115,6 +118,8 @@ public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog, DatanodeConfiguration dnConf = conf.getObject(DatanodeConfiguration.class); this.deleteBlocksPendingCommandLimit = dnConf.getBlockDeleteQueueLimit(); + this.transactionToDNsCommitMapSize = + conf.getInt(HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE, HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT); this.clock = clock; this.deletedBlockLog = deletedBlockLog; this.nodeManager = nodeManager; @@ -167,6 +172,12 @@ public EmptyTaskResult call() throws Exception { final Set included = getDatanodesWithinCommandLimit(datanodes); int blockDeletionLimit = getBlockDeleteTXNum(); + int txnToDNsCommitMapSize = deletedBlockLog.getTransactionToDNsCommitMapSize(); + if (txnToDNsCommitMapSize >= transactionToDNsCommitMapSize) { + LOG.warn("Skipping block deletion as transactionToDNsCommitMap size = {}, exceeds threshold {}", + txnToDNsCommitMapSize, transactionToDNsCommitMapSize); + return EmptyTaskResult.newResult(); + } DatanodeDeletedBlockTransactions transactions = deletedBlockLog.getTransactions(blockDeletionLimit, included); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java index 4a4aa3b607fc..4be3a1a57461 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMDeletedBlockTransactionStatusManager.java @@ -572,4 +572,8 @@ private boolean isTransactionFailed(DeleteBlockTransactionResult result) { } return false; } + + public int getTransactionToDNsCommitMapSize() { + return transactionToDNsCommitMap.size(); + } } From 51277a77316bf1828d7f080c8c440e5c57481cd1 Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Mon, 28 Apr 2025 20:45:49 +0530 Subject: [PATCH 2/3] Addressed review comments. --- .../org/apache/hadoop/hdds/HddsConfigKeys.java | 4 ---- .../java/org/apache/hadoop/hdds/scm/ScmConfig.java | 14 ++++++++++++++ .../hdds/scm/block/SCMBlockDeletingService.java | 11 ++++------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 559881717005..c9d94f0b91e7 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -74,10 +74,6 @@ public final class HddsConfigKeys { public static final String HDDS_CONTAINER_CLOSE_THRESHOLD = "hdds.container.close.threshold"; public static final float HDDS_CONTAINER_CLOSE_THRESHOLD_DEFAULT = 0.9f; - - public static final String HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE = "hdds.scm.txn.dn.commit.map.size"; - - public static final int HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT = 5000000; public static final String HDDS_SCM_SAFEMODE_ENABLED = "hdds.scm.safemode.enabled"; diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java index db5fde5efe65..4812684dbb44 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java @@ -139,6 +139,20 @@ public class ScmConfig extends ReconfigurableConfig { ) private int defaultLayoutVersionOnInit = -1; + @Config(key = "hdds.scm.block.deletion.txn.dn.commit.map.limit", + defaultValue = "5000000", + type = ConfigType.INT, + tags = { ConfigTag.SCM }, + description = + " This value indicates the size of the transactionToDNsCommitMap after which" + + " we will skip one round of scm block deleting interval." + ) + private int transactionToDNsCommitMapLimit = 5000000; + + public int getTransactionToDNsCommitMapLimit() { + return transactionToDNsCommitMapLimit; + } + public Duration getBlockDeletionInterval() { return blockDeletionInterval; } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java index f3304bbfba3b..6d80dbf0b52d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java @@ -17,8 +17,6 @@ package org.apache.hadoop.hdds.scm.block; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT; @@ -95,7 +93,7 @@ public class SCMBlockDeletingService extends BackgroundService private final long safemodeExitRunDelayMillis; private final long deleteBlocksPendingCommandLimit; private final Clock clock; - private final int transactionToDNsCommitMapSize; + private final int transactionToDNsCommitMapLimit; @SuppressWarnings("parameternumber") public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog, @@ -118,8 +116,7 @@ public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog, DatanodeConfiguration dnConf = conf.getObject(DatanodeConfiguration.class); this.deleteBlocksPendingCommandLimit = dnConf.getBlockDeleteQueueLimit(); - this.transactionToDNsCommitMapSize = - conf.getInt(HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE, HDDS_SCM_TXN_DN_COMMIT_MAP_SIZE_DEFAULT); + this.transactionToDNsCommitMapLimit = scmConfig.getTransactionToDNsCommitMapLimit(); this.clock = clock; this.deletedBlockLog = deletedBlockLog; this.nodeManager = nodeManager; @@ -173,9 +170,9 @@ public EmptyTaskResult call() throws Exception { getDatanodesWithinCommandLimit(datanodes); int blockDeletionLimit = getBlockDeleteTXNum(); int txnToDNsCommitMapSize = deletedBlockLog.getTransactionToDNsCommitMapSize(); - if (txnToDNsCommitMapSize >= transactionToDNsCommitMapSize) { + if (txnToDNsCommitMapSize >= transactionToDNsCommitMapLimit) { LOG.warn("Skipping block deletion as transactionToDNsCommitMap size = {}, exceeds threshold {}", - txnToDNsCommitMapSize, transactionToDNsCommitMapSize); + txnToDNsCommitMapSize, transactionToDNsCommitMapLimit); return EmptyTaskResult.newResult(); } DatanodeDeletedBlockTransactions transactions = From a823d16d3dc999617ab84d66e2b143fe3e77a135 Mon Sep 17 00:00:00 2001 From: Aryan Gupta Date: Sun, 4 May 2025 21:19:50 +0530 Subject: [PATCH 3/3] Remove unused config. --- .../java/org/apache/hadoop/hdds/scm/ScmConfig.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java index d8dc68662d31..dbab95efd4bd 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfig.java @@ -128,17 +128,6 @@ public class ScmConfig extends ReconfigurableConfig { ) private Duration blockDeletionInterval = Duration.ofSeconds(60); - @Config(key = "init.default.layout.version", - defaultValue = "-1", - type = ConfigType.INT, - tags = { ConfigTag.SCM, ConfigTag.UPGRADE }, - description = - " Default Layout Version to init the SCM with. Intended to be used " + - "in tests to finalize from an older version of SCM to the " + - "latest. By default, SCM init uses the highest layout version." - ) - private int defaultLayoutVersionOnInit = -1; - @Config(key = "hdds.scm.block.deletion.txn.dn.commit.map.limit", defaultValue = "5000000", type = ConfigType.INT,