From d0a4d492b46f42290218134821c3396905217a10 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Fri, 14 Apr 2023 07:57:55 -0700 Subject: [PATCH 1/7] HDDS-7935. [Snapshot] LRU Cache entries may get evicted/closed during long running processes Change-Id: I1402062eb264e7a4a27014e3cd9f1ba91b6a18bd --- .../apache/hadoop/ozone/OzoneConfigKeys.java | 1 + .../apache/hadoop/ozone/om/OmSnapshot.java | 8 ++ .../hadoop/ozone/om/OmSnapshotManager.java | 94 ++++++++++++------- .../snapshot/OMSnapshotDeleteRequest.java | 5 +- .../om/snapshot/SnapshotDiffManager.java | 5 + 5 files changed, 76 insertions(+), 37 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java index 1983afbd3b92..6e4c31180bdd 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java @@ -600,6 +600,7 @@ public final class OzoneConfigKeys { "org.apache.hadoop.ozone.om.TrashPolicyOzone"; + // TODO: [SNAPSHOT] Document this in ozone-default.xml public static final String OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE = "ozone.om.snapshot.cache.max.size"; public static final int OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT = 10; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java index 655a13d51d98..b8754ca39661 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java @@ -260,6 +260,14 @@ public void close() throws IOException { omMetadataManager.getStore().close(); } + @Override + protected void finalize() throws IOException { + // Close the DB if it hasn't been closed when GC'ed by JVM + if (!omMetadataManager.getStore().isClosed()) { + omMetadataManager.getStore().close(); + } + } + @VisibleForTesting public OMMetadataManager getMetadataManager() { return omMetadataManager; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index e8219eb4236b..2448d6639507 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -122,6 +122,7 @@ public final class OmSnapshotManager implements AutoCloseable { // TODO: [SNAPSHOT] create config for max allowed page size. private final int maxPageSize = 1000; + private final int softCacheSize; public OmSnapshotManager(OzoneManager ozoneManager) { this.options = new ManagedDBOptions(); @@ -161,44 +162,24 @@ public OmSnapshotManager(OzoneManager ozoneManager) { .getStore() .getRocksDBCheckpointDiffer(); - // size of lru cache - int cacheSize = ozoneManager.getConfiguration().getInt( + // snapshot cache size, soft-limit + this.softCacheSize = ozoneManager.getConfiguration().getInt( OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE, OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT); CacheLoader loader; loader = new CacheLoader() { - @Override - // load the snapshot into the cache if not already there + @Override @Nonnull public OmSnapshot load(@Nonnull String snapshotTableKey) throws IOException { - SnapshotInfo snapshotInfo; // see if the snapshot exists - snapshotInfo = getSnapshotInfo(snapshotTableKey); + SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey); // Block snapshot from loading when it is no longer active e.g. DELETED, // unless this is called from SnapshotDeletingService. - // TODO: [SNAPSHOT] However, snapshotCache.get() from other requests - // (not from SDS) would be able to piggyback off of this because - // snapshot still in cache won't trigger loader again. - // This needs proper addressal in e.g. HDDS-7935 - // by introducing another cache just for SDS. - // While the snapshotCache would host ACTIVE snapshots only. - if (!snapshotInfo.getSnapshotStatus().equals( - SnapshotStatus.SNAPSHOT_ACTIVE)) { - if (isCalledFromSnapshotDeletingService()) { - LOG.debug("Permitting {} to load snapshot {} in status: {}", - SnapshotDeletingService.class.getSimpleName(), - snapshotInfo.getTableKey(), - snapshotInfo.getSnapshotStatus()); - } else { - throw new OMException("Unable to load snapshot. " + - "Snapshot with table key '" + snapshotTableKey + - "' is no longer active", FILE_NOT_FOUND); - } - } + checkSnapshotActive(snapshotInfo); CacheValue cacheValue = ozoneManager.getMetadataManager().getSnapshotInfoTable() @@ -217,7 +198,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) snapshotMetadataManager = new OmMetadataManagerImpl(conf, snapshotInfo.getCheckpointDirName(), isSnapshotInCache); } catch (IOException e) { - LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e); + LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e); throw e; } @@ -239,19 +220,29 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) RemovalListener removalListener = notification -> { try { - // close snapshot's rocksdb on eviction - LOG.debug("Closing snapshot: {}", notification.getKey()); - // TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero? - notification.getValue().close(); + final String snapshotTableKey = notification.getKey(); + final OmSnapshot omSnapshot = notification.getValue(); + if (omSnapshot != null) { + // close snapshot's rocksdb on eviction + LOG.debug("Closing OmSnapshot '{}'", snapshotTableKey); + omSnapshot.close(); + } else { + // Assuming the value becomes null when weak ref is GC'ed by JVM. + LOG.debug("OmSnapshot '{}' was already garbage collected by JVM.", + snapshotTableKey); + } } catch (IOException e) { - LOG.error("Failed to close snapshot: {} {}", - notification.getKey(), e); + LOG.error("Failed to close snapshot: {}", notification.getKey(), e); } }; // init LRU cache snapshotCache = CacheBuilder.newBuilder() - .maximumSize(cacheSize) + // Indicating OmSnapshot instances are weakly referenced from the cache. + // If no thread is holding a strong reference to an OmSnapshot instance + // (e.g. SnapDiff), the instance will be garbage collected by JVM at + // some point. + .weakValues() .removalListener(removalListener) .build(loader); @@ -260,6 +251,30 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) columnFamilyOptions); } + /** + * Throws OMException FILE_NOT_FOUND if snapshot is not in active status. + * @param snapshotTableKey snapshot table key + */ + public void checkSnapshotActive(String snapshotTableKey) throws IOException { + checkSnapshotActive(getSnapshotInfo(snapshotTableKey)); + } + + private void checkSnapshotActive(SnapshotInfo snapInfo) throws OMException { + + if (!snapInfo.getSnapshotStatus().equals(SnapshotStatus.SNAPSHOT_ACTIVE)) { + if (isCalledFromSnapshotDeletingService()) { + LOG.debug("Permitting {} to load snapshot {} even in status: {}", + SnapshotDeletingService.class.getSimpleName(), + snapInfo.getTableKey(), + snapInfo.getSnapshotStatus()); + } else { + throw new OMException("Unable to load snapshot. " + + "Snapshot with table key '" + snapInfo.getTableKey() + + "' is no longer active", FILE_NOT_FOUND); + } + } + } + /** * Helper method to check whether the loader is called from * SnapshotDeletingTask (return true) or not (return false). @@ -434,6 +449,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName, String snapshotTableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName); + // Block FS API reads when snapshot is not active. + checkSnapshotActive(snapshotTableKey); + + // Warn if actual cache size exceeds the soft limit already. + if (snapshotCache.size() > softCacheSize) { + LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).", + snapshotCache.size(), softCacheSize); + } + // retrieve the snapshot from the cache try { return snapshotCache.get(snapshotTableKey); @@ -489,6 +513,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, final String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot); final String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); try { + // Block SnapDiff if either one of the snapshot is not active. + checkSnapshotActive(fsKey); + checkSnapshotActive(tsKey); + final OmSnapshot fs = snapshotCache.get(fsKey); final OmSnapshot ts = snapshotCache.get(tsKey); return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts, diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java index 7af3f1486fd3..a6f52dc9ee42 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java @@ -184,10 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager, omClientResponse = new OMSnapshotDeleteResponse( omResponse.build(), tableKey, snapshotInfo); - // Evict the snapshot entry from cache, and close the snapshot DB - // Nothing happens if the key doesn't exist in cache (snapshot not loaded) - ozoneManager.getOmSnapshotManager().getSnapshotCache() - .invalidate(tableKey); + // No longer need to invalidate the snapshot cache here. } catch (IOException ex) { exception = ex; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java index 60e7ea7fd0d5..6a5971747f04 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java @@ -927,6 +927,11 @@ private synchronized void loadJobOnStartUp(final String jobKey, String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot); String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); + + // Block SnapDiff if either one of the snapshots is not active + ozoneManager.getOmSnapshotManager().checkSnapshotActive(fsKey); + ozoneManager.getOmSnapshotManager().checkSnapshotActive(tsKey); + try { submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey), snapshotCache.get(tsKey), fsInfo, tsInfo, forceFullDiff); From b31f20c44f984280bf558482e08bfeb0ecda7a86 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:48:19 -0700 Subject: [PATCH 2/7] Use `softValues` rather than `weakValues`; fix finalize(); fix UT. Change-Id: I1e8b244d48e34b33588e0ca7a64ad0dbc25d5123 --- .../java/org/apache/hadoop/ozone/om/OmSnapshot.java | 6 ++++-- .../org/apache/hadoop/ozone/om/OmSnapshotManager.java | 10 +++++----- .../apache/hadoop/ozone/om/TestOmSnapshotManager.java | 5 +++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java index b8754ca39661..069bca7839a9 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java @@ -261,11 +261,13 @@ public void close() throws IOException { } @Override - protected void finalize() throws IOException { - // Close the DB if it hasn't been closed when GC'ed by JVM + protected void finalize() throws Throwable { + // Close the DB if it hasn't been closed when this OmSnapshot instance is + // garbage-collected by the JVM to avoid handle leaks. if (!omMetadataManager.getStore().isClosed()) { omMetadataManager.getStore().close(); } + super.finalize(); } @VisibleForTesting diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index 2448d6639507..f85f2076ab2f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -227,7 +227,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) LOG.debug("Closing OmSnapshot '{}'", snapshotTableKey); omSnapshot.close(); } else { - // Assuming the value becomes null when weak ref is GC'ed by JVM. + // Assuming the value becomes null when soft ref is GC'ed by JVM. LOG.debug("OmSnapshot '{}' was already garbage collected by JVM.", snapshotTableKey); } @@ -238,11 +238,11 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) // init LRU cache snapshotCache = CacheBuilder.newBuilder() - // Indicating OmSnapshot instances are weakly referenced from the cache. + // Indicating OmSnapshot instances are softly referenced from the cache. // If no thread is holding a strong reference to an OmSnapshot instance - // (e.g. SnapDiff), the instance will be garbage collected by JVM at - // some point. - .weakValues() + // (e.g. SnapDiff), the instance could be garbage collected by JVM at + // its discretion. + .softValues() .removalListener(removalListener) .build(loader); diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java index 583e381d25ba..34cf6a38c2ea 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java @@ -123,6 +123,11 @@ public void testCloseOnEviction() throws IOException { .checkForSnapshot(second.getVolumeName(), second.getBucketName(), getSnapshotPrefix(second.getName())); + // As a workaround, invalidate all cache entries in order to trigger + // instances close in this test case, since JVM GC most likely would not + // have triggered and closed the instances yet at this point. + omSnapshotManager.getSnapshotCache().invalidateAll(); + // confirm store was closed verify(firstSnapshotStore, timeout(3000).times(1)).close(); } From 45e9ba426bee6112c483eafd79924ad613158641 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Thu, 20 Apr 2023 16:11:30 -0700 Subject: [PATCH 3/7] Re-apply changes in OmSnapshotManager. Change-Id: I653dfc6373862ae39e2e71a77b8998439dcf57b3 --- .../hadoop/ozone/om/OmSnapshotManager.java | 97 ++++++++++++------- .../snapshot/OMSnapshotDeleteRequest.java | 2 +- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index 754b5fa41ddc..85ec72da17be 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -149,6 +149,8 @@ public final class OmSnapshotManager implements AutoCloseable { // TODO: [SNAPSHOT] create config for max allowed page size. private final int maxPageSize = 1000; + // Soft limit of the snapshot cache size. + private final int softCacheSize; public OmSnapshotManager(OzoneManager ozoneManager) { this.options = new ManagedDBOptions(); @@ -194,29 +196,38 @@ public OmSnapshotManager(OzoneManager ozoneManager) { .getStore() .getRocksDBCheckpointDiffer(); - // size of lru cache - int cacheSize = ozoneManager.getConfiguration().getInt( + // snapshot cache size, soft-limit + this.softCacheSize = ozoneManager.getConfiguration().getInt( OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE, OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT); CacheLoader loader = createCacheLoader(); - RemovalListener removalListener - = notification -> { + RemovalListener removalListener = notification -> { try { - // close snapshot's rocksdb on eviction - LOG.debug("Closing snapshot: {}", notification.getKey()); - // TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero? - notification.getValue().close(); + final String snapshotTableKey = notification.getKey(); + final OmSnapshot omSnapshot = notification.getValue(); + if (omSnapshot != null) { + // close snapshot's rocksdb on eviction + LOG.debug("Closing OmSnapshot '{}'", snapshotTableKey); + omSnapshot.close(); + } else { + // Assuming the value becomes null when soft ref is GC'ed by JVM. + LOG.debug("OmSnapshot '{}' was already garbage collected by JVM.", + snapshotTableKey); + } } catch (IOException e) { - LOG.error("Failed to close snapshot: {} {}", - notification.getKey(), e); + LOG.error("Failed to close snapshot: {}", notification.getKey(), e); } }; // init LRU cache - snapshotCache = CacheBuilder.newBuilder() - .maximumSize(cacheSize) + this.snapshotCache = CacheBuilder.newBuilder() + // Indicating OmSnapshot instances are softly referenced from the cache. + // If no thread is holding a strong reference to an OmSnapshot instance + // (e.g. SnapDiff), the instance could be garbage collected by JVM at + // its discretion. + .softValues() .removalListener(removalListener) .build(loader); @@ -241,6 +252,30 @@ public OmSnapshotManager(OzoneManager ozoneManager) { this.snapshotDiffCleanupService.start(); } + /** + * Throws OMException FILE_NOT_FOUND if snapshot is not in active status. + * @param snapshotTableKey snapshot table key + */ + public void checkSnapshotActive(String snapshotTableKey) throws IOException { + checkSnapshotActive(getSnapshotInfo(snapshotTableKey)); + } + + private void checkSnapshotActive(SnapshotInfo snapInfo) throws OMException { + + if (!snapInfo.getSnapshotStatus().equals(SnapshotStatus.SNAPSHOT_ACTIVE)) { + if (isCalledFromSnapshotDeletingService()) { + LOG.debug("Permitting {} to load snapshot {} even in status: {}", + SnapshotDeletingService.class.getSimpleName(), + snapInfo.getTableKey(), + snapInfo.getSnapshotStatus()); + } else { + throw new OMException("Unable to load snapshot. " + + "Snapshot with table key '" + snapInfo.getTableKey() + + "' is no longer active", FILE_NOT_FOUND); + } + } + } + private CacheLoader createCacheLoader() { return new CacheLoader() { @Override @@ -249,31 +284,12 @@ private CacheLoader createCacheLoader() { @Nonnull public OmSnapshot load(@Nonnull String snapshotTableKey) throws IOException { - SnapshotInfo snapshotInfo; // see if the snapshot exists - snapshotInfo = getSnapshotInfo(snapshotTableKey); + SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey); // Block snapshot from loading when it is no longer active e.g. DELETED, // unless this is called from SnapshotDeletingService. - // TODO: [SNAPSHOT] However, snapshotCache.get() from other requests - // (not from SDS) would be able to piggyback off of this because - // snapshot still in cache won't trigger loader again. - // This needs proper addressal in e.g. HDDS-7935 - // by introducing another cache just for SDS. - // While the snapshotCache would host ACTIVE snapshots only. - if (!snapshotInfo.getSnapshotStatus().equals( - SnapshotStatus.SNAPSHOT_ACTIVE)) { - if (isCalledFromSnapshotDeletingService()) { - LOG.debug("Permitting {} to load snapshot {} in status: {}", - SnapshotDeletingService.class.getSimpleName(), - snapshotInfo.getTableKey(), - snapshotInfo.getSnapshotStatus()); - } else { - throw new OMException("Unable to load snapshot. " + - "Snapshot with table key '" + snapshotTableKey + - "' is no longer active", FILE_NOT_FOUND); - } - } + checkSnapshotActive(snapshotInfo); CacheValue cacheValue = ozoneManager.getMetadataManager().getSnapshotInfoTable() @@ -292,7 +308,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) snapshotMetadataManager = new OmMetadataManagerImpl(conf, snapshotInfo.getCheckpointDirName(), isSnapshotInCache); } catch (IOException e) { - LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e); + LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e); throw e; } @@ -500,6 +516,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName, String snapshotTableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName); + // Block FS API reads when snapshot is not active. + checkSnapshotActive(snapshotTableKey); + + // Warn if actual cache size exceeds the soft limit already. + if (snapshotCache.size() > softCacheSize) { + LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).", + snapshotCache.size(), softCacheSize); + } + // retrieve the snapshot from the cache try { return snapshotCache.get(snapshotTableKey); @@ -555,6 +580,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, final String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot); final String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); try { + // Block SnapDiff if either one of the snapshot is not active. + checkSnapshotActive(fsKey); + checkSnapshotActive(tsKey); + final OmSnapshot fs = snapshotCache.get(fsKey); final OmSnapshot ts = snapshotCache.get(tsKey); return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts, diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java index a6f52dc9ee42..eaaf63cf645b 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java @@ -184,7 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager, omClientResponse = new OMSnapshotDeleteResponse( omResponse.build(), tableKey, snapshotInfo); - // No longer need to invalidate the snapshot cache here. + // No longer need to invalidate the entry in the snapshot cache here. } catch (IOException ex) { exception = ex; From 9d4283b5a890e899fc03ab1045c95b7cb131f0e9 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Thu, 20 Apr 2023 16:32:18 -0700 Subject: [PATCH 4/7] Fix UT: exception class change. Change-Id: I915f770e4864647d324264640b3d8300508df97e --- .../ozone/om/TestOmSnapshotFileSystem.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java index cac8181743c3..b5f62a67a0e9 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java @@ -39,7 +39,6 @@ import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; -import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.BucketLayout; import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; import org.apache.hadoop.ozone.om.helpers.OpenKeySession; @@ -59,6 +58,7 @@ import org.slf4j.LoggerFactory; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; @@ -411,15 +411,19 @@ public void testBlockSnapshotFSAccessAfterDeletion() throws Exception { deleteSnapshot(snapshotName); // Can't access keys in snapshot anymore with FS API. Should throw exception - final String errorMsg = "no longer active"; - LambdaTestUtils.intercept(OMException.class, errorMsg, + final String errorMsg1 = "no longer active"; + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1, () -> o3fs.listStatus(snapshotRoot)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1, () -> o3fs.listStatus(snapshotParent)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + // Note: Different error message due to inconsistent FNFE client-side + // handling in BasicOzoneClientAdapterImpl#getFileStatus + // TODO: Reconciliation? + final String errorMsg2 = "No such file or directory"; + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2, () -> o3fs.getFileStatus(snapshotKey1)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2, () -> o3fs.getFileStatus(snapshotKey2)); } From 76c0c8310f79d80fc2dba0a60da59d8119cd2d0f Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Wed, 26 Apr 2023 15:49:15 -0700 Subject: [PATCH 5/7] Fix checkstyle caused during merge Change-Id: Icb08433e935f14b119eb76e4740038491eea2ac5 --- .../main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index af6983938063..0572e9aa7498 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -150,8 +150,6 @@ public final class OmSnapshotManager implements AutoCloseable { private final CodecRegistry codecRegistry; private final SnapshotDiffCleanupService snapshotDiffCleanupService; - // TODO: [SNAPSHOT] create config for max allowed page size. - private final int maxPageSize = 1000; // Soft limit of the snapshot cache size. private final int softCacheSize; From 53d330d7f12ed439870e5c8114e5d983a43e9022 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Thu, 27 Apr 2023 10:59:47 -0700 Subject: [PATCH 6/7] Address comments --- .../apache/hadoop/ozone/om/OmSnapshot.java | 9 ++- .../hadoop/ozone/om/OmSnapshotManager.java | 78 ++++--------------- .../om/snapshot/SnapshotDiffManager.java | 5 +- .../ozone/om/snapshot/SnapshotUtils.java | 50 ++++++++++++ 4 files changed, 75 insertions(+), 67 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java index 069bca7839a9..8d026326bb48 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java @@ -262,10 +262,13 @@ public void close() throws IOException { @Override protected void finalize() throws Throwable { - // Close the DB if it hasn't been closed when this OmSnapshot instance is - // garbage-collected by the JVM to avoid handle leaks. + // Verify that the DB handle has been closed, log warning otherwise + // https://softwareengineering.stackexchange.com/a/288724 if (!omMetadataManager.getStore().isClosed()) { - omMetadataManager.getStore().close(); + LOG.warn("{} is not closed properly. snapshotName: {}", + // Print hash code for debugging + omMetadataManager.getStore().toString(), + snapshotName); } super.finalize(); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index 0572e9aa7498..743458201883 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -57,8 +57,6 @@ import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo; import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; -import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus; -import org.apache.hadoop.ozone.om.service.SnapshotDeletingService; import org.apache.hadoop.ozone.om.service.SnapshotDiffCleanupService; import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob; import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager; @@ -84,8 +82,8 @@ import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_DIFF_DB_NAME; import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_INDICATOR; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_DB_DIR; -import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.INVALID_KEY_NAME; +import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle; /** @@ -210,15 +208,19 @@ public OmSnapshotManager(OzoneManager ozoneManager) { final OmSnapshot omSnapshot = notification.getValue(); if (omSnapshot != null) { // close snapshot's rocksdb on eviction - LOG.debug("Closing OmSnapshot '{}'", snapshotTableKey); + LOG.debug("Closing OmSnapshot '{}' due to {}", + snapshotTableKey, notification.getCause()); omSnapshot.close(); } else { - // Assuming the value becomes null when soft ref is GC'ed by JVM. - LOG.debug("OmSnapshot '{}' was already garbage collected by JVM.", - snapshotTableKey); + // Cache value would never be null in RemovalListener. + + // When a soft reference is GC'ed by the JVM, this RemovalListener + // would be called. But the cache value should still exist at that + // point. Thus in-theory this condition won't be triggered by JVM GC + throw new IllegalStateException("Unexpected: OmSnapshot is null"); } } catch (IOException e) { - LOG.error("Failed to close snapshot: {}", notification.getKey(), e); + LOG.error("Failed to close OmSnapshot: {}", notification.getKey(), e); } }; @@ -253,30 +255,6 @@ public OmSnapshotManager(OzoneManager ozoneManager) { this.snapshotDiffCleanupService.start(); } - /** - * Throws OMException FILE_NOT_FOUND if snapshot is not in active status. - * @param snapshotTableKey snapshot table key - */ - public void checkSnapshotActive(String snapshotTableKey) throws IOException { - checkSnapshotActive(getSnapshotInfo(snapshotTableKey)); - } - - private void checkSnapshotActive(SnapshotInfo snapInfo) throws OMException { - - if (!snapInfo.getSnapshotStatus().equals(SnapshotStatus.SNAPSHOT_ACTIVE)) { - if (isCalledFromSnapshotDeletingService()) { - LOG.debug("Permitting {} to load snapshot {} even in status: {}", - SnapshotDeletingService.class.getSimpleName(), - snapInfo.getTableKey(), - snapInfo.getSnapshotStatus()); - } else { - throw new OMException("Unable to load snapshot. " + - "Snapshot with table key '" + snapInfo.getTableKey() + - "' is no longer active", FILE_NOT_FOUND); - } - } - } - private CacheLoader createCacheLoader() { return new CacheLoader() { @Override @@ -309,7 +287,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) snapshotMetadataManager = new OmMetadataManagerImpl(conf, snapshotInfo.getCheckpointDirName(), isSnapshotInCache); } catch (IOException e) { - LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey, e); + LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey); throw e; } @@ -343,24 +321,6 @@ private CodecRegistry createCodecRegistryForSnapDiff() { return registry; } - /** - * Helper method to check whether the loader is called from - * SnapshotDeletingTask (return true) or not (return false). - */ - private boolean isCalledFromSnapshotDeletingService() { - - StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); - for (StackTraceElement elem : stackTrace) { - // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask - if (elem.getClassName().startsWith( - SnapshotDeletingService.class.getName())) { - return true; - } - } - - return false; - } - /** * Get snapshot instance LRU cache. * @return LoadingCache @@ -518,7 +478,7 @@ public IOmMetadataReader checkForSnapshot(String volumeName, bucketName, snapshotName); // Block FS API reads when snapshot is not active. - checkSnapshotActive(snapshotTableKey); + checkSnapshotActive(ozoneManager, snapshotTableKey); // Warn if actual cache size exceeds the soft limit already. if (snapshotCache.size() > softCacheSize) { @@ -584,10 +544,6 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, final String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot); final String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); try { - // Block SnapDiff if either one of the snapshot is not active. - checkSnapshotActive(fsKey); - checkSnapshotActive(tsKey); - final OmSnapshot fs = snapshotCache.get(fsKey); final OmSnapshot ts = snapshotCache.get(tsKey); return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts, @@ -600,12 +556,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, private void verifySnapshotInfoForSnapDiff(final SnapshotInfo fromSnapshot, final SnapshotInfo toSnapshot) throws IOException { - if ((fromSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) || - (toSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE)) { - // TODO: [SNAPSHOT] Throw custom snapshot exception. - throw new IOException("Cannot generate snapshot diff for non-active " + - "snapshots."); - } + // Block SnapDiff if either of the snapshots is not active. + checkSnapshotActive(fromSnapshot); + checkSnapshotActive(toSnapshot); + // Check snapshot creation time if (fromSnapshot.getCreationTime() > toSnapshot.getCreationTime()) { throw new IOException("fromSnapshot:" + fromSnapshot.getName() + " should be older than to toSnapshot:" + toSnapshot.getName()); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java index 81c260b3cb24..9e280aef2363 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java @@ -85,6 +85,7 @@ import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX; import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER; +import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle; import static org.apache.hadoop.ozone.OzoneConsts.OZONE_URI_DELIMITER; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getSnapshotInfo; @@ -1057,8 +1058,8 @@ private synchronized void loadJobOnStartUp(final String jobKey, String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); // Block SnapDiff if either one of the snapshots is not active - ozoneManager.getOmSnapshotManager().checkSnapshotActive(fsKey); - ozoneManager.getOmSnapshotManager().checkSnapshotActive(tsKey); + checkSnapshotActive(ozoneManager, fsKey); + checkSnapshotActive(ozoneManager, tsKey); try { submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey), diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java index a03f817de366..358f1ed1c03c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java @@ -23,11 +23,14 @@ import org.apache.hadoop.ozone.om.OzoneManager; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; +import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus; +import org.apache.hadoop.ozone.om.service.SnapshotDeletingService; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.RocksDBException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND; /** @@ -83,4 +86,51 @@ public static void dropColumnFamilyHandle( throw new RuntimeException(exception); } } + + + /** + * Throws OMException FILE_NOT_FOUND if snapshot is not in active status. + * @param snapshotTableKey snapshot table key + */ + public static void checkSnapshotActive(OzoneManager ozoneManager, + String snapshotTableKey) + throws IOException { + checkSnapshotActive(getSnapshotInfo(ozoneManager, snapshotTableKey)); + } + + public static void checkSnapshotActive(SnapshotInfo snapInfo) + throws OMException { + + if (snapInfo.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) { + if (isCalledFromSnapshotDeletingService()) { + LOG.debug("Permitting {} to load snapshot {} even in status: {}", + SnapshotDeletingService.class.getSimpleName(), + snapInfo.getTableKey(), + snapInfo.getSnapshotStatus()); + } else { + throw new OMException("Unable to load snapshot. " + + "Snapshot with table key '" + snapInfo.getTableKey() + + "' is no longer active", FILE_NOT_FOUND); + } + } + } + + /** + * Helper method to check whether the loader is called from + * SnapshotDeletingTask (return true) or not (return false). + */ + private static boolean isCalledFromSnapshotDeletingService() { + + StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); + for (StackTraceElement elem : stackTrace) { + // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask + if (elem.getClassName().startsWith( + SnapshotDeletingService.class.getName())) { + return true; + } + } + + return false; + } + } From 7fabe86a2df1b50c639f9b48ee55e520049ac228 Mon Sep 17 00:00:00 2001 From: Siyao Meng <50227127+smengcl@users.noreply.github.com> Date: Tue, 2 May 2023 11:27:58 -0700 Subject: [PATCH 7/7] Adjust description in ozone-default.xml. Change-Id: I6720da0c4740bf626b9737272e8f35b0621f1cb8 --- hadoop-hdds/common/src/main/resources/ozone-default.xml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index ca61077eaece..132fdb0a4c9b 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -3678,7 +3678,12 @@ 10 OZONE, OM - Maximum number of entries allowed in the snapshot cache. + Size of the OM Snapshot LRU cache. + + This is a soft limit of open OM Snapshot RocksDB instances that will be + held. The actual number of cached instance could exceed this limit if + more than this number of snapshot instances are still in-use by snapDiff + or other tasks.