diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 97c06a900c14..132fdb0a4c9b 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -3541,15 +3541,6 @@
However this parameter allows the namespace to support non-S3 compatible characters.
-
- ozone.om.snapshot.cache.max.size
- 10
- OZONE, OM
-
- Size of the OM Snapshot LRU cache. This is the maximum number of open OM Snapshot RocksDb instances
- that will be held in memory at any time.
-
-
ozone.om.container.location.cache.size
@@ -3687,7 +3678,12 @@
10
OZONE, OM
- Maximum number of entries allowed in the snapshot cache.
+ Size of the OM Snapshot LRU cache.
+
+ This is a soft limit of open OM Snapshot RocksDB instances that will be
+ held. The actual number of cached instance could exceed this limit if
+ more than this number of snapshot instances are still in-use by snapDiff
+ or other tasks.
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java
index cac8181743c3..b5f62a67a0e9 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java
@@ -39,7 +39,6 @@
import org.apache.hadoop.ozone.client.OzoneVolume;
import org.apache.hadoop.ozone.client.io.OzoneInputStream;
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
-import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.BucketLayout;
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
import org.apache.hadoop.ozone.om.helpers.OpenKeySession;
@@ -59,6 +58,7 @@
import org.slf4j.LoggerFactory;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
@@ -411,15 +411,19 @@ public void testBlockSnapshotFSAccessAfterDeletion() throws Exception {
deleteSnapshot(snapshotName);
// Can't access keys in snapshot anymore with FS API. Should throw exception
- final String errorMsg = "no longer active";
- LambdaTestUtils.intercept(OMException.class, errorMsg,
+ final String errorMsg1 = "no longer active";
+ LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
() -> o3fs.listStatus(snapshotRoot));
- LambdaTestUtils.intercept(OMException.class, errorMsg,
+ LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
() -> o3fs.listStatus(snapshotParent));
- LambdaTestUtils.intercept(OMException.class, errorMsg,
+ // Note: Different error message due to inconsistent FNFE client-side
+ // handling in BasicOzoneClientAdapterImpl#getFileStatus
+ // TODO: Reconciliation?
+ final String errorMsg2 = "No such file or directory";
+ LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey1));
- LambdaTestUtils.intercept(OMException.class, errorMsg,
+ LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey2));
}
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java
index 655a13d51d98..8d026326bb48 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java
@@ -260,6 +260,19 @@ public void close() throws IOException {
omMetadataManager.getStore().close();
}
+ @Override
+ protected void finalize() throws Throwable {
+ // Verify that the DB handle has been closed, log warning otherwise
+ // https://softwareengineering.stackexchange.com/a/288724
+ if (!omMetadataManager.getStore().isClosed()) {
+ LOG.warn("{} is not closed properly. snapshotName: {}",
+ // Print hash code for debugging
+ omMetadataManager.getStore().toString(),
+ snapshotName);
+ }
+ super.finalize();
+ }
+
@VisibleForTesting
public OMMetadataManager getMetadataManager() {
return omMetadataManager;
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
index f75623a8ad88..9ccf4c4ded85 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java
@@ -56,8 +56,6 @@
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
-import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus;
-import org.apache.hadoop.ozone.om.service.SnapshotDeletingService;
import org.apache.hadoop.ozone.om.service.SnapshotDiffCleanupService;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager;
@@ -89,8 +87,8 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_DB_DIR;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE_DEFAULT;
-import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.INVALID_KEY_NAME;
+import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;
/**
@@ -156,6 +154,9 @@ public final class OmSnapshotManager implements AutoCloseable {
private final int maxPageSize;
+ // Soft limit of the snapshot cache size.
+ private final int softCacheSize;
+
public OmSnapshotManager(OzoneManager ozoneManager) {
this.options = new ManagedDBOptions();
this.options.setCreateIfMissing(true);
@@ -204,29 +205,42 @@ public OmSnapshotManager(OzoneManager ozoneManager) {
.getStore()
.getRocksDBCheckpointDiffer();
- // size of lru cache
- int cacheSize = ozoneManager.getConfiguration().getInt(
+ // snapshot cache size, soft-limit
+ this.softCacheSize = ozoneManager.getConfiguration().getInt(
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE,
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT);
CacheLoader loader = createCacheLoader();
- RemovalListener removalListener
- = notification -> {
- try {
- // close snapshot's rocksdb on eviction
- LOG.debug("Closing snapshot: {}", notification.getKey());
- // TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero?
- notification.getValue().close();
- } catch (IOException e) {
- LOG.error("Failed to close snapshot: {} {}",
- notification.getKey(), e);
- }
- };
+ RemovalListener removalListener = notification -> {
+ try {
+ final String snapshotTableKey = notification.getKey();
+ final OmSnapshot omSnapshot = notification.getValue();
+ if (omSnapshot != null) {
+ // close snapshot's rocksdb on eviction
+ LOG.debug("Closing OmSnapshot '{}' due to {}",
+ snapshotTableKey, notification.getCause());
+ omSnapshot.close();
+ } else {
+ // Cache value would never be null in RemovalListener.
+
+ // When a soft reference is GC'ed by the JVM, this RemovalListener
+ // would be called. But the cache value should still exist at that
+ // point. Thus in-theory this condition won't be triggered by JVM GC
+ throw new IllegalStateException("Unexpected: OmSnapshot is null");
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to close OmSnapshot: {}", notification.getKey(), e);
+ }
+ };
// init LRU cache
- snapshotCache = CacheBuilder.newBuilder()
- .maximumSize(cacheSize)
+ this.snapshotCache = CacheBuilder.newBuilder()
+ // Indicating OmSnapshot instances are softly referenced from the cache.
+ // If no thread is holding a strong reference to an OmSnapshot instance
+ // (e.g. SnapDiff), the instance could be garbage collected by JVM at
+ // its discretion.
+ .softValues()
.removalListener(removalListener)
.build(loader);
@@ -265,31 +279,12 @@ private CacheLoader createCacheLoader() {
@Nonnull
public OmSnapshot load(@Nonnull String snapshotTableKey)
throws IOException {
- SnapshotInfo snapshotInfo;
// see if the snapshot exists
- snapshotInfo = getSnapshotInfo(snapshotTableKey);
+ SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey);
// Block snapshot from loading when it is no longer active e.g. DELETED,
// unless this is called from SnapshotDeletingService.
- // TODO: [SNAPSHOT] However, snapshotCache.get() from other requests
- // (not from SDS) would be able to piggyback off of this because
- // snapshot still in cache won't trigger loader again.
- // This needs proper addressal in e.g. HDDS-7935
- // by introducing another cache just for SDS.
- // While the snapshotCache would host ACTIVE snapshots only.
- if (!snapshotInfo.getSnapshotStatus().equals(
- SnapshotStatus.SNAPSHOT_ACTIVE)) {
- if (isCalledFromSnapshotDeletingService()) {
- LOG.debug("Permitting {} to load snapshot {} in status: {}",
- SnapshotDeletingService.class.getSimpleName(),
- snapshotInfo.getTableKey(),
- snapshotInfo.getSnapshotStatus());
- } else {
- throw new OMException("Unable to load snapshot. " +
- "Snapshot with table key '" + snapshotTableKey +
- "' is no longer active", FILE_NOT_FOUND);
- }
- }
+ checkSnapshotActive(snapshotInfo);
CacheValue cacheValue =
ozoneManager.getMetadataManager().getSnapshotInfoTable()
@@ -308,7 +303,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey)
snapshotMetadataManager = new OmMetadataManagerImpl(conf,
snapshotInfo.getCheckpointDirName(), isSnapshotInCache);
} catch (IOException e) {
- LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e);
+ LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey);
throw e;
}
@@ -342,24 +337,6 @@ private CodecRegistry createCodecRegistryForSnapDiff() {
return registry;
}
- /**
- * Helper method to check whether the loader is called from
- * SnapshotDeletingTask (return true) or not (return false).
- */
- private boolean isCalledFromSnapshotDeletingService() {
-
- StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
- for (StackTraceElement elem : stackTrace) {
- // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask
- if (elem.getClassName().startsWith(
- SnapshotDeletingService.class.getName())) {
- return true;
- }
- }
-
- return false;
- }
-
/**
* Get snapshot instance LRU cache.
* @return LoadingCache
@@ -449,7 +426,7 @@ private static void deleteKeysInSnapshotScopeFromDTableInternal(
try (TableIterator>
- keyIter = omMetadataManager.getDeletedTable().iterator()) {
+ keyIter = omMetadataManager.getDeletedTable().iterator()) {
keyIter.seek(beginKey);
// Continue only when there are entries of snapshot (bucket) scope
@@ -516,6 +493,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName,
String snapshotTableKey = SnapshotInfo.getTableKey(volumeName,
bucketName, snapshotName);
+ // Block FS API reads when snapshot is not active.
+ checkSnapshotActive(ozoneManager, snapshotTableKey);
+
+ // Warn if actual cache size exceeds the soft limit already.
+ if (snapshotCache.size() > softCacheSize) {
+ LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).",
+ snapshotCache.size(), softCacheSize);
+ }
+
// retrieve the snapshot from the cache
try {
return snapshotCache.get(snapshotTableKey);
@@ -537,7 +523,7 @@ public static String getSnapshotPrefix(String snapshotName) {
}
public static String getSnapshotPath(OzoneConfiguration conf,
- SnapshotInfo snapshotInfo) {
+ SnapshotInfo snapshotInfo) {
return OMStorage.getOmDbDir(conf) +
OM_KEY_PREFIX + OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX +
OM_DB_NAME + snapshotInfo.getCheckpointDirName();
@@ -574,7 +560,7 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
final OmSnapshot fs = snapshotCache.get(fsKey);
final OmSnapshot ts = snapshotCache.get(tsKey);
return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts,
- fsInfo, tsInfo, index, pageSize, forceFullDiff);
+ fsInfo, tsInfo, index, pageSize, forceFullDiff);
} catch (ExecutionException e) {
throw new IOException(e.getCause());
}
@@ -583,12 +569,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
private void verifySnapshotInfoForSnapDiff(final SnapshotInfo fromSnapshot,
final SnapshotInfo toSnapshot)
throws IOException {
- if ((fromSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) ||
- (toSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE)) {
- // TODO: [SNAPSHOT] Throw custom snapshot exception.
- throw new IOException("Cannot generate snapshot diff for non-active " +
- "snapshots.");
- }
+ // Block SnapDiff if either of the snapshots is not active.
+ checkSnapshotActive(fromSnapshot);
+ checkSnapshotActive(toSnapshot);
+ // Check snapshot creation time
if (fromSnapshot.getCreationTime() > toSnapshot.getCreationTime()) {
throw new IOException("fromSnapshot:" + fromSnapshot.getName() +
" should be older than to toSnapshot:" + toSnapshot.getName());
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java
index 7af3f1486fd3..eaaf63cf645b 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java
@@ -184,10 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager,
omClientResponse = new OMSnapshotDeleteResponse(
omResponse.build(), tableKey, snapshotInfo);
- // Evict the snapshot entry from cache, and close the snapshot DB
- // Nothing happens if the key doesn't exist in cache (snapshot not loaded)
- ozoneManager.getOmSnapshotManager().getSnapshotCache()
- .invalidate(tableKey);
+ // No longer need to invalidate the entry in the snapshot cache here.
} catch (IOException ex) {
exception = ex;
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java
index ebd23dff9a95..7801fce7a10d 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java
@@ -96,6 +96,7 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF_DEFAULT;
import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
+import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;
import static org.apache.hadoop.ozone.OzoneConsts.OZONE_URI_DELIMITER;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getSnapshotInfo;
@@ -1147,6 +1148,11 @@ private synchronized void loadJobOnStartUp(final String jobKey,
String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot);
String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot);
+
+ // Block SnapDiff if either one of the snapshots is not active
+ checkSnapshotActive(ozoneManager, fsKey);
+ checkSnapshotActive(ozoneManager, tsKey);
+
try {
submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey),
snapshotCache.get(tsKey), fsInfo, tsInfo, forceFullDiff);
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
index a03f817de366..358f1ed1c03c 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java
@@ -23,11 +23,14 @@
import org.apache.hadoop.ozone.om.OzoneManager;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
+import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus;
+import org.apache.hadoop.ozone.om.service.SnapshotDeletingService;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.RocksDBException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND;
/**
@@ -83,4 +86,51 @@ public static void dropColumnFamilyHandle(
throw new RuntimeException(exception);
}
}
+
+
+ /**
+ * Throws OMException FILE_NOT_FOUND if snapshot is not in active status.
+ * @param snapshotTableKey snapshot table key
+ */
+ public static void checkSnapshotActive(OzoneManager ozoneManager,
+ String snapshotTableKey)
+ throws IOException {
+ checkSnapshotActive(getSnapshotInfo(ozoneManager, snapshotTableKey));
+ }
+
+ public static void checkSnapshotActive(SnapshotInfo snapInfo)
+ throws OMException {
+
+ if (snapInfo.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) {
+ if (isCalledFromSnapshotDeletingService()) {
+ LOG.debug("Permitting {} to load snapshot {} even in status: {}",
+ SnapshotDeletingService.class.getSimpleName(),
+ snapInfo.getTableKey(),
+ snapInfo.getSnapshotStatus());
+ } else {
+ throw new OMException("Unable to load snapshot. " +
+ "Snapshot with table key '" + snapInfo.getTableKey() +
+ "' is no longer active", FILE_NOT_FOUND);
+ }
+ }
+ }
+
+ /**
+ * Helper method to check whether the loader is called from
+ * SnapshotDeletingTask (return true) or not (return false).
+ */
+ private static boolean isCalledFromSnapshotDeletingService() {
+
+ StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
+ for (StackTraceElement elem : stackTrace) {
+ // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask
+ if (elem.getClassName().startsWith(
+ SnapshotDeletingService.class.getName())) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
}
diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java
index c3162784b051..541d0a5c969e 100644
--- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java
+++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java
@@ -122,6 +122,11 @@ public void testCloseOnEviction() throws IOException {
.checkForSnapshot(second.getVolumeName(),
second.getBucketName(), getSnapshotPrefix(second.getName()));
+ // As a workaround, invalidate all cache entries in order to trigger
+ // instances close in this test case, since JVM GC most likely would not
+ // have triggered and closed the instances yet at this point.
+ omSnapshotManager.getSnapshotCache().invalidateAll();
+
// confirm store was closed
verify(firstSnapshotStore, timeout(3000).times(1)).close();
}