diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index 97c06a900c14..132fdb0a4c9b 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -3541,15 +3541,6 @@ However this parameter allows the namespace to support non-S3 compatible characters. - - ozone.om.snapshot.cache.max.size - 10 - OZONE, OM - - Size of the OM Snapshot LRU cache. This is the maximum number of open OM Snapshot RocksDb instances - that will be held in memory at any time. - - ozone.om.container.location.cache.size @@ -3687,7 +3678,12 @@ 10 OZONE, OM - Maximum number of entries allowed in the snapshot cache. + Size of the OM Snapshot LRU cache. + + This is a soft limit of open OM Snapshot RocksDB instances that will be + held. The actual number of cached instance could exceed this limit if + more than this number of snapshot instances are still in-use by snapDiff + or other tasks. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java index cac8181743c3..b5f62a67a0e9 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotFileSystem.java @@ -39,7 +39,6 @@ import org.apache.hadoop.ozone.client.OzoneVolume; import org.apache.hadoop.ozone.client.io.OzoneInputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; -import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.BucketLayout; import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; import org.apache.hadoop.ozone.om.helpers.OpenKeySession; @@ -59,6 +58,7 @@ import org.slf4j.LoggerFactory; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; @@ -411,15 +411,19 @@ public void testBlockSnapshotFSAccessAfterDeletion() throws Exception { deleteSnapshot(snapshotName); // Can't access keys in snapshot anymore with FS API. Should throw exception - final String errorMsg = "no longer active"; - LambdaTestUtils.intercept(OMException.class, errorMsg, + final String errorMsg1 = "no longer active"; + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1, () -> o3fs.listStatus(snapshotRoot)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1, () -> o3fs.listStatus(snapshotParent)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + // Note: Different error message due to inconsistent FNFE client-side + // handling in BasicOzoneClientAdapterImpl#getFileStatus + // TODO: Reconciliation? + final String errorMsg2 = "No such file or directory"; + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2, () -> o3fs.getFileStatus(snapshotKey1)); - LambdaTestUtils.intercept(OMException.class, errorMsg, + LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2, () -> o3fs.getFileStatus(snapshotKey2)); } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java index 655a13d51d98..8d026326bb48 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshot.java @@ -260,6 +260,19 @@ public void close() throws IOException { omMetadataManager.getStore().close(); } + @Override + protected void finalize() throws Throwable { + // Verify that the DB handle has been closed, log warning otherwise + // https://softwareengineering.stackexchange.com/a/288724 + if (!omMetadataManager.getStore().isClosed()) { + LOG.warn("{} is not closed properly. snapshotName: {}", + // Print hash code for debugging + omMetadataManager.getStore().toString(), + snapshotName); + } + super.finalize(); + } + @VisibleForTesting public OMMetadataManager getMetadataManager() { return omMetadataManager; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java index f75623a8ad88..9ccf4c4ded85 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmSnapshotManager.java @@ -56,8 +56,6 @@ import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo; import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; -import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus; -import org.apache.hadoop.ozone.om.service.SnapshotDeletingService; import org.apache.hadoop.ozone.om.service.SnapshotDiffCleanupService; import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob; import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager; @@ -89,8 +87,8 @@ import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_DB_DIR; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE_DEFAULT; -import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.INVALID_KEY_NAME; +import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle; /** @@ -156,6 +154,9 @@ public final class OmSnapshotManager implements AutoCloseable { private final int maxPageSize; + // Soft limit of the snapshot cache size. + private final int softCacheSize; + public OmSnapshotManager(OzoneManager ozoneManager) { this.options = new ManagedDBOptions(); this.options.setCreateIfMissing(true); @@ -204,29 +205,42 @@ public OmSnapshotManager(OzoneManager ozoneManager) { .getStore() .getRocksDBCheckpointDiffer(); - // size of lru cache - int cacheSize = ozoneManager.getConfiguration().getInt( + // snapshot cache size, soft-limit + this.softCacheSize = ozoneManager.getConfiguration().getInt( OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE, OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT); CacheLoader loader = createCacheLoader(); - RemovalListener removalListener - = notification -> { - try { - // close snapshot's rocksdb on eviction - LOG.debug("Closing snapshot: {}", notification.getKey()); - // TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero? - notification.getValue().close(); - } catch (IOException e) { - LOG.error("Failed to close snapshot: {} {}", - notification.getKey(), e); - } - }; + RemovalListener removalListener = notification -> { + try { + final String snapshotTableKey = notification.getKey(); + final OmSnapshot omSnapshot = notification.getValue(); + if (omSnapshot != null) { + // close snapshot's rocksdb on eviction + LOG.debug("Closing OmSnapshot '{}' due to {}", + snapshotTableKey, notification.getCause()); + omSnapshot.close(); + } else { + // Cache value would never be null in RemovalListener. + + // When a soft reference is GC'ed by the JVM, this RemovalListener + // would be called. But the cache value should still exist at that + // point. Thus in-theory this condition won't be triggered by JVM GC + throw new IllegalStateException("Unexpected: OmSnapshot is null"); + } + } catch (IOException e) { + LOG.error("Failed to close OmSnapshot: {}", notification.getKey(), e); + } + }; // init LRU cache - snapshotCache = CacheBuilder.newBuilder() - .maximumSize(cacheSize) + this.snapshotCache = CacheBuilder.newBuilder() + // Indicating OmSnapshot instances are softly referenced from the cache. + // If no thread is holding a strong reference to an OmSnapshot instance + // (e.g. SnapDiff), the instance could be garbage collected by JVM at + // its discretion. + .softValues() .removalListener(removalListener) .build(loader); @@ -265,31 +279,12 @@ private CacheLoader createCacheLoader() { @Nonnull public OmSnapshot load(@Nonnull String snapshotTableKey) throws IOException { - SnapshotInfo snapshotInfo; // see if the snapshot exists - snapshotInfo = getSnapshotInfo(snapshotTableKey); + SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey); // Block snapshot from loading when it is no longer active e.g. DELETED, // unless this is called from SnapshotDeletingService. - // TODO: [SNAPSHOT] However, snapshotCache.get() from other requests - // (not from SDS) would be able to piggyback off of this because - // snapshot still in cache won't trigger loader again. - // This needs proper addressal in e.g. HDDS-7935 - // by introducing another cache just for SDS. - // While the snapshotCache would host ACTIVE snapshots only. - if (!snapshotInfo.getSnapshotStatus().equals( - SnapshotStatus.SNAPSHOT_ACTIVE)) { - if (isCalledFromSnapshotDeletingService()) { - LOG.debug("Permitting {} to load snapshot {} in status: {}", - SnapshotDeletingService.class.getSimpleName(), - snapshotInfo.getTableKey(), - snapshotInfo.getSnapshotStatus()); - } else { - throw new OMException("Unable to load snapshot. " + - "Snapshot with table key '" + snapshotTableKey + - "' is no longer active", FILE_NOT_FOUND); - } - } + checkSnapshotActive(snapshotInfo); CacheValue cacheValue = ozoneManager.getMetadataManager().getSnapshotInfoTable() @@ -308,7 +303,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey) snapshotMetadataManager = new OmMetadataManagerImpl(conf, snapshotInfo.getCheckpointDirName(), isSnapshotInCache); } catch (IOException e) { - LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e); + LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey); throw e; } @@ -342,24 +337,6 @@ private CodecRegistry createCodecRegistryForSnapDiff() { return registry; } - /** - * Helper method to check whether the loader is called from - * SnapshotDeletingTask (return true) or not (return false). - */ - private boolean isCalledFromSnapshotDeletingService() { - - StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); - for (StackTraceElement elem : stackTrace) { - // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask - if (elem.getClassName().startsWith( - SnapshotDeletingService.class.getName())) { - return true; - } - } - - return false; - } - /** * Get snapshot instance LRU cache. * @return LoadingCache @@ -449,7 +426,7 @@ private static void deleteKeysInSnapshotScopeFromDTableInternal( try (TableIterator> - keyIter = omMetadataManager.getDeletedTable().iterator()) { + keyIter = omMetadataManager.getDeletedTable().iterator()) { keyIter.seek(beginKey); // Continue only when there are entries of snapshot (bucket) scope @@ -516,6 +493,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName, String snapshotTableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName); + // Block FS API reads when snapshot is not active. + checkSnapshotActive(ozoneManager, snapshotTableKey); + + // Warn if actual cache size exceeds the soft limit already. + if (snapshotCache.size() > softCacheSize) { + LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).", + snapshotCache.size(), softCacheSize); + } + // retrieve the snapshot from the cache try { return snapshotCache.get(snapshotTableKey); @@ -537,7 +523,7 @@ public static String getSnapshotPrefix(String snapshotName) { } public static String getSnapshotPath(OzoneConfiguration conf, - SnapshotInfo snapshotInfo) { + SnapshotInfo snapshotInfo) { return OMStorage.getOmDbDir(conf) + OM_KEY_PREFIX + OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX + OM_DB_NAME + snapshotInfo.getCheckpointDirName(); @@ -574,7 +560,7 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, final OmSnapshot fs = snapshotCache.get(fsKey); final OmSnapshot ts = snapshotCache.get(tsKey); return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts, - fsInfo, tsInfo, index, pageSize, forceFullDiff); + fsInfo, tsInfo, index, pageSize, forceFullDiff); } catch (ExecutionException e) { throw new IOException(e.getCause()); } @@ -583,12 +569,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume, private void verifySnapshotInfoForSnapDiff(final SnapshotInfo fromSnapshot, final SnapshotInfo toSnapshot) throws IOException { - if ((fromSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) || - (toSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE)) { - // TODO: [SNAPSHOT] Throw custom snapshot exception. - throw new IOException("Cannot generate snapshot diff for non-active " + - "snapshots."); - } + // Block SnapDiff if either of the snapshots is not active. + checkSnapshotActive(fromSnapshot); + checkSnapshotActive(toSnapshot); + // Check snapshot creation time if (fromSnapshot.getCreationTime() > toSnapshot.getCreationTime()) { throw new IOException("fromSnapshot:" + fromSnapshot.getName() + " should be older than to toSnapshot:" + toSnapshot.getName()); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java index 7af3f1486fd3..eaaf63cf645b 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotDeleteRequest.java @@ -184,10 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager, omClientResponse = new OMSnapshotDeleteResponse( omResponse.build(), tableKey, snapshotInfo); - // Evict the snapshot entry from cache, and close the snapshot DB - // Nothing happens if the key doesn't exist in cache (snapshot not loaded) - ozoneManager.getOmSnapshotManager().getSnapshotCache() - .invalidate(tableKey); + // No longer need to invalidate the entry in the snapshot cache here. } catch (IOException ex) { exception = ex; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java index ebd23dff9a95..7801fce7a10d 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotDiffManager.java @@ -96,6 +96,7 @@ import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF; import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF_DEFAULT; import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER; +import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle; import static org.apache.hadoop.ozone.OzoneConsts.OZONE_URI_DELIMITER; import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getSnapshotInfo; @@ -1147,6 +1148,11 @@ private synchronized void loadJobOnStartUp(final String jobKey, String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot); String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot); + + // Block SnapDiff if either one of the snapshots is not active + checkSnapshotActive(ozoneManager, fsKey); + checkSnapshotActive(ozoneManager, tsKey); + try { submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey), snapshotCache.get(tsKey), fsInfo, tsInfo, forceFullDiff); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java index a03f817de366..358f1ed1c03c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotUtils.java @@ -23,11 +23,14 @@ import org.apache.hadoop.ozone.om.OzoneManager; import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.SnapshotInfo; +import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus; +import org.apache.hadoop.ozone.om.service.SnapshotDeletingService; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.RocksDBException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND; import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND; /** @@ -83,4 +86,51 @@ public static void dropColumnFamilyHandle( throw new RuntimeException(exception); } } + + + /** + * Throws OMException FILE_NOT_FOUND if snapshot is not in active status. + * @param snapshotTableKey snapshot table key + */ + public static void checkSnapshotActive(OzoneManager ozoneManager, + String snapshotTableKey) + throws IOException { + checkSnapshotActive(getSnapshotInfo(ozoneManager, snapshotTableKey)); + } + + public static void checkSnapshotActive(SnapshotInfo snapInfo) + throws OMException { + + if (snapInfo.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) { + if (isCalledFromSnapshotDeletingService()) { + LOG.debug("Permitting {} to load snapshot {} even in status: {}", + SnapshotDeletingService.class.getSimpleName(), + snapInfo.getTableKey(), + snapInfo.getSnapshotStatus()); + } else { + throw new OMException("Unable to load snapshot. " + + "Snapshot with table key '" + snapInfo.getTableKey() + + "' is no longer active", FILE_NOT_FOUND); + } + } + } + + /** + * Helper method to check whether the loader is called from + * SnapshotDeletingTask (return true) or not (return false). + */ + private static boolean isCalledFromSnapshotDeletingService() { + + StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); + for (StackTraceElement elem : stackTrace) { + // Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask + if (elem.getClassName().startsWith( + SnapshotDeletingService.class.getName())) { + return true; + } + } + + return false; + } + } diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java index c3162784b051..541d0a5c969e 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmSnapshotManager.java @@ -122,6 +122,11 @@ public void testCloseOnEviction() throws IOException { .checkForSnapshot(second.getVolumeName(), second.getBucketName(), getSnapshotPrefix(second.getName())); + // As a workaround, invalidate all cache entries in order to trigger + // instances close in this test case, since JVM GC most likely would not + // have triggered and closed the instances yet at this point. + omSnapshotManager.getSnapshotCache().invalidateAll(); + // confirm store was closed verify(firstSnapshotStore, timeout(3000).times(1)).close(); }