From 7ffaae95076fc3ad9315355efeabb00868b503fb Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Thu, 25 Jun 2020 13:07:09 -0700 Subject: [PATCH 1/9] HDDS-3741. Reload old OM state if Install Snapshot from Leader fails --- .../apache/hadoop/ozone/om/OzoneManager.java | 106 ++++++++++-------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 0905d81887b1..d3fb81355936 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -217,6 +217,7 @@ import org.apache.ratis.proto.RaftProtos.RaftPeerRole; import org.apache.ratis.server.protocol.TermIndex; +import org.apache.ratis.util.ExitUtils; import org.apache.ratis.util.FileUtils; import org.apache.ratis.util.LifeCycle; import org.bouncycastle.pkcs.PKCS10CertificationRequest; @@ -3083,32 +3084,15 @@ public TermIndex installSnapshot(String leaderId) { } DBCheckpoint omDBcheckpoint = getDBCheckpointFromLeader(leaderId); - Path newDBlocation = omDBcheckpoint.getCheckpointLocation(); + Path newDBLocation = omDBcheckpoint.getCheckpointLocation(); LOG.info("Downloaded checkpoint from Leader {}, in to the location {}", - leaderId, newDBlocation); + leaderId, newDBLocation); - // Check if current ratis log index is smaller than the downloaded - // checkpoint transaction index. If yes, proceed by stopping the ratis - // server so that the OM state can be re-initialized. If no, then do not - // proceed with installSnapshot. + OMTransactionInfo omTransactionInfo = getTransactionInfoFromDB( + newDBLocation); - OMTransactionInfo omTransactionInfo = null; - - Path dbDir = newDBlocation.getParent(); - if (dbDir == null) { - LOG.error("Incorrect DB location path {} received from checkpoint.", - newDBlocation); - return null; - } - - try { - omTransactionInfo = - OzoneManagerRatisUtils.getTransactionInfoFromDownloadedSnapshot( - configuration, dbDir); - } catch (Exception ex) { - LOG.error("Failed during opening downloaded snapshot from " + - "{} to obtain transaction index", newDBlocation, ex); + if (omTransactionInfo == null) { return null; } @@ -3124,45 +3108,61 @@ public TermIndex installSnapshot(String leaderId) { } catch (Exception e) { LOG.error("Failed to stop/ pause the services. Cannot proceed with " + "installing the new checkpoint.", e); + + // During stopServices, if KeyManager was stopped successfully and + // OMMetadataManager stop failed, we should restart the KeyManager. + keyManager.start(configuration); + return null; } - //TODO: un-pause SM if any failures and retry? + File dbBackup; + TermIndex termIndex = omRatisServer.getLastAppliedTermIndex(); + long currentTerm = termIndex.getTerm(); + long lastAppliedIndex = termIndex.getIndex(); + boolean loadSuccess = false; - long lastAppliedIndex = omRatisServer.getLastAppliedTermIndex().getIndex(); + try { + // Check if current applied log index is smaller than the downloaded + // checkpoint transaction index. If yes, proceed by stopping the ratis + // server so that the OM state can be re-initialized. If no then do not + // proceed with installSnapshot. + boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo( + omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation); + if (!canProceed) { + return null; + } - boolean canProceed = - OzoneManagerRatisUtils.verifyTransactionInfo(omTransactionInfo, - lastAppliedIndex, leaderId, newDBlocation); + try { + dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, + newDBLocation); + } catch (Exception e) { + LOG.error("OM DB checkpoint replacement with new downloaded " + + "checkpoint failed.", e); + return null; + } - // If downloaded DB has transaction info less than current one, return. - if (!canProceed) { - return null; + loadSuccess = true; + } finally { + if (!loadSuccess) { + omRatisServer.getOmStateMachine().unpause(lastAppliedIndex, + currentTerm); + } } long leaderIndex = omTransactionInfo.getTransactionIndex(); long leaderTerm = omTransactionInfo.getCurrentTerm(); - - File dbBackup; - try { - dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, - newDBlocation); - } catch (Exception e) { - LOG.error("OM DB checkpoint replacement with new downloaded checkpoint " + - "failed.", e); - return null; - } - // Reload the OM DB store with the new checkpoint. // Restart (unpause) the state machine and update its last applied index // to the installed checkpoint's snapshot index. try { reloadOMState(leaderIndex, leaderTerm); omRatisServer.getOmStateMachine().unpause(leaderIndex, leaderTerm); - } catch (IOException e) { - LOG.error("Failed to reload OM state with new DB checkpoint.", e); - return null; + } catch (IOException ex) { + String errorMsg = "Failed to reload OM state and instantiate services " + + "after installing Snapshot from Leader."; + ExitUtils.terminate(1, errorMsg, ex, LOG); } // Delete the backup DB @@ -3203,6 +3203,24 @@ void stopServices() throws Exception { metadataManager.stop(); } + private OMTransactionInfo getTransactionInfoFromDB(Path dbPath) { + Path dbDir = dbPath.getParent(); + if (dbDir == null) { + LOG.error("Incorrect DB location path {} received from checkpoint.", + dbPath); + return null; + } + + try { + return OzoneManagerRatisUtils.getTransactionInfoFromDownloadedSnapshot( + configuration, dbDir); + } catch (Exception ex) { + LOG.error("Install Snapshot failed during opening downloaded snapshot " + + "from {} to obtain transaction index", dbPath, ex); + return null; + } + } + /** * Replace the current OM DB with the new DB checkpoint. * From be58f0b9a46137d3f064d729e11589ba386e66dc Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Tue, 30 Jun 2020 12:12:54 -0700 Subject: [PATCH 2/9] review fixes --- .../org/apache/hadoop/ozone/OzoneConsts.java | 3 + .../ozone/om/OmMetadataManagerImpl.java | 17 +++ .../apache/hadoop/ozone/om/OzoneManager.java | 109 ++++++++++-------- .../ratis/utils/OzoneManagerRatisUtils.java | 5 +- 4 files changed, 86 insertions(+), 48 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java index e340b3231491..4b380948abd9 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConsts.java @@ -366,4 +366,7 @@ private OzoneConsts() { public static final String CONTAINER_DB_TYPE_ROCKSDB = "RocksDB"; public static final String CONTAINER_DB_TYPE_LEVELDB = "LevelDB"; + + // An on-disk transient marker file used when replacing DB with checkpoint + public static final String DB_TRANSIENT_MARKER = "dbInconsistentMarker"; } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java index d48c6fa9a36f..a6cd17487201 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java @@ -18,6 +18,7 @@ import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; @@ -77,9 +78,11 @@ import org.apache.commons.lang3.StringUtils; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_OPEN_KEY_EXPIRE_THRESHOLD_SECONDS; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_OPEN_KEY_EXPIRE_THRESHOLD_SECONDS_DEFAULT; +import static org.apache.hadoop.ozone.OzoneConsts.DB_TRANSIENT_MARKER; import static org.apache.hadoop.ozone.OzoneConsts.OM_DB_NAME; import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX; +import org.apache.ratis.util.ExitUtils; import org.eclipse.jetty.util.StringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -249,6 +252,20 @@ public void start(OzoneConfiguration configuration) throws IOException { if (store == null) { File metaDir = OMStorage.getOmDbDir(configuration); + // Check if there is a DB Inconsistent Marker in the metaDir. This + // marker indicates that the DB is in an inconsistent state and hence + // the OM process should be terminated. + File markerFile = new File(metaDir, DB_TRANSIENT_MARKER); + if (Files.exists(markerFile.toPath())) { + LOG.error("File {} marks that OM DB is in an inconsistent state."); + // Note - The marker file should be deleted only after fixing the DB. + // In an HA setup, this can be done by replacing this DB with a + // checkpoint from another OM. + String errorMsg = "Cannot load OM DB as it is in an inconsistent " + + "state."; + ExitUtils.terminate(1, errorMsg, LOG); + } + RocksDBConfiguration rocksDBConfiguration = configuration.getObject(RocksDBConfiguration.class); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index d3fb81355936..2c981985c52b 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -194,6 +194,7 @@ import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_KEY_PREALLOCATION_BLOCKS_MAX_DEFAULT; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE; import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE_DEFAULT; +import static org.apache.hadoop.ozone.OzoneConsts.DB_TRANSIENT_MARKER; import static org.apache.hadoop.ozone.OzoneConsts.OM_METRICS_FILE; import static org.apache.hadoop.ozone.OzoneConsts.OM_METRICS_TEMP_FILE; import static org.apache.hadoop.ozone.OzoneConsts.RPC_PORT; @@ -3089,10 +3090,11 @@ public TermIndex installSnapshot(String leaderId) { LOG.info("Downloaded checkpoint from Leader {}, in to the location {}", leaderId, newDBLocation); - OMTransactionInfo omTransactionInfo = getTransactionInfoFromDB( + OMTransactionInfo omTransactionInfo = getTrxnInfoFromCheckpoint( newDBLocation); if (omTransactionInfo == null) { + LOG.error("Failed to install snapshot from {}."); return null; } @@ -3116,65 +3118,64 @@ public TermIndex installSnapshot(String leaderId) { return null; } - File dbBackup; + File dbBackup = null; TermIndex termIndex = omRatisServer.getLastAppliedTermIndex(); - long currentTerm = termIndex.getTerm(); + long term = termIndex.getTerm(); long lastAppliedIndex = termIndex.getIndex(); - boolean loadSuccess = false; - try { - // Check if current applied log index is smaller than the downloaded - // checkpoint transaction index. If yes, proceed by stopping the ratis - // server so that the OM state can be re-initialized. If no then do not - // proceed with installSnapshot. - boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo( - omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation); - if (!canProceed) { - return null; - } - - try { - dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, - newDBLocation); - } catch (Exception e) { - LOG.error("OM DB checkpoint replacement with new downloaded " + - "checkpoint failed.", e); - return null; - } - - loadSuccess = true; - } finally { - if (!loadSuccess) { - omRatisServer.getOmStateMachine().unpause(lastAppliedIndex, - currentTerm); - } + // Check if current applied log index is smaller than the downloaded + // checkpoint transaction index. If yes, proceed by stopping the ratis + // server so that the OM state can be re-initialized. If no then do not + // proceed with installSnapshot. + boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo( + omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation); + if (!canProceed) { + return null; } - long leaderIndex = omTransactionInfo.getTransactionIndex(); - long leaderTerm = omTransactionInfo.getCurrentTerm(); + try { + dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, + newDBLocation); + term = omTransactionInfo.getCurrentTerm(); + lastAppliedIndex = omTransactionInfo.getTransactionIndex(); + LOG.info("Replaced DB with checkpoint from OM: {}, term: {}, index: {}", + leaderId, term, lastAppliedIndex); + } catch (Exception e) { + LOG.error("Failed to install Snapshot from {} as OM failed to " + + "replace DB with downloaded checkpoint. Reloading old OM state.", e); + } // Reload the OM DB store with the new checkpoint. // Restart (unpause) the state machine and update its last applied index // to the installed checkpoint's snapshot index. try { - reloadOMState(leaderIndex, leaderTerm); - omRatisServer.getOmStateMachine().unpause(leaderIndex, leaderTerm); + reloadOMState(lastAppliedIndex, term); + omRatisServer.getOmStateMachine().unpause(lastAppliedIndex, term); + LOG.info("Reloaded OM state with Term: {} and Index: {}", term, + lastAppliedIndex); } catch (IOException ex) { - String errorMsg = "Failed to reload OM state and instantiate services " + - "after installing Snapshot from Leader."; + String errorMsg = "Failed to reload OM state and instantiate services."; ExitUtils.terminate(1, errorMsg, ex, LOG); } // Delete the backup DB try { - FileUtils.deleteFully(dbBackup); + if (dbBackup != null) { + FileUtils.deleteFully(dbBackup); + } } catch (IOException e) { LOG.error("Failed to delete the backup of the original DB {}", dbBackup); } + if (lastAppliedIndex != omTransactionInfo.getTransactionIndex()) { + // Install Snapshot failed and old state was reloaded. Return null to + // Ratis to indicate that installation failed. + return null; + } + // TODO: We should only return the snpashotIndex to the leader. // Should be fixed after RATIS-586 - TermIndex newTermIndex = TermIndex.newTermIndex(leaderTerm, leaderIndex); + TermIndex newTermIndex = TermIndex.newTermIndex(term, lastAppliedIndex); return newTermIndex; } @@ -3203,7 +3204,7 @@ void stopServices() throws Exception { metadataManager.stop(); } - private OMTransactionInfo getTransactionInfoFromDB(Path dbPath) { + private OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) { Path dbDir = dbPath.getParent(); if (dbDir == null) { LOG.error("Incorrect DB location path {} received from checkpoint.", @@ -3212,11 +3213,11 @@ private OMTransactionInfo getTransactionInfoFromDB(Path dbPath) { } try { - return OzoneManagerRatisUtils.getTransactionInfoFromDownloadedSnapshot( + return OzoneManagerRatisUtils.getTransactionInfoFromDB( configuration, dbDir); } catch (Exception ex) { - LOG.error("Install Snapshot failed during opening downloaded snapshot " + - "from {} to obtain transaction index", dbPath, ex); + LOG.error("Failed to open downloaded checkpoint {} and read transaction" + + " info", dbPath, ex); return null; } } @@ -3226,16 +3227,17 @@ private OMTransactionInfo getTransactionInfoFromDB(Path dbPath) { * * @param lastAppliedIndex the last applied index in the current OM DB. * @param checkpointPath path to the new DB checkpoint - * @return location of the backup of the original DB + * @return location of backup of the original DB * @throws Exception */ File replaceOMDBWithCheckpoint(long lastAppliedIndex, File oldDB, - Path checkpointPath) throws Exception { + Path checkpointPath) throws IOException { // Take a backup of the current DB String dbBackupName = OzoneConsts.OM_DB_BACKUP_PREFIX + lastAppliedIndex + "_" + System.currentTimeMillis(); - File dbBackup = new File(oldDB.getParentFile(), dbBackupName); + File dbDir = oldDB.getParentFile(); + File dbBackup = new File(dbDir, dbBackupName); try { Files.move(oldDB.toPath(), dbBackup.toPath()); @@ -3246,13 +3248,28 @@ File replaceOMDBWithCheckpoint(long lastAppliedIndex, File oldDB, } // Move the new DB checkpoint into the om metadata dir + Path markerFile = new File(dbDir, DB_TRANSIENT_MARKER).toPath(); try { + // Create a Transient Marker file. This file will be deleted if the + // checkpoint DB is successfully moved to the old DB location or if the + // old DB backup is reset to its location. If not, then the OM DB is in + // an inconsistent state and this marker file will fail OM from + // starting up. + Files.createFile(markerFile); Files.move(checkpointPath, oldDB.toPath()); + Files.deleteIfExists(markerFile); } catch (IOException e) { LOG.error("Failed to move downloaded DB checkpoint {} to metadata " + "directory {}. Resetting to original DB.", checkpointPath, oldDB.toPath()); - Files.move(dbBackup.toPath(), oldDB.toPath()); + try { + Files.move(dbBackup.toPath(), oldDB.toPath()); + Files.deleteIfExists(markerFile); + } catch (IOException ex) { + String errorMsg = "Failed to reset to original DB. OM is in an " + + "inconsistent state."; + ExitUtils.terminate(1, errorMsg, ex, LOG); + } throw e; } return dbBackup; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index 4aaaf13f6a4d..531b2e295089 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -228,12 +228,13 @@ public static Status exceptionToResponseStatus(IOException exception) { } /** - * Obtain Transaction info from downloaded snapshot DB. + * Obtain Transaction info from DB. * @param tempConfig + * @param dbDir path to DB * @return OMTransactionInfo * @throws Exception */ - public static OMTransactionInfo getTransactionInfoFromDownloadedSnapshot( + public static OMTransactionInfo getTransactionInfoFromDB( OzoneConfiguration tempConfig, Path dbDir) throws Exception { DBStore dbStore = OmMetadataManagerImpl.loadDB(tempConfig, dbDir.toFile()); From fd1a90822b44032b450f194150505db337517fe9 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 10 Jul 2020 11:46:36 -0700 Subject: [PATCH 3/9] fix can't proceed case --- .../apache/hadoop/ozone/om/OzoneManager.java | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 2c981985c52b..37b099971f18 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -3129,20 +3129,25 @@ public TermIndex installSnapshot(String leaderId) { // proceed with installSnapshot. boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo( omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation); - if (!canProceed) { - return null; - } - try { - dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, - newDBLocation); - term = omTransactionInfo.getCurrentTerm(); - lastAppliedIndex = omTransactionInfo.getTransactionIndex(); - LOG.info("Replaced DB with checkpoint from OM: {}, term: {}, index: {}", - leaderId, term, lastAppliedIndex); - } catch (Exception e) { - LOG.error("Failed to install Snapshot from {} as OM failed to " + - "replace DB with downloaded checkpoint. Reloading old OM state.", e); + if (canProceed) { + try { + dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, + newDBLocation); + term = omTransactionInfo.getCurrentTerm(); + lastAppliedIndex = omTransactionInfo.getTransactionIndex(); + LOG.info("Replaced DB with checkpoint from OM: {}, term: {}, index: {}", + leaderId, term, lastAppliedIndex); + } catch (Exception e) { + LOG.error("Failed to install Snapshot from {} as OM failed to replace" + + " DB with downloaded checkpoint. Reloading old OM state.", e); + } + } else { + LOG.warn("Cannot proceed with InstallSnapshot as OM is at termIndex: " + + "({},{}) and checkpoint has lower termIndex: ({},{}). Reloading old" + + " state of OM.", term, lastAppliedIndex, + omTransactionInfo.getCurrentTerm(), + omTransactionInfo.getTransactionIndex()); } // Reload the OM DB store with the new checkpoint. From a0ff810022f815591f7713ac3be81a92f0b95785 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 10 Jul 2020 14:47:39 -0700 Subject: [PATCH 4/9] Refactor code to add unit test --- .../hadoop/ozone/om/TestOMRatisSnapshots.java | 147 ++++++++++++------ .../ozone/om/OmMetadataManagerImpl.java | 8 +- .../apache/hadoop/ozone/om/OzoneManager.java | 74 +++++---- .../ozone/om/ratis/OMTransactionInfo.java | 21 ++- .../om/ratis/OzoneManagerStateMachine.java | 6 +- .../ratis/utils/OzoneManagerRatisUtils.java | 7 +- .../OzoneManagerSnapshotProvider.java | 10 +- .../ozone/om/TestOmMetadataManager.java | 2 +- ...eManagerDoubleBufferWithDummyResponse.java | 2 +- ...zoneManagerDoubleBufferWithOMResponse.java | 2 +- 10 files changed, 172 insertions(+), 107 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java index 0cfbea4ef9c2..dd66594078ad 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java @@ -16,7 +16,6 @@ */ package org.apache.hadoop.ozone.om; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -34,8 +33,11 @@ import org.apache.hadoop.ozone.om.ratis.OMTransactionInfo; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; -import org.apache.commons.lang3.RandomStringUtils; import static org.apache.hadoop.ozone.om.TestOzoneManagerHAWithData.createKey; +import static org.junit.Assert.assertTrue; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.test.GenericTestUtils; import org.apache.ratis.server.protocol.TermIndex; import org.junit.After; import org.junit.Assert; @@ -45,6 +47,7 @@ import org.junit.Test; import org.junit.rules.ExpectedException; import org.junit.rules.Timeout; +import org.slf4j.event.Level; /** * Tests the Ratis snaphsots feature in OM. @@ -59,6 +62,10 @@ public class TestOMRatisSnapshots { private String scmId; private String omServiceId; private int numOfOMs = 3; + private OzoneBucket ozoneBucket; + private String volumeName; + private String bucketName; + private static final long SNAPSHOT_THRESHOLD = 50; private static final int LOG_PURGE_GAP = 50; @@ -95,6 +102,20 @@ public void init() throws Exception { cluster.waitForClusterToBeReady(); objectStore = OzoneClientFactory.getRpcClient(omServiceId, conf) .getObjectStore(); + + volumeName = "volume" + RandomStringUtils.randomNumeric(5); + bucketName = "bucket" + RandomStringUtils.randomNumeric(5); + + VolumeArgs createVolumeArgs = VolumeArgs.newBuilder() + .setOwner("user" + RandomStringUtils.randomNumeric(5)) + .setAdmin("admin" + RandomStringUtils.randomNumeric(5)) + .build(); + + objectStore.createVolume(volumeName, createVolumeArgs); + OzoneVolume retVolumeinfo = objectStore.getVolume(volumeName); + + retVolumeinfo.createBucket(bucketName); + ozoneBucket = retVolumeinfo.getBucket(bucketName); } /** @@ -125,37 +146,13 @@ public void testInstallSnapshot() throws Exception { OzoneManager followerOM = cluster.getOzoneManager(followerNodeId); // Do some transactions so that the log index increases - String userName = "user" + RandomStringUtils.randomNumeric(5); - String adminName = "admin" + RandomStringUtils.randomNumeric(5); - String volumeName = "volume" + RandomStringUtils.randomNumeric(5); - String bucketName = "bucket" + RandomStringUtils.randomNumeric(5); - - VolumeArgs createVolumeArgs = VolumeArgs.newBuilder() - .setOwner(userName) - .setAdmin(adminName) - .build(); - - objectStore.createVolume(volumeName, createVolumeArgs); - OzoneVolume retVolumeinfo = objectStore.getVolume(volumeName); - - retVolumeinfo.createBucket(bucketName); - OzoneBucket ozoneBucket = retVolumeinfo.getBucket(bucketName); - - long leaderOMappliedLogIndex = - leaderRatisServer.getLastAppliedTermIndex().getIndex(); - - List keys = new ArrayList<>(); - while (leaderOMappliedLogIndex < 2000) { - keys.add(createKey(ozoneBucket)); - leaderOMappliedLogIndex = - leaderRatisServer.getLastAppliedTermIndex().getIndex(); - } + List keys = writeKeysToIncreaseLogIndex(leaderRatisServer, 200); // Get the latest db checkpoint from the leader OM. OMTransactionInfo omTransactionInfo = OMTransactionInfo.readTransactionInfo(leaderOM.getMetadataManager()); TermIndex leaderOMTermIndex = - TermIndex.newTermIndex(omTransactionInfo.getCurrentTerm(), + TermIndex.newTermIndex(omTransactionInfo.getTerm(), omTransactionInfo.getTransactionIndex()); long leaderOMSnaphsotIndex = leaderOMTermIndex.getIndex(); long leaderOMSnapshotTermIndex = leaderOMTermIndex.getTerm(); @@ -169,30 +166,20 @@ public void testInstallSnapshot() throws Exception { // The recently started OM should be lagging behind the leader OM. long followerOMLastAppliedIndex = followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex(); - Assert.assertTrue( + assertTrue( followerOMLastAppliedIndex < leaderOMSnaphsotIndex); // Install leader OM's db checkpoint on the lagging OM. - File oldDbLocation = followerOM.getMetadataManager().getStore() - .getDbLocation(); - followerOM.getOmRatisServer().getOmStateMachine().pause(); - followerOM.getMetadataManager().getStore().close(); - followerOM.replaceOMDBWithCheckpoint(leaderOMSnaphsotIndex, oldDbLocation, - leaderDbCheckpoint.getCheckpointLocation()); - - // Reload the follower OM with new DB checkpoint from the leader OM. - followerOM.reloadOMState(leaderOMSnaphsotIndex, leaderOMSnapshotTermIndex); - followerOM.getOmRatisServer().getOmStateMachine().unpause( - leaderOMSnaphsotIndex, leaderOMSnapshotTermIndex); - - // After the new checkpoint is loaded and state machine is unpaused, the - // follower OM lastAppliedIndex must match the snapshot index of the - // checkpoint. + followerOM.installCheckpoint(leaderOMNodeId, leaderDbCheckpoint); + + // After the new checkpoint is installed, the follower OM + // lastAppliedIndex must >= the snapshot index of the checkpoint. It + // could be great than snapshot index if there is any conf entry from ratis. followerOMLastAppliedIndex = followerOM.getOmRatisServer() .getLastAppliedTermIndex().getIndex(); - Assert.assertEquals(leaderOMSnaphsotIndex, followerOMLastAppliedIndex); - Assert.assertEquals(leaderOMSnapshotTermIndex, - followerOM.getOmRatisServer().getLastAppliedTermIndex().getTerm()); + assertTrue(followerOMLastAppliedIndex >= leaderOMSnaphsotIndex); + assertTrue(followerOM.getOmRatisServer().getLastAppliedTermIndex() + .getTerm() >= leaderOMSnapshotTermIndex); // Verify that the follower OM's DB contains the transactions which were // made while it was inactive. @@ -206,4 +193,70 @@ public void testInstallSnapshot() throws Exception { followerOMMetaMngr.getOzoneKey(volumeName, bucketName, key))); } } + + @Test + public void testInstallOldCheckpointFailure() throws Exception { + // Get the leader OM + String leaderOMNodeId = OmFailoverProxyUtil + .getFailoverProxyProvider(objectStore.getClientProxy()) + .getCurrentProxyOMNodeId(); + + OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId); + + // Find the inactive OM and start it + String followerNodeId = leaderOM.getPeerNodes().get(0).getOMNodeId(); + if (cluster.isOMActive(followerNodeId)) { + followerNodeId = leaderOM.getPeerNodes().get(1).getOMNodeId(); + } + cluster.startInactiveOM(followerNodeId); + + OzoneManager followerOM = cluster.getOzoneManager(followerNodeId); + OzoneManagerRatisServer followerRatisServer = followerOM.getOmRatisServer(); + + // Do some transactions so that the log index increases on follower OM + writeKeysToIncreaseLogIndex(followerRatisServer, 100); + + TermIndex leaderCheckpointTermIndex = leaderOM.getOmRatisServer() + .getLastAppliedTermIndex(); + DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore() + .getCheckpoint(false); + + // Do some more transactions to increase the log index further on + // follower OM such that it is more than the checkpoint index taken on + // leader OM. + writeKeysToIncreaseLogIndex(followerOM.getOmRatisServer(), + leaderCheckpointTermIndex.getIndex() + 100); + + GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.INFO); + GenericTestUtils.LogCapturer logCapture = + GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG); + + // Install the old checkpoint on the follower OM. This should fail as the + // followerOM is already ahead of that transactionLogIndex and the OM + // state should be reloaded. + TermIndex followerTermIndex = followerRatisServer.getLastAppliedTermIndex(); + TermIndex newTermIndex = followerOM.installCheckpoint( + leaderOMNodeId, leaderDbCheckpoint); + + String errorMsg = "Cannot proceed with InstallSnapshot as OM is at " + + "TermIndex " + followerTermIndex + " and checkpoint has lower " + + "TermIndex"; + Assert.assertTrue(logCapture.getOutput().contains(errorMsg)); + Assert.assertNull("OM installed checkpoint even though checkpoint " + + "logIndex is less than it's lastAppliedIndex", newTermIndex); + Assert.assertEquals(followerTermIndex, followerRatisServer.getLastAppliedTermIndex()); + } + + private List writeKeysToIncreaseLogIndex( + OzoneManagerRatisServer omRatisServer, long targetLogIndex) + throws IOException, InterruptedException { + List keys = new ArrayList<>(); + long logIndex = omRatisServer.getLastAppliedTermIndex().getIndex(); + while (logIndex < targetLogIndex) { + keys.add(createKey(ozoneBucket)); + Thread.sleep(100); + logIndex = omRatisServer.getLastAppliedTermIndex().getIndex(); + } + return keys; + } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java index a6cd17487201..e2c47d075c2c 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java @@ -290,10 +290,16 @@ public void start(OzoneConfiguration configuration) throws IOException { public static DBStore loadDB(OzoneConfiguration configuration, File metaDir) throws IOException { + return loadDB(configuration, metaDir, OM_DB_NAME); + } + + public static DBStore loadDB(OzoneConfiguration configuration, File metaDir, + String dbName) + throws IOException { RocksDBConfiguration rocksDBConfiguration = configuration.getObject(RocksDBConfiguration.class); DBStoreBuilder dbStoreBuilder = DBStoreBuilder.newBuilder(configuration, - rocksDBConfiguration).setName(OM_DB_NAME) + rocksDBConfiguration).setName(dbName) .setPath(Paths.get(metaDir.getPath())); DBStore dbStore = addOMTablesAndCodecs(dbStoreBuilder).build(); return dbStore; diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index 37b099971f18..e3a7bf0d743e 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -3068,36 +3068,44 @@ public List getAcl(OzoneObj obj) throws IOException { /** * Download and install latest checkpoint from leader OM. - * If the download checkpoints snapshot index is greater than this OM's - * last applied transaction index, then re-initialize the OM state via this - * checkpoint. Before re-initializing OM state, the OM Ratis server should - * be stopped so that no new transactions can be applied. * * @param leaderId peerNodeID of the leader OM - * @return If checkpoint is installed, return the corresponding termIndex. - * Otherwise, return null. + * @return If checkpoint is installed successfully, return the + * corresponding termIndex. Otherwise, return null. */ - public TermIndex installSnapshot(String leaderId) { + public TermIndex installSnapshotFromLeader(String leaderId) { if (omSnapshotProvider == null) { LOG.error("OM Snapshot Provider is not configured as there are no peer " + "nodes."); return null; } - DBCheckpoint omDBcheckpoint = getDBCheckpointFromLeader(leaderId); - Path newDBLocation = omDBcheckpoint.getCheckpointLocation(); + DBCheckpoint omDBCheckpoint = getDBCheckpointFromLeader(leaderId); + LOG.info("Downloaded checkpoint from Leader {} to the location {}", + leaderId, omDBCheckpoint.getCheckpointLocation()); - LOG.info("Downloaded checkpoint from Leader {}, in to the location {}", - leaderId, newDBLocation); + TermIndex termIndex = null; + try { + termIndex = installCheckpoint(leaderId, omDBCheckpoint); + } catch (Exception ex) { + LOG.error("Failed to install snapshot from Leader OM: {}", ex); + } + return termIndex; + } + + /** + * Install checkpoint. If the checkpoints snapshot index is greater than + * OM's last applied transaction index, then re-initialize the OM + * state via this checkpoint. Before re-initializing OM state, the OM Ratis + * server should be stopped so that no new transactions can be applied. + */ + public TermIndex installCheckpoint(String leaderId, + DBCheckpoint omDBCheckpoint) throws Exception { + Path newDBLocation = omDBCheckpoint.getCheckpointLocation(); OMTransactionInfo omTransactionInfo = getTrxnInfoFromCheckpoint( newDBLocation); - if (omTransactionInfo == null) { - LOG.error("Failed to install snapshot from {}."); - return null; - } - File oldDBLocation = metadataManager.getStore().getDbLocation(); try { // Stop Background services @@ -3109,13 +3117,11 @@ public TermIndex installSnapshot(String leaderId) { omRatisServer.getOmStateMachine().pause(); } catch (Exception e) { LOG.error("Failed to stop/ pause the services. Cannot proceed with " + - "installing the new checkpoint.", e); - + "installing the new checkpoint."); // During stopServices, if KeyManager was stopped successfully and // OMMetadataManager stop failed, we should restart the KeyManager. keyManager.start(configuration); - - return null; + throw e; } File dbBackup = null; @@ -3134,7 +3140,7 @@ public TermIndex installSnapshot(String leaderId) { try { dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, newDBLocation); - term = omTransactionInfo.getCurrentTerm(); + term = omTransactionInfo.getTerm(); lastAppliedIndex = omTransactionInfo.getTransactionIndex(); LOG.info("Replaced DB with checkpoint from OM: {}, term: {}, index: {}", leaderId, term, lastAppliedIndex); @@ -3143,11 +3149,9 @@ public TermIndex installSnapshot(String leaderId) { " DB with downloaded checkpoint. Reloading old OM state.", e); } } else { - LOG.warn("Cannot proceed with InstallSnapshot as OM is at termIndex: " + - "({},{}) and checkpoint has lower termIndex: ({},{}). Reloading old" + - " state of OM.", term, lastAppliedIndex, - omTransactionInfo.getCurrentTerm(), - omTransactionInfo.getTransactionIndex()); + LOG.warn("Cannot proceed with InstallSnapshot as OM is at TermIndex {} " + + "and checkpoint has lower TermIndex {}. Reloading old state of OM.", + termIndex, omTransactionInfo.getTermIndex()); } // Reload the OM DB store with the new checkpoint. @@ -3209,22 +3213,16 @@ void stopServices() throws Exception { metadataManager.stop(); } - private OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) { + private OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) + throws Exception { Path dbDir = dbPath.getParent(); + String dbName = dbPath.getFileName().toString(); if (dbDir == null) { - LOG.error("Incorrect DB location path {} received from checkpoint.", - dbPath); - return null; + throw new IOException("Checkpoint {} does not have proper DB location"); } - try { - return OzoneManagerRatisUtils.getTransactionInfoFromDB( - configuration, dbDir); - } catch (Exception ex) { - LOG.error("Failed to open downloaded checkpoint {} and read transaction" + - " info", dbPath, ex); - return null; - } + return OzoneManagerRatisUtils.getTransactionInfoFromDB(configuration, + dbDir, dbName); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OMTransactionInfo.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OMTransactionInfo.java index 24417515ef13..28c8c3a91f27 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OMTransactionInfo.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OMTransactionInfo.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.util.Objects; +import org.apache.ratis.server.protocol.TermIndex; import static org.apache.hadoop.ozone.OzoneConsts.TRANSACTION_INFO_KEY; import static org.apache.hadoop.ozone.OzoneConsts.TRANSACTION_INFO_SPLIT_KEY; @@ -33,7 +34,7 @@ */ public final class OMTransactionInfo { - private long currentTerm; // term associated with the ratis log index. + private long term; // term associated with the ratis log index. // Transaction index corresponds to ratis log index private long transactionIndex; @@ -43,12 +44,12 @@ private OMTransactionInfo(String transactionInfo) { Preconditions.checkState(tInfo.length==2, "Incorrect TransactionInfo value"); - currentTerm = Long.parseLong(tInfo[0]); + term = Long.parseLong(tInfo[0]); transactionIndex = Long.parseLong(tInfo[1]); } private OMTransactionInfo(long currentTerm, long transactionIndex) { - this.currentTerm = currentTerm; + this.term = currentTerm; this.transactionIndex = transactionIndex; } @@ -56,8 +57,8 @@ private OMTransactionInfo(long currentTerm, long transactionIndex) { * Get current term. * @return currentTerm */ - public long getCurrentTerm() { - return currentTerm; + public long getTerm() { + return term; } /** @@ -68,6 +69,10 @@ public long getTransactionIndex() { return transactionIndex; } + public TermIndex getTermIndex() { + return TermIndex.newTermIndex(term, transactionIndex); + } + /** * Generate String form of transaction info which need to be persisted in OM * DB finally in byte array. @@ -75,7 +80,7 @@ public long getTransactionIndex() { */ private String generateTransactionInfo() { StringBuilder stringBuilder = new StringBuilder(); - stringBuilder.append(currentTerm); + stringBuilder.append(term); stringBuilder.append(TRANSACTION_INFO_SPLIT_KEY); stringBuilder.append(transactionIndex); @@ -109,13 +114,13 @@ public boolean equals(Object o) { return false; } OMTransactionInfo that = (OMTransactionInfo) o; - return currentTerm == that.currentTerm && + return term == that.term && transactionIndex == that.transactionIndex; } @Override public int hashCode() { - return Objects.hash(currentTerm, transactionIndex); + return Objects.hash(term, transactionIndex); } /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java index c042fcb7eedd..3f7429ab7dd0 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerStateMachine.java @@ -379,7 +379,7 @@ public CompletableFuture notifyInstallSnapshotFromLeader( } CompletableFuture future = CompletableFuture.supplyAsync( - () -> ozoneManager.installSnapshot(leaderNodeId), + () -> ozoneManager.installSnapshotFromLeader(leaderNodeId), installSnapshotExecutor); return future; } @@ -521,9 +521,9 @@ public void loadSnapshotInfoFromDB() throws IOException { ozoneManager.getMetadataManager()); if (omTransactionInfo != null) { setLastAppliedTermIndex(TermIndex.newTermIndex( - omTransactionInfo.getCurrentTerm(), + omTransactionInfo.getTerm(), omTransactionInfo.getTransactionIndex())); - snapshotInfo.updateTermIndex(omTransactionInfo.getCurrentTerm(), + snapshotInfo.updateTermIndex(omTransactionInfo.getTerm(), omTransactionInfo.getTransactionIndex()); } LOG.info("LastAppliedIndex is set from TransactionInfo from OM DB as {}", diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index 531b2e295089..93c12f2bce62 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -235,9 +235,10 @@ public static Status exceptionToResponseStatus(IOException exception) { * @throws Exception */ public static OMTransactionInfo getTransactionInfoFromDB( - OzoneConfiguration tempConfig, Path dbDir) throws Exception { - DBStore dbStore = - OmMetadataManagerImpl.loadDB(tempConfig, dbDir.toFile()); + OzoneConfiguration tempConfig, Path dbDir, String dbName) + throws Exception { + DBStore dbStore = OmMetadataManagerImpl.loadDB(tempConfig, dbDir.toFile(), + dbName); Table transactionInfoTable = dbStore.getTable(TRANSACTION_INFO_TABLE, diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java index 1c78251abb92..a11c60b9435d 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/OzoneManagerSnapshotProvider.java @@ -113,8 +113,10 @@ public OzoneManagerSnapshotProvider(MutableConfigurationSource conf, public DBCheckpoint getOzoneManagerDBSnapshot(String leaderOMNodeID) throws IOException { String snapshotTime = Long.toString(System.currentTimeMillis()); - String snapshotFileName = Paths.get(omSnapshotDir.getAbsolutePath(), - snapshotTime, OM_DB_NAME).toFile().getAbsolutePath(); + String snapshotFileName = OM_DB_NAME + "-" + leaderOMNodeID + + "-" + snapshotTime; + String snapshotFilePath = Paths.get(omSnapshotDir.getAbsolutePath(), + snapshotFileName).toFile().getAbsolutePath(); File targetFile = new File(snapshotFileName + ".tar.gz"); String omCheckpointUrl = peerNodesMap.get(leaderOMNodeID) @@ -141,11 +143,11 @@ public DBCheckpoint getOzoneManagerDBSnapshot(String leaderOMNodeID) }); // Untar the checkpoint file. - Path untarredDbDir = Paths.get(snapshotFileName); + Path untarredDbDir = Paths.get(snapshotFilePath); FileUtil.unTar(targetFile, untarredDbDir.toFile()); FileUtils.deleteQuietly(targetFile); - LOG.info("Sucessfully downloaded latest checkpoint from leader OM: {}", + LOG.info("Successfully downloaded latest checkpoint from leader OM: {}", leaderOMNodeID); RocksDBCheckpoint omCheckpoint = new RocksDBCheckpoint(untarredDbDir); diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmMetadataManager.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmMetadataManager.java index 054c97f396c7..6226c5bbc9f1 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmMetadataManager.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestOmMetadataManager.java @@ -76,7 +76,7 @@ public void testTransactionTable() throws Exception { OMTransactionInfo omTransactionInfo = omMetadataManager.getTransactionInfoTable().get(TRANSACTION_INFO_KEY); - Assert.assertEquals(3, omTransactionInfo.getCurrentTerm()); + Assert.assertEquals(3, omTransactionInfo.getTerm()); Assert.assertEquals(250, omTransactionInfo.getTransactionIndex()); diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithDummyResponse.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithDummyResponse.java index 7b86006b9379..372679b2b3eb 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithDummyResponse.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithDummyResponse.java @@ -139,7 +139,7 @@ public void testDoubleBufferWithDummyResponse() throws Exception { Assert.assertEquals(lastAppliedIndex, omTransactionInfo.getTransactionIndex()); - Assert.assertEquals(term, omTransactionInfo.getCurrentTerm()); + Assert.assertEquals(term, omTransactionInfo.getTerm()); } /** diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithOMResponse.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithOMResponse.java index b3693415b183..260e2cd17c10 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithOMResponse.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis/TestOzoneManagerDoubleBufferWithOMResponse.java @@ -202,7 +202,7 @@ public void testDoubleBufferWithMixOfTransactions() throws Exception { Assert.assertEquals(lastAppliedIndex, omTransactionInfo.getTransactionIndex()); - Assert.assertEquals(term, omTransactionInfo.getCurrentTerm()); + Assert.assertEquals(term, omTransactionInfo.getTerm()); } /** From a4c9bf67ab22aa6fb92c34eecfe33cdf85606fa6 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 10 Jul 2020 15:43:03 -0700 Subject: [PATCH 5/9] unit test for testing system exit on installing corrupted DB --- .../apache/hadoop/ozone/util/ExitManager.java | 16 +++++ .../hadoop/ozone/om/TestOMRatisSnapshots.java | 66 +++++++++++++++++++ .../ozone/om/OmMetadataManagerImpl.java | 3 +- .../apache/hadoop/ozone/om/OzoneManager.java | 36 ++++++---- 4 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java new file mode 100644 index 000000000000..7f42224a295c --- /dev/null +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java @@ -0,0 +1,16 @@ +package org.apache.hadoop.ozone.util; + +import org.apache.ratis.util.ExitUtils; +import org.slf4j.Logger; + +/** + * An Exit Manager used to shutdown service in case of unrecoverable error. + * This class will be helpful to test exit functionality. + */ +public class ExitManager { + + public void exitSystem(int status, String message, Throwable throwable, + Logger log) { + ExitUtils.terminate(1, message, throwable, log); + } +} diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java index dd66594078ad..079e4142111f 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java @@ -16,7 +16,9 @@ */ package org.apache.hadoop.ozone.om; +import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import java.util.UUID; @@ -37,6 +39,7 @@ import static org.junit.Assert.assertTrue; import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.ozone.util.ExitManager; import org.apache.hadoop.test.GenericTestUtils; import org.apache.ratis.server.protocol.TermIndex; import org.junit.After; @@ -47,6 +50,7 @@ import org.junit.Test; import org.junit.rules.ExpectedException; import org.junit.rules.Timeout; +import org.slf4j.Logger; import org.slf4j.event.Level; /** @@ -247,6 +251,60 @@ public void testInstallOldCheckpointFailure() throws Exception { Assert.assertEquals(followerTermIndex, followerRatisServer.getLastAppliedTermIndex()); } + @Test + public void testInstallCorruptedCheckpointFailure() throws Exception { + // Get the leader OM + String leaderOMNodeId = OmFailoverProxyUtil + .getFailoverProxyProvider(objectStore.getClientProxy()) + .getCurrentProxyOMNodeId(); + + OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId); + OzoneManagerRatisServer leaderRatisServer = leaderOM.getOmRatisServer(); + + // Find the inactive OM + String followerNodeId = leaderOM.getPeerNodes().get(0).getOMNodeId(); + if (cluster.isOMActive(followerNodeId)) { + followerNodeId = leaderOM.getPeerNodes().get(1).getOMNodeId(); + } + OzoneManager followerOM = cluster.getOzoneManager(followerNodeId); + OzoneManagerRatisServer followerRatisServer = followerOM.getOmRatisServer(); + + // Do some transactions so that the log index increases + writeKeysToIncreaseLogIndex(leaderRatisServer, 100); + + DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore() + .getCheckpoint(false); + Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation(); + OMTransactionInfo leaderCheckpointTrxnInfo = leaderOM + .getTrxnInfoFromCheckpoint(leaderCheckpointLocation); + + // Corrupt the leader checkpoint and install that on the OM. The + // operation should fail and OM should shutdown. + boolean delete = true; + for (File file : leaderCheckpointLocation.toFile() + .listFiles()) { + if (file.getName().contains(".sst")) { + if (delete) { + file.delete(); + delete = false; + } else { + delete = true; + } + } + } + + GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.ERROR); + GenericTestUtils.LogCapturer logCapture = + GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG); + followerOM.setExitManagerForTesting(new DummyExitManager()); + + followerOM.installCheckpoint(leaderOMNodeId, leaderCheckpointLocation, + leaderCheckpointTrxnInfo); + + Assert.assertTrue(logCapture.getOutput().contains("System Exit: " + + "Failed to reload OM state and instantiate services.")); + } + private List writeKeysToIncreaseLogIndex( OzoneManagerRatisServer omRatisServer, long targetLogIndex) throws IOException, InterruptedException { @@ -259,4 +317,12 @@ private List writeKeysToIncreaseLogIndex( } return keys; } + + private class DummyExitManager extends ExitManager { + @Override + public void exitSystem(int status, String message, Throwable throwable, + Logger log) { + log.error("System Exit: " + message, throwable); + } + } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java index e2c47d075c2c..800a565a9843 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java @@ -294,8 +294,7 @@ public static DBStore loadDB(OzoneConfiguration configuration, File metaDir) } public static DBStore loadDB(OzoneConfiguration configuration, File metaDir, - String dbName) - throws IOException { + String dbName) throws IOException { RocksDBConfiguration rocksDBConfiguration = configuration.getObject(RocksDBConfiguration.class); DBStoreBuilder dbStoreBuilder = DBStoreBuilder.newBuilder(configuration, diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index e3a7bf0d743e..dabf2ace8ba6 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -155,6 +155,7 @@ import org.apache.hadoop.ozone.security.acl.OzoneObj.StoreType; import org.apache.hadoop.ozone.security.acl.OzoneObjInfo; import org.apache.hadoop.ozone.security.acl.RequestContext; +import org.apache.hadoop.ozone.util.ExitManager; import org.apache.hadoop.ozone.util.OzoneVersionInfo; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; @@ -310,6 +311,8 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl private boolean isNativeAuthorizerEnabled; + private ExitManager exitManager; + private enum State { INITIALIZED, RUNNING, @@ -3099,12 +3102,18 @@ public TermIndex installSnapshotFromLeader(String leaderId) { * state via this checkpoint. Before re-initializing OM state, the OM Ratis * server should be stopped so that no new transactions can be applied. */ - public TermIndex installCheckpoint(String leaderId, + TermIndex installCheckpoint(String leaderId, DBCheckpoint omDBCheckpoint) throws Exception { - Path newDBLocation = omDBCheckpoint.getCheckpointLocation(); + Path checkpointLocation = omDBCheckpoint.getCheckpointLocation(); OMTransactionInfo omTransactionInfo = getTrxnInfoFromCheckpoint( - newDBLocation); + checkpointLocation); + + return installCheckpoint(leaderId, checkpointLocation, omTransactionInfo); + } + + TermIndex installCheckpoint(String leaderId, Path checkpointLocation, + OMTransactionInfo checkpointTrxnInfo) throws Exception { File oldDBLocation = metadataManager.getStore().getDbLocation(); try { @@ -3134,14 +3143,14 @@ public TermIndex installCheckpoint(String leaderId, // server so that the OM state can be re-initialized. If no then do not // proceed with installSnapshot. boolean canProceed = OzoneManagerRatisUtils.verifyTransactionInfo( - omTransactionInfo, lastAppliedIndex, leaderId, newDBLocation); + checkpointTrxnInfo, lastAppliedIndex, leaderId, checkpointLocation); if (canProceed) { try { dbBackup = replaceOMDBWithCheckpoint(lastAppliedIndex, oldDBLocation, - newDBLocation); - term = omTransactionInfo.getTerm(); - lastAppliedIndex = omTransactionInfo.getTransactionIndex(); + checkpointLocation); + term = checkpointTrxnInfo.getTerm(); + lastAppliedIndex = checkpointTrxnInfo.getTransactionIndex(); LOG.info("Replaced DB with checkpoint from OM: {}, term: {}, index: {}", leaderId, term, lastAppliedIndex); } catch (Exception e) { @@ -3151,7 +3160,7 @@ public TermIndex installCheckpoint(String leaderId, } else { LOG.warn("Cannot proceed with InstallSnapshot as OM is at TermIndex {} " + "and checkpoint has lower TermIndex {}. Reloading old state of OM.", - termIndex, omTransactionInfo.getTermIndex()); + termIndex, checkpointTrxnInfo.getTermIndex()); } // Reload the OM DB store with the new checkpoint. @@ -3164,7 +3173,7 @@ public TermIndex installCheckpoint(String leaderId, lastAppliedIndex); } catch (IOException ex) { String errorMsg = "Failed to reload OM state and instantiate services."; - ExitUtils.terminate(1, errorMsg, ex, LOG); + exitManager.exitSystem(1, errorMsg, ex, LOG); } // Delete the backup DB @@ -3176,7 +3185,7 @@ public TermIndex installCheckpoint(String leaderId, LOG.error("Failed to delete the backup of the original DB {}", dbBackup); } - if (lastAppliedIndex != omTransactionInfo.getTransactionIndex()) { + if (lastAppliedIndex != checkpointTrxnInfo.getTransactionIndex()) { // Install Snapshot failed and old state was reloaded. Return null to // Ratis to indicate that installation failed. return null; @@ -3213,7 +3222,8 @@ void stopServices() throws Exception { metadataManager.stop(); } - private OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) + @VisibleForTesting + OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) throws Exception { Path dbDir = dbPath.getParent(); String dbName = dbPath.getFileName().toString(); @@ -3490,4 +3500,8 @@ private Pair resolveBucketLink( visited); } + @VisibleForTesting + void setExitManagerForTesting(ExitManager exitManagerForTesting) { + this.exitManager = exitManagerForTesting; + } } From bcadd13812a3aeb29f307c4f2ee32dd8910ef62e Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 10 Jul 2020 16:57:29 -0700 Subject: [PATCH 6/9] CI fixes --- .../apache/hadoop/ozone/util/ExitManager.java | 17 ++++++++++++ .../hadoop/ozone/om/TestOMRatisSnapshots.java | 16 +++++++----- .../TestOzoneManagerSnapshotProvider.java | 26 +++++++------------ .../apache/hadoop/ozone/om/OzoneManager.java | 26 ++++++------------- .../ratis/utils/OzoneManagerRatisUtils.java | 26 ++++++++++++++++--- 5 files changed, 66 insertions(+), 45 deletions(-) diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java index 7f42224a295c..4a83c1d8c239 100644 --- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java +++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/util/ExitManager.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.ozone.util; import org.apache.ratis.util.ExitUtils; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java index 079e4142111f..ef08abd89096 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOMRatisSnapshots.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.UUID; +import org.apache.commons.lang3.RandomStringUtils; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.utils.db.DBCheckpoint; import org.apache.hadoop.ozone.MiniOzoneCluster; @@ -34,14 +35,14 @@ import org.apache.hadoop.ozone.client.VolumeArgs; import org.apache.hadoop.ozone.om.ratis.OMTransactionInfo; import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer; +import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils; +import org.apache.hadoop.ozone.util.ExitManager; +import org.apache.hadoop.test.GenericTestUtils; +import org.apache.ratis.server.protocol.TermIndex; import static org.apache.hadoop.ozone.om.TestOzoneManagerHAWithData.createKey; import static org.junit.Assert.assertTrue; -import org.apache.commons.lang3.RandomStringUtils; -import org.apache.hadoop.ozone.util.ExitManager; -import org.apache.hadoop.test.GenericTestUtils; -import org.apache.ratis.server.protocol.TermIndex; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -248,7 +249,8 @@ public void testInstallOldCheckpointFailure() throws Exception { Assert.assertTrue(logCapture.getOutput().contains(errorMsg)); Assert.assertNull("OM installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex); - Assert.assertEquals(followerTermIndex, followerRatisServer.getLastAppliedTermIndex()); + Assert.assertEquals(followerTermIndex, + followerRatisServer.getLastAppliedTermIndex()); } @Test @@ -275,8 +277,8 @@ public void testInstallCorruptedCheckpointFailure() throws Exception { DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore() .getCheckpoint(false); Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation(); - OMTransactionInfo leaderCheckpointTrxnInfo = leaderOM - .getTrxnInfoFromCheckpoint(leaderCheckpointLocation); + OMTransactionInfo leaderCheckpointTrxnInfo = OzoneManagerRatisUtils + .getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation); // Corrupt the leader checkpoint and install that on the OM. The // operation should fail and OM should shutdown. diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerSnapshotProvider.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerSnapshotProvider.java index d77f4d9d1341..844c859ac028 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerSnapshotProvider.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerSnapshotProvider.java @@ -20,6 +20,7 @@ import java.util.UUID; +import org.apache.commons.lang3.RandomStringUtils; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.utils.db.DBCheckpoint; import org.apache.hadoop.ozone.MiniOzoneCluster; @@ -31,11 +32,10 @@ import org.apache.hadoop.ozone.client.VolumeArgs; import org.apache.hadoop.ozone.om.OMConfigKeys; import org.apache.hadoop.ozone.om.OmFailoverProxyUtil; -import org.apache.hadoop.ozone.om.OmMetadataManagerImpl; import org.apache.hadoop.ozone.om.OzoneManager; - -import org.apache.commons.lang3.RandomStringUtils; import org.apache.hadoop.ozone.om.ratis.OMTransactionInfo; +import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils; + import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -124,7 +124,7 @@ public void testDownloadCheckpoint() throws Exception { .getOzoneManagerDBSnapshot(leaderOMNodeId); long leaderSnapshotIndex = leaderOM.getRatisSnapshotIndex(); - long downloadedSnapshotIndex = getDownloadSnapshotIndex(omSnapshot); + long downloadedSnapshotIndex = getDownloadedSnapshotIndex(omSnapshot); // The snapshot index downloaded from leader OM should match the ratis // snapshot index on the leader OM @@ -133,21 +133,13 @@ public void testDownloadCheckpoint() throws Exception { leaderSnapshotIndex, downloadedSnapshotIndex); } - private long getDownloadSnapshotIndex(DBCheckpoint dbCheckpoint) + private long getDownloadedSnapshotIndex(DBCheckpoint dbCheckpoint) throws Exception { - OzoneConfiguration configuration = new OzoneConfiguration(conf); - configuration.set(OMConfigKeys.OZONE_OM_DB_DIRS, - dbCheckpoint.getCheckpointLocation().getParent().toString()); - - OmMetadataManagerImpl omMetadataManager = - new OmMetadataManagerImpl(configuration); - - long transactionIndex = - OMTransactionInfo.readTransactionInfo(omMetadataManager) - .getTransactionIndex(); - omMetadataManager.stop(); - return transactionIndex; + OMTransactionInfo trxnInfoFromCheckpoint = + OzoneManagerRatisUtils.getTrxnInfoFromCheckpoint(conf, + dbCheckpoint.getCheckpointLocation()); + return trxnInfoFromCheckpoint.getTransactionIndex(); } } \ No newline at end of file diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java index dabf2ace8ba6..8a49fa780b37 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java @@ -3102,14 +3102,17 @@ public TermIndex installSnapshotFromLeader(String leaderId) { * state via this checkpoint. Before re-initializing OM state, the OM Ratis * server should be stopped so that no new transactions can be applied. */ - TermIndex installCheckpoint(String leaderId, - DBCheckpoint omDBCheckpoint) throws Exception { + TermIndex installCheckpoint(String leaderId, DBCheckpoint omDBCheckpoint) + throws Exception { Path checkpointLocation = omDBCheckpoint.getCheckpointLocation(); - OMTransactionInfo omTransactionInfo = getTrxnInfoFromCheckpoint( - checkpointLocation); + OMTransactionInfo checkpointTrxnInfo = OzoneManagerRatisUtils + .getTrxnInfoFromCheckpoint(configuration, checkpointLocation); + + LOG.info("Installing checkpoint with OMTransactionInfo {}", + checkpointTrxnInfo); - return installCheckpoint(leaderId, checkpointLocation, omTransactionInfo); + return installCheckpoint(leaderId, checkpointLocation, checkpointTrxnInfo); } TermIndex installCheckpoint(String leaderId, Path checkpointLocation, @@ -3222,19 +3225,6 @@ void stopServices() throws Exception { metadataManager.stop(); } - @VisibleForTesting - OMTransactionInfo getTrxnInfoFromCheckpoint(Path dbPath) - throws Exception { - Path dbDir = dbPath.getParent(); - String dbName = dbPath.getFileName().toString(); - if (dbDir == null) { - throw new IOException("Checkpoint {} does not have proper DB location"); - } - - return OzoneManagerRatisUtils.getTransactionInfoFromDB(configuration, - dbDir, dbName); - } - /** * Replace the current OM DB with the new DB checkpoint. * diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index 93c12f2bce62..d67123d0871f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -227,6 +227,23 @@ public static Status exceptionToResponseStatus(IOException exception) { } } + /** + * Obtain OMTransactionInfo from Checkpoint. + */ + public static OMTransactionInfo getTrxnInfoFromCheckpoint( + OzoneConfiguration conf, Path dbPath) throws Exception { + Path dbDir = dbPath.getParent(); + String dbName = dbPath.getFileName().toString(); + if (dbDir == null) { + throw new IOException("Checkpoint {} does not have proper DB location"); + } + + OMTransactionInfo checkpointTrxnInfo = getTransactionInfoFromDB(conf, dbDir, + dbName); + + return checkpointTrxnInfo; + } + /** * Obtain Transaction info from DB. * @param tempConfig @@ -234,7 +251,7 @@ public static Status exceptionToResponseStatus(IOException exception) { * @return OMTransactionInfo * @throws Exception */ - public static OMTransactionInfo getTransactionInfoFromDB( + private static OMTransactionInfo getTransactionInfoFromDB( OzoneConfiguration tempConfig, Path dbDir, String dbName) throws Exception { DBStore dbStore = OmMetadataManagerImpl.loadDB(tempConfig, dbDir.toFile(), @@ -247,8 +264,11 @@ public static OMTransactionInfo getTransactionInfoFromDB( OMTransactionInfo omTransactionInfo = transactionInfoTable.get(TRANSACTION_INFO_KEY); dbStore.close(); - OzoneManager.LOG.info("Downloaded checkpoint with OMTransactionInfo {}", - omTransactionInfo); + + if (omTransactionInfo == null) { + throw new IOException("Failed to read OMTransactionInfo from DB " + + dbName + " at " + dbDir); + } return omTransactionInfo; } From e88121aa6e0844a91ca297ac3abae17ec1777b19 Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Wed, 15 Jul 2020 17:57:49 -0700 Subject: [PATCH 7/9] review + findbug fixes --- .../org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java | 3 +-- .../hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java index 800a565a9843..6c8b50595ca1 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OmMetadataManagerImpl.java @@ -18,7 +18,6 @@ import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; @@ -256,7 +255,7 @@ public void start(OzoneConfiguration configuration) throws IOException { // marker indicates that the DB is in an inconsistent state and hence // the OM process should be terminated. File markerFile = new File(metaDir, DB_TRANSIENT_MARKER); - if (Files.exists(markerFile.toPath())) { + if (markerFile.exists()) { LOG.error("File {} marks that OM DB is in an inconsistent state."); // Note - The marker file should be deleted only after fixing the DB. // In an HA setup, this can be done by replacing this DB with a diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index d67123d0871f..37410598f1fb 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -233,10 +233,11 @@ public static Status exceptionToResponseStatus(IOException exception) { public static OMTransactionInfo getTrxnInfoFromCheckpoint( OzoneConfiguration conf, Path dbPath) throws Exception { Path dbDir = dbPath.getParent(); - String dbName = dbPath.getFileName().toString(); if (dbDir == null) { - throw new IOException("Checkpoint {} does not have proper DB location"); + throw new IOException("Checkpoint " + dbPath + " does not have proper " + + "DB location"); } + String dbName = dbPath.getFileName().toString(); OMTransactionInfo checkpointTrxnInfo = getTransactionInfoFromDB(conf, dbDir, dbName); From a1ff4ad2231b10ef8a5209ee7a07a53d658ccb6d Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Thu, 16 Jul 2020 09:18:50 -0700 Subject: [PATCH 8/9] findbug fix --- .../om/ratis/utils/OzoneManagerRatisUtils.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index 37410598f1fb..dd03c40ee238 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -233,16 +233,16 @@ public static Status exceptionToResponseStatus(IOException exception) { public static OMTransactionInfo getTrxnInfoFromCheckpoint( OzoneConfiguration conf, Path dbPath) throws Exception { Path dbDir = dbPath.getParent(); - if (dbDir == null) { + if (dbDir != null && dbPath.getFileName() != null) { + String dbName = dbPath.getFileName().toString(); + OMTransactionInfo checkpointTrxnInfo = getTransactionInfoFromDB(conf, dbDir, + dbName); + + return checkpointTrxnInfo; + } else { throw new IOException("Checkpoint " + dbPath + " does not have proper " + "DB location"); } - String dbName = dbPath.getFileName().toString(); - - OMTransactionInfo checkpointTrxnInfo = getTransactionInfoFromDB(conf, dbDir, - dbName); - - return checkpointTrxnInfo; } /** From b3c6113dc2b8ad1068b7da7c35365810ea06d5ad Mon Sep 17 00:00:00 2001 From: Hanisha Koneru Date: Fri, 17 Jul 2020 10:55:52 -0700 Subject: [PATCH 9/9] findbug and checkstyle fix --- .../om/ratis/utils/OzoneManagerRatisUtils.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java index dd03c40ee238..ddb6841ae31e 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/utils/OzoneManagerRatisUtils.java @@ -232,17 +232,17 @@ public static Status exceptionToResponseStatus(IOException exception) { */ public static OMTransactionInfo getTrxnInfoFromCheckpoint( OzoneConfiguration conf, Path dbPath) throws Exception { - Path dbDir = dbPath.getParent(); - if (dbDir != null && dbPath.getFileName() != null) { - String dbName = dbPath.getFileName().toString(); - OMTransactionInfo checkpointTrxnInfo = getTransactionInfoFromDB(conf, dbDir, - dbName); - return checkpointTrxnInfo; - } else { - throw new IOException("Checkpoint " + dbPath + " does not have proper " + - "DB location"); + if (dbPath != null) { + Path dbDir = dbPath.getParent(); + Path dbFile = dbPath.getFileName(); + if (dbDir != null && dbFile != null) { + return getTransactionInfoFromDB(conf, dbDir, dbFile.toString()); + } } + + throw new IOException("Checkpoint " + dbPath + " does not have proper " + + "DB location"); } /**