From 706625d67cc54fddd30da5f0b232f00ecf6cbeea Mon Sep 17 00:00:00 2001 From: wuhangze Date: Wed, 13 Apr 2022 17:55:43 +0800 Subject: [PATCH 1/6] load newly generated image file as soon as generated to check if it is valid. --- .../main/java/org/apache/doris/master/Checkpoint.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java index 69efd618f85f99..a3c1ddf601762a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java @@ -121,6 +121,17 @@ public synchronized void doCheckpoint() { catalog.fixBugAfterMetadataReplayed(false); catalog.saveImage(); replayedJournalId = catalog.getReplayedJournalId(); + + // destroy checkpoint catalog, reclaim memory + catalog = null; + Catalog.destroyCheckpoint(); + destroyStaticFieldForCkpt(); + + // Load image to verify if the newly generated image file is valid + // If success, do all the following jobs + // If failed, just return + catalog = Catalog.getCurrentCatalog(); + catalog.loadImage(imageDir); if (MetricRepo.isInit) { MetricRepo.COUNTER_IMAGE_WRITE_SUCCESS.increase(1L); } From 51ad74c8317e0f3442b1193d927ecb076d2dd960 Mon Sep 17 00:00:00 2001 From: Henry2SS <1021263336@qq.com> Date: Wed, 13 Apr 2022 23:02:39 +0800 Subject: [PATCH 2/6] delete the latest invalid image file --- .../org/apache/doris/master/Checkpoint.java | 17 +++++++++++++++++ .../org/apache/doris/persist/MetaCleaner.java | 13 +++++++++++++ 2 files changed, 30 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java index a3c1ddf601762a..2a9c2e473ac635 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java @@ -111,6 +111,7 @@ public synchronized void doCheckpoint() { catalog = Catalog.getCurrentCatalog(); catalog.setEditLog(editLog); createStaticFieldForCkpt(); + boolean newImageGenerated = false; try { catalog.loadImage(imageDir); catalog.replayJournal(checkPointVersion); @@ -120,6 +121,7 @@ public synchronized void doCheckpoint() { } catalog.fixBugAfterMetadataReplayed(false); catalog.saveImage(); + newImageGenerated = true; replayedJournalId = catalog.getReplayedJournalId(); // destroy checkpoint catalog, reclaim memory @@ -137,6 +139,21 @@ public synchronized void doCheckpoint() { } LOG.info("checkpoint finished save image.{}", replayedJournalId); } catch (Throwable e) { + if (newImageGenerated) { + // delete the newest image file, cuz it is invalid + MetaCleaner cleaner = new MetaCleaner(Config.meta_dir + "/image"); + try { + cleaner.cleanTheLatestInvalidImageFile(); + if (MetricRepo.isInit) { + MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L); + } + } catch (Throwable ex) { + LOG.error("Master delete latest invalid image file failed.", ex); + if (MetricRepo.isInit) { + MetricRepo.COUNTER_IMAGE_CLEAN_FAILED.increase(1L); + } + } + } e.printStackTrace(); LOG.error("Exception when generate new image file", e); if (MetricRepo.isInit) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java index 2c2b17a9410c13..0cd658026abfd2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java @@ -67,6 +67,19 @@ public void clean() throws IOException { } } } + + public void cleanTheLatestInvalidImageFile() throws IOException { + Storage storage = new Storage(imageDir); + long currentVersion = storage.getImageSeq(); + File currentImage = storage.getImageFile(currentVersion); + if (currentImage.exists()) { + if (currentImage.delete()) { + LOG.info(currentImage.getAbsoluteFile() + " deleted."); + } else { + LOG.warn(currentImage.getAbsoluteFile() + " delete failed."); + } + } + } private String fileType(File file) throws IOException { String type = null; From b34c3ac3c4f53cae8060d52601510d766acbde0c Mon Sep 17 00:00:00 2001 From: Henry2SS <1021263336@qq.com> Date: Wed, 13 Apr 2022 23:22:56 +0800 Subject: [PATCH 3/6] fix --- .../java/org/apache/doris/persist/MetaCleaner.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java index 0cd658026abfd2..64e39fe06884d5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java @@ -70,13 +70,13 @@ public void clean() throws IOException { public void cleanTheLatestInvalidImageFile() throws IOException { Storage storage = new Storage(imageDir); - long currentVersion = storage.getImageSeq(); - File currentImage = storage.getImageFile(currentVersion); - if (currentImage.exists()) { - if (currentImage.delete()) { - LOG.info(currentImage.getAbsoluteFile() + " deleted."); + long latestVersion = storage.getImageSeq(); + File latestInvalidImage = storage.getImageFile(latestVersion); + if (latestInvalidImage.exists()) { + if (latestInvalidImage.delete()) { + LOG.info(latestInvalidImage.getAbsoluteFile() + " deleted."); } else { - LOG.warn(currentImage.getAbsoluteFile() + " delete failed."); + LOG.warn(latestInvalidImage.getAbsoluteFile() + " delete failed."); } } } From 3431f241e837167a83d0f348394ffae74388bb0a Mon Sep 17 00:00:00 2001 From: wuhangze Date: Thu, 14 Apr 2022 10:06:58 +0800 Subject: [PATCH 4/6] fix --- .../org/apache/doris/master/Checkpoint.java | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java index 2a9c2e473ac635..c6d060ab8b38ba 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java @@ -112,6 +112,7 @@ public synchronized void doCheckpoint() { catalog.setEditLog(editLog); createStaticFieldForCkpt(); boolean newImageGenerated = false; + boolean exceptionCaught = false; try { catalog.loadImage(imageDir); catalog.replayJournal(checkPointVersion); @@ -139,8 +140,21 @@ public synchronized void doCheckpoint() { } LOG.info("checkpoint finished save image.{}", replayedJournalId); } catch (Throwable e) { - if (newImageGenerated) { - // delete the newest image file, cuz it is invalid + exceptionCaught = true; + e.printStackTrace(); + LOG.error("Exception when generate new image file", e); + if (MetricRepo.isInit) { + MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L); + } + return; + } finally { + // destroy checkpoint catalog, reclaim memory + catalog = null; + Catalog.destroyCheckpoint(); + destroyStaticFieldForCkpt(); + // if new image generated && exception caught, delete the latest image here + // delete the newest image file, cuz it is invalid + if (newImageGenerated && exceptionCaught) { MetaCleaner cleaner = new MetaCleaner(Config.meta_dir + "/image"); try { cleaner.cleanTheLatestInvalidImageFile(); @@ -154,17 +168,6 @@ public synchronized void doCheckpoint() { } } } - e.printStackTrace(); - LOG.error("Exception when generate new image file", e); - if (MetricRepo.isInit) { - MetricRepo.COUNTER_IMAGE_WRITE_FAILED.increase(1L); - } - return; - } finally { - // destroy checkpoint catalog, reclaim memory - catalog = null; - Catalog.destroyCheckpoint(); - destroyStaticFieldForCkpt(); } // push image file to all the other non master nodes From 5248293375fefa4d244b171ec1377451bf6e1715 Mon Sep 17 00:00:00 2001 From: wuhangze Date: Thu, 21 Apr 2022 20:19:14 +0800 Subject: [PATCH 5/6] get filePath from saveImage() to ensure deleting the correct file while exception happens --- .../src/main/java/org/apache/doris/catalog/Catalog.java | 4 +++- .../main/java/org/apache/doris/master/Checkpoint.java | 9 ++++----- .../main/java/org/apache/doris/persist/MetaCleaner.java | 6 ++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java index 3178f05e348d6c..36e2ee88bea429 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Catalog.java @@ -1950,7 +1950,8 @@ public long loadSqlBlockRule(DataInputStream in, long checksum) throws IOExcepti } // Only called by checkpoint thread - public void saveImage() throws IOException { + // return the latest image file's absolute path + public String saveImage() throws IOException { // Write image.ckpt Storage storage = new Storage(this.imageDir); File curFile = storage.getImageFile(replayedJournalId.get()); @@ -1963,6 +1964,7 @@ public void saveImage() throws IOException { curFile.delete(); throw new IOException(); } + return curFile.getAbsolutePath(); } public void saveImage(File curFile, long replayedJournalId) throws IOException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java index c6d060ab8b38ba..bfc411708e042a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java @@ -111,8 +111,8 @@ public synchronized void doCheckpoint() { catalog = Catalog.getCurrentCatalog(); catalog.setEditLog(editLog); createStaticFieldForCkpt(); - boolean newImageGenerated = false; boolean exceptionCaught = false; + String latestImageFilePath = null; try { catalog.loadImage(imageDir); catalog.replayJournal(checkPointVersion); @@ -121,8 +121,7 @@ public synchronized void doCheckpoint() { checkPointVersion, catalog.getReplayedJournalId())); } catalog.fixBugAfterMetadataReplayed(false); - catalog.saveImage(); - newImageGenerated = true; + latestImageFilePath = catalog.saveImage(); replayedJournalId = catalog.getReplayedJournalId(); // destroy checkpoint catalog, reclaim memory @@ -154,10 +153,10 @@ public synchronized void doCheckpoint() { destroyStaticFieldForCkpt(); // if new image generated && exception caught, delete the latest image here // delete the newest image file, cuz it is invalid - if (newImageGenerated && exceptionCaught) { + if ((!Strings.isNullOrEmpty(latestImageFilePath)) && exceptionCaught) { MetaCleaner cleaner = new MetaCleaner(Config.meta_dir + "/image"); try { - cleaner.cleanTheLatestInvalidImageFile(); + cleaner.cleanTheLatestInvalidImageFile(latestImageFilePath); if (MetricRepo.isInit) { MetricRepo.COUNTER_IMAGE_CLEAN_SUCCESS.increase(1L); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java index 64e39fe06884d5..6c69ebe995c34e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/MetaCleaner.java @@ -68,10 +68,8 @@ public void clean() throws IOException { } } - public void cleanTheLatestInvalidImageFile() throws IOException { - Storage storage = new Storage(imageDir); - long latestVersion = storage.getImageSeq(); - File latestInvalidImage = storage.getImageFile(latestVersion); + public void cleanTheLatestInvalidImageFile(String path) throws IOException { + File latestInvalidImage = new File(path); if (latestInvalidImage.exists()) { if (latestInvalidImage.delete()) { LOG.info(latestInvalidImage.getAbsoluteFile() + " deleted."); From 33c21ff0445793620679add16b106425cea824fc Mon Sep 17 00:00:00 2001 From: wuhangze Date: Thu, 21 Apr 2022 20:23:13 +0800 Subject: [PATCH 6/6] fix --- .../src/main/java/org/apache/doris/master/Checkpoint.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java index bfc411708e042a..be433fe4c786a0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/Checkpoint.java @@ -35,6 +35,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import com.google.common.base.Strings; + import java.io.IOException; import java.io.OutputStream; import java.net.HttpURLConnection;