From 264a5b9d7581c0ff7459874359e2bf01e3463b0d Mon Sep 17 00:00:00 2001 From: w41ter Date: Tue, 27 Aug 2024 09:00:47 +0000 Subject: [PATCH 1/2] [chore](backup) limit the involved tablets in a backup job and to avoid FE OOM caused by saving too much metadata. Assuming the average tablet size is 50MB, the default value of 300000 can support 14TB of data per backup job. --- .../src/main/java/org/apache/doris/common/Config.java | 9 +++++++++ .../main/java/org/apache/doris/backup/BackupJob.java | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 69d382c7a5c2e7..355683230d51ed 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1532,6 +1532,15 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = true) public static int max_backup_restore_job_num_per_db = 10; + /** + * Control the max num of tablets per backup job involved. + */ + @ConfField(mutable = true, masterOnly = true, description = { + "用于控制每次 backup job 允许备份的 tablet 上限,以避免 OOM", + "Control the max num of tablets per backup job involved, to avoid OOM" + }) + public static int max_backup_tablets_per_job = 300000; + /** * whether to ignore table that not support type when backup, and not report exception. */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java index 6f73334f0c2474..2f1e5c4b2fc6cd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java @@ -523,6 +523,16 @@ private void prepareAndSendSnapshotTask() { } } + // Limit the max num of tablets involved in a backup job, to avoid OOM. + if (unfinishedTaskIds.size() > Config.max_backup_tablets_per_job) { + String msg = String.format("the num involved tablets %d exceeds the limit %d" + + "change config `max_backup_tablets_per_job` to change this limitation", + unfinishedTaskIds.size(), Config.max_backup_tablets_per_job); + LOG.warn(msg); + status = new Status(ErrCode.COMMON_ERROR, msg); + return; + } + backupMeta = new BackupMeta(copiedTables, copiedResources); // send tasks From 0886a9e63fd1e233690004792a3f6a492f8fd1b3 Mon Sep 17 00:00:00 2001 From: w41ter Date: Tue, 27 Aug 2024 11:26:09 +0000 Subject: [PATCH 2/2] address comment --- .../src/main/java/org/apache/doris/backup/BackupJob.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java index 2f1e5c4b2fc6cd..d5010293b4d6ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/backup/BackupJob.java @@ -525,8 +525,9 @@ private void prepareAndSendSnapshotTask() { // Limit the max num of tablets involved in a backup job, to avoid OOM. if (unfinishedTaskIds.size() > Config.max_backup_tablets_per_job) { - String msg = String.format("the num involved tablets %d exceeds the limit %d" - + "change config `max_backup_tablets_per_job` to change this limitation", + String msg = String.format("the num involved tablets %d exceeds the limit %d, " + + "which might cause the FE OOM, change config `max_backup_tablets_per_job` " + + "to change this limitation", unfinishedTaskIds.size(), Config.max_backup_tablets_per_job); LOG.warn(msg); status = new Status(ErrCode.COMMON_ERROR, msg);