From d6b11ed06d883d19b66c961c0a18f6cd9c215dd0 Mon Sep 17 00:00:00 2001 From: morningman Date: Wed, 16 Jun 2021 20:43:09 +0800 Subject: [PATCH 1/3] [Enhance] Support show unrecoverable tablets The unrecoverable tablets are tablets which non of their replicas are healthy. We should be able to find out these tablets then manual intervention. And these tablets should not be added to the tablet scheduler. --- .../java/org/apache/doris/catalog/Tablet.java | 9 +++++-- .../org/apache/doris/clone/TabletChecker.java | 5 ++++ .../apache/doris/clone/TabletScheduler.java | 24 +++++++++-------- .../proc/IncompleteTabletsProcNode.java | 9 +++++-- .../doris/common/proc/StatisticProcDir.java | 27 +++++++++++++------ .../apache/doris/master/ReportHandler.java | 3 ++- 6 files changed, 53 insertions(+), 24 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java index 361d6ec12625bf..b7288dbe244f96 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java @@ -62,6 +62,7 @@ public enum TabletStatus { COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set. COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant. NEED_FURTHER_REPAIR, // one of replicas need a definite repair. + UNRECOVERABLE // non of replicas are healthy } @SerializedName(value = "id") @@ -455,7 +456,9 @@ public Pair getHealthStatusWithPriority( // 1. alive replicas are not enough int aliveBackendsNum = aliveBeIdsInCluster.size(); - if (alive < replicationNum && replicas.size() >= aliveBackendsNum + if (alive == 0) { + return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH); + } else if (alive < replicationNum && replicas.size() >= aliveBackendsNum && aliveBackendsNum >= replicationNum && replicationNum > 1) { // there is no enough backend for us to create a new replica, so we have to delete an existing replica, // so there can be available backend for us to create a new replica. @@ -473,7 +476,9 @@ public Pair getHealthStatusWithPriority( } // 2. version complete replicas are not enough - if (aliveAndVersionComplete < (replicationNum / 2) + 1) { + if (aliveAndVersionComplete == 0) { + return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH); + } else if (aliveAndVersionComplete < (replicationNum / 2) + 1) { return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.HIGH); } else if (aliveAndVersionComplete < replicationNum) { return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.NORMAL); diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java index 4e375c10001fca..2c9666d1b67a89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java @@ -329,6 +329,11 @@ private LoopControlStatus handlePartitionTablet(Database db, OlapTable tbl, Part // Only set last status check time when status is healthy. tablet.setLastStatusCheckTime(startTime); continue; + } else if (statusWithPrio.first == TabletStatus.UNRECOVERABLE) { + // This tablet is not recoverable, do not set it into tablet scheduler + // all UNRECOVERABLE tablet can be seen from "show proc '/statistic'" + counter.unhealthyTabletNum++; + continue; } else if (isInPrios) { statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH; prioPartIsHealthy = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java index 0794296781748b..9c4b2b4c5b822d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java @@ -575,17 +575,19 @@ private void handleTabletByTypeAndStatus(TabletStatus status, TabletSchedCtx tab case FORCE_REDUNDANT: handleRedundantReplica(tabletCtx, true); break; - case REPLICA_MISSING_IN_CLUSTER: - handleReplicaClusterMigration(tabletCtx, batchTask); - break; - case COLOCATE_MISMATCH: - handleColocateMismatch(tabletCtx, batchTask); - break; - case COLOCATE_REDUNDANT: - handleColocateRedundant(tabletCtx); - break; - default: - break; + case REPLICA_MISSING_IN_CLUSTER: + handleReplicaClusterMigration(tabletCtx, batchTask); + break; + case COLOCATE_MISMATCH: + handleColocateMismatch(tabletCtx, batchTask); + break; + case COLOCATE_REDUNDANT: + handleColocateRedundant(tabletCtx); + break; + case UNRECOVERABLE: + throw new SchedException(Status.UNRECOVERABLE, "tablet is unrecoverable"); + default: + break; } } else { // balance diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java index b278c47cc1f5c5..4cdf5de7145805 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java @@ -29,20 +29,23 @@ public class IncompleteTabletsProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() - .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets") + .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("BadTablets") .build(); private static final Joiner JOINER = Joiner.on(","); Collection unhealthyTabletIds; Collection inconsistentTabletIds; Collection cloningTabletIds; + Collection unrecoverableTabletIds; public IncompleteTabletsProcNode(Collection unhealthyTabletIds, Collection inconsistentTabletIds, - Collection cloningTabletIds) { + Collection cloningTabletIds, + Collection unrecoverableTabletIds) { this.unhealthyTabletIds = unhealthyTabletIds; this.inconsistentTabletIds = inconsistentTabletIds; this.cloningTabletIds = cloningTabletIds; + this.unrecoverableTabletIds = unrecoverableTabletIds; } @Override @@ -56,9 +59,11 @@ public ProcResult fetchResult() throws AnalysisException { String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds)); String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds)); String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds)); + String unrecoverableTablets = JOINER.join(Arrays.asList(unrecoverableTabletIds)); row.add(incompleteTablets); row.add(inconsistentTablets); row.add(cloningTablets); + row.add(unrecoverableTablets); result.addRow(row); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java index 001f00c00c1a9c..769cb71e3b13fa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java @@ -17,10 +17,6 @@ package org.apache.doris.common.proc; -import com.google.common.base.Preconditions; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Multimap; import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.MaterializedIndex; @@ -38,6 +34,12 @@ import org.apache.doris.system.SystemInfoService; import org.apache.doris.task.AgentTaskQueue; import org.apache.doris.thrift.TTaskType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Multimap; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -49,7 +51,7 @@ public class StatisticProcDir implements ProcDirInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("DbId").add("DbName").add("TableNum").add("PartitionNum") .add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum") - .add("InconsistentTabletNum").add("CloningTabletNum") + .add("InconsistentTabletNum").add("CloningTabletNum").add("BadTabletNum") .build(); private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class); @@ -61,12 +63,15 @@ public class StatisticProcDir implements ProcDirInterface { Multimap inconsistentTabletIds; // db id -> set(tablet id) Multimap cloningTabletIds; + // db id -> set(tablet id) + Multimap unrecoverableTabletIds; public StatisticProcDir(Catalog catalog) { this.catalog = catalog; unhealthyTabletIds = HashMultimap.create(); inconsistentTabletIds = HashMultimap.create(); cloningTabletIds = HashMultimap.create(); + unrecoverableTabletIds = HashMultimap.create(); } @Override @@ -140,8 +145,11 @@ public ProcResult fetchResult() throws AnalysisException { // here we treat REDUNDANT as HEALTHY, for user friendly. if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT - && res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR) { + && res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR + && res.first != TabletStatus.UNRECOVERABLE) { unhealthyTabletIds.put(dbId, tablet.getId()); + } else if (res.first == TabletStatus.UNRECOVERABLE) { + unrecoverableTabletIds.put(dbId, tablet.getId()); } if (!tablet.isConsistent()) { @@ -166,6 +174,7 @@ public ProcResult fetchResult() throws AnalysisException { oneLine.add(unhealthyTabletIds.get(dbId).size()); oneLine.add(inconsistentTabletIds.get(dbId).size()); oneLine.add(cloningTabletIds.get(dbId).size()); + oneLine.add(unhealthyTabletIds.get(dbId).size()); lines.add(oneLine); @@ -195,6 +204,7 @@ public ProcResult fetchResult() throws AnalysisException { finalLine.add(unhealthyTabletIds.size()); finalLine.add(inconsistentTabletIds.size()); finalLine.add(cloningTabletIds.size()); + finalLine.add(unrecoverableTabletIds.size()); lines.add(finalLine); // add result @@ -224,7 +234,8 @@ public ProcNodeInterface lookup(String dbIdStr) throws AnalysisException { } return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId), - inconsistentTabletIds.get(dbId), - cloningTabletIds.get(dbId)); + inconsistentTabletIds.get(dbId), + cloningTabletIds.get(dbId), + unrecoverableTabletIds.get(dbId)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java index 1782b796cfd4b6..31a2dc4ab8c164 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java @@ -1014,7 +1014,8 @@ private static void addReplica(long tabletId, TTabletInfo backendTabletInfo, lon db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum, aliveBeIdsInCluster); - if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) { + if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING + || status.first == TabletStatus.UNRECOVERABLE) { long lastFailedVersion = -1L; long lastFailedVersionHash = 0L; From 37092d0b65fa6fca634012b017fbd5ed8d4950d3 Mon Sep 17 00:00:00 2001 From: morningman Date: Wed, 16 Jun 2021 21:04:34 +0800 Subject: [PATCH 2/3] fix1 --- .../java/org/apache/doris/common/proc/StatisticProcDir.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java index 769cb71e3b13fa..596267cb4228ea 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java @@ -174,7 +174,7 @@ public ProcResult fetchResult() throws AnalysisException { oneLine.add(unhealthyTabletIds.get(dbId).size()); oneLine.add(inconsistentTabletIds.get(dbId).size()); oneLine.add(cloningTabletIds.get(dbId).size()); - oneLine.add(unhealthyTabletIds.get(dbId).size()); + oneLine.add(unrecoverableTabletIds.get(dbId).size()); lines.add(oneLine); From dcc8320347e29d5595ae6621fc7a2fd00cf8af3b Mon Sep 17 00:00:00 2001 From: morningman Date: Wed, 16 Jun 2021 22:48:19 +0800 Subject: [PATCH 3/3] Fix show data bug and spring boot bug --- .../src/main/java/org/apache/doris/httpv2/HttpServer.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java index 0e5782551ccbae..8f062ea2ba6b71 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java @@ -65,6 +65,9 @@ public void start(String dorisHome) { properties.put("spring.http.encoding.force", true); properties.put("spring.servlet.multipart.max-file-size", this.maxFileSize); properties.put("spring.servlet.multipart.max-request-size", this.maxRequestSize); + // This is to disable the spring-boot-devtools restart feature. + // To avoid some unexpected behavior. + System.setProperty("spring.devtools.restart.enabled", "false"); properties.put("logging.config", dorisHome + "/conf/" + SpringLog4j2Config.SPRING_LOG_XML_FILE); new SpringApplicationBuilder() .sources(HttpServer.class)