diff --git a/fe/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/src/main/java/org/apache/doris/catalog/OlapTable.java index c0ac047eb2bf10..f70be9e6bcbecf 100644 --- a/fe/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1224,7 +1224,7 @@ public boolean isStable(SystemInfoService infoService, TabletScheduler tabletSch Pair statusPair = tablet.getHealthStatusWithPriority( infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum, - availableBackendsNum); + availableBackendsNum, false); if (statusPair.first != TabletStatus.HEALTHY) { LOG.info("table {} is not stable because tablet {} status is {}. replicas: {}", id, tablet.getId(), statusPair.first, tablet.getReplicas()); diff --git a/fe/src/main/java/org/apache/doris/catalog/Tablet.java b/fe/src/main/java/org/apache/doris/catalog/Tablet.java index d8eb69144e7760..9bddce2ce0ca9c 100644 --- a/fe/src/main/java/org/apache/doris/catalog/Tablet.java +++ b/fe/src/main/java/org/apache/doris/catalog/Tablet.java @@ -61,6 +61,7 @@ public enum TabletStatus { COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set. COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant. NEED_FURTHER_REPAIR, // one of replicas need a definite repair. + NO_AVAILABLE_REPLICA, // without available replica. } @SerializedName(value = "id") @@ -395,7 +396,7 @@ public long getDataSize(boolean singleReplica) { return dataSize; } - /** + /* * A replica is healthy only if * 1. the backend is available * 2. replica version is caught up, and last failed version is -1 @@ -403,11 +404,15 @@ public long getDataSize(boolean singleReplica) { * A tablet is healthy only if * 1. healthy replica num is equal to replicationNum * 2. all healthy replicas are in right cluster + * returnNoAvlExplicit + * True: means special usage to show lostTabletIds in the systemInfo + * if aliveAndVersionComplete==0 return NO_AVAILABLE_REPLICA + * False: normally to get the schedule priority of the tablet */ public Pair getHealthStatusWithPriority( SystemInfoService systemInfoService, String clusterName, long visibleVersion, long visibleVersionHash, int replicationNum, - int availableBackendsNum) { + int availableBackendsNum, boolean returnNoAvlExplicit) { int alive = 0; int aliveAndVersionComplete = 0; @@ -452,6 +457,12 @@ public Pair getHealthStatusWithPriority( } } + // special useage to show lost tablet ids in system info, do not need to schedule. + if (returnNoAvlExplicit && aliveAndVersionComplete == 0){ + + return Pair.create(TabletStatus.NO_AVAILABLE_REPLICA, null); + } + // 1. alive replicas are not enough if (alive < replicationNum && replicas.size() >= availableBackendsNum && availableBackendsNum >= replicationNum && replicationNum > 1) { diff --git a/fe/src/main/java/org/apache/doris/clone/TabletChecker.java b/fe/src/main/java/org/apache/doris/clone/TabletChecker.java index 850fe04f8faccd..4a49757bdb2080 100644 --- a/fe/src/main/java/org/apache/doris/clone/TabletChecker.java +++ b/fe/src/main/java/org/apache/doris/clone/TabletChecker.java @@ -239,7 +239,7 @@ private void checkTablets() { partition.getVisibleVersion(), partition.getVisibleVersionHash(), olapTbl.getPartitionInfo().getReplicationNum(partition.getId()), - availableBackendsNum); + availableBackendsNum,false); if (statusWithPrio.first == TabletStatus.HEALTHY) { // Only set last status check time when status is healthy. diff --git a/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java b/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java index 059306fe5589d6..d3962a96e06538 100644 --- a/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java +++ b/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java @@ -803,7 +803,7 @@ public void finishCloneTask(CloneTask cloneTask, TFinishTaskRequest request) short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId); Pair pair = tablet.getHealthStatusWithPriority( infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum, - availableBackendsNum); + availableBackendsNum,false); if (pair.first == TabletStatus.HEALTHY) { throw new SchedException(Status.FINISHED, "tablet is healthy"); } diff --git a/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java index 275fb8cd39cd97..f9deca2fddbc21 100644 --- a/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java +++ b/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java @@ -500,7 +500,7 @@ private void scheduleTablet(TabletSchedCtx tabletCtx, AgentBatchTask batchTask) partition.getVisibleVersion(), partition.getVisibleVersionHash(), tbl.getPartitionInfo().getReplicationNum(partition.getId()), - availableBackendsNum); + availableBackendsNum, false); } if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) { diff --git a/fe/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java b/fe/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java index b278c47cc1f5c5..a00a51d6e436e6 100644 --- a/fe/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java +++ b/fe/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java @@ -29,20 +29,24 @@ public class IncompleteTabletsProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() - .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets") + .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("LostTablets") .build(); + private static final Joiner JOINER = Joiner.on(","); Collection unhealthyTabletIds; Collection inconsistentTabletIds; + Collection noAvlreplicaTabletIds; Collection cloningTabletIds; public IncompleteTabletsProcNode(Collection unhealthyTabletIds, Collection inconsistentTabletIds, - Collection cloningTabletIds) { + Collection cloningTabletIds, + Collection noAvlreplicaTabletIds) { this.unhealthyTabletIds = unhealthyTabletIds; this.inconsistentTabletIds = inconsistentTabletIds; this.cloningTabletIds = cloningTabletIds; + this.noAvlreplicaTabletIds = noAvlreplicaTabletIds; } @Override @@ -55,10 +59,12 @@ public ProcResult fetchResult() throws AnalysisException { String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds)); String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds)); + String noAvlreplicaTablets = JOINER.join(Arrays.asList(noAvlreplicaTabletIds)); String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds)); row.add(incompleteTablets); row.add(inconsistentTablets); row.add(cloningTablets); + row.add(noAvlreplicaTablets); result.addRow(row); diff --git a/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java b/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java index e75a3fee69aca1..674813a3502c7a 100644 --- a/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java +++ b/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java @@ -52,6 +52,7 @@ public class StatisticProcDir implements ProcDirInterface { .add("DbId").add("DbName").add("TableNum").add("PartitionNum") .add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum") .add("InconsistentTabletNum").add("CloningTabletNum") + .add("LostTabletNum") // count of tablets without available replica. .build(); private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class); @@ -63,12 +64,15 @@ public class StatisticProcDir implements ProcDirInterface { Multimap inconsistentTabletIds; // db id -> set(tablet id) Multimap cloningTabletIds; + // db id -> set(tablet id) + Multimap noAvlreplicaTabletIds; public StatisticProcDir(Catalog catalog) { this.catalog = catalog; unhealthyTabletIds = HashMultimap.create(); inconsistentTabletIds = HashMultimap.create(); cloningTabletIds = HashMultimap.create(); + noAvlreplicaTabletIds = HashMultimap.create(); } @Override @@ -96,6 +100,7 @@ public ProcResult fetchResult() throws AnalysisException { unhealthyTabletIds.clear(); inconsistentTabletIds.clear(); cloningTabletIds = AgentTaskQueue.getTabletIdsByType(TTaskType.CLONE); + noAvlreplicaTabletIds.clear(); List> lines = new ArrayList>(); for (Long dbId : dbIds) { if (dbId == 0) { @@ -137,7 +142,7 @@ public ProcResult fetchResult() throws AnalysisException { Pair res = tablet.getHealthStatusWithPriority( infoService, db.getClusterName(), partition.getVisibleVersion(), partition.getVisibleVersionHash(), - replicationNum, availableBackendsNum); + replicationNum, availableBackendsNum, true); // here we treat REDUNDANT as HEALTHY, for user friendly. if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT @@ -145,6 +150,11 @@ public ProcResult fetchResult() throws AnalysisException { unhealthyTabletIds.put(dbId, tablet.getId()); } + // add tablet without available replica; + if (res.first == TabletStatus.NO_AVAILABLE_REPLICA) { + noAvlreplicaTabletIds.put(dbId, tablet.getId()); + } + if (!tablet.isConsistent()) { inconsistentTabletIds.put(dbId, tablet.getId()); } @@ -164,6 +174,7 @@ public ProcResult fetchResult() throws AnalysisException { oneLine.add(unhealthyTabletIds.get(dbId).size()); oneLine.add(inconsistentTabletIds.get(dbId).size()); oneLine.add(cloningTabletIds.get(dbId).size()); + oneLine.add(noAvlreplicaTabletIds.get(dbId).size()); lines.add(oneLine); @@ -193,6 +204,7 @@ public ProcResult fetchResult() throws AnalysisException { finalLine.add(unhealthyTabletIds.size()); finalLine.add(inconsistentTabletIds.size()); finalLine.add(cloningTabletIds.size()); + finalLine.add(noAvlreplicaTabletIds.size()); lines.add(finalLine); // add result @@ -223,6 +235,7 @@ public ProcNodeInterface lookup(String dbIdStr) throws AnalysisException { return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId), inconsistentTabletIds.get(dbId), - cloningTabletIds.get(dbId)); + cloningTabletIds.get(dbId), + noAvlreplicaTabletIds.get(dbId)); } } diff --git a/fe/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/src/main/java/org/apache/doris/master/ReportHandler.java index c8c1d187e61a9f..c9c9cc2f39b710 100644 --- a/fe/src/main/java/org/apache/doris/master/ReportHandler.java +++ b/fe/src/main/java/org/apache/doris/master/ReportHandler.java @@ -1030,7 +1030,7 @@ private static void addReplica(long tabletId, TTabletInfo backendTabletInfo, lon int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size(); Pair status = tablet.getHealthStatusWithPriority(infoService, db.getClusterName(), visibleVersion, visibleVersionHash, - replicationNum, availableBackendsNum); + replicationNum, availableBackendsNum, false); if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) { long lastFailedVersion = -1L;