Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fe/src/main/java/org/apache/doris/catalog/OlapTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ public boolean isStable(SystemInfoService infoService, TabletScheduler tabletSch

Pair<TabletStatus, TabletSchedCtx.Priority> statusPair = tablet.getHealthStatusWithPriority(
infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum,
availableBackendsNum);
availableBackendsNum, false);
if (statusPair.first != TabletStatus.HEALTHY) {
LOG.info("table {} is not stable because tablet {} status is {}. replicas: {}",
id, tablet.getId(), statusPair.first, tablet.getReplicas());
Expand Down
15 changes: 13 additions & 2 deletions fe/src/main/java/org/apache/doris/catalog/Tablet.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public enum TabletStatus {
COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set.
COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant.
NEED_FURTHER_REPAIR, // one of replicas need a definite repair.
NO_AVAILABLE_REPLICA, // without available replica.
}

@SerializedName(value = "id")
Expand Down Expand Up @@ -395,19 +396,23 @@ public long getDataSize(boolean singleReplica) {
return dataSize;
}

/**
/*
* A replica is healthy only if
* 1. the backend is available
* 2. replica version is caught up, and last failed version is -1
*
* A tablet is healthy only if
* 1. healthy replica num is equal to replicationNum
* 2. all healthy replicas are in right cluster
* returnNoAvlExplicit
* True: means special usage to show lostTabletIds in the systemInfo
* if aliveAndVersionComplete==0 return NO_AVAILABLE_REPLICA
* False: normally to get the schedule priority of the tablet
*/
public Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriority(
SystemInfoService systemInfoService, String clusterName,
long visibleVersion, long visibleVersionHash, int replicationNum,
int availableBackendsNum) {
int availableBackendsNum, boolean returnNoAvlExplicit) {

int alive = 0;
int aliveAndVersionComplete = 0;
Expand Down Expand Up @@ -452,6 +457,12 @@ public Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriority(
}
}

// special useage to show lost tablet ids in system info, do not need to schedule.
if (returnNoAvlExplicit && aliveAndVersionComplete == 0){

return Pair.create(TabletStatus.NO_AVAILABLE_REPLICA, null);
}

// 1. alive replicas are not enough
if (alive < replicationNum && replicas.size() >= availableBackendsNum
&& availableBackendsNum >= replicationNum && replicationNum > 1) {
Expand Down
2 changes: 1 addition & 1 deletion fe/src/main/java/org/apache/doris/clone/TabletChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private void checkTablets() {
partition.getVisibleVersion(),
partition.getVisibleVersionHash(),
olapTbl.getPartitionInfo().getReplicationNum(partition.getId()),
availableBackendsNum);
availableBackendsNum,false);

if (statusWithPrio.first == TabletStatus.HEALTHY) {
// Only set last status check time when status is healthy.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ public void finishCloneTask(CloneTask cloneTask, TFinishTaskRequest request)
short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId);
Pair<TabletStatus, TabletSchedCtx.Priority> pair = tablet.getHealthStatusWithPriority(
infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum,
availableBackendsNum);
availableBackendsNum,false);
if (pair.first == TabletStatus.HEALTHY) {
throw new SchedException(Status.FINISHED, "tablet is healthy");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ private void scheduleTablet(TabletSchedCtx tabletCtx, AgentBatchTask batchTask)
partition.getVisibleVersion(),
partition.getVisibleVersionHash(),
tbl.getPartitionInfo().getReplicationNum(partition.getId()),
availableBackendsNum);
availableBackendsNum, false);
}

if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,24 @@

public class IncompleteTabletsProcNode implements ProcNodeInterface {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets")
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("LostTablets")
.build();

private static final Joiner JOINER = Joiner.on(",");

Collection<Long> unhealthyTabletIds;
Collection<Long> inconsistentTabletIds;
Collection<Long> noAvlreplicaTabletIds;
Collection<Long> cloningTabletIds;

public IncompleteTabletsProcNode(Collection<Long> unhealthyTabletIds,
Collection<Long> inconsistentTabletIds,
Collection<Long> cloningTabletIds) {
Collection<Long> cloningTabletIds,
Collection<Long> noAvlreplicaTabletIds) {
this.unhealthyTabletIds = unhealthyTabletIds;
this.inconsistentTabletIds = inconsistentTabletIds;
this.cloningTabletIds = cloningTabletIds;
this.noAvlreplicaTabletIds = noAvlreplicaTabletIds;
}

@Override
Expand All @@ -55,10 +59,12 @@ public ProcResult fetchResult() throws AnalysisException {

String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds));
String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds));
String noAvlreplicaTablets = JOINER.join(Arrays.asList(noAvlreplicaTabletIds));
String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds));
row.add(incompleteTablets);
row.add(inconsistentTablets);
row.add(cloningTablets);
row.add(noAvlreplicaTablets);

result.addRow(row);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public class StatisticProcDir implements ProcDirInterface {
.add("DbId").add("DbName").add("TableNum").add("PartitionNum")
.add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum")
.add("InconsistentTabletNum").add("CloningTabletNum")
.add("LostTabletNum") // count of tablets without available replica.
.build();
private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class);

Expand All @@ -63,12 +64,15 @@ public class StatisticProcDir implements ProcDirInterface {
Multimap<Long, Long> inconsistentTabletIds;
// db id -> set(tablet id)
Multimap<Long, Long> cloningTabletIds;
// db id -> set(tablet id)
Multimap<Long, Long> noAvlreplicaTabletIds;

public StatisticProcDir(Catalog catalog) {
this.catalog = catalog;
unhealthyTabletIds = HashMultimap.create();
inconsistentTabletIds = HashMultimap.create();
cloningTabletIds = HashMultimap.create();
noAvlreplicaTabletIds = HashMultimap.create();
}

@Override
Expand Down Expand Up @@ -96,6 +100,7 @@ public ProcResult fetchResult() throws AnalysisException {
unhealthyTabletIds.clear();
inconsistentTabletIds.clear();
cloningTabletIds = AgentTaskQueue.getTabletIdsByType(TTaskType.CLONE);
noAvlreplicaTabletIds.clear();
List<List<Comparable>> lines = new ArrayList<List<Comparable>>();
for (Long dbId : dbIds) {
if (dbId == 0) {
Expand Down Expand Up @@ -137,14 +142,19 @@ public ProcResult fetchResult() throws AnalysisException {
Pair<TabletStatus, Priority> res = tablet.getHealthStatusWithPriority(
infoService, db.getClusterName(),
partition.getVisibleVersion(), partition.getVisibleVersionHash(),
replicationNum, availableBackendsNum);
replicationNum, availableBackendsNum, true);

// here we treat REDUNDANT as HEALTHY, for user friendly.
if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT
&& res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR) {
unhealthyTabletIds.put(dbId, tablet.getId());
}

// add tablet without available replica;
if (res.first == TabletStatus.NO_AVAILABLE_REPLICA) {
noAvlreplicaTabletIds.put(dbId, tablet.getId());
}

if (!tablet.isConsistent()) {
inconsistentTabletIds.put(dbId, tablet.getId());
}
Expand All @@ -164,6 +174,7 @@ public ProcResult fetchResult() throws AnalysisException {
oneLine.add(unhealthyTabletIds.get(dbId).size());
oneLine.add(inconsistentTabletIds.get(dbId).size());
oneLine.add(cloningTabletIds.get(dbId).size());
oneLine.add(noAvlreplicaTabletIds.get(dbId).size());

lines.add(oneLine);

Expand Down Expand Up @@ -193,6 +204,7 @@ public ProcResult fetchResult() throws AnalysisException {
finalLine.add(unhealthyTabletIds.size());
finalLine.add(inconsistentTabletIds.size());
finalLine.add(cloningTabletIds.size());
finalLine.add(noAvlreplicaTabletIds.size());
lines.add(finalLine);

// add result
Expand Down Expand Up @@ -223,6 +235,7 @@ public ProcNodeInterface lookup(String dbIdStr) throws AnalysisException {

return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId),
inconsistentTabletIds.get(dbId),
cloningTabletIds.get(dbId));
cloningTabletIds.get(dbId),
noAvlreplicaTabletIds.get(dbId));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ private static void addReplica(long tabletId, TTabletInfo backendTabletInfo, lon
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
Pair<TabletStatus, TabletSchedCtx.Priority> status = tablet.getHealthStatusWithPriority(infoService,
db.getClusterName(), visibleVersion, visibleVersionHash,
replicationNum, availableBackendsNum);
replicationNum, availableBackendsNum, false);

if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
long lastFailedVersion = -1L;
Expand Down