Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,25 @@ private void handleReplicaVersionIncomplete(TabletSchedCtx tabletCtx, AgentBatch
throw new SchedException(Status.UNRECOVERABLE, "cluster does not exist");
}

tabletCtx.chooseDestReplicaForVersionIncomplete(backendsWorkingSlots);
try {
tabletCtx.chooseDestReplicaForVersionIncomplete(backendsWorkingSlots);
} catch (SchedException e) {
if (e.getMessage().equals("unable to choose dest replica")) {
// This situation may occur when the BE nodes where all replicas of a tablet are located are decommission,
// and this task is a VERSION_INCOMPLETE task.
// This will lead to failure to select a suitable dest replica.
// At this time, we try to convert this task to a REPLICA_MISSING task, and schedule it again.
LOG.debug("failed to find version incomplete replica for VERSION_INCOMPLETE task. tablet id: {}, "
+ "try to find a new backend", tabletCtx.getTabletId());
tabletCtx.releaseResource(this, true);
tabletCtx.setTabletStatus(TabletStatus.REPLICA_MISSING);
handleReplicaMissing(tabletCtx, batchTask);
LOG.debug("succeed to find new backend for VERSION_INCOMPLETE task. tablet id: {}", tabletCtx.getTabletId());
return;
} else {
throw e;
}
}
tabletCtx.chooseSrcReplicaForVersionIncomplete(backendsWorkingSlots);

// create clone task
Expand Down Expand Up @@ -660,18 +678,19 @@ private void handleReplicaRelocating(TabletSchedCtx tabletCtx, AgentBatchTask ba
stat.counterReplicaUnavailableErr.incrementAndGet();
try {
handleReplicaVersionIncomplete(tabletCtx, batchTask);
LOG.info("succeed to find version incomplete replica from tablet relocating. tablet id: {}",
LOG.debug("succeed to find version incomplete replica from tablet relocating. tablet id: {}",
tabletCtx.getTabletId());
} catch (SchedException e) {
if (e.getStatus() == Status.SCHEDULE_FAILED) {
LOG.info("failed to find version incomplete replica from tablet relocating. tablet id: {}, "
LOG.debug("failed to find version incomplete replica from tablet relocating. tablet id: {}, "
+ "try to find a new backend", tabletCtx.getTabletId());
// the dest or src slot may be taken after calling handleReplicaVersionIncomplete(),
// so we need to release these slots first.
// and reserve the tablet in TabletSchedCtx so that it can continue to be scheduled.
tabletCtx.releaseResource(this, true);
tabletCtx.setTabletStatus(TabletStatus.REPLICA_MISSING);
handleReplicaMissing(tabletCtx, batchTask);
LOG.info("succeed to find new backend for tablet relocating. tablet id: {}", tabletCtx.getTabletId());
LOG.debug("succeed to find new backend for tablet relocating. tablet id: {}", tabletCtx.getTabletId());
} else {
throw e;
}
Expand Down Expand Up @@ -1057,8 +1076,17 @@ private RootPathLoadStatistic chooseAvailableDestPath(TabletSchedCtx tabletCtx,
resultPaths, tabletCtx.getTabletStatus() != TabletStatus.REPLICA_RELOCATING
/* if REPLICA_RELOCATING, then it is not a supplement task */);
if (!st.ok()) {
LOG.debug("unable to find path for supplementing tablet: {}. {}", tabletCtx, st);
continue;
// This is to solve, when we decommission some BEs with SSD disks,
// if there are no SSD disks on the remaining BEs, it will be impossible to select a
// suitable destination path.
// In this case, we need to ignore the storage medium property and try to select the destination path again.
// Set `isSupplement` to true will ignore the storage medium property.
st = bes.isFit(tabletCtx.getTabletSize(), tabletCtx.getStorageMedium(),
resultPaths, true);
if (!st.ok()) {
LOG.debug("unable to find path for supplementing tablet: {}. {}", tabletCtx, st);
continue;
}
}

Preconditions.checkState(resultPaths.size() == 1);
Expand Down