[minor](clone) add more debug log for tablet scheduler (#19892)

Sometimes I find that the tablet scheduler can not schedule tablet, and with no more info for debugging.
So I add some debug log for this process.
No logic is changed.
This commit is contained in:
Mingyu Chen
2023-05-20 15:59:26 +08:00
committed by GitHub
parent 8b9813663d
commit 777bdce5a5
3 changed files with 68 additions and 7 deletions

View File

@ -303,6 +303,8 @@ public class BackendLoadStatistic {
RootPathLoadStatistic pathStatistic = pathStatistics.get(i);
// if this is a supplement task, ignore the storage medium
if (!isSupplement && pathStatistic.getStorageMedium() != medium) {
LOG.debug("backend {} path {}'s storage medium {} is not {} storage medium, actual: {}",
beId, pathStatistic.getPath(), pathStatistic.getStorageMedium(), medium);
continue;
}

View File

@ -488,6 +488,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
if (backend == null) {
// containsBE() is currently only used for choosing dest backend to do clone task.
// return true so that it won't choose this backend.
LOG.debug("desc backend {} does not exist, skip. tablet: {}", beId, tabletId);
return true;
}
String host = backend.getHost();
@ -495,13 +496,18 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null) {
// BE has been dropped, skip it
LOG.debug("replica's backend {} does not exist, skip. tablet: {}", replica.getBackendId(), tabletId);
continue;
}
if (!Config.allow_replica_on_same_host && !FeConstants.runningUnitTest && host.equals(be.getHost())) {
LOG.debug("replica's backend {} is on same host {}, skip. tablet: {}",
replica.getBackendId(), host, tabletId);
return true;
}
if (replica.getBackendId() == beId) {
LOG.debug("replica's backend {} is same as dest backend {}, skip. tablet: {}",
replica.getBackendId(), beId, tabletId);
return true;
}
}
@ -557,24 +563,34 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
List<Replica> candidates = Lists.newArrayList();
for (Replica replica : tablet.getReplicas()) {
if (exceptBeId != -1 && replica.getBackendId() == exceptBeId) {
LOG.debug("replica's backend {} is same as except backend {}, skip. tablet: {}",
replica.getBackendId(), exceptBeId, tabletId);
continue;
}
if (replica.isBad() || replica.tooSlow()) {
LOG.debug("replica {} is bad({}) or too slow({}), skip. tablet: {}",
replica.getId(), replica.isBad(), replica.tooSlow(), tabletId);
continue;
}
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null || !be.isAlive()) {
// backend which is in decommission can still be the source backend
LOG.debug("replica's backend {} does not exist or is not alive, skip. tablet: {}",
replica.getBackendId(), tabletId);
continue;
}
if (replica.getLastFailedVersion() > 0) {
LOG.debug("replica {} has failed version {}, skip. tablet: {}",
replica.getId(), replica.getLastFailedVersion(), tabletId);
continue;
}
if (!replica.checkVersionCatchUp(visibleVersion, false)) {
LOG.debug("replica {} version {} has not catch up to visible version {}, skip. tablet: {}",
replica.getId(), replica.getVersion(), visibleVersion, tabletId);
continue;
}
@ -591,14 +607,19 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
for (Replica srcReplica : candidates) {
PathSlot slot = backendsWorkingSlots.get(srcReplica.getBackendId());
if (slot == null) {
LOG.debug("replica's backend {} does not have working slot, skip. tablet: {}",
srcReplica.getBackendId(), tabletId);
continue;
}
long srcPathHash = slot.takeSlot(srcReplica.getPathHash());
if (srcPathHash != -1) {
setSrc(srcReplica);
return;
if (srcPathHash == -1) {
LOG.debug("replica's backend {} does not have available slot, skip. tablet: {}",
srcReplica.getBackendId(), tabletId);
continue;
}
setSrc(srcReplica);
return;
}
throw new SchedException(Status.SCHEDULE_FAILED, "unable to find source slot");
}
@ -629,11 +650,15 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
Replica chosenReplica = null;
for (Replica replica : tablet.getReplicas()) {
if (replica.isBad()) {
LOG.debug("replica {} is bad, skip. tablet: {}",
replica.getId(), tabletId);
continue;
}
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null || !be.isScheduleAvailable()) {
LOG.debug("replica's backend {} does not exist or is not scheduler available, skip. tablet: {}",
replica.getBackendId(), tabletId);
continue;
}
@ -644,10 +669,14 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
&& ((replica.getVersion() == visibleVersion)
|| replica.getVersion() > visibleVersion) && replica.getState() != ReplicaState.DECOMMISSION) {
// skip healthy replica
LOG.debug("replica {} version {} is healthy, visible version {}, replica state {}, skip. tablet: {}",
replica.getId(), replica.getVersion(), visibleVersion, replica.getState(), tabletId);
continue;
}
if (replica.needFurtherRepair()) {
LOG.debug("replica {} need further repair, choose it. tablet: {}",
replica.getId(), tabletId);
chosenReplica = replica;
break;
}

View File

@ -1311,11 +1311,15 @@ public class TabletScheduler extends MasterDaemon {
List<RootPathLoadStatistic> allFitPaths = Lists.newArrayList();
for (BackendLoadStatistic bes : beStatistics) {
if (!bes.isAvailable()) {
LOG.debug("backend {} is not available, skip. tablet: {}", bes.getBeId(), tabletCtx.getTabletId());
continue;
}
// exclude BE which already has replica of this tablet or another BE at same host has this replica
if (tabletCtx.filterDestBE(bes.getBeId())) {
LOG.debug("backend {} already has replica of this tablet or another BE "
+ "at same host has this replica, skip. tablet: {}",
bes.getBeId(), tabletCtx.getTabletId());
continue;
}
@ -1323,9 +1327,13 @@ public class TabletScheduler extends MasterDaemon {
// Else, check the tag.
if (forColocate) {
if (!tabletCtx.getColocateBackendsSet().contains(bes.getBeId())) {
LOG.debug("backend {} is not in colocate backend set, skip. tablet: {}",
bes.getBeId(), tabletCtx.getTabletId());
continue;
}
} else if (!bes.getTag().equals(tag)) {
LOG.debug("backend {}'s tag {} is not equal to tablet's tag {}, skip. tablet: {}",
bes.getBeId(), bes.getTag(), tag, tabletCtx.getTabletId());
continue;
}
@ -1334,6 +1342,7 @@ public class TabletScheduler extends MasterDaemon {
resultPaths, tabletCtx.getTabletStatus() != TabletStatus.REPLICA_RELOCATING
/* if REPLICA_RELOCATING, then it is not a supplement task */);
if (!st.ok()) {
LOG.debug("unable to find path for tablet: {}. {}", tabletCtx, st);
// This is to solve, when we decommission some BEs with SSD disks,
// if there are no SSD disks on the remaining BEs, it will be impossible to select a
// suitable destination path.
@ -1361,31 +1370,50 @@ public class TabletScheduler extends MasterDaemon {
// we try to find a path with specified media type, if not find, arbitrarily use one.
for (RootPathLoadStatistic rootPathLoadStatistic : allFitPaths) {
if (rootPathLoadStatistic.getStorageMedium() != tabletCtx.getStorageMedium()) {
LOG.debug("backend {}'s path {}'s storage medium {} "
+ "is not equal to tablet's storage medium {}, skip. tablet: {}",
rootPathLoadStatistic.getBeId(), rootPathLoadStatistic.getPathHash(),
rootPathLoadStatistic.getStorageMedium(), tabletCtx.getStorageMedium(),
tabletCtx.getTabletId());
continue;
}
PathSlot slot = backendsWorkingSlots.get(rootPathLoadStatistic.getBeId());
if (slot == null) {
LOG.debug("backend {}'s path {}'s slot is null, skip. tablet: {}",
rootPathLoadStatistic.getBeId(), rootPathLoadStatistic.getPathHash(),
tabletCtx.getTabletId());
continue;
}
long pathHash = slot.takeSlot(rootPathLoadStatistic.getPathHash());
if (pathHash != -1) {
return rootPathLoadStatistic;
if (pathHash == -1) {
LOG.debug("backend {}'s path {}'s slot is full, skip. tablet: {}",
rootPathLoadStatistic.getBeId(), rootPathLoadStatistic.getPathHash(),
tabletCtx.getTabletId());
continue;
}
return rootPathLoadStatistic;
}
// no root path with specified media type is found, get arbitrary one.
for (RootPathLoadStatistic rootPathLoadStatistic : allFitPaths) {
PathSlot slot = backendsWorkingSlots.get(rootPathLoadStatistic.getBeId());
if (slot == null) {
LOG.debug("backend {}'s path {}'s slot is null, skip. tablet: {}",
rootPathLoadStatistic.getBeId(), rootPathLoadStatistic.getPathHash(),
tabletCtx.getTabletId());
continue;
}
long pathHash = slot.takeSlot(rootPathLoadStatistic.getPathHash());
if (pathHash != -1) {
return rootPathLoadStatistic;
if (pathHash == -1) {
LOG.debug("backend {}'s path {}'s slot is full, skip. tablet: {}",
rootPathLoadStatistic.getBeId(), rootPathLoadStatistic.getPathHash(),
tabletCtx.getTabletId());
continue;
}
return rootPathLoadStatistic;
}
throw new SchedException(Status.SCHEDULE_FAILED, "unable to find dest path which can be fit in");
@ -1712,10 +1740,12 @@ public class TabletScheduler extends MasterDaemon {
Slot slot = pathSlots.get(pathHash);
if (slot == null) {
LOG.debug("path {} is not exist", pathHash);
return -1;
}
slot.rectify();
if (slot.available <= 0) {
LOG.debug("path {} has no available slot", pathHash);
return -1;
}
slot.available--;