[improvement](tablet clone) partition balance should invalidate tablet move in cache when sched failed (#25602)

This commit is contained in:
yujun
2023-10-20 19:35:16 +08:00
committed by GitHub
parent fded8ba824
commit f1b81fafd4
4 changed files with 34 additions and 5 deletions

View File

@ -92,6 +92,18 @@ public class MovesCacheMap {
return null;
}
public void invalidateTablet(TabletSchedCtx tabletCtx) {
Map<TStorageMedium, MovesCache> mediumMoves = cacheMap.get(tabletCtx.getTag());
if (mediumMoves != null) {
MovesCache cache = mediumMoves.get(tabletCtx.getStorageMedium());
if (cache != null) {
cache.get().invalidate(tabletCtx.getTabletId());
} else {
mediumMoves.values().forEach(it -> it.get().invalidate(tabletCtx.getTabletId()));
}
}
}
// For given tablet ctx, find it in cacheMap
public Pair<PartitionRebalancer.TabletMove, Long> getTabletMove(TabletSchedCtx tabletCtx) {
for (Map<TStorageMedium, MovesCache> mediumMap : cacheMap.values()) {

View File

@ -306,13 +306,18 @@ public class PartitionRebalancer extends Rebalancer {
// To be improved
}
@Override
public void onTabletFailed(TabletSchedCtx tabletCtx) {
movesCacheMap.invalidateTablet(tabletCtx);
}
@Override
public Long getToDeleteReplicaId(TabletSchedCtx tabletCtx) {
// We don't invalidate the cached move here, cuz the redundant repair progress is just started.
// The move should be invalidated by TTL or Algo.CheckMoveCompleted()
Pair<TabletMove, Long> pair = movesCacheMap.getTabletMove(tabletCtx);
if (pair != null) {
Preconditions.checkState(pair.second != -1L);
//Preconditions.checkState(pair.second != -1L);
return pair.second;
} else {
return (long) -1;

View File

@ -100,6 +100,9 @@ public abstract class Rebalancer {
return -1L;
}
public void onTabletFailed(TabletSchedCtx tabletCtx) {
}
public void updateLoadStatistic(Map<Tag, LoadStatisticForTag> statisticMap) {
this.statisticMap = statisticMap;
}

View File

@ -1234,13 +1234,15 @@ public class TabletScheduler extends MasterDaemon {
List<TabletSchedCtx> alternativeTablets = rebalancer.selectAlternativeTablets();
Collections.shuffle(alternativeTablets);
for (TabletSchedCtx tabletCtx : alternativeTablets) {
if (addTablet(tabletCtx, false) == AddResult.ADDED) {
if (needAddBalanceNum > 0 && addTablet(tabletCtx, false) == AddResult.ADDED) {
needAddBalanceNum--;
if (needAddBalanceNum <= 0) {
return;
}
} else {
rebalancer.onTabletFailed(tabletCtx);
}
}
if (needAddBalanceNum <= 0) {
return;
}
if (Config.disable_disk_balance) {
LOG.info("disk balance is disabled. skip selecting tablets for disk balance");
return;
@ -1448,6 +1450,13 @@ public class TabletScheduler extends MasterDaemon {
private void finalizeTabletCtx(TabletSchedCtx tabletCtx, TabletSchedCtx.State state, Status status, String reason) {
if (state == TabletSchedCtx.State.CANCELLED || state == TabletSchedCtx.State.UNEXPECTED) {
if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE
&& tabletCtx.getBalanceType() == TabletSchedCtx.BalanceType.BE_BALANCE) {
rebalancer.onTabletFailed(tabletCtx);
}
}
// use 2 steps to avoid nested database lock and synchronized.(releaseTabletCtx() may hold db lock)
// remove the tablet ctx, so that no other process can see it
removeTabletCtx(tabletCtx, reason);