diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/ColocateTableCheckerAndBalancer.java b/fe/fe-core/src/main/java/org/apache/doris/clone/ColocateTableCheckerAndBalancer.java index 37030dc088..142137e203 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/ColocateTableCheckerAndBalancer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/ColocateTableCheckerAndBalancer.java @@ -427,6 +427,17 @@ public class ColocateTableCheckerAndBalancer extends MasterDaemon { continue; } + // Unavailable be has been removed from backendWithReplicaNum, + // but the conditions for judging unavailable be by + // getUnavailableBeIdsInGroup may be too loose. Under the + // default configuration (colocate_group_relocate_delay_second = + // 1800), a be that has been out of contact for 20 minutes can + // still be selected as the dest be. + if (!destBe.isAlive()) { + LOG.info("{} is not alive, not suitable as a dest be", destBe); + continue; + } + for (int seqIndex : seqIndexes) { // the bucket index. // eg: 0 / 3 = 0, so that the bucket index of the 4th backend id in flatBackendsPerBucketSeq is 0. diff --git a/fe/fe-core/src/test/java/org/apache/doris/clone/ColocateTableCheckerAndBalancerTest.java b/fe/fe-core/src/test/java/org/apache/doris/clone/ColocateTableCheckerAndBalancerTest.java index a549bd7e08..54ea8a77e7 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/clone/ColocateTableCheckerAndBalancerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/clone/ColocateTableCheckerAndBalancerTest.java @@ -74,6 +74,16 @@ public class ColocateTableCheckerAndBalancerTest { backend8 = new Backend(8L, "192.168.1.8", 9050); backend9 = new Backend(9L, "192.168.1.8", 9050); + backend1.setAlive(true); + backend2.setAlive(true); + backend3.setAlive(true); + backend4.setAlive(true); + backend5.setAlive(true); + backend6.setAlive(true); + backend7.setAlive(true); + backend8.setAlive(true); + backend9.setAlive(true); + mixLoadScores = Maps.newHashMap(); mixLoadScores.put(1L, 0.1); mixLoadScores.put(2L, 0.5);