Handle the situation when there is no enough backends for tablet repair (#1299)

If there are only 3 backends and replication num is 3. If one replica of a tablet is bad, there is no 4th backend for tablet repair. So we need to delete a bad replica first to make room for new replica.
2019-06-14 20:28:29 +08:00
parent c8d7b8e1c4
commit 5c2cf9f2ce
10 changed files with 130 additions and 50 deletions
--- a/docs/documentation/cn/administrator-guide/operation/tablet_repair_and_balance.md
+++ b/docs/documentation/cn/administrator-guide/operation/tablet_repair_and_balance.md
@ -69,8 +69,12 @@
 5. REDUNDANT

    副本冗余。健康副本都在对应 cluster 内，但数量大于期望副本数。或者有多余的 unavailable 副本。
+
+6. FORCE\_REDUNDANT
+
+    这是一个特殊状态。只会出现在当期望副本数大于等于可用节点数时，并且 Tablet 处于副本缺失状态时出现。这种情况下，需要先删除一个副本，以保证有可用节点用于创建新副本。
    
-6. HEALTHY
+7. HEALTHY

    健康分片，即条件[1-5]都不满足。
    
@ -107,6 +111,11 @@ TabletChecker 作为常驻的后台进程，会定期检查所有分片的状态
    6. 副本所在 cluster 不正确
    7. 副本所在 BE 节点负载高

+5. FORCE\_REDUNDANT
+
+    不同于 REDUNDANT，因为此时虽然 Tablet 有副本缺失，但是因为已经没有额外的可用节点用于创建新的副本了。所以此时必须先删除一个副本，以腾出一个可用节点用于创建新的副本。
+    删除副本的顺序同 REDUNDANT。
+
 ### 调度优先级

 TabletScheduler 里等待被调度的分片会根据状态不同，赋予不同的优先级。优先级高的分片将会被优先调度。目前有以下几种优先级。
@ -114,6 +123,7 @@ TabletScheduler 里等待被调度的分片会根据状态不同，赋予不同
 1. VERY\_HIGH

    * REDUNDANT。对于有副本冗余的分片，我们优先处理。虽然逻辑上来讲，副本冗余的紧急程度最低，但是因为这种情况处理起来最快且可以快速释放资源（比如磁盘空间等），所以我们优先处理。
+    * FORCE\_REDUNDANT。同上。

 2. HIGH

--- a/fe/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/src/main/java/org/apache/doris/catalog/OlapTable.java
@ -974,6 +974,7 @@ public class OlapTable extends Table {
    }

    public boolean isStable(SystemInfoService infoService, TabletScheduler tabletScheduler, String clusterName) {
+        int availableBackendsNum = infoService.getClusterBackendIds(clusterName, true).size();
        for (Partition partition : idToPartition.values()) {
            long visibleVersion = partition.getVisibleVersion();
            long visibleVersionHash = partition.getVisibleVersionHash();
@ -985,7 +986,8 @@ public class OlapTable extends Table {
                    }

                    Pair<TabletStatus, TabletSchedCtx.Priority> statusPair = tablet.getHealthStatusWithPriority(
-                            infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum);
+                            infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum,
+                            availableBackendsNum);
                    if (statusPair.first != TabletStatus.HEALTHY) {
                        return false;
                    }
--- a/fe/src/main/java/org/apache/doris/catalog/Tablet.java
+++ b/fe/src/main/java/org/apache/doris/catalog/Tablet.java
@ -54,6 +54,8 @@ public class Tablet extends MetaObject implements Writable {
        REPLICA_RELOCATING, // replica is healthy, but is under relocating (eg. BE is decommission)
        REDUNDANT, // too much replicas
        REPLICA_MISSING_IN_CLUSTER, // not enough healthy replicas in correct cluster
+        FORCE_REDUNDANT, // some replica is missing or bad, but there is no other backends for repair,
+                         // at least one replica has to be deleted first to make room for new replica.
    }

    private long id;
@ -371,13 +373,14 @@ public class Tablet extends MetaObject implements Writable {
     */
    public Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriority(
            SystemInfoService systemInfoService, String clusterName,
-            long visibleVersion, long visibleVersionHash, int replicationNum) {
+            long visibleVersion, long visibleVersionHash, int replicationNum,
+            int availableBackendsNum) {

        int alive = 0;
        int aliveAndVersionComplete = 0;
        int stable = 0;
        int availableInCluster = 0;
-
+        
        for (Replica replica : replicas) {
            long backendId = replica.getBackendId();
            Backend backend = systemInfoService.getBackend(backendId);
@ -409,7 +412,18 @@ public class Tablet extends MetaObject implements Writable {
        }

        // 1. alive replicas are not enough
-        if (alive < (replicationNum / 2) + 1) {
+        if (alive < replicationNum && replicas.size() >= availableBackendsNum
+                && availableBackendsNum >= replicationNum && replicationNum > 1) {
+            // there is no enough backend for us to create a new replica, so we have to delete an existing replica,
+            // so there can be available backend for us to create a new replica.
+            // And if there is only one replica, we will not handle it(maybe need human interference)
+            // condition explain:
+            // 1. alive < replicationNum: replica is missing or bad
+            // 2. replicas.size() >= availableBackendsNum: the existing replicas occupies all available backends
+            // 3. availableBackendsNum >= replicationNum: make sure after deleting, there will be a backend for new replica.
+            // 4. replicationNum > 1: if replication num is set to 1, do not delete any replica, for safety reason
+            return Pair.create(TabletStatus.FORCE_REDUNDANT, TabletSchedCtx.Priority.VERY_HIGH);
+        } else if (alive < (replicationNum / 2) + 1) {
            return Pair.create(TabletStatus.REPLICA_MISSING, TabletSchedCtx.Priority.HIGH);
        } else if (alive < replicationNum) {
            return Pair.create(TabletStatus.REPLICA_MISSING, TabletSchedCtx.Priority.NORMAL);
--- a/fe/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
+++ b/fe/src/main/java/org/apache/doris/catalog/TabletInvertedIndex.java
@ -151,12 +151,13 @@ public class TabletInvertedIndex {
                                if (needRecover(replica, tabletMeta.getOldSchemaHash(), backendTabletInfo)) {
                                    LOG.warn("replica {} of tablet {} on backend {} need recovery. "
                                            + "replica in FE: {}, report version {}-{}, report schema hash: {},"
-                                            + " is bad: {}",
+                                            + " is bad: {}, is version missing: {}",
                                            replica.getId(), tabletId, backendId, replica,
                                            backendTabletInfo.getVersion(),
                                            backendTabletInfo.getVersion_hash(),
                                            backendTabletInfo.getSchema_hash(),
-                                            backendTabletInfo.isSetUsed() ? backendTabletInfo.isUsed() : "unknown");
+                                            backendTabletInfo.isSetUsed() ? backendTabletInfo.isUsed() : "unknown",
+                                            backendTabletInfo.isSetVersion_miss() ? backendTabletInfo.isVersion_miss() : "unset");
                                    tabletRecoveryMap.put(tabletMeta.getDbId(), tabletId);
                                }

@ -342,6 +343,17 @@ public class TabletInvertedIndex {
     * because of some unrecoverable failure.
     */
    private boolean needRecover(Replica replicaInFe, int schemaHashInFe, TTabletInfo backendTabletInfo) {
+        if (replicaInFe.getState() != ReplicaState.NORMAL) {
+            // only normal replica need recover
+            // case:
+            // the replica's state is CLONE, which means this a newly created replica in clone process.
+            // and an old out-of-date replica reports here, and this report should not mark this replica as
+            // 'need recovery'.
+            // Other state such as ROLLUP/SCHEMA_CHANGE, the replica behavior is unknown, so for safety reason,
+            // also not mark this replica as 'need recovery'.
+            return false;
+        }
+
        if (backendTabletInfo.isSetUsed() && !backendTabletInfo.isUsed()) {
            // tablet is bad
            return true;
@ -372,6 +384,7 @@ public class TabletInvertedIndex {
            // even if backend version is less than fe's version, but if version_miss is false,
            // which means this may be a stale report.
            // so we only return true if version_miss is true.
+            LOG.info("replicaversion missing");
            return true;
        }
        return false;
--- a/fe/src/main/java/org/apache/doris/clone/TabletChecker.java
+++ b/fe/src/main/java/org/apache/doris/clone/TabletChecker.java
@ -191,6 +191,7 @@ public class TabletChecker extends Daemon {

            db.readLock();
            try {
+                int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
                for (Table table : db.getTables()) {
                    if (!table.needSchedule()) {
                        continue;
@ -210,11 +211,12 @@ public class TabletChecker extends Daemon {
                                }
                                
                                Pair<TabletStatus, TabletSchedCtx.Priority> statusWithPrio = tablet.getHealthStatusWithPriority(
-                                    infoService,
-                                    db.getClusterName(),
-                                    partition.getVisibleVersion(),
-                                    partition.getVisibleVersionHash(),
-                                    olapTbl.getPartitionInfo().getReplicationNum(partition.getId()));
+                                        infoService,
+                                        db.getClusterName(),
+                                        partition.getVisibleVersion(),
+                                        partition.getVisibleVersionHash(),
+                                        olapTbl.getPartitionInfo().getReplicationNum(partition.getId()),
+                                        availableBackendsNum);

                                if (statusWithPrio.first == TabletStatus.HEALTHY) {
                                    // Only set last status check time when status is healthy.
--- a/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
+++ b/fe/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
@ -752,9 +752,11 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
                throw new SchedException(Status.UNRECOVERABLE, "tablet does not exist");
            }
            
+            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
            short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId);
            Pair<TabletStatus, TabletSchedCtx.Priority> pair = tablet.getHealthStatusWithPriority(
-                    infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum);
+                    infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum,
+                    availableBackendsNum);
            if (pair.first == TabletStatus.HEALTHY) {
                throw new SchedException(Status.FINISHED, "tablet is healthy");
            }
--- a/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/src/main/java/org/apache/doris/clone/TabletScheduler.java
@ -43,6 +43,7 @@ import org.apache.doris.task.AgentTask;
 import org.apache.doris.task.AgentTaskExecutor;
 import org.apache.doris.task.AgentTaskQueue;
 import org.apache.doris.task.CloneTask;
+import org.apache.doris.task.DropReplicaTask;
 import org.apache.doris.thrift.TFinishTaskRequest;

 import com.google.common.base.Preconditions;
@ -471,11 +472,13 @@ public class TabletScheduler extends Daemon {
            Tablet tablet = idx.getTablet(tabletCtx.getTabletId());
            Preconditions.checkNotNull(tablet);

+            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
            statusPair = tablet.getHealthStatusWithPriority(
                    infoService, tabletCtx.getCluster(),
                    partition.getVisibleVersion(),
                    partition.getVisibleVersionHash(),
-                    tbl.getPartitionInfo().getReplicationNum(partition.getId()));
+                    tbl.getPartitionInfo().getReplicationNum(partition.getId()),
+                    availableBackendsNum);

            if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) {
                // If table is under ALTER process, do not allow to do balance.
@ -494,9 +497,9 @@ public class TabletScheduler extends Daemon {
                throw new SchedException(Status.UNRECOVERABLE, "tablet is healthy");
            } else if (statusPair.first != TabletStatus.HEALTHY
                    && tabletCtx.getType() == TabletSchedCtx.Type.BALANCE) {
-                tabletCtx.releaseResource(this);
                // we select an unhealthy tablet to do balance, which is not right.
                // so here we change it to a REPAIR task, and also reset its priority
+                tabletCtx.releaseResource(this);
                tabletCtx.setType(TabletSchedCtx.Type.REPAIR);
                tabletCtx.setOrigPriority(statusPair.second);
                tabletCtx.setLastSchedTime(currentTime);
@ -531,7 +534,10 @@ public class TabletScheduler extends Daemon {
                    handleReplicaRelocating(tabletCtx, batchTask);
                    break;
                case REDUNDANT:
-                    handleRedundantReplica(tabletCtx);
+                    handleRedundantReplica(tabletCtx, false);
+                    break;
+                case FORCE_REDUNDANT:
+                    handleRedundantReplica(tabletCtx, true);
                    break;
                case REPLICA_MISSING_IN_CLUSTER:
                    handleReplicaClusterMigration(tabletCtx, batchTask);
@ -621,16 +627,16 @@ public class TabletScheduler extends Daemon {
     *  7. replica not in right cluster
     *  8. replica in higher load backend
     */
-    private void handleRedundantReplica(TabletSchedCtx tabletCtx) throws SchedException {
+    private void handleRedundantReplica(TabletSchedCtx tabletCtx, boolean force) throws SchedException {
        stat.counterReplicaRedundantErr.incrementAndGet();
-        if (deleteBackendDropped(tabletCtx)
-                || deleteBadReplica(tabletCtx)
-                || deleteBackendUnavailable(tabletCtx)
-                || deleteCloneReplica(tabletCtx)
-                || deleteReplicaWithFailedVersion(tabletCtx)
-                || deleteReplicaWithLowerVersion(tabletCtx)
-                || deleteReplicaNotInCluster(tabletCtx)
-                || deleteReplicaOnHighLoadBackend(tabletCtx)) {
+        if (deleteBackendDropped(tabletCtx, force)
+                || deleteBadReplica(tabletCtx, force)
+                || deleteBackendUnavailable(tabletCtx, force)
+                || deleteCloneReplica(tabletCtx, force)
+                || deleteReplicaWithFailedVersion(tabletCtx, force)
+                || deleteReplicaWithLowerVersion(tabletCtx, force)
+                || deleteReplicaNotInCluster(tabletCtx, force)
+                || deleteReplicaOnHighLoadBackend(tabletCtx, force)) {
            // if we delete at least one redundant replica, we still throw a SchedException with status FINISHED
            // to remove this tablet from the pendingTablets(consider it as finished)
            throw new SchedException(Status.FINISHED, "redundant replica is deleted");
@ -638,28 +644,28 @@ public class TabletScheduler extends Daemon {
        throw new SchedException(Status.SCHEDULE_FAILED, "unable to delete any redundant replicas");
    }

-    private boolean deleteBackendDropped(TabletSchedCtx tabletCtx) {
+    private boolean deleteBackendDropped(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            long beId = replica.getBackendId();
            if (infoService.getBackend(beId) == null) {
-                deleteReplicaInternal(tabletCtx, replica, "backend dropped");
+                deleteReplicaInternal(tabletCtx, replica, "backend dropped", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteBadReplica(TabletSchedCtx tabletCtx) {
+    private boolean deleteBadReplica(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            if (replica.isBad()) {
-                deleteReplicaInternal(tabletCtx, replica, "replica is bad");
+                deleteReplicaInternal(tabletCtx, replica, "replica is bad", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteBackendUnavailable(TabletSchedCtx tabletCtx) {
+    private boolean deleteBackendUnavailable(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            Backend be = infoService.getBackend(replica.getBackendId());
            if (be == null) {
@ -667,44 +673,44 @@ public class TabletScheduler extends Daemon {
                continue;
            }
            if (!be.isAvailable()) {
-                deleteReplicaInternal(tabletCtx, replica, "backend unavailable");
+                deleteReplicaInternal(tabletCtx, replica, "backend unavailable", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteCloneReplica(TabletSchedCtx tabletCtx) {
+    private boolean deleteCloneReplica(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            if (replica.getState() == ReplicaState.CLONE) {
-                deleteReplicaInternal(tabletCtx, replica, "clone state");
+                deleteReplicaInternal(tabletCtx, replica, "clone state", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteReplicaWithFailedVersion(TabletSchedCtx tabletCtx) {
+    private boolean deleteReplicaWithFailedVersion(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            if (replica.getLastFailedVersion() > 0) {
-                deleteReplicaInternal(tabletCtx, replica, "version incomplete");
+                deleteReplicaInternal(tabletCtx, replica, "version incomplete", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteReplicaWithLowerVersion(TabletSchedCtx tabletCtx) {
+    private boolean deleteReplicaWithLowerVersion(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            if (!replica.checkVersionCatchUp(tabletCtx.getCommittedVersion(), tabletCtx.getCommittedVersionHash())) {
-                deleteReplicaInternal(tabletCtx, replica, "lower version");
+                deleteReplicaInternal(tabletCtx, replica, "lower version", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteReplicaNotInCluster(TabletSchedCtx tabletCtx) {
+    private boolean deleteReplicaNotInCluster(TabletSchedCtx tabletCtx, boolean force) {
        for (Replica replica : tabletCtx.getReplicas()) {
            Backend be = infoService.getBackend(replica.getBackendId());
            if (be == null) {
@ -712,14 +718,14 @@ public class TabletScheduler extends Daemon {
                continue;
            }
            if (!be.getOwnerClusterName().equals(tabletCtx.getCluster())) {
-                deleteReplicaInternal(tabletCtx, replica, "not in cluster");
+                deleteReplicaInternal(tabletCtx, replica, "not in cluster", force);
                return true;
            }
        }
        return false;
    }

-    private boolean deleteReplicaOnHighLoadBackend(TabletSchedCtx tabletCtx) {
+    private boolean deleteReplicaOnHighLoadBackend(TabletSchedCtx tabletCtx, boolean force) {
        ClusterLoadStatistic statistic = statisticMap.get(tabletCtx.getCluster());
        if (statistic == null) {
            return false;
@ -739,17 +745,26 @@ public class TabletScheduler extends Daemon {
        }

        if (chosenReplica != null) {
-            deleteReplicaInternal(tabletCtx, chosenReplica, "high load");
+            deleteReplicaInternal(tabletCtx, chosenReplica, "high load", force);
            return true;
        }
        return false;
    }

-    private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, String reason) {
+    private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, String reason, boolean force) {
        // delete this replica from catalog.
        // it will also delete replica from tablet inverted index.
        tabletCtx.deleteReplica(replica);

+        if (force) {
+            // send the delete replica task.
+            // also this may not be necessary, but delete it will make things simpler.
+            // NOTICE: only delete the replica from meta may not work. sometimes we can depends on tablet report
+            // to delete these replicas, but in FORCE_REDUNDANT case, replica may be added to meta again in report
+            // process.
+            sendDeleteReplicaTask(replica.getBackendId(), tabletCtx.getTabletId(), tabletCtx.getSchemaHash());
+        }
+
        // write edit log
        ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(tabletCtx.getDbId(),
                                                                     tabletCtx.getTblId(),
@ -764,6 +779,14 @@ public class TabletScheduler extends Daemon {
                 tabletCtx.getTabletId(), replica.getBackendId(), reason);
    }

+    private void sendDeleteReplicaTask(long backendId, long tabletId, int schemaHash) {
+        DropReplicaTask task = new DropReplicaTask(backendId, tabletId, schemaHash);
+        AgentBatchTask batchTask = new AgentBatchTask();
+        batchTask.addTask(task);
+        AgentTaskExecutor.submit(batchTask);
+        LOG.info("send delete replica task for tablet {} in backend {}", tabletId, backendId);
+    }
+
    /*
     * Cluster migration, which means the tablet has enough healthy replicas,
     * but some replicas are not in right cluster.
--- a/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
+++ b/fe/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java
@ -97,6 +97,7 @@ public class StatisticProcDir implements ProcDirInterface {
            }

            ++totalDbNum;
+            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
            db.readLock();
            try {
                int dbTableNum = 0;
@ -125,7 +126,7 @@ public class StatisticProcDir implements ProcDirInterface {
                                Pair<TabletStatus, Priority> res = tablet.getHealthStatusWithPriority(
                                        infoService, db.getClusterName(),
                                        partition.getVisibleVersion(), partition.getVisibleVersionHash(),
-                                        replicationNum);
+                                        replicationNum, availableBackendsNum);

                                // here we treat REDUNDANT as HEALTHY, for user friendly.
                                if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT) {
--- a/fe/src/main/java/org/apache/doris/master/ReportHandler.java
+++ b/fe/src/main/java/org/apache/doris/master/ReportHandler.java
@ -627,7 +627,8 @@ public class ReportHandler extends Daemon {
            TTablet backendTablet = backendTablets.get(tabletId);
            for (TTabletInfo backendTabletInfo : backendTablet.getTablet_infos()) {
                boolean needDelete = false;
-                if (!foundTabletsWithValidSchema.contains(tabletId)) {
+                if (!foundTabletsWithValidSchema.contains(tabletId)
+                        && isBackendReplicaHealthy(backendTabletInfo)) {
                    // if this tablet is not in meta. try adding it.
                    // if add failed. delete this tablet from backend.
                    try {
@ -669,6 +670,17 @@ public class ReportHandler extends Daemon {
        LOG.info("add {} replica(s) to meta. backend[{}]", addToMetaCounter, backendId);
    }

+    // replica is used and no version missing
+    private static boolean isBackendReplicaHealthy(TTabletInfo backendTabletInfo) {
+        if (backendTabletInfo.isSetUsed() && !backendTabletInfo.isUsed()) {
+            return false;
+        }
+        if (backendTabletInfo.isSetVersion_miss() && backendTabletInfo.isVersion_miss()) {
+            return false;
+        }
+        return true;
+    }
+
    private static void handleMigration(ListMultimap<TStorageMedium, Long> tabletMetaMigrationMap,
                                        long backendId) {
        TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex();
@ -928,9 +940,10 @@ public class ReportHandler extends Daemon {
                return;
            }

+            int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
            Pair<TabletStatus, TabletSchedCtx.Priority> status = tablet.getHealthStatusWithPriority(infoService,
                    db.getClusterName(), visibleVersion, visibleVersionHash,
-                    replicationNum);
+                    replicationNum, availableBackendsNum);
            
            if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
                long lastFailedVersion = -1L;
--- a/fe/src/main/java/org/apache/doris/system/SystemInfoService.java
+++ b/fe/src/main/java/org/apache/doris/system/SystemInfoService.java
@ -674,20 +674,20 @@ public class SystemInfoService {
    /**
     * get cluster's backend id list
     *
-     * @param name
+     * @param clusterName
     * @return
     */
-    public List<Long> getClusterBackendIds(String name, boolean needAlive) {
+    public List<Long> getClusterBackendIds(String clusterName, boolean needAlive) {
        final Map<Long, Backend> copiedBackends = Maps.newHashMap(idToBackendRef.get());
        final List<Long> ret = new ArrayList<Long>();

-        if (Strings.isNullOrEmpty(name)) {
+        if (Strings.isNullOrEmpty(clusterName)) {
            return null;
        }

        if (needAlive) {
            for (Backend backend : copiedBackends.values()) {
-                if (name.equals(backend.getOwnerClusterName())) {
+                if (clusterName.equals(backend.getOwnerClusterName())) {
                    if (backend != null && backend.isAlive()) {
                        ret.add(backend.getId());
                    }
@ -695,7 +695,7 @@ public class SystemInfoService {
            }
        } else {
            for (Backend backend : copiedBackends.values()) {
-                if (name.equals(backend.getOwnerClusterName())) {
+                if (clusterName.equals(backend.getOwnerClusterName())) {
                    ret.add(backend.getId());
                }
            }