Handle the situation when there is no enough backends for tablet repair (#1299)

If there are only 3 backends and replication num is 3. If one replica of a
tablet is bad, there is no 4th backend for tablet repair. So we need to delete
a bad replica first to make room for new replica.
This commit is contained in:
Mingyu Chen
2019-06-14 20:28:29 +08:00
committed by ZHAO Chun
parent c8d7b8e1c4
commit 5c2cf9f2ce
10 changed files with 130 additions and 50 deletions

View File

@ -69,8 +69,12 @@
5. REDUNDANT
副本冗余。健康副本都在对应 cluster 内,但数量大于期望副本数。或者有多余的 unavailable 副本。
6. FORCE\_REDUNDANT
这是一个特殊状态。只会出现在当期望副本数大于等于可用节点数时,并且 Tablet 处于副本缺失状态时出现。这种情况下,需要先删除一个副本,以保证有可用节点用于创建新副本。
6. HEALTHY
7. HEALTHY
健康分片,即条件[1-5]都不满足。
@ -107,6 +111,11 @@ TabletChecker 作为常驻的后台进程,会定期检查所有分片的状态
6. 副本所在 cluster 不正确
7. 副本所在 BE 节点负载高
5. FORCE\_REDUNDANT
不同于 REDUNDANT,因为此时虽然 Tablet 有副本缺失,但是因为已经没有额外的可用节点用于创建新的副本了。所以此时必须先删除一个副本,以腾出一个可用节点用于创建新的副本。
删除副本的顺序同 REDUNDANT。
### 调度优先级
TabletScheduler 里等待被调度的分片会根据状态不同,赋予不同的优先级。优先级高的分片将会被优先调度。目前有以下几种优先级。
@ -114,6 +123,7 @@ TabletScheduler 里等待被调度的分片会根据状态不同,赋予不同
1. VERY\_HIGH
* REDUNDANT。对于有副本冗余的分片,我们优先处理。虽然逻辑上来讲,副本冗余的紧急程度最低,但是因为这种情况处理起来最快且可以快速释放资源(比如磁盘空间等),所以我们优先处理。
* FORCE\_REDUNDANT。同上。
2. HIGH

View File

@ -974,6 +974,7 @@ public class OlapTable extends Table {
}
public boolean isStable(SystemInfoService infoService, TabletScheduler tabletScheduler, String clusterName) {
int availableBackendsNum = infoService.getClusterBackendIds(clusterName, true).size();
for (Partition partition : idToPartition.values()) {
long visibleVersion = partition.getVisibleVersion();
long visibleVersionHash = partition.getVisibleVersionHash();
@ -985,7 +986,8 @@ public class OlapTable extends Table {
}
Pair<TabletStatus, TabletSchedCtx.Priority> statusPair = tablet.getHealthStatusWithPriority(
infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum);
infoService, clusterName, visibleVersion, visibleVersionHash, replicationNum,
availableBackendsNum);
if (statusPair.first != TabletStatus.HEALTHY) {
return false;
}

View File

@ -54,6 +54,8 @@ public class Tablet extends MetaObject implements Writable {
REPLICA_RELOCATING, // replica is healthy, but is under relocating (eg. BE is decommission)
REDUNDANT, // too much replicas
REPLICA_MISSING_IN_CLUSTER, // not enough healthy replicas in correct cluster
FORCE_REDUNDANT, // some replica is missing or bad, but there is no other backends for repair,
// at least one replica has to be deleted first to make room for new replica.
}
private long id;
@ -371,13 +373,14 @@ public class Tablet extends MetaObject implements Writable {
*/
public Pair<TabletStatus, TabletSchedCtx.Priority> getHealthStatusWithPriority(
SystemInfoService systemInfoService, String clusterName,
long visibleVersion, long visibleVersionHash, int replicationNum) {
long visibleVersion, long visibleVersionHash, int replicationNum,
int availableBackendsNum) {
int alive = 0;
int aliveAndVersionComplete = 0;
int stable = 0;
int availableInCluster = 0;
for (Replica replica : replicas) {
long backendId = replica.getBackendId();
Backend backend = systemInfoService.getBackend(backendId);
@ -409,7 +412,18 @@ public class Tablet extends MetaObject implements Writable {
}
// 1. alive replicas are not enough
if (alive < (replicationNum / 2) + 1) {
if (alive < replicationNum && replicas.size() >= availableBackendsNum
&& availableBackendsNum >= replicationNum && replicationNum > 1) {
// there is no enough backend for us to create a new replica, so we have to delete an existing replica,
// so there can be available backend for us to create a new replica.
// And if there is only one replica, we will not handle it(maybe need human interference)
// condition explain:
// 1. alive < replicationNum: replica is missing or bad
// 2. replicas.size() >= availableBackendsNum: the existing replicas occupies all available backends
// 3. availableBackendsNum >= replicationNum: make sure after deleting, there will be a backend for new replica.
// 4. replicationNum > 1: if replication num is set to 1, do not delete any replica, for safety reason
return Pair.create(TabletStatus.FORCE_REDUNDANT, TabletSchedCtx.Priority.VERY_HIGH);
} else if (alive < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.REPLICA_MISSING, TabletSchedCtx.Priority.HIGH);
} else if (alive < replicationNum) {
return Pair.create(TabletStatus.REPLICA_MISSING, TabletSchedCtx.Priority.NORMAL);

View File

@ -151,12 +151,13 @@ public class TabletInvertedIndex {
if (needRecover(replica, tabletMeta.getOldSchemaHash(), backendTabletInfo)) {
LOG.warn("replica {} of tablet {} on backend {} need recovery. "
+ "replica in FE: {}, report version {}-{}, report schema hash: {},"
+ " is bad: {}",
+ " is bad: {}, is version missing: {}",
replica.getId(), tabletId, backendId, replica,
backendTabletInfo.getVersion(),
backendTabletInfo.getVersion_hash(),
backendTabletInfo.getSchema_hash(),
backendTabletInfo.isSetUsed() ? backendTabletInfo.isUsed() : "unknown");
backendTabletInfo.isSetUsed() ? backendTabletInfo.isUsed() : "unknown",
backendTabletInfo.isSetVersion_miss() ? backendTabletInfo.isVersion_miss() : "unset");
tabletRecoveryMap.put(tabletMeta.getDbId(), tabletId);
}
@ -342,6 +343,17 @@ public class TabletInvertedIndex {
* because of some unrecoverable failure.
*/
private boolean needRecover(Replica replicaInFe, int schemaHashInFe, TTabletInfo backendTabletInfo) {
if (replicaInFe.getState() != ReplicaState.NORMAL) {
// only normal replica need recover
// case:
// the replica's state is CLONE, which means this a newly created replica in clone process.
// and an old out-of-date replica reports here, and this report should not mark this replica as
// 'need recovery'.
// Other state such as ROLLUP/SCHEMA_CHANGE, the replica behavior is unknown, so for safety reason,
// also not mark this replica as 'need recovery'.
return false;
}
if (backendTabletInfo.isSetUsed() && !backendTabletInfo.isUsed()) {
// tablet is bad
return true;
@ -372,6 +384,7 @@ public class TabletInvertedIndex {
// even if backend version is less than fe's version, but if version_miss is false,
// which means this may be a stale report.
// so we only return true if version_miss is true.
LOG.info("replicaversion missing");
return true;
}
return false;

View File

@ -191,6 +191,7 @@ public class TabletChecker extends Daemon {
db.readLock();
try {
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
for (Table table : db.getTables()) {
if (!table.needSchedule()) {
continue;
@ -210,11 +211,12 @@ public class TabletChecker extends Daemon {
}
Pair<TabletStatus, TabletSchedCtx.Priority> statusWithPrio = tablet.getHealthStatusWithPriority(
infoService,
db.getClusterName(),
partition.getVisibleVersion(),
partition.getVisibleVersionHash(),
olapTbl.getPartitionInfo().getReplicationNum(partition.getId()));
infoService,
db.getClusterName(),
partition.getVisibleVersion(),
partition.getVisibleVersionHash(),
olapTbl.getPartitionInfo().getReplicationNum(partition.getId()),
availableBackendsNum);
if (statusWithPrio.first == TabletStatus.HEALTHY) {
// Only set last status check time when status is healthy.

View File

@ -752,9 +752,11 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
throw new SchedException(Status.UNRECOVERABLE, "tablet does not exist");
}
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId);
Pair<TabletStatus, TabletSchedCtx.Priority> pair = tablet.getHealthStatusWithPriority(
infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum);
infoService, db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum,
availableBackendsNum);
if (pair.first == TabletStatus.HEALTHY) {
throw new SchedException(Status.FINISHED, "tablet is healthy");
}

View File

@ -43,6 +43,7 @@ import org.apache.doris.task.AgentTask;
import org.apache.doris.task.AgentTaskExecutor;
import org.apache.doris.task.AgentTaskQueue;
import org.apache.doris.task.CloneTask;
import org.apache.doris.task.DropReplicaTask;
import org.apache.doris.thrift.TFinishTaskRequest;
import com.google.common.base.Preconditions;
@ -471,11 +472,13 @@ public class TabletScheduler extends Daemon {
Tablet tablet = idx.getTablet(tabletCtx.getTabletId());
Preconditions.checkNotNull(tablet);
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
statusPair = tablet.getHealthStatusWithPriority(
infoService, tabletCtx.getCluster(),
partition.getVisibleVersion(),
partition.getVisibleVersionHash(),
tbl.getPartitionInfo().getReplicationNum(partition.getId()));
tbl.getPartitionInfo().getReplicationNum(partition.getId()),
availableBackendsNum);
if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) {
// If table is under ALTER process, do not allow to do balance.
@ -494,9 +497,9 @@ public class TabletScheduler extends Daemon {
throw new SchedException(Status.UNRECOVERABLE, "tablet is healthy");
} else if (statusPair.first != TabletStatus.HEALTHY
&& tabletCtx.getType() == TabletSchedCtx.Type.BALANCE) {
tabletCtx.releaseResource(this);
// we select an unhealthy tablet to do balance, which is not right.
// so here we change it to a REPAIR task, and also reset its priority
tabletCtx.releaseResource(this);
tabletCtx.setType(TabletSchedCtx.Type.REPAIR);
tabletCtx.setOrigPriority(statusPair.second);
tabletCtx.setLastSchedTime(currentTime);
@ -531,7 +534,10 @@ public class TabletScheduler extends Daemon {
handleReplicaRelocating(tabletCtx, batchTask);
break;
case REDUNDANT:
handleRedundantReplica(tabletCtx);
handleRedundantReplica(tabletCtx, false);
break;
case FORCE_REDUNDANT:
handleRedundantReplica(tabletCtx, true);
break;
case REPLICA_MISSING_IN_CLUSTER:
handleReplicaClusterMigration(tabletCtx, batchTask);
@ -621,16 +627,16 @@ public class TabletScheduler extends Daemon {
* 7. replica not in right cluster
* 8. replica in higher load backend
*/
private void handleRedundantReplica(TabletSchedCtx tabletCtx) throws SchedException {
private void handleRedundantReplica(TabletSchedCtx tabletCtx, boolean force) throws SchedException {
stat.counterReplicaRedundantErr.incrementAndGet();
if (deleteBackendDropped(tabletCtx)
|| deleteBadReplica(tabletCtx)
|| deleteBackendUnavailable(tabletCtx)
|| deleteCloneReplica(tabletCtx)
|| deleteReplicaWithFailedVersion(tabletCtx)
|| deleteReplicaWithLowerVersion(tabletCtx)
|| deleteReplicaNotInCluster(tabletCtx)
|| deleteReplicaOnHighLoadBackend(tabletCtx)) {
if (deleteBackendDropped(tabletCtx, force)
|| deleteBadReplica(tabletCtx, force)
|| deleteBackendUnavailable(tabletCtx, force)
|| deleteCloneReplica(tabletCtx, force)
|| deleteReplicaWithFailedVersion(tabletCtx, force)
|| deleteReplicaWithLowerVersion(tabletCtx, force)
|| deleteReplicaNotInCluster(tabletCtx, force)
|| deleteReplicaOnHighLoadBackend(tabletCtx, force)) {
// if we delete at least one redundant replica, we still throw a SchedException with status FINISHED
// to remove this tablet from the pendingTablets(consider it as finished)
throw new SchedException(Status.FINISHED, "redundant replica is deleted");
@ -638,28 +644,28 @@ public class TabletScheduler extends Daemon {
throw new SchedException(Status.SCHEDULE_FAILED, "unable to delete any redundant replicas");
}
private boolean deleteBackendDropped(TabletSchedCtx tabletCtx) {
private boolean deleteBackendDropped(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
long beId = replica.getBackendId();
if (infoService.getBackend(beId) == null) {
deleteReplicaInternal(tabletCtx, replica, "backend dropped");
deleteReplicaInternal(tabletCtx, replica, "backend dropped", force);
return true;
}
}
return false;
}
private boolean deleteBadReplica(TabletSchedCtx tabletCtx) {
private boolean deleteBadReplica(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
if (replica.isBad()) {
deleteReplicaInternal(tabletCtx, replica, "replica is bad");
deleteReplicaInternal(tabletCtx, replica, "replica is bad", force);
return true;
}
}
return false;
}
private boolean deleteBackendUnavailable(TabletSchedCtx tabletCtx) {
private boolean deleteBackendUnavailable(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null) {
@ -667,44 +673,44 @@ public class TabletScheduler extends Daemon {
continue;
}
if (!be.isAvailable()) {
deleteReplicaInternal(tabletCtx, replica, "backend unavailable");
deleteReplicaInternal(tabletCtx, replica, "backend unavailable", force);
return true;
}
}
return false;
}
private boolean deleteCloneReplica(TabletSchedCtx tabletCtx) {
private boolean deleteCloneReplica(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
if (replica.getState() == ReplicaState.CLONE) {
deleteReplicaInternal(tabletCtx, replica, "clone state");
deleteReplicaInternal(tabletCtx, replica, "clone state", force);
return true;
}
}
return false;
}
private boolean deleteReplicaWithFailedVersion(TabletSchedCtx tabletCtx) {
private boolean deleteReplicaWithFailedVersion(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
if (replica.getLastFailedVersion() > 0) {
deleteReplicaInternal(tabletCtx, replica, "version incomplete");
deleteReplicaInternal(tabletCtx, replica, "version incomplete", force);
return true;
}
}
return false;
}
private boolean deleteReplicaWithLowerVersion(TabletSchedCtx tabletCtx) {
private boolean deleteReplicaWithLowerVersion(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
if (!replica.checkVersionCatchUp(tabletCtx.getCommittedVersion(), tabletCtx.getCommittedVersionHash())) {
deleteReplicaInternal(tabletCtx, replica, "lower version");
deleteReplicaInternal(tabletCtx, replica, "lower version", force);
return true;
}
}
return false;
}
private boolean deleteReplicaNotInCluster(TabletSchedCtx tabletCtx) {
private boolean deleteReplicaNotInCluster(TabletSchedCtx tabletCtx, boolean force) {
for (Replica replica : tabletCtx.getReplicas()) {
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null) {
@ -712,14 +718,14 @@ public class TabletScheduler extends Daemon {
continue;
}
if (!be.getOwnerClusterName().equals(tabletCtx.getCluster())) {
deleteReplicaInternal(tabletCtx, replica, "not in cluster");
deleteReplicaInternal(tabletCtx, replica, "not in cluster", force);
return true;
}
}
return false;
}
private boolean deleteReplicaOnHighLoadBackend(TabletSchedCtx tabletCtx) {
private boolean deleteReplicaOnHighLoadBackend(TabletSchedCtx tabletCtx, boolean force) {
ClusterLoadStatistic statistic = statisticMap.get(tabletCtx.getCluster());
if (statistic == null) {
return false;
@ -739,17 +745,26 @@ public class TabletScheduler extends Daemon {
}
if (chosenReplica != null) {
deleteReplicaInternal(tabletCtx, chosenReplica, "high load");
deleteReplicaInternal(tabletCtx, chosenReplica, "high load", force);
return true;
}
return false;
}
private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, String reason) {
private void deleteReplicaInternal(TabletSchedCtx tabletCtx, Replica replica, String reason, boolean force) {
// delete this replica from catalog.
// it will also delete replica from tablet inverted index.
tabletCtx.deleteReplica(replica);
if (force) {
// send the delete replica task.
// also this may not be necessary, but delete it will make things simpler.
// NOTICE: only delete the replica from meta may not work. sometimes we can depends on tablet report
// to delete these replicas, but in FORCE_REDUNDANT case, replica may be added to meta again in report
// process.
sendDeleteReplicaTask(replica.getBackendId(), tabletCtx.getTabletId(), tabletCtx.getSchemaHash());
}
// write edit log
ReplicaPersistInfo info = ReplicaPersistInfo.createForDelete(tabletCtx.getDbId(),
tabletCtx.getTblId(),
@ -764,6 +779,14 @@ public class TabletScheduler extends Daemon {
tabletCtx.getTabletId(), replica.getBackendId(), reason);
}
private void sendDeleteReplicaTask(long backendId, long tabletId, int schemaHash) {
DropReplicaTask task = new DropReplicaTask(backendId, tabletId, schemaHash);
AgentBatchTask batchTask = new AgentBatchTask();
batchTask.addTask(task);
AgentTaskExecutor.submit(batchTask);
LOG.info("send delete replica task for tablet {} in backend {}", tabletId, backendId);
}
/*
* Cluster migration, which means the tablet has enough healthy replicas,
* but some replicas are not in right cluster.

View File

@ -97,6 +97,7 @@ public class StatisticProcDir implements ProcDirInterface {
}
++totalDbNum;
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
db.readLock();
try {
int dbTableNum = 0;
@ -125,7 +126,7 @@ public class StatisticProcDir implements ProcDirInterface {
Pair<TabletStatus, Priority> res = tablet.getHealthStatusWithPriority(
infoService, db.getClusterName(),
partition.getVisibleVersion(), partition.getVisibleVersionHash(),
replicationNum);
replicationNum, availableBackendsNum);
// here we treat REDUNDANT as HEALTHY, for user friendly.
if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT) {

View File

@ -627,7 +627,8 @@ public class ReportHandler extends Daemon {
TTablet backendTablet = backendTablets.get(tabletId);
for (TTabletInfo backendTabletInfo : backendTablet.getTablet_infos()) {
boolean needDelete = false;
if (!foundTabletsWithValidSchema.contains(tabletId)) {
if (!foundTabletsWithValidSchema.contains(tabletId)
&& isBackendReplicaHealthy(backendTabletInfo)) {
// if this tablet is not in meta. try adding it.
// if add failed. delete this tablet from backend.
try {
@ -669,6 +670,17 @@ public class ReportHandler extends Daemon {
LOG.info("add {} replica(s) to meta. backend[{}]", addToMetaCounter, backendId);
}
// replica is used and no version missing
private static boolean isBackendReplicaHealthy(TTabletInfo backendTabletInfo) {
if (backendTabletInfo.isSetUsed() && !backendTabletInfo.isUsed()) {
return false;
}
if (backendTabletInfo.isSetVersion_miss() && backendTabletInfo.isVersion_miss()) {
return false;
}
return true;
}
private static void handleMigration(ListMultimap<TStorageMedium, Long> tabletMetaMigrationMap,
long backendId) {
TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex();
@ -928,9 +940,10 @@ public class ReportHandler extends Daemon {
return;
}
int availableBackendsNum = infoService.getClusterBackendIds(db.getClusterName(), true).size();
Pair<TabletStatus, TabletSchedCtx.Priority> status = tablet.getHealthStatusWithPriority(infoService,
db.getClusterName(), visibleVersion, visibleVersionHash,
replicationNum);
replicationNum, availableBackendsNum);
if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
long lastFailedVersion = -1L;

View File

@ -674,20 +674,20 @@ public class SystemInfoService {
/**
* get cluster's backend id list
*
* @param name
* @param clusterName
* @return
*/
public List<Long> getClusterBackendIds(String name, boolean needAlive) {
public List<Long> getClusterBackendIds(String clusterName, boolean needAlive) {
final Map<Long, Backend> copiedBackends = Maps.newHashMap(idToBackendRef.get());
final List<Long> ret = new ArrayList<Long>();
if (Strings.isNullOrEmpty(name)) {
if (Strings.isNullOrEmpty(clusterName)) {
return null;
}
if (needAlive) {
for (Backend backend : copiedBackends.values()) {
if (name.equals(backend.getOwnerClusterName())) {
if (clusterName.equals(backend.getOwnerClusterName())) {
if (backend != null && backend.isAlive()) {
ret.add(backend.getId());
}
@ -695,7 +695,7 @@ public class SystemInfoService {
}
} else {
for (Backend backend : copiedBackends.values()) {
if (name.equals(backend.getOwnerClusterName())) {
if (clusterName.equals(backend.getOwnerClusterName())) {
ret.add(backend.getId());
}
}