[Enhance] Support show unrecoverable tablets (#6045)

* [Enhance] Support show unrecoverable tablets

The unrecoverable tablets are tablets which non of their replicas are healthy.
We should be able to find out these tablets then manual intervention.

And these tablets should not be added to the tablet scheduler.
This commit is contained in:
Mingyu Chen
2021-06-22 09:19:12 +08:00
committed by GitHub
parent 68bab73c35
commit abcd56c6c8
7 changed files with 56 additions and 24 deletions

View File

@ -62,6 +62,7 @@ public class Tablet extends MetaObject implements Writable {
COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set.
COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant.
NEED_FURTHER_REPAIR, // one of replicas need a definite repair.
UNRECOVERABLE // non of replicas are healthy
}
@SerializedName(value = "id")
@ -455,7 +456,9 @@ public class Tablet extends MetaObject implements Writable {
// 1. alive replicas are not enough
int aliveBackendsNum = aliveBeIdsInCluster.size();
if (alive < replicationNum && replicas.size() >= aliveBackendsNum
if (alive == 0) {
return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
} else if (alive < replicationNum && replicas.size() >= aliveBackendsNum
&& aliveBackendsNum >= replicationNum && replicationNum > 1) {
// there is no enough backend for us to create a new replica, so we have to delete an existing replica,
// so there can be available backend for us to create a new replica.
@ -473,7 +476,9 @@ public class Tablet extends MetaObject implements Writable {
}
// 2. version complete replicas are not enough
if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
if (aliveAndVersionComplete == 0) {
return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
} else if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.HIGH);
} else if (aliveAndVersionComplete < replicationNum) {
return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.NORMAL);

View File

@ -329,6 +329,11 @@ public class TabletChecker extends MasterDaemon {
// Only set last status check time when status is healthy.
tablet.setLastStatusCheckTime(startTime);
continue;
} else if (statusWithPrio.first == TabletStatus.UNRECOVERABLE) {
// This tablet is not recoverable, do not set it into tablet scheduler
// all UNRECOVERABLE tablet can be seen from "show proc '/statistic'"
counter.unhealthyTabletNum++;
continue;
} else if (isInPrios) {
statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH;
prioPartIsHealthy = false;

View File

@ -575,17 +575,19 @@ public class TabletScheduler extends MasterDaemon {
case FORCE_REDUNDANT:
handleRedundantReplica(tabletCtx, true);
break;
case REPLICA_MISSING_IN_CLUSTER:
handleReplicaClusterMigration(tabletCtx, batchTask);
break;
case COLOCATE_MISMATCH:
handleColocateMismatch(tabletCtx, batchTask);
break;
case COLOCATE_REDUNDANT:
handleColocateRedundant(tabletCtx);
break;
default:
break;
case REPLICA_MISSING_IN_CLUSTER:
handleReplicaClusterMigration(tabletCtx, batchTask);
break;
case COLOCATE_MISMATCH:
handleColocateMismatch(tabletCtx, batchTask);
break;
case COLOCATE_REDUNDANT:
handleColocateRedundant(tabletCtx);
break;
case UNRECOVERABLE:
throw new SchedException(Status.UNRECOVERABLE, "tablet is unrecoverable");
default:
break;
}
} else {
// balance

View File

@ -29,20 +29,23 @@ import java.util.List;
public class IncompleteTabletsProcNode implements ProcNodeInterface {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets")
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("BadTablets")
.build();
private static final Joiner JOINER = Joiner.on(",");
Collection<Long> unhealthyTabletIds;
Collection<Long> inconsistentTabletIds;
Collection<Long> cloningTabletIds;
Collection<Long> unrecoverableTabletIds;
public IncompleteTabletsProcNode(Collection<Long> unhealthyTabletIds,
Collection<Long> inconsistentTabletIds,
Collection<Long> cloningTabletIds) {
Collection<Long> cloningTabletIds,
Collection<Long> unrecoverableTabletIds) {
this.unhealthyTabletIds = unhealthyTabletIds;
this.inconsistentTabletIds = inconsistentTabletIds;
this.cloningTabletIds = cloningTabletIds;
this.unrecoverableTabletIds = unrecoverableTabletIds;
}
@Override
@ -56,9 +59,11 @@ public class IncompleteTabletsProcNode implements ProcNodeInterface {
String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds));
String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds));
String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds));
String unrecoverableTablets = JOINER.join(Arrays.asList(unrecoverableTabletIds));
row.add(incompleteTablets);
row.add(inconsistentTablets);
row.add(cloningTablets);
row.add(unrecoverableTablets);
result.addRow(row);

View File

@ -17,10 +17,6 @@
package org.apache.doris.common.proc;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multimap;
import org.apache.doris.catalog.Catalog;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.MaterializedIndex;
@ -38,6 +34,12 @@ import org.apache.doris.common.util.ListComparator;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.task.AgentTaskQueue;
import org.apache.doris.thrift.TTaskType;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multimap;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -49,7 +51,7 @@ public class StatisticProcDir implements ProcDirInterface {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("DbId").add("DbName").add("TableNum").add("PartitionNum")
.add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum")
.add("InconsistentTabletNum").add("CloningTabletNum")
.add("InconsistentTabletNum").add("CloningTabletNum").add("BadTabletNum")
.build();
private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class);
@ -61,12 +63,15 @@ public class StatisticProcDir implements ProcDirInterface {
Multimap<Long, Long> inconsistentTabletIds;
// db id -> set(tablet id)
Multimap<Long, Long> cloningTabletIds;
// db id -> set(tablet id)
Multimap<Long, Long> unrecoverableTabletIds;
public StatisticProcDir(Catalog catalog) {
this.catalog = catalog;
unhealthyTabletIds = HashMultimap.create();
inconsistentTabletIds = HashMultimap.create();
cloningTabletIds = HashMultimap.create();
unrecoverableTabletIds = HashMultimap.create();
}
@Override
@ -140,8 +145,11 @@ public class StatisticProcDir implements ProcDirInterface {
// here we treat REDUNDANT as HEALTHY, for user friendly.
if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT
&& res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR) {
&& res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR
&& res.first != TabletStatus.UNRECOVERABLE) {
unhealthyTabletIds.put(dbId, tablet.getId());
} else if (res.first == TabletStatus.UNRECOVERABLE) {
unrecoverableTabletIds.put(dbId, tablet.getId());
}
if (!tablet.isConsistent()) {
@ -166,6 +174,7 @@ public class StatisticProcDir implements ProcDirInterface {
oneLine.add(unhealthyTabletIds.get(dbId).size());
oneLine.add(inconsistentTabletIds.get(dbId).size());
oneLine.add(cloningTabletIds.get(dbId).size());
oneLine.add(unrecoverableTabletIds.get(dbId).size());
lines.add(oneLine);
@ -195,6 +204,7 @@ public class StatisticProcDir implements ProcDirInterface {
finalLine.add(unhealthyTabletIds.size());
finalLine.add(inconsistentTabletIds.size());
finalLine.add(cloningTabletIds.size());
finalLine.add(unrecoverableTabletIds.size());
lines.add(finalLine);
// add result
@ -224,7 +234,8 @@ public class StatisticProcDir implements ProcDirInterface {
}
return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId),
inconsistentTabletIds.get(dbId),
cloningTabletIds.get(dbId));
inconsistentTabletIds.get(dbId),
cloningTabletIds.get(dbId),
unrecoverableTabletIds.get(dbId));
}
}

View File

@ -65,6 +65,9 @@ public class HttpServer extends SpringBootServletInitializer {
properties.put("spring.http.encoding.force", true);
properties.put("spring.servlet.multipart.max-file-size", this.maxFileSize);
properties.put("spring.servlet.multipart.max-request-size", this.maxRequestSize);
// This is to disable the spring-boot-devtools restart feature.
// To avoid some unexpected behavior.
System.setProperty("spring.devtools.restart.enabled", "false");
properties.put("logging.config", dorisHome + "/conf/" + SpringLog4j2Config.SPRING_LOG_XML_FILE);
new SpringApplicationBuilder()
.sources(HttpServer.class)

View File

@ -1014,7 +1014,8 @@ public class ReportHandler extends Daemon {
db.getClusterName(), visibleVersion, visibleVersionHash,
replicationNum, aliveBeIdsInCluster);
if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING
|| status.first == TabletStatus.UNRECOVERABLE) {
long lastFailedVersion = -1L;
long lastFailedVersionHash = 0L;