[Enhance] Support show unrecoverable tablets (#6045)
* [Enhance] Support show unrecoverable tablets The unrecoverable tablets are tablets which non of their replicas are healthy. We should be able to find out these tablets then manual intervention. And these tablets should not be added to the tablet scheduler.
This commit is contained in:
@ -62,6 +62,7 @@ public class Tablet extends MetaObject implements Writable {
|
||||
COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set.
|
||||
COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant.
|
||||
NEED_FURTHER_REPAIR, // one of replicas need a definite repair.
|
||||
UNRECOVERABLE // non of replicas are healthy
|
||||
}
|
||||
|
||||
@SerializedName(value = "id")
|
||||
@ -455,7 +456,9 @@ public class Tablet extends MetaObject implements Writable {
|
||||
|
||||
// 1. alive replicas are not enough
|
||||
int aliveBackendsNum = aliveBeIdsInCluster.size();
|
||||
if (alive < replicationNum && replicas.size() >= aliveBackendsNum
|
||||
if (alive == 0) {
|
||||
return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
|
||||
} else if (alive < replicationNum && replicas.size() >= aliveBackendsNum
|
||||
&& aliveBackendsNum >= replicationNum && replicationNum > 1) {
|
||||
// there is no enough backend for us to create a new replica, so we have to delete an existing replica,
|
||||
// so there can be available backend for us to create a new replica.
|
||||
@ -473,7 +476,9 @@ public class Tablet extends MetaObject implements Writable {
|
||||
}
|
||||
|
||||
// 2. version complete replicas are not enough
|
||||
if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
|
||||
if (aliveAndVersionComplete == 0) {
|
||||
return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH);
|
||||
} else if (aliveAndVersionComplete < (replicationNum / 2) + 1) {
|
||||
return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.HIGH);
|
||||
} else if (aliveAndVersionComplete < replicationNum) {
|
||||
return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.NORMAL);
|
||||
|
||||
@ -329,6 +329,11 @@ public class TabletChecker extends MasterDaemon {
|
||||
// Only set last status check time when status is healthy.
|
||||
tablet.setLastStatusCheckTime(startTime);
|
||||
continue;
|
||||
} else if (statusWithPrio.first == TabletStatus.UNRECOVERABLE) {
|
||||
// This tablet is not recoverable, do not set it into tablet scheduler
|
||||
// all UNRECOVERABLE tablet can be seen from "show proc '/statistic'"
|
||||
counter.unhealthyTabletNum++;
|
||||
continue;
|
||||
} else if (isInPrios) {
|
||||
statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH;
|
||||
prioPartIsHealthy = false;
|
||||
|
||||
@ -575,17 +575,19 @@ public class TabletScheduler extends MasterDaemon {
|
||||
case FORCE_REDUNDANT:
|
||||
handleRedundantReplica(tabletCtx, true);
|
||||
break;
|
||||
case REPLICA_MISSING_IN_CLUSTER:
|
||||
handleReplicaClusterMigration(tabletCtx, batchTask);
|
||||
break;
|
||||
case COLOCATE_MISMATCH:
|
||||
handleColocateMismatch(tabletCtx, batchTask);
|
||||
break;
|
||||
case COLOCATE_REDUNDANT:
|
||||
handleColocateRedundant(tabletCtx);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case REPLICA_MISSING_IN_CLUSTER:
|
||||
handleReplicaClusterMigration(tabletCtx, batchTask);
|
||||
break;
|
||||
case COLOCATE_MISMATCH:
|
||||
handleColocateMismatch(tabletCtx, batchTask);
|
||||
break;
|
||||
case COLOCATE_REDUNDANT:
|
||||
handleColocateRedundant(tabletCtx);
|
||||
break;
|
||||
case UNRECOVERABLE:
|
||||
throw new SchedException(Status.UNRECOVERABLE, "tablet is unrecoverable");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// balance
|
||||
|
||||
@ -29,20 +29,23 @@ import java.util.List;
|
||||
|
||||
public class IncompleteTabletsProcNode implements ProcNodeInterface {
|
||||
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
|
||||
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets")
|
||||
.add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("BadTablets")
|
||||
.build();
|
||||
private static final Joiner JOINER = Joiner.on(",");
|
||||
|
||||
Collection<Long> unhealthyTabletIds;
|
||||
Collection<Long> inconsistentTabletIds;
|
||||
Collection<Long> cloningTabletIds;
|
||||
Collection<Long> unrecoverableTabletIds;
|
||||
|
||||
public IncompleteTabletsProcNode(Collection<Long> unhealthyTabletIds,
|
||||
Collection<Long> inconsistentTabletIds,
|
||||
Collection<Long> cloningTabletIds) {
|
||||
Collection<Long> cloningTabletIds,
|
||||
Collection<Long> unrecoverableTabletIds) {
|
||||
this.unhealthyTabletIds = unhealthyTabletIds;
|
||||
this.inconsistentTabletIds = inconsistentTabletIds;
|
||||
this.cloningTabletIds = cloningTabletIds;
|
||||
this.unrecoverableTabletIds = unrecoverableTabletIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -56,9 +59,11 @@ public class IncompleteTabletsProcNode implements ProcNodeInterface {
|
||||
String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds));
|
||||
String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds));
|
||||
String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds));
|
||||
String unrecoverableTablets = JOINER.join(Arrays.asList(unrecoverableTabletIds));
|
||||
row.add(incompleteTablets);
|
||||
row.add(inconsistentTablets);
|
||||
row.add(cloningTablets);
|
||||
row.add(unrecoverableTablets);
|
||||
|
||||
result.addRow(row);
|
||||
|
||||
|
||||
@ -17,10 +17,6 @@
|
||||
|
||||
package org.apache.doris.common.proc;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Multimap;
|
||||
import org.apache.doris.catalog.Catalog;
|
||||
import org.apache.doris.catalog.Database;
|
||||
import org.apache.doris.catalog.MaterializedIndex;
|
||||
@ -38,6 +34,12 @@ import org.apache.doris.common.util.ListComparator;
|
||||
import org.apache.doris.system.SystemInfoService;
|
||||
import org.apache.doris.task.AgentTaskQueue;
|
||||
import org.apache.doris.thrift.TTaskType;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Multimap;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
@ -49,7 +51,7 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
|
||||
.add("DbId").add("DbName").add("TableNum").add("PartitionNum")
|
||||
.add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum")
|
||||
.add("InconsistentTabletNum").add("CloningTabletNum")
|
||||
.add("InconsistentTabletNum").add("CloningTabletNum").add("BadTabletNum")
|
||||
.build();
|
||||
private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class);
|
||||
|
||||
@ -61,12 +63,15 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
Multimap<Long, Long> inconsistentTabletIds;
|
||||
// db id -> set(tablet id)
|
||||
Multimap<Long, Long> cloningTabletIds;
|
||||
// db id -> set(tablet id)
|
||||
Multimap<Long, Long> unrecoverableTabletIds;
|
||||
|
||||
public StatisticProcDir(Catalog catalog) {
|
||||
this.catalog = catalog;
|
||||
unhealthyTabletIds = HashMultimap.create();
|
||||
inconsistentTabletIds = HashMultimap.create();
|
||||
cloningTabletIds = HashMultimap.create();
|
||||
unrecoverableTabletIds = HashMultimap.create();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -140,8 +145,11 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
|
||||
// here we treat REDUNDANT as HEALTHY, for user friendly.
|
||||
if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT
|
||||
&& res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR) {
|
||||
&& res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR
|
||||
&& res.first != TabletStatus.UNRECOVERABLE) {
|
||||
unhealthyTabletIds.put(dbId, tablet.getId());
|
||||
} else if (res.first == TabletStatus.UNRECOVERABLE) {
|
||||
unrecoverableTabletIds.put(dbId, tablet.getId());
|
||||
}
|
||||
|
||||
if (!tablet.isConsistent()) {
|
||||
@ -166,6 +174,7 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
oneLine.add(unhealthyTabletIds.get(dbId).size());
|
||||
oneLine.add(inconsistentTabletIds.get(dbId).size());
|
||||
oneLine.add(cloningTabletIds.get(dbId).size());
|
||||
oneLine.add(unrecoverableTabletIds.get(dbId).size());
|
||||
|
||||
lines.add(oneLine);
|
||||
|
||||
@ -195,6 +204,7 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
finalLine.add(unhealthyTabletIds.size());
|
||||
finalLine.add(inconsistentTabletIds.size());
|
||||
finalLine.add(cloningTabletIds.size());
|
||||
finalLine.add(unrecoverableTabletIds.size());
|
||||
lines.add(finalLine);
|
||||
|
||||
// add result
|
||||
@ -224,7 +234,8 @@ public class StatisticProcDir implements ProcDirInterface {
|
||||
}
|
||||
|
||||
return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId),
|
||||
inconsistentTabletIds.get(dbId),
|
||||
cloningTabletIds.get(dbId));
|
||||
inconsistentTabletIds.get(dbId),
|
||||
cloningTabletIds.get(dbId),
|
||||
unrecoverableTabletIds.get(dbId));
|
||||
}
|
||||
}
|
||||
|
||||
@ -65,6 +65,9 @@ public class HttpServer extends SpringBootServletInitializer {
|
||||
properties.put("spring.http.encoding.force", true);
|
||||
properties.put("spring.servlet.multipart.max-file-size", this.maxFileSize);
|
||||
properties.put("spring.servlet.multipart.max-request-size", this.maxRequestSize);
|
||||
// This is to disable the spring-boot-devtools restart feature.
|
||||
// To avoid some unexpected behavior.
|
||||
System.setProperty("spring.devtools.restart.enabled", "false");
|
||||
properties.put("logging.config", dorisHome + "/conf/" + SpringLog4j2Config.SPRING_LOG_XML_FILE);
|
||||
new SpringApplicationBuilder()
|
||||
.sources(HttpServer.class)
|
||||
|
||||
@ -1014,7 +1014,8 @@ public class ReportHandler extends Daemon {
|
||||
db.getClusterName(), visibleVersion, visibleVersionHash,
|
||||
replicationNum, aliveBeIdsInCluster);
|
||||
|
||||
if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) {
|
||||
if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING
|
||||
|| status.first == TabletStatus.UNRECOVERABLE) {
|
||||
long lastFailedVersion = -1L;
|
||||
long lastFailedVersionHash = 0L;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user