diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java index 361d6ec126..b7288dbe24 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Tablet.java @@ -62,6 +62,7 @@ public class Tablet extends MetaObject implements Writable { COLOCATE_MISMATCH, // replicas do not all locate in right colocate backends set. COLOCATE_REDUNDANT, // replicas match the colocate backends set, but redundant. NEED_FURTHER_REPAIR, // one of replicas need a definite repair. + UNRECOVERABLE // non of replicas are healthy } @SerializedName(value = "id") @@ -455,7 +456,9 @@ public class Tablet extends MetaObject implements Writable { // 1. alive replicas are not enough int aliveBackendsNum = aliveBeIdsInCluster.size(); - if (alive < replicationNum && replicas.size() >= aliveBackendsNum + if (alive == 0) { + return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH); + } else if (alive < replicationNum && replicas.size() >= aliveBackendsNum && aliveBackendsNum >= replicationNum && replicationNum > 1) { // there is no enough backend for us to create a new replica, so we have to delete an existing replica, // so there can be available backend for us to create a new replica. @@ -473,7 +476,9 @@ public class Tablet extends MetaObject implements Writable { } // 2. version complete replicas are not enough - if (aliveAndVersionComplete < (replicationNum / 2) + 1) { + if (aliveAndVersionComplete == 0) { + return Pair.create(TabletStatus.UNRECOVERABLE, Priority.VERY_HIGH); + } else if (aliveAndVersionComplete < (replicationNum / 2) + 1) { return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.HIGH); } else if (aliveAndVersionComplete < replicationNum) { return Pair.create(TabletStatus.VERSION_INCOMPLETE, TabletSchedCtx.Priority.NORMAL); diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java index 4e375c1000..2c9666d1b6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java @@ -329,6 +329,11 @@ public class TabletChecker extends MasterDaemon { // Only set last status check time when status is healthy. tablet.setLastStatusCheckTime(startTime); continue; + } else if (statusWithPrio.first == TabletStatus.UNRECOVERABLE) { + // This tablet is not recoverable, do not set it into tablet scheduler + // all UNRECOVERABLE tablet can be seen from "show proc '/statistic'" + counter.unhealthyTabletNum++; + continue; } else if (isInPrios) { statusWithPrio.second = TabletSchedCtx.Priority.VERY_HIGH; prioPartIsHealthy = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java index 0794296781..9c4b2b4c5b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java @@ -575,17 +575,19 @@ public class TabletScheduler extends MasterDaemon { case FORCE_REDUNDANT: handleRedundantReplica(tabletCtx, true); break; - case REPLICA_MISSING_IN_CLUSTER: - handleReplicaClusterMigration(tabletCtx, batchTask); - break; - case COLOCATE_MISMATCH: - handleColocateMismatch(tabletCtx, batchTask); - break; - case COLOCATE_REDUNDANT: - handleColocateRedundant(tabletCtx); - break; - default: - break; + case REPLICA_MISSING_IN_CLUSTER: + handleReplicaClusterMigration(tabletCtx, batchTask); + break; + case COLOCATE_MISMATCH: + handleColocateMismatch(tabletCtx, batchTask); + break; + case COLOCATE_REDUNDANT: + handleColocateRedundant(tabletCtx); + break; + case UNRECOVERABLE: + throw new SchedException(Status.UNRECOVERABLE, "tablet is unrecoverable"); + default: + break; } } else { // balance diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java index b278c47cc1..4cdf5de714 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/IncompleteTabletsProcNode.java @@ -29,20 +29,23 @@ import java.util.List; public class IncompleteTabletsProcNode implements ProcNodeInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() - .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets") + .add("UnhealthyTablets").add("InconsistentTablets").add("CloningTablets").add("BadTablets") .build(); private static final Joiner JOINER = Joiner.on(","); Collection unhealthyTabletIds; Collection inconsistentTabletIds; Collection cloningTabletIds; + Collection unrecoverableTabletIds; public IncompleteTabletsProcNode(Collection unhealthyTabletIds, Collection inconsistentTabletIds, - Collection cloningTabletIds) { + Collection cloningTabletIds, + Collection unrecoverableTabletIds) { this.unhealthyTabletIds = unhealthyTabletIds; this.inconsistentTabletIds = inconsistentTabletIds; this.cloningTabletIds = cloningTabletIds; + this.unrecoverableTabletIds = unrecoverableTabletIds; } @Override @@ -56,9 +59,11 @@ public class IncompleteTabletsProcNode implements ProcNodeInterface { String incompleteTablets = JOINER.join(Arrays.asList(unhealthyTabletIds)); String inconsistentTablets = JOINER.join(Arrays.asList(inconsistentTabletIds)); String cloningTablets = JOINER.join(Arrays.asList(cloningTabletIds)); + String unrecoverableTablets = JOINER.join(Arrays.asList(unrecoverableTabletIds)); row.add(incompleteTablets); row.add(inconsistentTablets); row.add(cloningTablets); + row.add(unrecoverableTablets); result.addRow(row); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java index 001f00c00c..596267cb42 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/StatisticProcDir.java @@ -17,10 +17,6 @@ package org.apache.doris.common.proc; -import com.google.common.base.Preconditions; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Multimap; import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.MaterializedIndex; @@ -38,6 +34,12 @@ import org.apache.doris.common.util.ListComparator; import org.apache.doris.system.SystemInfoService; import org.apache.doris.task.AgentTaskQueue; import org.apache.doris.thrift.TTaskType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Multimap; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -49,7 +51,7 @@ public class StatisticProcDir implements ProcDirInterface { public static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() .add("DbId").add("DbName").add("TableNum").add("PartitionNum") .add("IndexNum").add("TabletNum").add("ReplicaNum").add("UnhealthyTabletNum") - .add("InconsistentTabletNum").add("CloningTabletNum") + .add("InconsistentTabletNum").add("CloningTabletNum").add("BadTabletNum") .build(); private static final Logger LOG = LogManager.getLogger(StatisticProcDir.class); @@ -61,12 +63,15 @@ public class StatisticProcDir implements ProcDirInterface { Multimap inconsistentTabletIds; // db id -> set(tablet id) Multimap cloningTabletIds; + // db id -> set(tablet id) + Multimap unrecoverableTabletIds; public StatisticProcDir(Catalog catalog) { this.catalog = catalog; unhealthyTabletIds = HashMultimap.create(); inconsistentTabletIds = HashMultimap.create(); cloningTabletIds = HashMultimap.create(); + unrecoverableTabletIds = HashMultimap.create(); } @Override @@ -140,8 +145,11 @@ public class StatisticProcDir implements ProcDirInterface { // here we treat REDUNDANT as HEALTHY, for user friendly. if (res.first != TabletStatus.HEALTHY && res.first != TabletStatus.REDUNDANT - && res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR) { + && res.first != TabletStatus.COLOCATE_REDUNDANT && res.first != TabletStatus.NEED_FURTHER_REPAIR + && res.first != TabletStatus.UNRECOVERABLE) { unhealthyTabletIds.put(dbId, tablet.getId()); + } else if (res.first == TabletStatus.UNRECOVERABLE) { + unrecoverableTabletIds.put(dbId, tablet.getId()); } if (!tablet.isConsistent()) { @@ -166,6 +174,7 @@ public class StatisticProcDir implements ProcDirInterface { oneLine.add(unhealthyTabletIds.get(dbId).size()); oneLine.add(inconsistentTabletIds.get(dbId).size()); oneLine.add(cloningTabletIds.get(dbId).size()); + oneLine.add(unrecoverableTabletIds.get(dbId).size()); lines.add(oneLine); @@ -195,6 +204,7 @@ public class StatisticProcDir implements ProcDirInterface { finalLine.add(unhealthyTabletIds.size()); finalLine.add(inconsistentTabletIds.size()); finalLine.add(cloningTabletIds.size()); + finalLine.add(unrecoverableTabletIds.size()); lines.add(finalLine); // add result @@ -224,7 +234,8 @@ public class StatisticProcDir implements ProcDirInterface { } return new IncompleteTabletsProcNode(unhealthyTabletIds.get(dbId), - inconsistentTabletIds.get(dbId), - cloningTabletIds.get(dbId)); + inconsistentTabletIds.get(dbId), + cloningTabletIds.get(dbId), + unrecoverableTabletIds.get(dbId)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java index 0e5782551c..8f062ea2ba 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/HttpServer.java @@ -65,6 +65,9 @@ public class HttpServer extends SpringBootServletInitializer { properties.put("spring.http.encoding.force", true); properties.put("spring.servlet.multipart.max-file-size", this.maxFileSize); properties.put("spring.servlet.multipart.max-request-size", this.maxRequestSize); + // This is to disable the spring-boot-devtools restart feature. + // To avoid some unexpected behavior. + System.setProperty("spring.devtools.restart.enabled", "false"); properties.put("logging.config", dorisHome + "/conf/" + SpringLog4j2Config.SPRING_LOG_XML_FILE); new SpringApplicationBuilder() .sources(HttpServer.class) diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java index 1782b796cf..31a2dc4ab8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java @@ -1014,7 +1014,8 @@ public class ReportHandler extends Daemon { db.getClusterName(), visibleVersion, visibleVersionHash, replicationNum, aliveBeIdsInCluster); - if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING) { + if (status.first == TabletStatus.VERSION_INCOMPLETE || status.first == TabletStatus.REPLICA_MISSING + || status.first == TabletStatus.UNRECOVERABLE) { long lastFailedVersion = -1L; long lastFailedVersionHash = 0L;