[feature](diagnose) diagnose for cluster balance (#26085)

This commit is contained in:
yujun
2023-11-10 15:31:58 +08:00
committed by GitHub
parent 4ebb517af0
commit 0749d632c4
13 changed files with 656 additions and 72 deletions

View File

@ -245,7 +245,7 @@ public class BeLoadRebalancer extends Rebalancer {
clusterStat.getBackendStatisticByClass(lowBe, midBe, highBe, tabletCtx.getStorageMedium());
if (lowBe.isEmpty() && highBe.isEmpty()) {
throw new SchedException(Status.UNRECOVERABLE, "cluster is balance");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "cluster is balance");
}
// if all low backends is not available, return
@ -265,12 +265,14 @@ public class BeLoadRebalancer extends Rebalancer {
}
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null) {
throw new SchedException(Status.UNRECOVERABLE, "backend is dropped: " + replica.getBackendId());
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"backend is dropped: " + replica.getBackendId());
}
hosts.add(be.getHost());
}
if (!hasHighReplica) {
throw new SchedException(Status.UNRECOVERABLE, "no replica on high load backend");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"no replica on high load backend");
}
// select a replica as source
@ -288,7 +290,7 @@ public class BeLoadRebalancer extends Rebalancer {
}
}
if (!setSource) {
throw new SchedException(Status.UNRECOVERABLE, "unable to take src slot");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "unable to take src slot");
}
// Select a low load backend as destination.
@ -331,7 +333,7 @@ public class BeLoadRebalancer extends Rebalancer {
}
if (candidates.isEmpty()) {
throw new SchedException(Status.UNRECOVERABLE, "unable to find low backend");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "unable to find low backend");
}
List<BePathLoadStatPair> candFitPaths = Lists.newArrayList();

View File

@ -26,6 +26,7 @@ import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
import org.apache.doris.clone.SchedException.Status;
import org.apache.doris.clone.SchedException.SubCode;
import org.apache.doris.clone.TabletSchedCtx.BalanceType;
import org.apache.doris.clone.TabletSchedCtx.Priority;
import org.apache.doris.clone.TabletScheduler.PathSlot;
@ -296,23 +297,26 @@ public class DiskRebalancer extends Rebalancer {
Replica replica = invertedIndex.getReplica(tabletCtx.getTabletId(), tabletCtx.getTempSrcBackendId());
// check src replica still there
if (replica == null || replica.getPathHash() != tabletCtx.getTempSrcPathHash()) {
throw new SchedException(Status.UNRECOVERABLE, "src replica may be rebalanced");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "src replica may be rebalanced");
}
// ignore empty replicas as they do not make disk more balance
if (replica.getDataSize() == 0) {
throw new SchedException(Status.UNRECOVERABLE, "size of src replica is zero");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "size of src replica is zero");
}
Database db = Env.getCurrentInternalCatalog().getDbOrException(tabletCtx.getDbId(),
s -> new SchedException(Status.UNRECOVERABLE, "db " + tabletCtx.getDbId() + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"db " + tabletCtx.getDbId() + " does not exist"));
OlapTable tbl = (OlapTable) db.getTableOrException(tabletCtx.getTblId(),
s -> new SchedException(Status.UNRECOVERABLE, "tbl " + tabletCtx.getTblId() + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"tbl " + tabletCtx.getTblId() + " does not exist"));
DataProperty dataProperty = tbl.getPartitionInfo().getDataProperty(tabletCtx.getPartitionId());
if (dataProperty == null) {
throw new SchedException(Status.UNRECOVERABLE, "data property is null");
}
String storagePolicy = dataProperty.getStoragePolicy();
if (!Strings.isNullOrEmpty(storagePolicy)) {
throw new SchedException(Status.UNRECOVERABLE, "disk balance not support for cooldown storage");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"disk balance not support for cooldown storage");
}
// check src slot
@ -323,7 +327,7 @@ public class DiskRebalancer extends Rebalancer {
}
long pathHash = slot.takeBalanceSlot(replica.getPathHash());
if (pathHash == -1) {
throw new SchedException(Status.UNRECOVERABLE, "unable to take src slot");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "unable to take src slot");
}
// after take src slot, we can set src replica now
tabletCtx.setSrc(replica);
@ -343,7 +347,7 @@ public class DiskRebalancer extends Rebalancer {
if (pathHigh.contains(replica.getPathHash())) {
pathLow.addAll(pathMid);
} else if (!pathMid.contains(replica.getPathHash())) {
throw new SchedException(Status.UNRECOVERABLE, "src path is low load");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "src path is low load");
}
// check if this migration task can make the be's disks more balance.
List<RootPathLoadStatistic> availPaths = Lists.newArrayList();
@ -380,7 +384,7 @@ public class DiskRebalancer extends Rebalancer {
}
if (!setDest) {
throw new SchedException(Status.UNRECOVERABLE, "unable to find low load path");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "unable to find low load path");
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.doris.clone;
import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
import org.apache.doris.clone.SchedException.SubCode;
import org.apache.doris.clone.TabletScheduler.PathSlot;
import org.apache.doris.common.Config;
import org.apache.doris.common.Pair;
@ -261,7 +262,7 @@ public class PartitionRebalancer extends Rebalancer {
if (slot.takeBalanceSlot(srcReplica.getPathHash()) != -1) {
tabletCtx.setSrc(srcReplica);
} else {
throw new SchedException(SchedException.Status.SCHEDULE_FAILED, SchedException.SubCode.WAITING_SLOT,
throw new SchedException(SchedException.Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
"no slot for src replica " + srcReplica + ", pathHash " + srcReplica.getPathHash());
}
@ -279,7 +280,7 @@ public class PartitionRebalancer extends Rebalancer {
.map(RootPathLoadStatistic::getPathHash).collect(Collectors.toSet());
long pathHash = slot.takeAnAvailBalanceSlotFrom(availPath);
if (pathHash == -1) {
throw new SchedException(SchedException.Status.SCHEDULE_FAILED, SchedException.SubCode.WAITING_SLOT,
throw new SchedException(SchedException.Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
"paths has no available balance slot: " + availPath);
}

View File

@ -31,6 +31,7 @@ public class SchedException extends Exception {
NONE,
WAITING_DECOMMISSION,
WAITING_SLOT,
DIAGNOSE_IGNORE, // proc '/diagnose/cluster_balance' will ignore this error
}
private Status status;

View File

@ -324,6 +324,10 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
this.lastVisitedTime = lastVisitedTime;
}
public long getLastVisitedTime() {
return lastVisitedTime;
}
public void setFinishedTime(long finishedTime) {
this.finishedTime = finishedTime;
}
@ -424,6 +428,18 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
this.errMsg = errMsg;
}
public String getErrMsg() {
return errMsg;
}
public SubCode getSchedFailedCode() {
return schedFailedCode;
}
public void setSchedFailedCode(SubCode code) {
schedFailedCode = code;
}
public CloneTask getCloneTask() {
return cloneTask;
}
@ -613,7 +629,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
}
if (candidates.isEmpty()) {
throw new SchedException(Status.UNRECOVERABLE, "unable to find source replica");
throw new SchedException(Status.UNRECOVERABLE, "unable to find copy source replica");
}
// choose a replica which slot is available from candidates.
@ -637,7 +653,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
return;
}
throw new SchedException(Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
"unable to find source slot");
"waiting for source replica's slot");
}
/*
@ -718,7 +734,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
if (candidates.isEmpty()) {
if (furtherRepairs.isEmpty()) {
throw new SchedException(Status.UNRECOVERABLE, "unable to choose dest replica");
throw new SchedException(Status.UNRECOVERABLE, "unable to choose copy dest replica");
}
boolean allCatchup = true;
@ -741,7 +757,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
if (slot == null || !slot.hasAvailableSlot(replica.getPathHash())) {
if (!replica.needFurtherRepair()) {
throw new SchedException(Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
"replica " + replica + " has not slot");
"dest replica " + replica + " has no slot");
}
continue;
@ -1063,20 +1079,23 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
// 1. check the tablet status first
Database db = Env.getCurrentInternalCatalog().getDbOrException(dbId,
s -> new SchedException(Status.UNRECOVERABLE, "db " + dbId + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"db " + dbId + " does not exist"));
OlapTable olapTable = (OlapTable) db.getTableOrException(tblId,
s -> new SchedException(Status.UNRECOVERABLE, "tbl " + tabletId + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"tbl " + tabletId + " does not exist"));
olapTable.writeLockOrException(new SchedException(Status.UNRECOVERABLE, "table "
+ olapTable.getName() + " does not exist"));
try {
Partition partition = olapTable.getPartition(partitionId);
if (partition == null) {
throw new SchedException(Status.UNRECOVERABLE, "partition does not exist");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"partition does not exist");
}
MaterializedIndex index = partition.getIndex(indexId);
if (index == null) {
throw new SchedException(Status.UNRECOVERABLE, "index does not exist");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "index does not exist");
}
if (schemaHash != olapTable.getSchemaHashByIndexId(indexId)) {
@ -1116,7 +1135,7 @@ public class TabletSchedCtx implements Comparable<TabletSchedCtx> {
// check if replica exist
Replica replica = tablet.getReplicaByBackendId(destBackendId);
if (replica == null) {
throw new SchedException(Status.UNRECOVERABLE,
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"replica does not exist. backend id: " + destBackendId);
}

View File

@ -393,6 +393,7 @@ public class TabletScheduler extends MasterDaemon {
throw new SchedException(Status.FINISHED, "tablet scheduler is disabled");
}
if (Config.disable_balance && tabletCtx.getType() == Type.BALANCE) {
tabletCtx.setSchedFailedCode(SubCode.DIAGNOSE_IGNORE);
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, Status.UNRECOVERABLE,
"config disable balance");
continue;
@ -420,6 +421,7 @@ public class TabletScheduler extends MasterDaemon {
Preconditions.checkState(e.getStatus() == Status.UNRECOVERABLE, e.getStatus());
// discard
stat.counterTabletScheduledDiscard.incrementAndGet();
tabletCtx.setSchedFailedCode(e.getSubCode());
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, e.getStatus(), e.getMessage());
}
continue;
@ -427,6 +429,7 @@ public class TabletScheduler extends MasterDaemon {
LOG.warn("got unexpected exception, discard this schedule. tablet: {}",
tabletCtx.getTabletId(), e);
stat.counterTabletScheduledFailed.incrementAndGet();
tabletCtx.setSchedFailedCode(SubCode.NONE);
finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, Status.UNRECOVERABLE, e.getMessage());
continue;
}
@ -475,11 +478,13 @@ public class TabletScheduler extends MasterDaemon {
Pair<TabletStatus, TabletSchedCtx.Priority> statusPair;
Database db = Env.getCurrentInternalCatalog().getDbOrException(tabletCtx.getDbId(),
s -> new SchedException(Status.UNRECOVERABLE, "db " + tabletCtx.getDbId() + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"db " + tabletCtx.getDbId() + " does not exist"));
OlapTable tbl = (OlapTable) db.getTableOrException(tabletCtx.getTblId(),
s -> new SchedException(Status.UNRECOVERABLE, "tbl " + tabletCtx.getTblId() + " does not exist"));
tbl.writeLockOrException(new SchedException(Status.UNRECOVERABLE, "table "
+ tbl.getName() + " does not exist"));
s -> new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"tbl " + tabletCtx.getTblId() + " does not exist"));
tbl.writeLockOrException(new SchedException(Status.UNRECOVERABLE,
"table " + tbl.getName() + " does not exist"));
try {
long tabletId = tabletCtx.getTabletId();
@ -489,12 +494,12 @@ public class TabletScheduler extends MasterDaemon {
Partition partition = tbl.getPartition(tabletCtx.getPartitionId());
if (partition == null) {
throw new SchedException(Status.UNRECOVERABLE, "partition does not exist");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "partition does not exist");
}
MaterializedIndex idx = partition.getIndex(tabletCtx.getIndexId());
if (idx == null) {
throw new SchedException(Status.UNRECOVERABLE, "index does not exist");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "index does not exist");
}
ReplicaAllocation replicaAlloc = null;
@ -503,7 +508,8 @@ public class TabletScheduler extends MasterDaemon {
if (isColocateTable) {
GroupId groupId = colocateTableIndex.getGroup(tbl.getId());
if (groupId == null) {
throw new SchedException(Status.UNRECOVERABLE, "colocate group does not exist");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"colocate group does not exist");
}
ColocateGroupSchema groupSchema = colocateTableIndex.getGroupSchema(groupId);
if (groupSchema == null) {
@ -543,19 +549,20 @@ public class TabletScheduler extends MasterDaemon {
}
}
if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE && tableState != OlapTableState.NORMAL) {
// If table is under ALTER process, do not allow to do balance.
throw new SchedException(Status.UNRECOVERABLE, "table's state is not NORMAL");
}
if (tabletCtx.getType() == TabletSchedCtx.Type.BALANCE) {
if (tableState != OlapTableState.NORMAL) {
// If table is under ALTER process, do not allow to do balance.
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"table's state is not NORMAL");
}
try {
DatabaseTransactionMgr dbTransactionMgr
= Env.getCurrentGlobalTransactionMgr().getDatabaseTransactionMgr(db.getId());
for (TransactionState transactionState : dbTransactionMgr.getPreCommittedTxnList()) {
if (transactionState.getTableIdList().contains(tbl.getId())) {
// If table releate to transaction with precommitted status, do not allow to do balance.
throw new SchedException(Status.UNRECOVERABLE,
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"There exists PRECOMMITTED transaction related to table");
}
}
@ -572,18 +579,19 @@ public class TabletScheduler extends MasterDaemon {
// The WAITING_STABLE state is an exception. This state indicates that the table is
// executing an alter job, but the alter job is in a PENDING state and is waiting for
// the table to become stable. In this case, we allow the tablet repair to proceed.
throw new SchedException(Status.UNRECOVERABLE,
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"table is in alter process, but tablet status is " + statusPair.first.name());
}
tabletCtx.setTabletStatus(statusPair.first);
if (statusPair.first == TabletStatus.HEALTHY && tabletCtx.getType() == TabletSchedCtx.Type.REPAIR) {
throw new SchedException(Status.UNRECOVERABLE, "tablet is healthy");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "tablet is healthy");
} else if (statusPair.first != TabletStatus.HEALTHY
&& tabletCtx.getType() == TabletSchedCtx.Type.BALANCE) {
// we select an unhealthy tablet to do balance, which is not right.
// so here we stop this task.
throw new SchedException(Status.UNRECOVERABLE, "tablet is unhealthy when doing balance");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"tablet is unhealthy when doing balance");
}
// for disk balance more accurately, we only schedule tablet when has lastly stat info about disk
@ -667,7 +675,7 @@ public class TabletScheduler extends MasterDaemon {
handleReplicaTooSlow(tabletCtx);
break;
case UNRECOVERABLE:
throw new SchedException(Status.UNRECOVERABLE, "tablet is unrecoverable");
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE, "tablet is unrecoverable");
default:
break;
}
@ -1090,8 +1098,9 @@ public class TabletScheduler extends MasterDaemon {
if (!otherCatchup) {
LOG.info("can not delete only one replica, tabletId = {} replicaId = {}", tabletCtx.getTabletId(),
replica.getId());
throw new SchedException(Status.UNRECOVERABLE, "the only one latest replia can not be dropped, tabletId = "
+ tabletCtx.getTabletId() + ", replicaId = " + replica.getId());
throw new SchedException(Status.UNRECOVERABLE, SubCode.DIAGNOSE_IGNORE,
"the only one latest replia can not be dropped, tabletId = "
+ tabletCtx.getTabletId() + ", replicaId = " + replica.getId());
}
/*
@ -1450,7 +1459,7 @@ public class TabletScheduler extends MasterDaemon {
if (hasBePath) {
throw new SchedException(Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
"unable to find dest path which can be fit in");
"waiting for dest replica slot");
} else {
throw new SchedException(Status.UNRECOVERABLE,
"unable to find dest path which can be fit in");
@ -1790,18 +1799,27 @@ public class TabletScheduler extends MasterDaemon {
}
public List<List<String>> getPendingTabletsInfo(int limit) {
List<TabletSchedCtx> tabletCtxs = getCopiedTablets(pendingTablets, limit);
return collectTabletCtx(tabletCtxs);
return collectTabletCtx(getPendingTablets(limit));
}
public List<TabletSchedCtx> getPendingTablets(int limit) {
return getCopiedTablets(pendingTablets, limit);
}
public List<List<String>> getRunningTabletsInfo(int limit) {
List<TabletSchedCtx> tabletCtxs = getCopiedTablets(runningTablets.values(), limit);
return collectTabletCtx(tabletCtxs);
return collectTabletCtx(getRunningTablets(limit));
}
public List<TabletSchedCtx> getRunningTablets(int limit) {
return getCopiedTablets(runningTablets.values(), limit);
}
public List<List<String>> getHistoryTabletsInfo(int limit) {
List<TabletSchedCtx> tabletCtxs = getCopiedTablets(schedHistory, limit);
return collectTabletCtx(tabletCtxs);
return collectTabletCtx(getHistoryTablets(limit));
}
public List<TabletSchedCtx> getHistoryTablets(int limit) {
return getCopiedTablets(schedHistory, limit);
}
private List<List<String>> collectTabletCtx(List<TabletSchedCtx> tabletCtxs) {

View File

@ -0,0 +1,307 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.common.proc;
import org.apache.doris.catalog.ColocateTableIndex.GroupId;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.clone.BackendLoadStatistic;
import org.apache.doris.clone.BackendLoadStatistic.Classification;
import org.apache.doris.clone.LoadStatisticForTag;
import org.apache.doris.clone.RootPathLoadStatistic;
import org.apache.doris.clone.SchedException.SubCode;
import org.apache.doris.clone.TabletSchedCtx;
import org.apache.doris.clone.TabletScheduler;
import org.apache.doris.common.Config;
import org.apache.doris.common.proc.DiagnoseProcDir.DiagnoseItem;
import org.apache.doris.common.proc.DiagnoseProcDir.DiagnoseStatus;
import org.apache.doris.common.proc.DiagnoseProcDir.SubProcDir;
import org.apache.doris.common.proc.TabletHealthProcDir.DBTabletStatistic;
import org.apache.doris.ha.FrontendNodeType;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.thrift.TStorageMedium;
import com.google.common.collect.Lists;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/*
* show proc "/diagnose/cluster_balance";
*/
public class DiagnoseClusterBalanceProcDir extends SubProcDir {
@Override
public List<DiagnoseItem> getDiagnoseResult() {
long now = System.currentTimeMillis();
long minToMs = 60 * 1000L;
Env env = Env.getCurrentEnv();
TabletScheduler tabletScheduler = env.getTabletScheduler();
List<TabletSchedCtx> pendingTablets = tabletScheduler.getPendingTablets(1);
List<TabletSchedCtx> runningTablets = tabletScheduler.getRunningTablets(1);
List<TabletSchedCtx> historyTablets = tabletScheduler.getHistoryTablets(10000);
long historyLastVisitTime = historyTablets.stream()
.mapToLong(tablet -> Math.max(tablet.getCreateTime(), tablet.getLastVisitedTime()))
.max().orElse(-1);
boolean schedReady = env.getFrontends(null).stream().anyMatch(
fe -> fe.isAlive() && now >= fe.getLastStartupTime() + 1 * minToMs
&& (fe.getRole() == FrontendNodeType.MASTER || fe.getRole() == FrontendNodeType.FOLLOWER));
boolean schedRecent = !pendingTablets.isEmpty() || !runningTablets.isEmpty()
|| historyLastVisitTime >= now - 15 * minToMs;
List<DiagnoseItem> items = Lists.newArrayList();
items.add(diagnoseTabletHealth(schedReady, schedRecent));
DiagnoseItem baseBalance = diagnoseBaseBalance(schedReady, schedRecent);
items.add(baseBalance);
items.add(diagnoseDiskBalance(schedReady, schedRecent, baseBalance.status == DiagnoseStatus.OK));
items.add(diagnoseColocateRebalance(schedReady, schedRecent));
items.add(diagnoseHistorySched(historyTablets,
items.stream().allMatch(item -> item.status == DiagnoseStatus.OK)));
return items;
}
private DiagnoseItem diagnoseTabletHealth(boolean schedReady, boolean schedRecent) {
DiagnoseItem tabletHealth = new DiagnoseItem();
tabletHealth.name = "Tablet Health";
tabletHealth.status = DiagnoseStatus.OK;
Env env = Env.getCurrentEnv();
List<DBTabletStatistic> statistics = env.getInternalCatalog().getDbIds().parallelStream()
// skip information_schema database
.flatMap(id -> Stream.of(id == 0 ? null : env.getInternalCatalog().getDbNullable(id)))
.filter(Objects::nonNull).map(DBTabletStatistic::new)
// sort by dbName
.sorted(Comparator.comparing(db -> db.db.getFullName())).collect(Collectors.toList());
DBTabletStatistic total = statistics.stream().reduce(new DBTabletStatistic(), DBTabletStatistic::reduce);
if (total.tabletNum != total.healthyNum) {
tabletHealth.status = DiagnoseStatus.ERROR;
tabletHealth.content = String.format("healthy tablet num %s < total tablet num %s",
total.healthyNum, total.tabletNum);
tabletHealth.detailCmd = "show proc \"/cluster_health/tablet_health\";";
boolean changeWarning = total.unrecoverableNum == 0;
if (Config.disable_tablet_scheduler) {
tabletHealth.suggestion = "has disable tablet balance, ensure master fe config: "
+ "disable_tablet_scheduler = false";
} else if (!schedReady) {
tabletHealth.suggestion = "check all fe are ready, and then wait some minutes for "
+ "sheduler to migrate tablets";
} else if (schedRecent) {
tabletHealth.suggestion = "tablet is still scheduling, run 'show proc \"/cluster_balance\"'";
} else {
changeWarning = false;
}
if (changeWarning) {
tabletHealth.status = DiagnoseStatus.WARNING;
}
}
return tabletHealth;
}
private DiagnoseItem diagnoseBaseBalance(boolean schedReady, boolean schedRecent) {
SystemInfoService infoService = Env.getCurrentSystemInfo();
Map<Tag, LoadStatisticForTag> loadStatisticMap = Env.getCurrentEnv().getTabletScheduler().getStatisticMap();
DiagnoseItem baseBalance = new DiagnoseItem();
baseBalance.status = DiagnoseStatus.OK;
// check base balance
List<Long> availableBeIds = infoService.getAllBackendIds(true).stream()
.filter(beId -> infoService.checkBackendScheduleAvailable(beId))
.collect(Collectors.toList());
boolean isPartitionBal = Config.tablet_rebalancer_type.equalsIgnoreCase("partition");
if (isPartitionBal) {
TabletInvertedIndex invertedIndex = Env.getCurrentInvertedIndex();
baseBalance.name = "Partition Balance";
List<Integer> tabletNums = availableBeIds.stream()
.map(beId -> invertedIndex.getTabletNumByBackendId(beId))
.collect(Collectors.toList());
int minTabletNum = tabletNums.stream().mapToInt(v -> v).min().orElse(0);
int maxTabletNum = tabletNums.stream().mapToInt(v -> v).max().orElse(0);
if (maxTabletNum <= Math.max(minTabletNum * Config.diagnose_balance_max_tablet_num_ratio,
minTabletNum + Config.diagnose_balance_max_tablet_num_diff)) {
baseBalance.status = DiagnoseStatus.OK;
} else {
baseBalance.status = DiagnoseStatus.ERROR;
baseBalance.content = String.format("tablets not balance, be %s has %s tablets, be %s has %s tablets",
availableBeIds.get(availableBeIds.indexOf(minTabletNum)), minTabletNum,
availableBeIds.get(availableBeIds.indexOf(maxTabletNum)), maxTabletNum);
baseBalance.detailCmd = "show backends";
}
} else {
baseBalance.name = "BeLoad Balance";
baseBalance.status = DiagnoseStatus.OK;
OUTER1:
for (LoadStatisticForTag stat : loadStatisticMap.values()) {
for (TStorageMedium storageMedium : TStorageMedium.values()) {
List<Long> lowBEs = stat.getBackendLoadStatistics().stream()
.filter(be -> availableBeIds.contains(be.getBeId())
&& be.getClazz(storageMedium) == Classification.LOW)
.map(BackendLoadStatistic::getBeId)
.collect(Collectors.toList());
List<Long> highBEs = stat.getBackendLoadStatistics().stream()
.filter(be -> availableBeIds.contains(be.getBeId())
&& be.getClazz(storageMedium) == Classification.HIGH)
.map(BackendLoadStatistic::getBeId)
.collect(Collectors.toList());
if (!lowBEs.isEmpty() || !highBEs.isEmpty()) {
baseBalance.status = DiagnoseStatus.ERROR;
baseBalance.content = String.format("backend load not balance for tag %s, storage medium %s, "
+ "low load backends %s, high load backends %s",
stat.getTag(), storageMedium.name().toUpperCase(), lowBEs, highBEs);
baseBalance.detailCmd = String.format("show proc \"/cluster_balance/cluster_load_stat/%s/%s\"",
stat.getTag().toKey(), storageMedium.name().toUpperCase());
break OUTER1;
}
}
}
}
if (baseBalance.status != DiagnoseStatus.OK) {
if (Config.disable_tablet_scheduler || Config.disable_balance) {
baseBalance.suggestion = "has disable tablet balance, ensure master fe config: "
+ "disable_tablet_scheduler = false, disable_balance = false";
} else if (!schedReady) {
baseBalance.suggestion = "check all fe are ready, and then wait some minutes for "
+ "sheduler to migrate tablets";
baseBalance.status = DiagnoseStatus.WARNING;
} else if (schedRecent) {
baseBalance.suggestion = "tablet is still scheduling, run 'show proc \"/cluster_balance\"'";
baseBalance.status = DiagnoseStatus.WARNING;
}
}
return baseBalance;
}
private DiagnoseItem diagnoseDiskBalance(boolean schedReady, boolean schedRecent, boolean baseBalanceOk) {
DiagnoseItem diskBalance = new DiagnoseItem();
diskBalance.name = "Disk Balance";
diskBalance.status = DiagnoseStatus.OK;
Map<Tag, LoadStatisticForTag> loadStatisticMap = Env.getCurrentEnv().getTabletScheduler().getStatisticMap();
OUTER2:
for (LoadStatisticForTag stat : loadStatisticMap.values()) {
List<RootPathLoadStatistic> lowPaths = Lists.newArrayList();
List<RootPathLoadStatistic> midPaths = Lists.newArrayList();
List<RootPathLoadStatistic> highPaths = Lists.newArrayList();
for (TStorageMedium storageMedium : TStorageMedium.values()) {
for (BackendLoadStatistic beStat : stat.getBackendLoadStatistics()) {
lowPaths.clear();
midPaths.clear();
highPaths.clear();
beStat.getPathStatisticByClass(lowPaths, midPaths, highPaths, storageMedium);
if (!lowPaths.isEmpty() || !highPaths.isEmpty()) {
diskBalance.status = DiagnoseStatus.ERROR;
diskBalance.content = String.format("backend %s is not disk balance, low paths { %s }, "
+ "high paths { %s }", beStat.getBeId(),
lowPaths.stream().map(RootPathLoadStatistic::getPath).collect(Collectors.toList()),
highPaths.stream().map(RootPathLoadStatistic::getPath).collect(Collectors.toList()));
diskBalance.detailCmd = String.format(
"show proc \"/cluster_balance/cluster_load_stat/%s/%s/%s\"",
stat.getTag().toKey(), storageMedium.name().toUpperCase(), beStat.getBeId());
break OUTER2;
}
}
}
}
if (diskBalance.status != DiagnoseStatus.OK) {
if (Config.disable_tablet_scheduler || Config.disable_balance || Config.disable_disk_balance) {
diskBalance.suggestion = "has disable tablet balance, ensure master fe config: "
+ "disable_tablet_scheduler = false, disable_balance = false, disable_disk_balance = false";
} else if (!schedReady) {
diskBalance.suggestion = "check all fe are ready, and then wait some minutes for "
+ "sheduler to migrate tablets";
diskBalance.status = DiagnoseStatus.WARNING;
} else if (schedRecent) {
diskBalance.suggestion = "tablet is still scheduling, run 'show proc \"/cluster_balance\"'";
diskBalance.status = DiagnoseStatus.WARNING;
} else if (!baseBalanceOk) {
diskBalance.suggestion = "disk balance run after all be balance, need them finished";
diskBalance.status = DiagnoseStatus.WARNING;
}
}
return diskBalance;
}
private DiagnoseItem diagnoseColocateRebalance(boolean schedReady, boolean schedRecent) {
DiagnoseItem colocateBalance = new DiagnoseItem();
colocateBalance.status = DiagnoseStatus.OK;
colocateBalance.name = "Colocate Group Stable";
Set<GroupId> unstableGroups = Env.getCurrentEnv().getColocateTableIndex().getUnstableGroupIds();
if (!unstableGroups.isEmpty()) {
colocateBalance.status = DiagnoseStatus.ERROR;
colocateBalance.content = String.format("colocate groups are unstable: %s", unstableGroups);
colocateBalance.detailCmd = "show proc \"/colocation_group\"";
if (Config.disable_tablet_scheduler || Config.disable_colocate_balance) {
colocateBalance.suggestion = "has disable tablet balance, ensure master fe config: "
+ "disable_tablet_scheduler = false, disable_colocate_balance = false";
} else if (!schedReady) {
colocateBalance.suggestion = "check all fe are ready, and then wait some minutes for "
+ "sheduler to migrate tablets";
colocateBalance.status = DiagnoseStatus.WARNING;
} else if (schedRecent) {
colocateBalance.suggestion = "tablet is still scheduling, run 'show proc \"/cluster_balance\"'";
colocateBalance.status = DiagnoseStatus.WARNING;
}
}
return colocateBalance;
}
private DiagnoseItem diagnoseHistorySched(List<TabletSchedCtx> historyTablets, boolean ignoreErrs) {
DiagnoseItem historySched = new DiagnoseItem();
historySched.name = "History Tablet Sched";
historySched.status = DiagnoseStatus.OK;
if (!ignoreErrs) {
long now = System.currentTimeMillis();
List<TabletSchedCtx> failedTablets = historyTablets.stream()
.filter(tablet -> tablet.getLastVisitedTime() >= now - 1800 * 1000L
&& tablet.getSchedFailedCode() != SubCode.WAITING_SLOT
&& tablet.getSchedFailedCode() != SubCode.WAITING_DECOMMISSION
&& tablet.getSchedFailedCode() != SubCode.DIAGNOSE_IGNORE
&& (tablet.getState() == TabletSchedCtx.State.CANCELLED
|| tablet.getState() == TabletSchedCtx.State.UNEXPECTED))
.sorted(Comparator.comparing(TabletSchedCtx::getLastVisitedTime).reversed())
.limit(5).collect(Collectors.toList());
if (!failedTablets.isEmpty()) {
historySched.status = DiagnoseStatus.WARNING;
historySched.content = String.format("tablet sched has failed: %s", failedTablets.stream()
.map(tablet -> String.format("tablet %s error: %s", tablet.getTabletId(), tablet.getErrMsg()))
.collect(Collectors.toList()));
historySched.detailCmd = "show proc \"/cluster_balance/history_tablets\"";
}
}
return historySched;
}
}

View File

@ -0,0 +1,121 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.common.proc;
import org.apache.doris.common.AnalysisException;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/*
* show proc "/diagnose";
*/
public class DiagnoseProcDir implements ProcDirInterface {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("Item").add("ErrorNum").add("WarningNum").build();
enum DiagnoseStatus {
OK,
WARNING,
ERROR,
}
static class DiagnoseItem {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("Item").add("Status").add("Content").add("Detail Cmd").add("Suggestion").build();
public String name = "";
public DiagnoseStatus status = DiagnoseStatus.OK;
public String content = "";
public String detailCmd = "";
public String suggestion = "";
public List<String> toRow() {
return Lists.newArrayList(name, status != null ? status.name().toUpperCase() : "", content,
detailCmd, suggestion);
}
}
static class SubProcDir implements ProcDirInterface {
public List<DiagnoseItem> getDiagnoseResult() {
return null;
}
@Override
public boolean register(String name, ProcNodeInterface node) {
return false;
}
@Override
public ProcNodeInterface lookup(String name) throws AnalysisException {
return null;
}
@Override
public ProcResult fetchResult() throws AnalysisException {
List<List<String>> rows = getDiagnoseResult().stream().map(DiagnoseItem::toRow)
.collect(Collectors.toList());
return new BaseProcResult(DiagnoseItem.TITLE_NAMES, rows);
}
}
private Map<String, SubProcDir> subDiagnoses;
DiagnoseProcDir() {
subDiagnoses = Maps.newHashMap();
subDiagnoses.put("cluster_balance", new DiagnoseClusterBalanceProcDir());
// (TODO)
//subDiagnoses.put("transactions", new DiagnoseTransactionsProcDir());
}
@Override
public boolean register(String name, ProcNodeInterface node) {
return false;
}
@Override
public ProcNodeInterface lookup(String name) throws AnalysisException {
return subDiagnoses.get(name);
}
@Override
public ProcResult fetchResult() throws AnalysisException {
List<List<String>> rows = subDiagnoses.entrySet().stream()
.sorted(Comparator.comparing(it -> it.getKey()))
.map(it -> {
List<DiagnoseItem> items = it.getValue().getDiagnoseResult();
long errNum = items.stream().filter(item -> item.status == DiagnoseStatus.ERROR).count();
long warningNum = items.stream().filter(item -> item.status == DiagnoseStatus.WARNING).count();
return Lists.newArrayList(it.getKey(), String.valueOf(errNum), String.valueOf(warningNum));
})
.collect(Collectors.toList());
long totalErrNum = rows.stream().mapToLong(row -> Long.valueOf(row.get(1))).sum();
long totalWarningNum = rows.stream().mapToLong(row -> Long.valueOf(row.get(2))).sum();
rows.add(Lists.newArrayList("Total", String.valueOf(totalErrNum), String.valueOf(totalWarningNum)));
return new BaseProcResult(TITLE_NAMES, rows);
}
}

View File

@ -57,6 +57,7 @@ public final class ProcService {
root.register("stream_loads", new StreamLoadProcNode());
root.register("colocation_group", new ColocationGroupProcDir());
root.register("bdbje", new BDBJEProcDir());
root.register("diagnose", new DiagnoseProcDir());
}
// 通过指定的路径获得对应的PROC Node