[feature](diagnose) support "ADMIN DIAGNOSE TABLET" stmt (#8839)

`ADMIN DIAGNOSE TABLET tablet_id`

This statement makes it easier to quickly diagnose the status of a tablet.
See "ADMIN-DIAGNOSE-TABLET.md" for details

```
mysql> admin diagnose tablet 10196;
+----------------------------------+------------------------------+------------+
| Item                             | Info                         | Suggestion |
+----------------------------------+------------------------------+------------+
| TabletExist                      | Yes                          |            |
| TabletId                         | 10196                        |            |
| Database                         | default_cluster:db1: 10192   |            |
| Table                            | tbl1: 10194                  |            |
| Partition                        | tbl1: 10193                  |            |
| MaterializedIndex                | tbl1: 10195                  |            |
| Replicas(ReplicaId -> BackendId) | {"10197":10002}              |            |
| ReplicasNum                      | OK                           |            |
| ReplicaBackendStatus             | Backend 10002 is not alive.  |            |
| ReplicaVersionStatus             | OK                           |            |
| ReplicaStatus                    | OK                           |            |
| ReplicaCompactionStatus          | OK                           |            |
+----------------------------------+------------------------------+------------+
```
This commit is contained in:
Mingyu Chen
2022-04-07 11:30:03 +08:00
committed by GitHub
parent ca4055244e
commit ce50c4d826
11 changed files with 388 additions and 3 deletions

View File

@ -241,7 +241,7 @@ terminal String KW_ADD, KW_ADMIN, KW_AFTER, KW_AGGREGATE, KW_ALIAS, KW_ALL, KW_A
KW_COLLATE, KW_COLLATION, KW_COLUMN, KW_COLUMNS, KW_COMMENT, KW_COMMIT, KW_COMMITTED, KW_COMPACT,
KW_CONFIG, KW_CONNECTION, KW_CONNECTION_ID, KW_CONSISTENT, KW_CONVERT, KW_COUNT, KW_CREATE, KW_CREATION, KW_CROSS, KW_CUBE, KW_CURRENT, KW_CURRENT_USER,
KW_DATA, KW_DATABASE, KW_DATABASES, KW_DATE, KW_DATETIME, KW_DAY, KW_DECIMAL, KW_DECOMMISSION, KW_DEFAULT, KW_DESC, KW_DESCRIBE,
KW_DELETE, KW_UPDATE, KW_DISK, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_DISTRIBUTION, KW_DYNAMIC, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE,
KW_DELETE, KW_UPDATE, KW_DIAGNOSE, KW_DISK, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_DISTRIBUTION, KW_DYNAMIC, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE,
KW_ELSE, KW_ENABLE, KW_ENCRYPTKEY, KW_ENCRYPTKEYS, KW_END, KW_ENGINE, KW_ENGINES, KW_ENTER, KW_ERRORS, KW_EVENTS, KW_EXCEPT, KW_EXCLUDE,
KW_EXISTS, KW_EXPORT, KW_EXTENDED, KW_EXTERNAL, KW_EXTRACT,
KW_FALSE, KW_FEATURE, KW_FOLLOWER, KW_FOLLOWING, KW_FREE, KW_FROM, KW_FIELDS, KW_FILE, KW_FILTER, KW_FIRST, KW_FLOAT, KW_FOR, KW_FORCE, KW_FORMAT, KW_FRONTEND, KW_FRONTENDS, KW_FULL, KW_FUNCTION, KW_FUNCTIONS,
@ -5321,6 +5321,10 @@ admin_stmt ::=
{:
RESULT = new AdminCleanTrashStmt(null);
:}
| KW_ADMIN KW_DIAGNOSE KW_TABLET INTEGER_LITERAL:tabletId
{:
RESULT = new AdminDiagnoseTabletStmt(tabletId);
:}
;
truncate_stmt ::=
@ -5500,6 +5504,8 @@ keyword ::=
{: RESULT = id; :}
| KW_DECIMAL:id
{: RESULT = id; :}
| KW_DIAGNOSE:id
{: RESULT = id; :}
| KW_DISTINCTPC:id
{: RESULT = id; :}
| KW_DISTINCTPCSA:id

View File

@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.analysis;
import org.apache.doris.catalog.Catalog;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.UserException;
import org.apache.doris.mysql.privilege.PrivPredicate;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.qe.ShowResultSetMetaData;
import com.google.common.collect.ImmutableList;
// ADMIN DIAGNOSE TABLET tablet_id
public class AdminDiagnoseTabletStmt extends ShowStmt {
public static final ImmutableList<String> TITLE_NAMES = new ImmutableList.Builder<String>()
.add("Item").add("Info").add("Suggestion")
.build();
private long tabletId;
public AdminDiagnoseTabletStmt(long tabletId) {
this.tabletId = tabletId;
}
@Override
public void analyze(Analyzer analyzer) throws UserException {
super.analyze(analyzer);
// check auth
if (!Catalog.getCurrentCatalog().getAuth().checkGlobalPriv(ConnectContext.get(), PrivPredicate.ADMIN)) {
ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "ADMIN");
}
}
public long getTabletId() {
return tabletId;
}
@Override
public String toSql() {
return "ADMIN DIAGNOSE TABLET " + tabletId;
}
@Override
public ShowResultSetMetaData getMetaData() {
ShowResultSetMetaData.Builder builder = ShowResultSetMetaData.builder();
for (String title : TITLE_NAMES) {
builder.addColumn(new Column(title, ScalarType.createVarchar(1024)));
}
return builder.build();
}
@Override
public RedirectStatus getRedirectStatus() {
return RedirectStatus.FORWARD_NO_SYNC;
}
}

View File

@ -17,6 +17,7 @@
package org.apache.doris.qe;
import org.apache.doris.analysis.AdminDiagnoseTabletStmt;
import org.apache.doris.analysis.AdminShowConfigStmt;
import org.apache.doris.analysis.AdminShowReplicaDistributionStmt;
import org.apache.doris.analysis.AdminShowReplicaStatusStmt;
@ -155,6 +156,7 @@ import org.apache.doris.load.LoadJob.JobState;
import org.apache.doris.load.routineload.RoutineLoadJob;
import org.apache.doris.mysql.privilege.PrivPredicate;
import org.apache.doris.system.Backend;
import org.apache.doris.system.Diagnoser;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.thrift.TUnit;
import org.apache.doris.transaction.GlobalTransactionMgr;
@ -333,6 +335,8 @@ public class ShowExecutor {
handleShowTableCreation();
} else if (stmt instanceof ShowLastInsertStmt) {
handleShowLastInsert();
} else if (stmt instanceof AdminDiagnoseTabletStmt) {
handleAdminDiagnoseTablet();
} else {
handleEmtpy();
}
@ -2128,4 +2132,11 @@ public class ShowExecutor {
resultSet = new ShowResultSet(showMetaData, resultRowSet);
}
private void handleAdminDiagnoseTablet() {
AdminDiagnoseTabletStmt showStmt = (AdminDiagnoseTabletStmt) stmt;
List<List<String>> resultRowSet = Diagnoser.diagnoseTablet(showStmt.getTabletId());
ShowResultSetMetaData showMetaData = showStmt.getMetaData();
resultSet = new ShowResultSet(showMetaData, resultRowSet);
}
}

View File

@ -0,0 +1,163 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.system;
import org.apache.doris.catalog.Catalog;
import org.apache.doris.catalog.Database;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
import org.apache.doris.catalog.Replica;
import org.apache.doris.catalog.Tablet;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
import org.apache.doris.common.Config;
import com.google.common.collect.Lists;
import org.json.simple.JSONObject;
import java.util.List;
// This is a util class to diagnose the Doris system
public class Diagnoser {
// To diagnose a given tablet and return the info and issues about it
// - tablet exist:
// - tablet id
// - database
// - table
// - partition
// - materialized view
// - replica info: {"replica_id" : "backend id"}
// - replica num
// - ReplicaBackendStatus
// - ReplicaVersionStatus
// - ReplicaStatus
// - ReplicaCompactionStatus
//
public static List<List<String>> diagnoseTablet(long tabletId) {
List<List<String>> results = Lists.newArrayList();
TabletInvertedIndex invertedIndex = Catalog.getCurrentInvertedIndex();
TabletMeta tabletMeta = invertedIndex.getTabletMeta(tabletId);
if (tabletMeta == null) {
results.add(Lists.newArrayList("TabletExist", "No", ""));
return results;
}
results.add(Lists.newArrayList("TabletExist", "Yes", ""));
results.add(Lists.newArrayList("TabletId", String.valueOf(tabletId), ""));
// database
Database db = Catalog.getCurrentCatalog().getDbNullable(tabletMeta.getDbId());
if (db == null) {
results.add(Lists.newArrayList("Database", "Not exist", ""));
return results;
}
results.add(Lists.newArrayList("Database", db.getFullName() + ": " + db.getId(), ""));
// table
OlapTable tbl = (OlapTable) db.getTableNullable(tabletMeta.getTableId());
if (tbl == null) {
results.add(Lists.newArrayList("Table", "Not exist", ""));
return results;
}
results.add(Lists.newArrayList("Table", tbl.getName() + ": " + tbl.getId(), ""));
// partition
Partition partition = tbl.getPartition(tabletMeta.getPartitionId());
if (partition == null) {
results.add(Lists.newArrayList("Partition", "Not exist", ""));
return results;
}
results.add(Lists.newArrayList("Partition", partition.getName() + ": " + partition.getId(), ""));
// materialized index
MaterializedIndex mIndex = partition.getIndex(tabletMeta.getIndexId());
if (mIndex == null) {
results.add(Lists.newArrayList("MaterializedIndex", "Not exist", ""));
return results;
}
results.add(Lists.newArrayList("MaterializedIndex", tbl.getIndexNameById(mIndex.getId()) + ": " + mIndex.getId(), ""));
// replica info
Tablet tablet = mIndex.getTablet(tabletId);
List<Replica> replicas = tablet.getReplicas();
JSONObject jobj = new JSONObject();
for (Replica replica : replicas) {
jobj.put(replica.getId(), replica.getBackendId());
}
results.add(Lists.newArrayList("Replicas(ReplicaId -> BackendId)", jobj.toJSONString(), ""));
// replica
short replicaNum = tbl.getPartitionInfo().getReplicaAllocation(partition.getId()).getTotalReplicaNum();
if (replicas.size() != replicaNum) {
results.add(Lists.newArrayList("ReplicasNum", "Replica num is " + replicas.size() + ", expected: " + replicaNum, ""));
} else {
results.add(Lists.newArrayList("ReplicasNum", "OK", ""));
}
SystemInfoService infoService = Catalog.getCurrentSystemInfo();
StringBuilder backendErr = new StringBuilder();
StringBuilder versionErr = new StringBuilder();
StringBuilder statusErr = new StringBuilder();
StringBuilder compactionErr = new StringBuilder();
for (Replica replica : replicas) {
// backend
do {
Backend be = infoService.getBackend(replica.getBackendId());
if (be == null) {
backendErr.append("Backend " + replica.getBackendId() + " does not exist. ");
break;
}
if (!be.isAlive()) {
backendErr.append("Backend " + replica.getBackendId() + " is not alive. ");
break;
}
if (be.isDecommissioned()) {
backendErr.append("Backend " + replica.getBackendId() + " is decommission. ");
break;
}
if (!be.isLoadAvailable()) {
backendErr.append("Backend " + replica.getBackendId() + " is not load available. ");
break;
}
if (!be.isQueryAvailable()) {
backendErr.append("Backend " + replica.getBackendId() + " is not query available. ");
break;
}
if (be.diskExceedLimit()) {
backendErr.append("Backend " + replica.getBackendId() + " has no space left. ");
break;
}
} while (false);
// version
if (replica.getVersion() != partition.getVisibleVersion()) {
versionErr.append("Replica on backend " + replica.getBackendId() + "'s version (" +
replica.getVersion() + ") does not equal" +
" to partition visible version (" + partition.getVisibleVersion() + ")");
}
// status
if (!replica.isAlive()) {
statusErr.append("Replica on backend " + replica.getBackendId() + "'s state is " + replica.getState()
+ ", and is bad: " + (replica.isBad() ? "Yes" : "No"));
}
if (replica.getVersionCount() > Config.min_version_count_indicate_replica_compaction_too_slow) {
compactionErr.append("Replica on backend " + replica.getBackendId() + "'s version count is too high: "
+ replica.getVersionCount());
}
}
results.add(Lists.newArrayList("ReplicaBackendStatus", (backendErr.length() == 0 ? "OK" : backendErr.toString()), ""));
results.add(Lists.newArrayList("ReplicaVersionStatus", (versionErr.length() == 0 ? "OK" : versionErr.toString()), ""));
results.add(Lists.newArrayList("ReplicaStatus", (statusErr.length() == 0 ? "OK" : statusErr.toString()), ""));
results.add(Lists.newArrayList("ReplicaCompactionStatus", (compactionErr.length() == 0 ? "OK" : compactionErr.toString()), ""));
return results;
}
}

View File

@ -164,6 +164,7 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("delete", new Integer(SqlParserSymbols.KW_DELETE));
keywordMap.put("desc", new Integer(SqlParserSymbols.KW_DESC));
keywordMap.put("describe", new Integer(SqlParserSymbols.KW_DESCRIBE));
keywordMap.put("diagnose", new Integer(SqlParserSymbols.KW_DIAGNOSE));
keywordMap.put("distinct", new Integer(SqlParserSymbols.KW_DISTINCT));
keywordMap.put("distinctpc", new Integer(SqlParserSymbols.KW_DISTINCTPC));
keywordMap.put("distinctpc", new Integer(SqlParserSymbols.KW_DISTINCTPC));

View File

@ -29,6 +29,7 @@ import org.apache.doris.common.FeConstants;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.Backend;
import org.apache.doris.system.Diagnoser;
import org.apache.doris.system.SystemInfoService;
import org.apache.doris.thrift.TDisk;
import org.apache.doris.thrift.TStorageMedium;
@ -132,7 +133,9 @@ public class TabletReplicaTooSlowTest {
private static void updateReplicaVersionCount() {
Table<Long, Long, Replica> replicaMetaTable = Catalog.getCurrentInvertedIndex().getReplicaMetaTable();
int versionCount = 1;
long tabletId = -1;
for (Table.Cell<Long, Long, Replica> cell : replicaMetaTable.cellSet()) {
tabletId = cell.getRowKey();
long beId = cell.getColumnKey();
Backend be = Catalog.getCurrentSystemInfo().getBackend(beId);
List<Long> pathHashes = be.getDisks().values().stream().map(DiskInfo::getPathHash).collect(Collectors.toList());
@ -145,6 +148,10 @@ public class TabletReplicaTooSlowTest {
replica.setPathHash(pathHashes.get(0));
}
List<List<String>> result = Diagnoser.diagnoseTablet(tabletId);
Assert.assertEquals(12, result.size());
Assert.assertTrue(result.get(11).get(1).contains("version count is too high"));
}
@Test
@ -158,6 +165,7 @@ public class TabletReplicaTooSlowTest {
" \"replication_num\" = \"3\"\n" +
")";
ExceptionChecker.expectThrowsNoException(() -> createTable(createStr));
int maxLoop = 300;
boolean delete = false;
while (maxLoop-- > 0) {