[debug](rpc) debug rpc time consumption problem (#39852)

## Proposed changes

Issue Number: close #xxx

Add detail RPC time info for each channel, sorted by max rpc time of
channels:
```
                     DATA_STREAM_SINK_OPERATOR  (id=1,dst_id=1):
                          -  Partitioner:  Crc32HashPartitioner(64)
                          -  BlocksProduced:  74
                          -  BrpcSendTime:  2.689us
                          -  BrpcSendTime.Wait:  0ns
                          -  BytesSent:  89.35  KB
                          -  CloseTime:  680.152us
                          -  CompressTime:  0ns
                          -  ExecTime:  160.663ms
                          -  InitTime:  263.608us
                          -  InputRows:  32.512K  (32512)
                          -  LocalBytesSent:  0.00  
                          -  LocalSendTime:  0ns
                          -  LocalSentRows:  0
                          -  MemoryUsage:  
                              -  PeakMemoryUsage:  80.00  KB
                          -  MergeBlockTime:  0ns
                          -  OpenTime:  4.113ms
                          -  OverallThroughput:  0.0  /sec
                          -  PendingFinishDependency:  41.179ms
                          -  RowsProduced:  32.512K  (32512)
                          -  RpcAvgTime:  11.850ms
                          -  RpcCount:  10
                          -  RpcMaxTime:  86.891ms
                          -  RpcMinTime:  15.200ms
                          -  RpcSumTime:  118.503ms
                          -  SerializeBatchTime:  13.517ms
                          -  SplitBlockDistributeByChannelTime:  38.923ms
                          -  SplitBlockHashComputeTime:  2.659ms
                          -  UncompressedRowBatchSize:  135.19  KB
                          -  WaitForDependencyTime:  0ns
                              -  WaitForRpcBufferQueue:  0ns
                        RpcInstanceDetails:
                              -  Instance  85d4f75b72a9ea61:  Count:  4,  MaxTime:  36.238ms,  MinTime:  12.107ms,  AvgTime:  21.722ms,  SumTime:  86.891ms
                              -  Instance  85d4f75b72a9ea91:  Count:  3,  MaxTime:  11.107ms,  MinTime:  2.431ms,  AvgTime:  5.470ms,  SumTime:  16.412ms
                              -  Instance  85d4f75b72a9eac1:  Count:  3,  MaxTime:  7.554ms,  MinTime:  3.160ms,  AvgTime:  5.066ms,  SumTime:  15.200m
```
This commit is contained in:
TengJianPing
2024-08-24 19:59:39 +08:00
committed by GitHub
parent 5a810122a2
commit 2dea859bdb
7 changed files with 130 additions and 15 deletions

View File

@ -741,6 +741,7 @@ public class OlapScanNode extends ScanNode {
int useFixReplica = -1;
boolean needCheckTags = false;
boolean skipMissingVersion = false;
Set<Long> userSetBackendBlacklist = null;
if (ConnectContext.get() != null) {
allowedTags = ConnectContext.get().getResourceTags();
needCheckTags = ConnectContext.get().isResourceTagsSet();
@ -751,6 +752,7 @@ public class OlapScanNode extends ScanNode {
LOG.debug("query id: {}, partition id:{} visibleVersion: {}",
DebugUtil.printId(ConnectContext.get().queryId()), partition.getId(), visibleVersion);
}
userSetBackendBlacklist = ConnectContext.get().getSessionVariable().getQueryBackendBlacklist();
}
for (Tablet tablet : tablets) {
long tabletId = tablet.getId();
@ -866,6 +868,16 @@ public class OlapScanNode extends ScanNode {
+ " does not exist or not alive");
continue;
}
if (userSetBackendBlacklist != null && userSetBackendBlacklist.contains(backend.getId())) {
if (LOG.isDebugEnabled()) {
LOG.debug("backend {} is in the blacklist that user set in session variable {}",
replica.getBackendId(), replica.getId());
}
String err = "replica " + replica.getId() + "'s backend " + replica.getBackendId()
+ " in the blacklist that user set in session variable";
errs.add(err);
continue;
}
if (!backend.isMixNode()) {
continue;
}

View File

@ -87,6 +87,8 @@ public class SessionVariable implements Serializable, Writable {
public static final String MAX_EXECUTION_TIME = "max_execution_time";
public static final String INSERT_TIMEOUT = "insert_timeout";
public static final String ENABLE_PROFILE = "enable_profile";
public static final String ENABLE_VERBOSE_PROFILE = "enable_verbose_profile";
public static final String RPC_VERBOSE_PROFILE_MAX_INSTANCE_COUNT = "rpc_verbose_profile_max_instance_count";
public static final String AUTO_PROFILE_THRESHOLD_MS = "auto_profile_threshold_ms";
public static final String SQL_MODE = "sql_mode";
public static final String WORKLOAD_VARIABLE = "workload_group";
@ -415,6 +417,8 @@ public class SessionVariable implements Serializable, Writable {
// fix replica to query. If num = 1, query the smallest replica, if 2 is the second smallest replica.
public static final String USE_FIX_REPLICA = "use_fix_replica";
public static final String QUERY_BACKEND_BLACKLIST = "query_backend_blacklist";
public static final String DRY_RUN_QUERY = "dry_run_query";
// Split size for ExternalFileScanNode. Default value 0 means use the block size of HDFS/S3.
@ -705,6 +709,12 @@ public class SessionVariable implements Serializable, Writable {
@VariableMgr.VarAttr(name = ENABLE_PROFILE, needForward = true)
public boolean enableProfile = false;
@VariableMgr.VarAttr(name = ENABLE_VERBOSE_PROFILE, needForward = true)
public boolean enableVerboseProfile = true;
@VariableMgr.VarAttr(name = RPC_VERBOSE_PROFILE_MAX_INSTANCE_COUNT, needForward = true)
public int rpcVerboseProfileMaxInstanceCount = 5;
// if true, need report to coordinator when plan fragment execute successfully.
@VariableMgr.VarAttr(name = AUTO_PROFILE_THRESHOLD_MS, needForward = true)
public int autoProfileThresholdMs = -1;
@ -1475,6 +1485,13 @@ public class SessionVariable implements Serializable, Writable {
@VariableMgr.VarAttr(name = USE_FIX_REPLICA)
public int useFixReplica = -1;
// This is a debug feature, when we find a backend is not stable(for example network reasons)
// we could use this variable to exclude it from query plan. It is only used for query. Not for
// load jobs.
// Use could set multiple backendids using , to split like "10111,10112"
@VariableMgr.VarAttr(name = QUERY_BACKEND_BLACKLIST, needForward = true)
public String queryBackendBlacklist = "";
@VariableMgr.VarAttr(name = DUMP_NEREIDS_MEMO)
public boolean dumpNereidsMemo = false;
@ -3386,6 +3403,8 @@ public class SessionVariable implements Serializable, Writable {
tResult.setQueryTimeout(queryTimeoutS);
tResult.setEnableProfile(enableProfile);
tResult.setEnableVerboseProfile(enableVerboseProfile);
tResult.setRpcVerboseProfileMaxInstanceCount(rpcVerboseProfileMaxInstanceCount);
if (enableProfile) {
// If enable profile == true, then also set report success to true
// be need report success to start report thread. But it is very tricky
@ -4050,6 +4069,21 @@ public class SessionVariable implements Serializable, Writable {
this.ignoreStorageDataDistribution = ignoreStorageDataDistribution;
}
// If anything wrong during parsing, just throw exception to forbidden the query
// so there is not many exception handling logic here.
public Set<Long> getQueryBackendBlacklist() {
Set<Long> blacklist = Sets.newHashSet();
if (Strings.isNullOrEmpty(queryBackendBlacklist)) {
return blacklist;
}
String[] backendIds = this.queryBackendBlacklist.trim().split(",");
for (int i = 0; i < backendIds.length; ++i) {
long backendId = Long.parseLong(backendIds[i].trim());
blacklist.add(backendId);
}
return blacklist;
}
public boolean isForceJniScanner() {
return forceJniScanner;
}