[debug](rpc) debug rpc time consumption problem (#39852)
## Proposed changes
Issue Number: close #xxx
Add detail RPC time info for each channel, sorted by max rpc time of
channels:
```
DATA_STREAM_SINK_OPERATOR (id=1,dst_id=1):
- Partitioner: Crc32HashPartitioner(64)
- BlocksProduced: 74
- BrpcSendTime: 2.689us
- BrpcSendTime.Wait: 0ns
- BytesSent: 89.35 KB
- CloseTime: 680.152us
- CompressTime: 0ns
- ExecTime: 160.663ms
- InitTime: 263.608us
- InputRows: 32.512K (32512)
- LocalBytesSent: 0.00
- LocalSendTime: 0ns
- LocalSentRows: 0
- MemoryUsage:
- PeakMemoryUsage: 80.00 KB
- MergeBlockTime: 0ns
- OpenTime: 4.113ms
- OverallThroughput: 0.0 /sec
- PendingFinishDependency: 41.179ms
- RowsProduced: 32.512K (32512)
- RpcAvgTime: 11.850ms
- RpcCount: 10
- RpcMaxTime: 86.891ms
- RpcMinTime: 15.200ms
- RpcSumTime: 118.503ms
- SerializeBatchTime: 13.517ms
- SplitBlockDistributeByChannelTime: 38.923ms
- SplitBlockHashComputeTime: 2.659ms
- UncompressedRowBatchSize: 135.19 KB
- WaitForDependencyTime: 0ns
- WaitForRpcBufferQueue: 0ns
RpcInstanceDetails:
- Instance 85d4f75b72a9ea61: Count: 4, MaxTime: 36.238ms, MinTime: 12.107ms, AvgTime: 21.722ms, SumTime: 86.891ms
- Instance 85d4f75b72a9ea91: Count: 3, MaxTime: 11.107ms, MinTime: 2.431ms, AvgTime: 5.470ms, SumTime: 16.412ms
- Instance 85d4f75b72a9eac1: Count: 3, MaxTime: 7.554ms, MinTime: 3.160ms, AvgTime: 5.066ms, SumTime: 15.200m
```
This commit is contained in:
@ -741,6 +741,7 @@ public class OlapScanNode extends ScanNode {
|
||||
int useFixReplica = -1;
|
||||
boolean needCheckTags = false;
|
||||
boolean skipMissingVersion = false;
|
||||
Set<Long> userSetBackendBlacklist = null;
|
||||
if (ConnectContext.get() != null) {
|
||||
allowedTags = ConnectContext.get().getResourceTags();
|
||||
needCheckTags = ConnectContext.get().isResourceTagsSet();
|
||||
@ -751,6 +752,7 @@ public class OlapScanNode extends ScanNode {
|
||||
LOG.debug("query id: {}, partition id:{} visibleVersion: {}",
|
||||
DebugUtil.printId(ConnectContext.get().queryId()), partition.getId(), visibleVersion);
|
||||
}
|
||||
userSetBackendBlacklist = ConnectContext.get().getSessionVariable().getQueryBackendBlacklist();
|
||||
}
|
||||
for (Tablet tablet : tablets) {
|
||||
long tabletId = tablet.getId();
|
||||
@ -866,6 +868,16 @@ public class OlapScanNode extends ScanNode {
|
||||
+ " does not exist or not alive");
|
||||
continue;
|
||||
}
|
||||
if (userSetBackendBlacklist != null && userSetBackendBlacklist.contains(backend.getId())) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("backend {} is in the blacklist that user set in session variable {}",
|
||||
replica.getBackendId(), replica.getId());
|
||||
}
|
||||
String err = "replica " + replica.getId() + "'s backend " + replica.getBackendId()
|
||||
+ " in the blacklist that user set in session variable";
|
||||
errs.add(err);
|
||||
continue;
|
||||
}
|
||||
if (!backend.isMixNode()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -87,6 +87,8 @@ public class SessionVariable implements Serializable, Writable {
|
||||
public static final String MAX_EXECUTION_TIME = "max_execution_time";
|
||||
public static final String INSERT_TIMEOUT = "insert_timeout";
|
||||
public static final String ENABLE_PROFILE = "enable_profile";
|
||||
public static final String ENABLE_VERBOSE_PROFILE = "enable_verbose_profile";
|
||||
public static final String RPC_VERBOSE_PROFILE_MAX_INSTANCE_COUNT = "rpc_verbose_profile_max_instance_count";
|
||||
public static final String AUTO_PROFILE_THRESHOLD_MS = "auto_profile_threshold_ms";
|
||||
public static final String SQL_MODE = "sql_mode";
|
||||
public static final String WORKLOAD_VARIABLE = "workload_group";
|
||||
@ -415,6 +417,8 @@ public class SessionVariable implements Serializable, Writable {
|
||||
// fix replica to query. If num = 1, query the smallest replica, if 2 is the second smallest replica.
|
||||
public static final String USE_FIX_REPLICA = "use_fix_replica";
|
||||
|
||||
public static final String QUERY_BACKEND_BLACKLIST = "query_backend_blacklist";
|
||||
|
||||
public static final String DRY_RUN_QUERY = "dry_run_query";
|
||||
|
||||
// Split size for ExternalFileScanNode. Default value 0 means use the block size of HDFS/S3.
|
||||
@ -705,6 +709,12 @@ public class SessionVariable implements Serializable, Writable {
|
||||
@VariableMgr.VarAttr(name = ENABLE_PROFILE, needForward = true)
|
||||
public boolean enableProfile = false;
|
||||
|
||||
@VariableMgr.VarAttr(name = ENABLE_VERBOSE_PROFILE, needForward = true)
|
||||
public boolean enableVerboseProfile = true;
|
||||
|
||||
@VariableMgr.VarAttr(name = RPC_VERBOSE_PROFILE_MAX_INSTANCE_COUNT, needForward = true)
|
||||
public int rpcVerboseProfileMaxInstanceCount = 5;
|
||||
|
||||
// if true, need report to coordinator when plan fragment execute successfully.
|
||||
@VariableMgr.VarAttr(name = AUTO_PROFILE_THRESHOLD_MS, needForward = true)
|
||||
public int autoProfileThresholdMs = -1;
|
||||
@ -1475,6 +1485,13 @@ public class SessionVariable implements Serializable, Writable {
|
||||
@VariableMgr.VarAttr(name = USE_FIX_REPLICA)
|
||||
public int useFixReplica = -1;
|
||||
|
||||
// This is a debug feature, when we find a backend is not stable(for example network reasons)
|
||||
// we could use this variable to exclude it from query plan. It is only used for query. Not for
|
||||
// load jobs.
|
||||
// Use could set multiple backendids using , to split like "10111,10112"
|
||||
@VariableMgr.VarAttr(name = QUERY_BACKEND_BLACKLIST, needForward = true)
|
||||
public String queryBackendBlacklist = "";
|
||||
|
||||
@VariableMgr.VarAttr(name = DUMP_NEREIDS_MEMO)
|
||||
public boolean dumpNereidsMemo = false;
|
||||
|
||||
@ -3386,6 +3403,8 @@ public class SessionVariable implements Serializable, Writable {
|
||||
|
||||
tResult.setQueryTimeout(queryTimeoutS);
|
||||
tResult.setEnableProfile(enableProfile);
|
||||
tResult.setEnableVerboseProfile(enableVerboseProfile);
|
||||
tResult.setRpcVerboseProfileMaxInstanceCount(rpcVerboseProfileMaxInstanceCount);
|
||||
if (enableProfile) {
|
||||
// If enable profile == true, then also set report success to true
|
||||
// be need report success to start report thread. But it is very tricky
|
||||
@ -4050,6 +4069,21 @@ public class SessionVariable implements Serializable, Writable {
|
||||
this.ignoreStorageDataDistribution = ignoreStorageDataDistribution;
|
||||
}
|
||||
|
||||
// If anything wrong during parsing, just throw exception to forbidden the query
|
||||
// so there is not many exception handling logic here.
|
||||
public Set<Long> getQueryBackendBlacklist() {
|
||||
Set<Long> blacklist = Sets.newHashSet();
|
||||
if (Strings.isNullOrEmpty(queryBackendBlacklist)) {
|
||||
return blacklist;
|
||||
}
|
||||
String[] backendIds = this.queryBackendBlacklist.trim().split(",");
|
||||
for (int i = 0; i < backendIds.length; ++i) {
|
||||
long backendId = Long.parseLong(backendIds[i].trim());
|
||||
blacklist.add(backendId);
|
||||
}
|
||||
return blacklist;
|
||||
}
|
||||
|
||||
public boolean isForceJniScanner() {
|
||||
return forceJniScanner;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user