// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "runtime/runtime_query_statistics_mgr.h" #include "exec/schema_scanner/schema_scanner_helper.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" #include "util/debug_util.h" #include "util/time.h" #include "vec/core/block.h" namespace doris { void QueryStatisticsCtx::collect_query_statistics(TQueryStatistics* tq_s) { QueryStatistics tmp_qs; for (auto& qs_ptr : _qs_list) { tmp_qs.merge(*qs_ptr); } tmp_qs.to_thrift(tq_s); tq_s->__set_workload_group_id(_wg_id); } void RuntimeQueryStatisticsMgr::register_query_statistics(std::string query_id, std::shared_ptr qs_ptr, TNetworkAddress fe_addr, TQueryType::type query_type) { std::lock_guard write_lock(_qs_ctx_map_lock); if (_query_statistics_ctx_map.find(query_id) == _query_statistics_ctx_map.end()) { _query_statistics_ctx_map[query_id] = std::make_unique(fe_addr, query_type); } _query_statistics_ctx_map.at(query_id)->_qs_list.push_back(qs_ptr); } void RuntimeQueryStatisticsMgr::report_runtime_query_statistics() { int64_t be_id = ExecEnv::GetInstance()->master_info()->backend_id; // 1 get query statistics map std::map> fe_qs_map; std::map> qs_status; // { std::lock_guard write_lock(_qs_ctx_map_lock); int64_t current_time = MonotonicMillis(); int64_t conf_qs_timeout = config::query_statistics_reserve_timeout_ms; for (auto& [query_id, qs_ctx_ptr] : _query_statistics_ctx_map) { if (qs_ctx_ptr->_query_type == TQueryType::EXTERNAL) { continue; } if (fe_qs_map.find(qs_ctx_ptr->_fe_addr) == fe_qs_map.end()) { std::map tmp_map; fe_qs_map[qs_ctx_ptr->_fe_addr] = std::move(tmp_map); } TQueryStatistics ret_t_qs; qs_ctx_ptr->collect_query_statistics(&ret_t_qs); fe_qs_map.at(qs_ctx_ptr->_fe_addr)[query_id] = ret_t_qs; bool is_query_finished = qs_ctx_ptr->_is_query_finished; bool is_timeout_after_finish = false; if (is_query_finished) { is_timeout_after_finish = (current_time - qs_ctx_ptr->_query_finish_time) > conf_qs_timeout; } qs_status[query_id] = std::make_pair(is_query_finished, is_timeout_after_finish); } } // 2 report query statistics to fe std::map rpc_result; for (auto& [addr, qs_map] : fe_qs_map) { rpc_result[addr] = false; // 2.1 get client Status coord_status; FrontendServiceConnection coord(ExecEnv::GetInstance()->frontend_client_cache(), addr, &coord_status); std::string add_str = PrintThriftNetworkAddress(addr); if (!coord_status.ok()) { std::stringstream ss; LOG(WARNING) << "[report_query_statistics]could not get client " << add_str << " when report workload runtime stats, reason:" << coord_status.to_string(); continue; } // 2.2 send report TReportWorkloadRuntimeStatusParams report_runtime_params; report_runtime_params.__set_backend_id(be_id); report_runtime_params.__set_query_statistics_map(qs_map); TReportExecStatusParams params; params.__set_report_workload_runtime_status(report_runtime_params); TReportExecStatusResult res; Status rpc_status; try { coord->reportExecStatus(res, params); rpc_result[addr] = true; } catch (apache::thrift::TApplicationException& e) { LOG(WARNING) << "[report_query_statistics]fe " << add_str << " throw exception when report statistics, reason:" << e.what() << " , you can see fe log for details."; } catch (apache::thrift::transport::TTransportException& e) { LOG(WARNING) << "[report_query_statistics]report workload runtime statistics to " << add_str << " failed, reason: " << e.what(); rpc_status = coord.reopen(); if (!rpc_status.ok()) { LOG(WARNING) << "[report_query_statistics]reopen thrift client failed when report " "workload runtime statistics to" << add_str; } else { try { coord->reportExecStatus(res, params); rpc_result[addr] = true; } catch (apache::thrift::transport::TTransportException& e2) { LOG(WARNING) << "[report_query_statistics]retry report workload runtime stats to " << add_str << " failed, reason: " << e2.what(); } catch (std::exception& e) { LOG_WARNING( "[report_query_statistics]unknow exception when report workload " "runtime statistics to {}, " "reason:{}. ", add_str, e.what()); } } } catch (std::exception& e) { LOG_WARNING( "[report_query_statistics]unknown exception when report workload runtime " "statistics to {}, reason:{}. ", add_str, e.what()); } } // 3 when query is finished and (last rpc is send success), remove finished query statistics if (fe_qs_map.size() == 0) { return; } { std::lock_guard write_lock(_qs_ctx_map_lock); for (auto& [addr, qs_map] : fe_qs_map) { bool is_rpc_success = rpc_result[addr]; for (auto& [query_id, qs] : qs_map) { auto& qs_status_pair = qs_status[query_id]; bool is_query_finished = qs_status_pair.first; bool is_timeout_after_finish = qs_status_pair.second; if ((is_rpc_success && is_query_finished) || is_timeout_after_finish) { _query_statistics_ctx_map.erase(query_id); } } } } } void RuntimeQueryStatisticsMgr::set_query_finished(std::string query_id) { // NOTE: here must be a write lock std::lock_guard write_lock(_qs_ctx_map_lock); // when a query get query_ctx succ, but failed before create node/operator, // it may not register query statistics, so it can not be mark finish if (_query_statistics_ctx_map.find(query_id) != _query_statistics_ctx_map.end()) { auto* qs_ptr = _query_statistics_ctx_map.at(query_id).get(); qs_ptr->_is_query_finished = true; qs_ptr->_query_finish_time = MonotonicMillis(); } } std::shared_ptr RuntimeQueryStatisticsMgr::get_runtime_query_statistics( std::string query_id) { std::shared_lock read_lock(_qs_ctx_map_lock); if (_query_statistics_ctx_map.find(query_id) == _query_statistics_ctx_map.end()) { return nullptr; } std::shared_ptr qs_ptr = std::make_shared(); for (auto const& qs : _query_statistics_ctx_map[query_id]->_qs_list) { qs_ptr->merge(*qs); } return qs_ptr; } void RuntimeQueryStatisticsMgr::get_metric_map( std::string query_id, std::map& metric_map) { QueryStatistics ret_qs; int64_t query_time_ms = 0; { std::shared_lock read_lock(_qs_ctx_map_lock); if (_query_statistics_ctx_map.find(query_id) != _query_statistics_ctx_map.end()) { for (auto const& qs : _query_statistics_ctx_map[query_id]->_qs_list) { ret_qs.merge(*qs); } query_time_ms = MonotonicMillis() - _query_statistics_ctx_map.at(query_id)->_query_start_time; } } metric_map.emplace(WorkloadMetricType::QUERY_TIME, std::to_string(query_time_ms)); metric_map.emplace(WorkloadMetricType::SCAN_ROWS, std::to_string(ret_qs.get_scan_rows())); metric_map.emplace(WorkloadMetricType::SCAN_BYTES, std::to_string(ret_qs.get_scan_bytes())); metric_map.emplace(WorkloadMetricType::QUERY_MEMORY_BYTES, std::to_string(ret_qs.get_current_used_memory_bytes())); } void RuntimeQueryStatisticsMgr::set_workload_group_id(std::string query_id, int64_t wg_id) { // wg id just need eventual consistency, read lock is ok std::shared_lock read_lock(_qs_ctx_map_lock); if (_query_statistics_ctx_map.find(query_id) != _query_statistics_ctx_map.end()) { _query_statistics_ctx_map.at(query_id)->_wg_id = wg_id; } } void RuntimeQueryStatisticsMgr::get_active_be_tasks_block(vectorized::Block* block) { std::shared_lock read_lock(_qs_ctx_map_lock); int64_t be_id = ExecEnv::GetInstance()->master_info()->backend_id; // block's schema come from SchemaBackendActiveTasksScanner::_s_tbls_columns // before 2.1.7, there are 12 columns in "backend_active_tasks" table. // after 2.1.8, 2 new columns added. // check this to make it compatible with version before 2.1.7 bool need_local_and_remote_bytes = (block->columns() > 12); for (auto& [query_id, qs_ctx_ptr] : _query_statistics_ctx_map) { int col_idx = 0; TQueryStatistics tqs; qs_ctx_ptr->collect_query_statistics(&tqs); SchemaScannerHelper::insert_int64_value(col_idx++, be_id, block); SchemaScannerHelper::insert_string_value(col_idx++, qs_ctx_ptr->_fe_addr.hostname, block); SchemaScannerHelper::insert_string_value(col_idx++, query_id, block); int64_t task_time = qs_ctx_ptr->_is_query_finished ? qs_ctx_ptr->_query_finish_time - qs_ctx_ptr->_query_start_time : MonotonicMillis() - qs_ctx_ptr->_query_start_time; SchemaScannerHelper::insert_int64_value(col_idx++, task_time, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.cpu_ms, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.scan_rows, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.scan_bytes, block); if (need_local_and_remote_bytes) { SchemaScannerHelper::insert_int64_value(col_idx++, tqs.scan_bytes_from_local_storage, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.scan_bytes_from_remote_storage, block); } SchemaScannerHelper::insert_int64_value(col_idx++, tqs.max_peak_memory_bytes, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.current_used_memory_bytes, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.shuffle_send_bytes, block); SchemaScannerHelper::insert_int64_value(col_idx++, tqs.shuffle_send_rows, block); std::stringstream ss; ss << qs_ctx_ptr->_query_type; SchemaScannerHelper::insert_string_value(col_idx++, ss.str(), block); } } } // namespace doris