From abbf75d30241a7f5d3153027570877eebfafdb2f Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Tue, 2 Aug 2022 11:34:06 +0800 Subject: [PATCH] [doc][refactor](metrics) Reorganize FE and BE metrics and add document (#11307) --- be/src/common/config.h | 4 - be/src/common/daemon.cpp | 16 +- be/src/http/action/stream_load.cpp | 3 - be/src/http/action/stream_load.h | 1 - be/src/io/fs/local_file_reader.cpp | 1 + be/src/olap/file_helper.cpp | 96 +----- be/src/olap/file_helper.h | 6 - be/src/olap/lru_cache.cpp | 38 +-- be/src/olap/lru_cache.h | 12 +- .../rowset/segment_v2/segment_iterator.cpp | 2 - be/src/olap/segment_loader.cpp | 3 +- be/src/olap/storage_engine.cpp | 12 - be/src/olap/storage_engine.h | 15 - be/src/runtime/exec_env.h | 2 - be/src/runtime/exec_env_init.cpp | 8 - .../stream_load/stream_load_executor.cpp | 7 +- be/src/util/doris_metrics.cpp | 41 +-- be/src/util/doris_metrics.h | 15 +- be/src/vec/sink/vtablet_sink.cpp | 1 + be/test/util/doris_metrics_test.cpp | 12 - .../monitor-metrics/be-metrics.md | 84 ----- .../monitor-metrics/fe-metrics.md | 155 --------- .../maint-monitor/monitor-metrics/metrics.md | 30 ++ .../monitor-metrics/be-metrics.md | 84 ----- .../monitor-metrics/fe-metrics.md | 155 --------- .../maint-monitor/monitor-metrics/metrics.md | 314 ++++++++++++++++++ .../org/apache/doris/analysis/InsertStmt.java | 2 - .../java/org/apache/doris/catalog/Env.java | 4 +- .../org/apache/doris/clone/TabletChecker.java | 2 +- .../doris/common/ThreadPoolManager.java | 5 +- .../doris/httpv2/rest/MetricsAction.java | 6 +- .../doris/httpv2/util/LoadSubmitter.java | 2 +- .../org/apache/doris/load/ExportChecker.java | 4 +- .../main/java/org/apache/doris/load/Load.java | 3 - .../org/apache/doris/load/LoadChecker.java | 14 +- .../doris/load/loadv2/BrokerLoadJob.java | 3 - .../org/apache/doris/load/loadv2/LoadJob.java | 5 - .../load/routineload/RoutineLoadTaskInfo.java | 2 - .../doris/load/update/UpdateStmtExecutor.java | 3 - .../apache/doris/master/ReportHandler.java | 2 +- .../doris/metric/DorisMetricRegistry.java | 33 +- .../doris/metric/JsonMetricVisitor.java | 13 +- .../org/apache/doris/metric/MetricRepo.java | 283 ++++++++-------- .../apache/doris/metric/MetricVisitor.java | 12 +- .../doris/metric/PrometheusMetricVisitor.java | 42 +-- .../doris/metric/SimpleCoreMetricVisitor.java | 38 +-- .../apache/doris/mysql/nio/NMysqlServer.java | 2 +- .../org/apache/doris/qe/ConnectProcessor.java | 5 +- .../org/apache/doris/qe/ConnectScheduler.java | 2 +- .../org/apache/doris/qe/StmtExecutor.java | 3 - .../apache/doris/qe/cache/CacheAnalyzer.java | 4 +- .../apache/doris/qe/cache/PartitionCache.java | 2 - .../doris/service/FrontendServiceImpl.java | 11 +- .../apache/doris/task/MasterTaskExecutor.java | 15 +- .../transaction/DatabaseTransactionMgr.java | 15 + .../transaction/GlobalTransactionMgr.java | 8 + .../apache/doris/load/loadv2/LoadJobTest.java | 2 - 57 files changed, 673 insertions(+), 991 deletions(-) delete mode 100644 docs/en/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md delete mode 100644 docs/en/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md create mode 100644 docs/en/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md delete mode 100644 docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md delete mode 100644 docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md create mode 100644 docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md diff --git a/be/src/common/config.h b/be/src/common/config.h index 6f7342bb2f..a3f6f6d5bc 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -159,10 +159,6 @@ CONF_mInt64(doris_blocking_priority_queue_wait_timeout_ms, "5"); CONF_Int32(doris_scanner_thread_pool_thread_num, "48"); // number of olap scanner thread pool queue size CONF_Int32(doris_scanner_thread_pool_queue_size, "102400"); -// number of etl thread pool size -CONF_Int32(etl_thread_pool_size, "8"); -// number of etl thread pool size -CONF_Int32(etl_thread_pool_queue_size, "256"); // default thrift client connect timeout(in seconds) CONF_mInt32(thrift_connect_timeout_seconds, "3"); // default thrift client retry interval (in milliseconds) diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index df18fda3b0..e5525d264c 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -111,7 +111,6 @@ void Daemon::memory_maintenance_thread() { */ void Daemon::calculate_metrics_thread() { int64_t last_ts = -1L; - int64_t lst_push_bytes = -1; int64_t lst_query_bytes = -1; std::map lst_disks_io_time; @@ -123,7 +122,6 @@ void Daemon::calculate_metrics_thread() { if (last_ts == -1L) { last_ts = GetCurrentTimeMicros() / 1000; - lst_push_bytes = DorisMetrics::instance()->push_request_write_bytes->value(); lst_query_bytes = DorisMetrics::instance()->query_scan_bytes->value(); DorisMetrics::instance()->system_metrics()->get_disks_io_time(&lst_disks_io_time); DorisMetrics::instance()->system_metrics()->get_network_traffic(&lst_net_send_bytes, @@ -133,28 +131,20 @@ void Daemon::calculate_metrics_thread() { long interval = (current_ts - last_ts) / 1000; last_ts = current_ts; - // 1. push bytes per second - int64_t current_push_bytes = - DorisMetrics::instance()->push_request_write_bytes->value(); - int64_t pps = (current_push_bytes - lst_push_bytes) / (interval + 1); - DorisMetrics::instance()->push_request_write_bytes_per_second->set_value(pps < 0 ? 0 - : pps); - lst_push_bytes = current_push_bytes; - - // 2. query bytes per second + // 1. query bytes per second int64_t current_query_bytes = DorisMetrics::instance()->query_scan_bytes->value(); int64_t qps = (current_query_bytes - lst_query_bytes) / (interval + 1); DorisMetrics::instance()->query_scan_bytes_per_second->set_value(qps < 0 ? 0 : qps); lst_query_bytes = current_query_bytes; - // 3. max disk io util + // 2. max disk io util DorisMetrics::instance()->max_disk_io_util_percent->set_value( DorisMetrics::instance()->system_metrics()->get_max_io_util(lst_disks_io_time, 15)); // update lst map DorisMetrics::instance()->system_metrics()->get_disks_io_time(&lst_disks_io_time); - // 4. max network traffic + // 3. max network traffic int64_t max_send = 0; int64_t max_receive = 0; DorisMetrics::instance()->system_metrics()->get_max_net_traffic( diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 6f59403070..0586c44cc7 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -64,7 +64,6 @@ namespace doris { DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(streaming_load_requests_total, MetricUnit::REQUESTS); -DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(streaming_load_bytes, MetricUnit::BYTES); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(streaming_load_duration_ms, MetricUnit::MILLISECONDS); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(streaming_load_current_processing, MetricUnit::REQUESTS); @@ -127,7 +126,6 @@ StreamLoadAction::StreamLoadAction(ExecEnv* exec_env) : _exec_env(exec_env) { _stream_load_entity = DorisMetrics::instance()->metric_registry()->register_entity("stream_load"); INT_COUNTER_METRIC_REGISTER(_stream_load_entity, streaming_load_requests_total); - INT_COUNTER_METRIC_REGISTER(_stream_load_entity, streaming_load_bytes); INT_COUNTER_METRIC_REGISTER(_stream_load_entity, streaming_load_duration_ms); INT_GAUGE_METRIC_REGISTER(_stream_load_entity, streaming_load_current_processing); } @@ -175,7 +173,6 @@ void StreamLoadAction::handle(HttpRequest* req) { // update statstics streaming_load_requests_total->increment(1); streaming_load_duration_ms->increment(ctx->load_cost_millis); - streaming_load_bytes->increment(ctx->receive_bytes); streaming_load_current_processing->increment(-1); } diff --git a/be/src/http/action/stream_load.h b/be/src/http/action/stream_load.h index d2b9367160..fa39a407dd 100644 --- a/be/src/http/action/stream_load.h +++ b/be/src/http/action/stream_load.h @@ -57,7 +57,6 @@ private: std::shared_ptr _stream_load_entity; IntCounter* streaming_load_requests_total; - IntCounter* streaming_load_bytes; IntCounter* streaming_load_duration_ms; IntGauge* streaming_load_current_processing; }; diff --git a/be/src/io/fs/local_file_reader.cpp b/be/src/io/fs/local_file_reader.cpp index ae5bd44ab9..21c90a510f 100644 --- a/be/src/io/fs/local_file_reader.cpp +++ b/be/src/io/fs/local_file_reader.cpp @@ -38,6 +38,7 @@ LocalFileReader::~LocalFileReader() { Status LocalFileReader::close() { bool expected = false; if (_closed.compare_exchange_strong(expected, true, std::memory_order_acq_rel)) { + DorisMetrics::instance()->local_file_open_reading->increment(-1); auto res = ::close(_fd); if (-1 == res) { return Status::IOError("failed to close {}: {}", _path.native(), std::strerror(errno)); diff --git a/be/src/olap/file_helper.cpp b/be/src/olap/file_helper.cpp index 4be2bee1db..c243299b73 100644 --- a/be/src/olap/file_helper.cpp +++ b/be/src/olap/file_helper.cpp @@ -35,25 +35,7 @@ using std::string; namespace doris { -Cache* FileHandler::_s_fd_cache = nullptr; - -FileHandler::FileHandler() - : _fd(-1), _wr_length(0), _file_name(""), _is_using_cache(false), _cache_handle(nullptr) { - static std::once_flag once_flag; -#ifdef BE_TEST - std::call_once(once_flag, [] { - _s_fd_cache = new_lru_cache("FileHandlerCacheTest", config::file_descriptor_cache_capacity); - }); -#else - // storage engine may not be opened when doris try to read and write - // temp file under the storage root path. So we need to check it. - if (StorageEngine::instance() != nullptr && - StorageEngine::instance()->file_cache() != nullptr) { - std::call_once(once_flag, - [] { _s_fd_cache = StorageEngine::instance()->file_cache().get(); }); - } -#endif -} +FileHandler::FileHandler() : _fd(-1), _wr_length(0), _file_name("") {} FileHandler::~FileHandler() { this->close(); @@ -82,49 +64,6 @@ Status FileHandler::open(const string& file_name, int flag) { VLOG_NOTICE << "success to open file. file_name=" << file_name << ", mode=" << flag << " fd=" << _fd; - _is_using_cache = false; - _file_name = file_name; - return Status::OK(); -} - -Status FileHandler::open_with_cache(const string& file_name, int flag) { - if (_s_fd_cache == nullptr) { - return open(file_name, flag); - } - - if (_fd != -1 && _file_name == file_name) { - return Status::OK(); - } - - if (!this->close()) { - return Status::OLAPInternalError(OLAP_ERR_IO_ERROR); - } - - CacheKey key(file_name.c_str(), file_name.size()); - _cache_handle = _s_fd_cache->lookup(key); - if (nullptr != _cache_handle) { - FileDescriptor* file_desc = - reinterpret_cast(_s_fd_cache->value(_cache_handle)); - _fd = file_desc->fd; - VLOG_NOTICE << "success to open file with cache. file_name=" << file_name - << ", mode=" << flag << " fd=" << _fd; - } else { - _fd = ::open(file_name.c_str(), flag); - if (_fd < 0) { - char errmsg[64]; - LOG(WARNING) << "failed to open file. [err=" << strerror_r(errno, errmsg, 64) - << " file_name='" << file_name << "' flag=" << flag << "]"; - if (errno == EEXIST) { - return Status::OLAPInternalError(OLAP_ERR_FILE_ALREADY_EXIST); - } - return Status::OLAPInternalError(OLAP_ERR_IO_ERROR); - } - FileDescriptor* file_desc = new FileDescriptor(_fd); - _cache_handle = _s_fd_cache->insert(key, file_desc, 1, &_delete_cache_file_descriptor); - VLOG_NOTICE << "success to open file with cache. " - << "file_name=" << file_name << ", mode=" << flag << ", fd=" << _fd; - } - _is_using_cache = true; _file_name = file_name; return Status::OK(); } @@ -157,9 +96,6 @@ Status FileHandler::open_with_mode(const string& file_name, int flag, int mode) } Status FileHandler::_release() { - _s_fd_cache->release(_cache_handle); - _cache_handle = nullptr; - _is_using_cache = false; return Status::OK(); } @@ -168,24 +104,20 @@ Status FileHandler::close() { return Status::OK(); } - if (_is_using_cache && _s_fd_cache != nullptr) { - _release(); - } else { - // try to sync page cache if have written some bytes - if (_wr_length > 0) { - posix_fadvise(_fd, 0, 0, POSIX_FADV_DONTNEED); - // Clean dirty pages and wait for io queue empty and return - sync_file_range(_fd, 0, 0, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER); - _wr_length = 0; - } + // try to sync page cache if have written some bytes + if (_wr_length > 0) { + posix_fadvise(_fd, 0, 0, POSIX_FADV_DONTNEED); + // Clean dirty pages and wait for io queue empty and return + sync_file_range(_fd, 0, 0, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER); + _wr_length = 0; + } - // In some extreme cases (fd is available, but fsync fails) can cause handle leaks - if (::close(_fd) < 0) { - char errmsg[64]; - LOG(WARNING) << "failed to close file. [err= " << strerror_r(errno, errmsg, 64) - << " file_name='" << _file_name << "' fd=" << _fd << "]"; - return Status::OLAPInternalError(OLAP_ERR_IO_ERROR); - } + // In some extreme cases (fd is available, but fsync fails) can cause handle leaks + if (::close(_fd) < 0) { + char errmsg[64]; + LOG(WARNING) << "failed to close file. [err= " << strerror_r(errno, errmsg, 64) + << " file_name='" << _file_name << "' fd=" << _fd << "]"; + return Status::OLAPInternalError(OLAP_ERR_IO_ERROR); } VLOG_NOTICE << "finished to close file. " diff --git a/be/src/olap/file_helper.h b/be/src/olap/file_helper.h index 75d47742d9..bd86e8e68b 100644 --- a/be/src/olap/file_helper.h +++ b/be/src/olap/file_helper.h @@ -44,7 +44,6 @@ public: ~FileHandler(); Status open(const std::string& file_name, int flag); - Status open_with_cache(const std::string& file_name, int flag); // The argument mode specifies the permissions to use in case a new file is created. Status open_with_mode(const std::string& file_name, int flag, int mode); Status close(); @@ -91,18 +90,13 @@ public: SAFE_DELETE(file_desc); } - static Cache* get_fd_cache() { return _s_fd_cache; } - private: Status _release(); - static Cache* _s_fd_cache; int _fd; off_t _wr_length; const int64_t _cache_threshold = 1 << 19; std::string _file_name; - bool _is_using_cache; - Cache::Handle* _cache_handle; }; class FileHandlerWithBuf { diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index 20c3ca476e..a1b2a174ab 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -24,12 +24,12 @@ using std::stringstream; namespace doris { -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(capacity, MetricUnit::BYTES); -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(usage, MetricUnit::BYTES); -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(usage_ratio, MetricUnit::NOUNIT); -DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(lookup_count, MetricUnit::OPERATIONS); -DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(hit_count, MetricUnit::OPERATIONS); -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(hit_ratio, MetricUnit::NOUNIT); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_capacity, MetricUnit::BYTES); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_usage, MetricUnit::BYTES); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_usage_ratio, MetricUnit::NOUNIT); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_lookup_count, MetricUnit::OPERATIONS); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(cache_hit_count, MetricUnit::OPERATIONS); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(cache_hit_ratio, MetricUnit::NOUNIT); uint32_t CacheKey::hash(const char* data, size_t n, uint32_t seed) const { // Similar to murmur hash @@ -452,12 +452,12 @@ ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity, _entity = DorisMetrics::instance()->metric_registry()->register_entity( std::string("lru_cache:") + name, {{"name", name}}); _entity->register_hook(name, std::bind(&ShardedLRUCache::update_cache_metrics, this)); - INT_GAUGE_METRIC_REGISTER(_entity, capacity); - INT_GAUGE_METRIC_REGISTER(_entity, usage); - INT_DOUBLE_METRIC_REGISTER(_entity, usage_ratio); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, lookup_count); - INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, hit_count); - INT_DOUBLE_METRIC_REGISTER(_entity, hit_ratio); + INT_GAUGE_METRIC_REGISTER(_entity, cache_capacity); + INT_GAUGE_METRIC_REGISTER(_entity, cache_usage); + INT_DOUBLE_METRIC_REGISTER(_entity, cache_usage_ratio); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_lookup_count); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, cache_hit_count); + INT_DOUBLE_METRIC_REGISTER(_entity, cache_hit_ratio); } ShardedLRUCache::~ShardedLRUCache() { @@ -535,13 +535,13 @@ void ShardedLRUCache::update_cache_metrics() const { total_hit_count += _shards[i]->get_hit_count(); } - capacity->set_value(total_capacity); - usage->set_value(total_usage); - lookup_count->set_value(total_lookup_count); - hit_count->set_value(total_hit_count); - usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity)); - hit_ratio->set_value(total_lookup_count == 0 ? 0 - : ((double)total_hit_count / total_lookup_count)); + cache_capacity->set_value(total_capacity); + cache_usage->set_value(total_usage); + cache_lookup_count->set_value(total_lookup_count); + cache_hit_count->set_value(total_hit_count); + cache_usage_ratio->set_value(total_capacity == 0 ? 0 : ((double)total_usage / total_capacity)); + cache_hit_ratio->set_value( + total_lookup_count == 0 ? 0 : ((double)total_hit_count / total_lookup_count)); } Cache* new_lru_cache(const std::string& name, size_t capacity, LRUCacheType type, diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 7f925bb69d..7ae79ef3ba 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -388,12 +388,12 @@ private: std::unique_ptr _mem_tracker; std::shared_ptr _entity = nullptr; - IntGauge* capacity = nullptr; - IntGauge* usage = nullptr; - DoubleGauge* usage_ratio = nullptr; - IntAtomicCounter* lookup_count = nullptr; - IntAtomicCounter* hit_count = nullptr; - DoubleGauge* hit_ratio = nullptr; + IntGauge* cache_capacity = nullptr; + IntGauge* cache_usage = nullptr; + DoubleGauge* cache_usage_ratio = nullptr; + IntAtomicCounter* cache_lookup_count = nullptr; + IntAtomicCounter* cache_hit_count = nullptr; + DoubleGauge* cache_hit_ratio = nullptr; }; } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 85e73b8a44..9aa4173fd3 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -195,7 +195,6 @@ Status SegmentIterator::_get_row_ranges_by_keys() { size_t pre_size = _row_bitmap.cardinality(); _row_bitmap = RowRanges::ranges_to_roaring(result_ranges); _opts.stats->rows_key_range_filtered += (pre_size - _row_bitmap.cardinality()); - DorisMetrics::instance()->segment_rows_by_short_key->increment(_row_bitmap.cardinality()); return Status::OK(); } @@ -315,7 +314,6 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row &zone_map_row_ranges); } - DorisMetrics::instance()->segment_rows_read_by_zone_map->increment(zone_map_row_ranges.count()); pre_size = condition_row_ranges->count(); RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges, condition_row_ranges); diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index f147625af5..a83143e6f9 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -32,8 +32,7 @@ void SegmentLoader::create_global_instance(size_t capacity) { } SegmentLoader::SegmentLoader(size_t capacity) { - _cache = std::unique_ptr( - new_lru_cache("SegmentLoader:SegmentCache", capacity, LRUCacheType::NUMBER)); + _cache = std::unique_ptr(new_lru_cache("SegmentCache", capacity, LRUCacheType::NUMBER)); } bool SegmentLoader::_lookup(const SegmentLoader::CacheKey& key, SegmentCacheHandle* handle) { diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index f239f2a585..c0e001dbae 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -110,8 +110,6 @@ StorageEngine::StorageEngine(const EngineOptions& options) _available_storage_medium_type_count(0), _effective_cluster_id(-1), _is_all_cluster_id_exist(true), - _index_stream_lru_cache(nullptr), - _file_cache(nullptr), _compaction_mem_tracker( std::make_shared(-1, "StorageEngine::AutoCompaction")), _segment_meta_mem_tracker(std::make_unique("StorageEngine::SegmentMeta")), @@ -183,9 +181,6 @@ void StorageEngine::load_data_dirs(const std::vector& data_dirs) { } Status StorageEngine::_open() { - // NOTE: must init before _init_store_map. - _file_cache.reset(new_lru_cache("FileHandlerCache", config::file_descriptor_cache_capacity)); - // init store_map RETURN_NOT_OK_STATUS_WITH_WARN(_init_store_map(), "_init_store_map failed"); @@ -196,9 +191,6 @@ Status StorageEngine::_open() { RETURN_NOT_OK_STATUS_WITH_WARN(_check_file_descriptor_number(), "check fd number failed"); - _index_stream_lru_cache = - new_lru_cache("SegmentIndexCache", config::index_stream_cache_capacity); - auto dirs = get_stores(); load_data_dirs(dirs); @@ -597,9 +589,6 @@ void StorageEngine::stop() { } void StorageEngine::_clear() { - SAFE_DELETE(_index_stream_lru_cache); - _file_cache.reset(); - std::lock_guard l(_store_lock); for (auto& store_pair : _store_map) { delete store_pair.second; @@ -643,7 +632,6 @@ void StorageEngine::clear_transaction_task(const TTransactionId transaction_id, } void StorageEngine::_start_clean_cache() { - _file_cache->prune(); SegmentLoader::instance()->prune(); } diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index a46218e442..9ad9d4a8d7 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -82,10 +82,6 @@ public: // 是允许的,但re-load全新的path是不允许的,因为此处没有彻底更新ce调度器信息 void load_data_dirs(const std::vector& stores); - Cache* index_stream_lru_cache() { return _index_stream_lru_cache; } - - std::shared_ptr file_cache() { return _file_cache; } - template std::vector get_stores(); @@ -319,17 +315,6 @@ private: int32_t _effective_cluster_id; bool _is_all_cluster_id_exist; - Cache* _index_stream_lru_cache; - - // _file_cache is a lru_cache for file descriptors of files opened by doris, - // which can be shared by others. Why we need to share cache with others? - // Because a unique memory space is easier for management. For example, - // we can deal with segment v1's cache and segment v2's cache at same time. - // Note that, we must create _file_cache before sharing it with other. - // (e.g. the storage engine's open function must be called earlier than - // FileBlockManager created.) - std::shared_ptr _file_cache; - static StorageEngine* _s_instance; std::mutex _gc_mutex; diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index f708f4b410..6a3b71ca4d 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -124,7 +124,6 @@ public: PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; } PriorityThreadPool* remote_scan_thread_pool() { return _remote_scan_thread_pool; } ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); } - PriorityThreadPool* etl_thread_pool() { return _etl_thread_pool; } ThreadPool* send_batch_thread_pool() { return _send_batch_thread_pool.get(); } CgroupsMgr* cgroups_mgr() { return _cgroups_mgr; } FragmentMgr* fragment_mgr() { return _fragment_mgr; } @@ -208,7 +207,6 @@ private: std::unique_ptr _limited_scan_thread_pool; std::unique_ptr _send_batch_thread_pool; - PriorityThreadPool* _etl_thread_pool = nullptr; CgroupsMgr* _cgroups_mgr = nullptr; FragmentMgr* _fragment_mgr = nullptr; ResultCache* _result_cache = nullptr; diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index e9074af172..360e2e7a54 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -68,7 +68,6 @@ namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(scanner_thread_pool_queue_size, MetricUnit::NOUNIT); -DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(etl_thread_pool_queue_size, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_thread_num, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(send_batch_thread_pool_queue_size, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(query_mem_consumption, MetricUnit::BYTES, "", mem_consumption, @@ -130,8 +129,6 @@ Status ExecEnv::_init(const std::vector& store_paths) { .set_max_queue_size(config::send_batch_thread_pool_queue_size) .build(&_send_batch_thread_pool); - _etl_thread_pool = new PriorityThreadPool(config::etl_thread_pool_size, - config::etl_thread_pool_queue_size); _cgroups_mgr = new CgroupsMgr(this, config::doris_cgroups); _fragment_mgr = new FragmentMgr(this); _result_cache = new ResultCache(config::query_cache_max_size_mb, @@ -317,9 +314,6 @@ void ExecEnv::_register_metrics() { REGISTER_HOOK_METRIC(scanner_thread_pool_queue_size, [this]() { return _scan_thread_pool->get_queue_size(); }); - REGISTER_HOOK_METRIC(etl_thread_pool_queue_size, - [this]() { return _etl_thread_pool->get_queue_size(); }); - REGISTER_HOOK_METRIC(send_batch_thread_pool_thread_num, [this]() { return _send_batch_thread_pool->num_threads(); }); @@ -329,7 +323,6 @@ void ExecEnv::_register_metrics() { void ExecEnv::_deregister_metrics() { DEREGISTER_HOOK_METRIC(scanner_thread_pool_queue_size); - DEREGISTER_HOOK_METRIC(etl_thread_pool_queue_size); DEREGISTER_HOOK_METRIC(send_batch_thread_pool_thread_num); DEREGISTER_HOOK_METRIC(send_batch_thread_pool_queue_size); } @@ -352,7 +345,6 @@ void ExecEnv::_destroy() { SAFE_DELETE(_master_info); SAFE_DELETE(_fragment_mgr); SAFE_DELETE(_cgroups_mgr); - SAFE_DELETE(_etl_thread_pool); SAFE_DELETE(_scan_thread_pool); SAFE_DELETE(_remote_scan_thread_pool); SAFE_DELETE(_thread_mgr); diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 37491fc98e..d87f16fbe5 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -37,7 +37,6 @@ Status k_stream_load_plan_status; #endif Status StreamLoadExecutor::execute_plan_fragment(StreamLoadContext* ctx) { - DorisMetrics::instance()->txn_exec_plan_total->increment(1); // submit this params #ifndef BE_TEST ctx->ref(); @@ -132,7 +131,7 @@ Status StreamLoadExecutor::execute_plan_fragment(StreamLoadContext* ctx) { return Status::OK(); } Status StreamLoadExecutor::begin_txn(StreamLoadContext* ctx) { - DorisMetrics::instance()->txn_begin_request_total->increment(1); + DorisMetrics::instance()->stream_load_txn_begin_request_total->increment(1); TLoadTxnBeginRequest request; set_request_auth(&request, ctx->auth); @@ -256,7 +255,7 @@ void StreamLoadExecutor::get_commit_request(StreamLoadContext* ctx, } Status StreamLoadExecutor::commit_txn(StreamLoadContext* ctx) { - DorisMetrics::instance()->txn_commit_request_total->increment(1); + DorisMetrics::instance()->stream_load_txn_commit_request_total->increment(1); TLoadTxnCommitRequest request; get_commit_request(ctx, request); @@ -291,7 +290,7 @@ Status StreamLoadExecutor::commit_txn(StreamLoadContext* ctx) { } void StreamLoadExecutor::rollback_txn(StreamLoadContext* ctx) { - DorisMetrics::instance()->txn_rollback_request_total->increment(1); + DorisMetrics::instance()->stream_load_txn_rollback_request_total->increment(1); TNetworkAddress master_addr = _exec_env->master_info()->network_address; TLoadTxnRollbackRequest request; diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index a4c7a80cf0..f8a27e4f6a 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -30,8 +30,6 @@ namespace doris { DEFINE_COUNTER_METRIC_PROTOTYPE_3ARG(fragment_requests_total, MetricUnit::REQUESTS, "Total fragment requests received."); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(fragment_request_duration_us, MetricUnit::MICROSECONDS); -DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(http_requests_total, MetricUnit::REQUESTS); -DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(http_request_send_bytes, MetricUnit::BYTES); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(query_scan_bytes, MetricUnit::BYTES); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(query_scan_rows, MetricUnit::ROWS); DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(query_scan_count, MetricUnit::NOUNIT); @@ -100,28 +98,18 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(meta_read_request_duration_us, MetricUnit:: DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(segment_read_total, MetricUnit::OPERATIONS, "(segment_v2) total number of segments read", segment_read, - Labels({{"type", "segment_total_read_times"}})); + Labels({{"type", "segment_read_total"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG( segment_row_total, MetricUnit::ROWS, "(segment_v2) total number of rows in queried segments (before index pruning)", - segment_read, Labels({{"type", "segment_total_row_num"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG( - segment_rows_by_short_key, MetricUnit::ROWS, - "(segment_v2) total number of rows selected by short key index", segment_read, - Labels({{"type", "segment_rows_by_short_key"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(segment_rows_read_by_zone_map, MetricUnit::ROWS, - "(segment_v2) total number of rows selected by zone map index", - segment_read, - Labels({{"type", "segment_rows_read_by_zone_map"}})); + segment_read, Labels({{"type", "segment_row_total"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(txn_begin_request_total, MetricUnit::OPERATIONS, "", - txn_request, Labels({{"type", "begin"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(txn_commit_request_total, MetricUnit::OPERATIONS, "", - txn_request, Labels({{"type", "commit"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(txn_rollback_request_total, MetricUnit::OPERATIONS, "", - txn_request, Labels({{"type", "rollback"}})); -DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(txn_exec_plan_total, MetricUnit::OPERATIONS, "", txn_request, - Labels({{"type", "exec"}})); +DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(stream_load_txn_begin_request_total, MetricUnit::OPERATIONS, + "", stream_load_txn_request, Labels({{"type", "begin"}})); +DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(stream_load_txn_commit_request_total, MetricUnit::OPERATIONS, + "", stream_load_txn_request, Labels({{"type", "commit"}})); +DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(stream_load_txn_rollback_request_total, MetricUnit::OPERATIONS, + "", stream_load_txn_request, Labels({{"type", "rollback"}})); DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(stream_receive_bytes_total, MetricUnit::BYTES, "", stream_load, Labels({{"type", "receive_bytes"}})); @@ -153,7 +141,6 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(compaction_waitting_permits, MetricUnit::NOUN DEFINE_HISTOGRAM_METRIC_PROTOTYPE_2ARG(tablet_version_num_distribution, MetricUnit::NOUNIT); -DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(push_request_write_bytes_per_second, MetricUnit::BYTES); DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(query_scan_bytes_per_second, MetricUnit::BYTES); DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_disk_io_util_percent, MetricUnit::PERCENT); DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_network_send_bytes_rate, MetricUnit::BYTES); @@ -198,8 +185,6 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) { INT_COUNTER_METRIC_REGISTER(_server_metric_entity, fragment_requests_total); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, fragment_request_duration_us); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, http_requests_total); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, http_request_send_bytes); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, query_scan_bytes); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, query_scan_rows); @@ -254,13 +239,10 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) { INT_COUNTER_METRIC_REGISTER(_server_metric_entity, segment_read_total); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, segment_row_total); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, segment_rows_by_short_key); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, segment_rows_read_by_zone_map); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, txn_begin_request_total); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, txn_commit_request_total); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, txn_rollback_request_total); - INT_COUNTER_METRIC_REGISTER(_server_metric_entity, txn_exec_plan_total); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, stream_load_txn_begin_request_total); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, stream_load_txn_commit_request_total); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, stream_load_txn_rollback_request_total); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, stream_receive_bytes_total); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, stream_load_rows_total); @@ -281,7 +263,6 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) { HISTOGRAM_METRIC_REGISTER(_server_metric_entity, tablet_version_num_distribution); - INT_GAUGE_METRIC_REGISTER(_server_metric_entity, push_request_write_bytes_per_second); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, query_scan_bytes_per_second); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, max_disk_io_util_percent); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, max_network_send_bytes_rate); diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 2811da069b..22407cb853 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -46,8 +46,6 @@ class DorisMetrics { public: IntCounter* fragment_requests_total; IntCounter* fragment_request_duration_us; - IntCounter* http_requests_total; - IntCounter* http_request_send_bytes; IntCounter* query_scan_bytes; IntCounter* query_scan_rows; @@ -109,15 +107,10 @@ public: IntCounter* segment_read_total; // total number of rows in queried segments (before index pruning) IntCounter* segment_row_total; - // total number of rows selected by short key index - IntCounter* segment_rows_by_short_key; - // total number of rows selected by zone map index - IntCounter* segment_rows_read_by_zone_map; - IntCounter* txn_begin_request_total; - IntCounter* txn_commit_request_total; - IntCounter* txn_rollback_request_total; - IntCounter* txn_exec_plan_total; + IntCounter* stream_load_txn_begin_request_total; + IntCounter* stream_load_txn_commit_request_total; + IntCounter* stream_load_txn_rollback_request_total; IntCounter* stream_receive_bytes_total; IntCounter* stream_load_rows_total; IntCounter* load_rows; @@ -153,7 +146,6 @@ public: // The following metrics will be calculated // by metric calculator - IntGauge* push_request_write_bytes_per_second; IntGauge* query_scan_bytes_per_second; IntGauge* max_disk_io_util_percent; IntGauge* max_network_send_bytes_rate; @@ -203,7 +195,6 @@ public: UIntGauge* query_cache_partition_total_count; UIntGauge* scanner_thread_pool_queue_size; - UIntGauge* etl_thread_pool_queue_size; UIntGauge* add_batch_task_queue_size; UIntGauge* send_batch_thread_pool_thread_num; UIntGauge* send_batch_thread_pool_queue_size; diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index 825c67b2a6..68abea3856 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -384,6 +384,7 @@ size_t VOlapTableSink::get_pending_bytes() const { } return mem_consumption; } + Status VOlapTableSink::send(RuntimeState* state, vectorized::Block* input_block) { INIT_AND_SCOPE_SEND_SPAN(state->get_tracer(), _send_span, "VOlapTableSink::send"); SCOPED_CONSUME_MEM_TRACKER(_mem_tracker.get()); diff --git a/be/test/util/doris_metrics_test.cpp b/be/test/util/doris_metrics_test.cpp index 91df81602e..2dcea035b2 100644 --- a/be/test/util/doris_metrics_test.cpp +++ b/be/test/util/doris_metrics_test.cpp @@ -47,18 +47,6 @@ TEST_F(DorisMetricsTest, Normal) { EXPECT_TRUE(metric != nullptr); EXPECT_STREQ("101", metric->to_string().c_str()); } - { - DorisMetrics::instance()->http_requests_total->increment(102); - auto metric = server_entity->get_metric("http_requests_total"); - EXPECT_TRUE(metric != nullptr); - EXPECT_STREQ("102", metric->to_string().c_str()); - } - { - DorisMetrics::instance()->http_request_send_bytes->increment(104); - auto metric = server_entity->get_metric("http_request_send_bytes"); - EXPECT_TRUE(metric != nullptr); - EXPECT_STREQ("104", metric->to_string().c_str()); - } { DorisMetrics::instance()->query_scan_bytes->increment(104); auto metric = server_entity->get_metric("query_scan_bytes"); diff --git a/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md b/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md deleted file mode 100644 index d630d11065..0000000000 --- a/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -{ - "title": "BE Metrics", - "language": "en" -} ---- - - - - - -# BE Metrics - -This document mainly introduces the monitor metrics of BE. - -## View Metrics - -BE metrics can be viewed by visiting: - -`http://be_host:be_webserver_port/metrics` - -The default format is of [Prometheus](https://prometheus.io/). - -You can get Json format by visiting: - -`http://be_host:be_webserver_port/metrics?type=json` - -## Metrics List - -### `doris_be_snmp{name="tcp_in_errs"}` - -Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received. - -The incidence rate can be calculated in combination with the sampling period. - -Usually used to troubleshoot network problems. - -### `doris_be_snmp{name="tcp_retrans_segs"}` - -Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received. - -The incidence rate can be calculated in combination with the sampling period. - -Usually used to troubleshoot network problems. - -### `doris_be_snmp{name="tcp_in_segs"}` - -Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of received TCP packets. - -Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets. - -Usually used to troubleshoot network problems. - -### `doris_be_snmp{name="tcp_out_segs"}` - -Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark. - -Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets. - -Usually used to troubleshoot network problems. - -### `doris_be_compaction_mem_current_consumption` - -The total MemPool consumption of all running `Compaction` threads. Use this value, we can easily identify whether -Compactions use too much memory, it may cause memory overhead or OOM. - -Usually used to troubleshoot memory problems. \ No newline at end of file diff --git a/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md b/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md deleted file mode 100644 index 8ad4381c8a..0000000000 --- a/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -{ - "title": "FE Metrics", - "language": "en" -} ---- - - - - - -# FE Metrics - -This document mainly introduces the monitor metrics of FE. - -## View Metrics - -FE metrics can be viewed by visiting: - -`http://fe_host:fe_http_port/metrics` - -The default format is of [Prometheus](https://prometheus.io/). - -You can get Json format by visiting: - -`http://fe_host:fe_http_port/metrics?type=json` - -## Metrics List - -### `doris_fe_snmp{name="tcp_in_errs"}` - -Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received. - -The incidence rate can be calculated in combination with the sampling period. - -Usually used to troubleshoot network problems. - -### `doris_fe_snmp{name="tcp_retrans_segs"}` - -Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received. - -The incidence rate can be calculated in combination with the sampling period. - -Usually used to troubleshoot network problems. - -### `doris_fe_snmp{name="tcp_in_segs"}` - -Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of received TCP packets. - -Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets. - -Usually used to troubleshoot network problems. - -### `doris_fe_snmp{name="tcp_out_segs"}` - -Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark. - -Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets. - -Usually used to troubleshoot network problems. - -### `doris_fe_meminfo{name="memory_total"}` - -Value of the `MemTotal` field in `/proc/meminfo`. Represents the size of all available memory, total physical memory minus reserved space and kernel size. - -Usually used to troubleshoot memory problems. - -### `doris_fe_meminfo{name="memory_free"}` - -Value of the `MemFree` field in `/proc/meminfo`. Represents the size of unused memory in system. - -Usually used to troubleshoot memory problems. - -### `doris_fe_meminfo{name="memory_available"}` - -Value of the `MemAvailable` field in `/proc/meminfo`. Represents the real system usable memory size. Although some memory in the system has been used, but it can be reclaimed. So this part of reclaimable memory plus MemFree is the system usable memory. - -Usually used to troubleshoot memory problems. - -### `doris_fe_meminfo{name="buffers"}` - -Value of the `Buffers` field in `/proc/meminfo`. Represents the memory used to cache the block device (metadata, pages of the file system). - -Usually used to troubleshoot memory problems. - -### `doris_fe_meminfo{name="cached"}` - -Value of the `Cached` field in `/proc/meminfo`. Represents the memory allocated to the file cache. - -Usually used to troubleshoot memory problems. - -### `jvm_thread{type="count"}` - -Value of the `count` type in `jvm_thread`. Represents the current number of live threads including both daemon and non-daemon threads. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="peak_count"}` - -Value of the `peak_count` type in `jvm_thread`. Represents the current number of live threads including both daemon and non-daemon threads. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="new_count"}` - -Value of the `new_count` type in `jvm_thread`. Represents the current number of threads which state is NEW. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="runnable_count"}` - -Value of the `runnable_count` type in `jvm_thread`. Represents the current number of threads which state is RUNNABLE. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="blocked_count"}` - -Value of the `blocked_count` type in `jvm_thread`. Represents the current number of threads which state is BLOCKED. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="waiting_count"}` - -Value of the `waiting_count` type in `jvm_thread`. Represents the current number of threads which state is WAITING. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="timed_waiting_count"}` - -Value of the `timed_waiting_count` type in `jvm_thread`. Represents the current number of threads which state is TIMED_WAITING. - -Usually used to troubleshoot jvm threads problems for FE. - -### `jvm_thread{type="terminated_count"}` - -Value of the `terminated_count` type in `jvm_thread`. Represents the current number of threads which state is TERMINATED. - -Usually used to troubleshoot jvm threads problems for FE. \ No newline at end of file diff --git a/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md b/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md new file mode 100644 index 0000000000..e44f28a2d7 --- /dev/null +++ b/docs/en/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md @@ -0,0 +1,30 @@ +--- +{ + "title": "Monitor Metrics", + "language": "en" +} +--- + + + +# Monitor Metrics + +(TODO) +There is no English document, please visit the Chinese document. diff --git a/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md b/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md deleted file mode 100644 index db555efbe7..0000000000 --- a/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/be-metrics.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -{ - "title": "BE 监控项", - "language": "zh-CN" -} ---- - - - - - -# BE 监控项 - -该文档主要介绍 BE 的相关监控项。 - -## 查看监控项 - -BE 的监控项可以通过以下方式访问: - -`http://be_host:be_webserver_port/metrics` - -默认显示为 [Prometheus](https://prometheus.io/) 格式。 - -通过以下接口可以获取 Json 格式的监控项: - -`http://be_host:be_webserver_port/metrics?type=json` - -## 监控项列表 - -### `doris_be_snmp{name="tcp_in_errs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: InErrs` 字段值。表示当前接收到的错误的 TCP 包的数量。 - -结合采样周期可以计算发生率。 - -通常用于排查网络问题。 - -### `doris_be_snmp{name="tcp_retrans_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: RetransSegs` 字段值。表示当前重传的 TCP 包的数量。 - -结合采样周期可以计算发生率。 - -通常用于排查网络问题。 - -### `doris_be_snmp{name="tcp_in_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。 - -通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。 - -通常用于排查网络问题。 - -### `doris_be_snmp{name="tcp_out_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。 - -通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。 - -通常用于排查网络问题。 - -### `doris_be_compaction_mem_current_consumption` - -该监控项为Compaction使用的MemPool总和(所有Compaction线程)。通过该值,可以迅速判断Compaction是否占用过多内存,引起高内存占用 -甚至OOM等问题。 - -通常用于排查内存使用问题。 \ No newline at end of file diff --git a/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md b/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md deleted file mode 100644 index 2277296438..0000000000 --- a/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/fe-metrics.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -{ - "title": "FE 监控项", - "language": "zh-CN" -} ---- - - - - - -# FE 监控项 - -该文档主要介绍 FE 的相关监控项。 - -## 查看监控项 - -FE 的监控项可以通过以下方式访问: - -`http://fe_host:fe_http_port/metrics` - -默认显示为 [Prometheus](https://prometheus.io/) 格式。 - -通过以下接口可以获取 Json 格式的监控项: - -`http://fe_host:fe_http_port/metrics?type=json` - -## 监控项列表 - -### `doris_fe_snmp{name="tcp_in_errs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: InErrs` 字段值。表示当前接收到的错误的 TCP 包的数量。 - -结合采样周期可以计算发生率。 - -通常用于排查网络问题。 - -### `doris_fe_snmp{name="tcp_retrans_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: RetransSegs` 字段值。表示当前重传的 TCP 包的数量。 - -结合采样周期可以计算发生率。 - -通常用于排查网络问题。 - -### `doris_fe_snmp{name="tcp_in_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。 - -通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。 - -通常用于排查网络问题。 - -### `doris_fe_snmp{name="tcp_out_segs"}` - -该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。 - -通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。 - -通常用于排查网络问题。 - -### `doris_fe_meminfo{name="memory_total"}` - -该监控项为 `/proc/meminfo` 中的 `MemTotal` 字段值。表示所有可用的内存大小,总的物理内存减去预留空间和内核大小。 - -通常用于排查内存问题。 - -### `doris_fe_meminfo{name="memory_free"}` - -该监控项为 `/proc/meminfo` 中的 `MemFree` 字段值。表示系统尚未使用的内存。。 - -通常用于排查内存问题。 - -### `doris_fe_meminfo{name="memory_available"}` - -该监控项为 `/proc/meminfo` 中的 `MemAvailable` 字段值。真正的系统可用内存,系统中有些内存虽然已被使用但是可以回收的,所以这部分可回收的内存加上MemFree才是系统可用的内存 - -通常用于排查内存问题。 - -### `doris_fe_meminfo{name="buffers"}` - -该监控项为 `/proc/meminfo` 中的 `Buffers` 字段值。表示用来给块设备做缓存的内存(文件系统的metadata、pages)。 - -通常用于排查内存问题。 - -### `doris_fe_meminfo{name="cached"}` - -该监控项为 `/proc/meminfo` 中的 `Cached` 字段值。表示分配给文件缓冲区的内存。 - -通常用于排查内存问题。 - -### `jvm_thread{type="count"}` - -该监控项表示FE节点当前JVM总的线程数量,包含daemon线程和非daemon线程。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="peak_count"}` - -该监控项表示FE节点从JVM启动以来的最大峰值线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="new_count"}` - -该监控项表示FE节点JVM中处于NEW状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="runnable_count"}` - -该监控项表示FE节点JVM中处于RUNNABLE状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="blocked_count"}` - -该监控项表示FE节点JVM中处于BLOCKED状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="waiting_count"}` - -该监控项表示FE节点JVM中处于WAITING状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="timed_waiting_count"}` - -该监控项表示FE节点JVM中处于TIMED_WAITING状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 - -### `jvm_thread{type="terminated_count"}` - -该监控项表示FE节点JVM中处于TERMINATED状态的线程数量。 - -通常用于排查FE节点的JVM线程运行问题。 \ No newline at end of file diff --git a/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md b/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md new file mode 100644 index 0000000000..d5789c05f5 --- /dev/null +++ b/docs/zh-CN/docs/admin-manual/maint-monitor/monitor-metrics/metrics.md @@ -0,0 +1,314 @@ +--- +{ + "title": "监控指标", + "language": "zh-CN" +} +--- + + + +# 监控指标 + +Doris 的 FE 进程和 BE 进程都提供了完备的监控指标。监控指标可以分为两类: + +1. 进程监控:主要展示 Doris 进程本身的一些监控值。 +2. 节点监控:主要展示 Doris 进程所在节点机器本身的监控,如 CPU、内存、IO、网络等等。 + +可以通过访问 FE 或 BE 节点的 http 端口获取当前监控。如: + +``` +curl http://fe_host:http_port/metrics +curl http://be_host:webserver_port/metrics +``` + +默认返回 Prometheus 兼容格式的监控指标,如: + +``` +doris_fe_cache_added{type="partition"} 0 +doris_fe_cache_added{type="sql"} 0 +doris_fe_cache_hit{type="partition"} 0 +doris_fe_cache_hit{type="sql"} 0 +doris_fe_connection_total 2 +``` + +如需获取 Json 格式的监控指标,请访问: + +``` +curl http://fe_host:http_port/metrics?type=json +curl http://be_host:webserver_port/metrics?type=json +``` + +## 监控等级和最佳实践 + +**表格中的最后一列标注了监控项的重要等级。P0 表示最重要,数值越大,重要性越低。** + +绝大多数监控指标类型为 Counter。即累计值。你可通过间隔采集(如每15秒)监控值,并计算单位时间的斜率,来获得有效信息。 + +如可以通过计算 `doris_fe_query_err` 的斜率来获取查询错误率(error per second)。 + +> 欢迎完善此表格以提供更全面有效的监控指标。 + +## FE 监控指标 + +### 进程监控 + +|名称| 标签 |单位 | 含义 | 说明 | 等级 | +|---|---|---|---|---|---| +|`doris_fe_cache_added`|{type="partition"}| Num | 新增的 Partition Cache 数量累计值 | | +||{type="sql"}|Num| 新增的 SQL Cache 数量累计值 | | +|`doris_fe_cache_hit`|{type="partition"}| Num | 命中 Partition Cache 的计数 | | +||{type="sql"}| Num | 命中 SQL Cache 的计数 | | +|`doris_fe_connection_total`| | Num| 当前FE的MySQL端口连接数 | 用于监控查询连接数。如果连接数超限,则新的连接将无法接入 | P0 | +|`doris_fe_counter_hit_sql_block_rule`|| Num| 被 SQL BLOCK RULE 拦截的查询数量 | | | +|`doris_fe_edit_log_clean`| {type="failed"} | Num| 清理历史元数据日志失败的次数 | 不应失败,如失败,需人工介入 | P0| +|| {type="success"} | Num| 清理历史元数据日志成功的次数 | | +|`doris_fe_edit_log`| {type="bytes"} |字节 | 元数据日志写入量的累计值 | 通过计算斜率可以获得写入速率,来观察是否元数据写入有延迟 | P0 | +|| {type="read"} |Num| 元数据日志读取次数的计数 | 通过斜率观察元数据读取频率是否正常 |P0 | +|| {type="write"} |Num | 元数据日志写入次数的计数 |通过斜率观察元数据写入频率是否正常 |P0 | +|`doris_fe_editlog_write_latency_ms`| | 毫秒| 元数据日志写入延迟的百分位统计。如 {quantile="0.75"} 表示 75 分位的写入延迟 | | +|`doris_fe_image_clean`|{type="failed"} | Num | 清理历史元数据镜像文件失败的次数 | 不应失败,如失败,需人工介入 | P0| +||{type="success"} | Num | 清理历史元数据镜像文件成功的次数 | | +|`doris_fe_image_push`|{type="failed"} | Num | 将元数据镜像文件推送给其他FE节点的失败的次数 | | +||{type="success"} | Num | 将元数据镜像文件推送给其他FE节点的成功的次数 | | +|`doris_fe_image_write`|{type="failed"} | Num | 生成元数据镜像文件失败的次数 | 不应失败,如失败,需人工介入 | P0| +||{type="success"} | Num | 生成元数据镜像文件成功的次数 | | +|`doris_fe_job`| | Num | 当前不同作业类型以及不同作业状态的计数。如 {job="load", type="INSERT", state="LOADING"} 表示类型为 INSERT 的导入作业,处于 LOADING 状态的作业个数| 可以根据需要,观察不同类型的作业在集群中的数量 | P0 | +|`doris_fe_max_journal_id`| | Num | 当前FE节点最大元数据日志ID。如果是Master FE,则是当前写入的最大ID,如果是非Master FE,则代表当前回放的元数据日志最大ID | 用于观察多个FE之间的 id 是否差距过大。过大则表示元数据同步出现问题 | P0 | +|`doris_fe_max_tablet_compaction_score`| | Num| 所有BE节点中最大的 compaction score 值。 | 该值可以观测当前集群最大的 compaction score,以判断是否过高。如过高则可能出现查询或写入延迟 | P0 | +|`doris_fe_qps`| | Num/Sec | 当前FE每秒查询数量(仅统计查询请求)| QPS | P0 | +|`doris_fe_query_err`| | Num | 错误查询的累积值 | | +|`doris_fe_query_err_rate`| | Num/Sec| 每秒错误查询数 | 观察集群是否出现查询错误 | P0 | +|`doris_fe_query_latency_ms`| | 毫秒| 查询请求延迟的百分位统计。如 {quantile="0.75"} 表示 75 分位的查询延迟 | 详细观察各分位查询延迟 | P0 | +|`doris_fe_query_olap_table`| | Num| 查询内部表(OlapTable)的请求个数统计 | | +|`doris_fe_query_total`| | Num | 所有查询请求的累积计数 | | +|`doris_fe_report_queue_size`| | Num | BE的各种定期汇报任务在FE端的队列长度 | 该值反映了汇报任务在 Master FE 节点上的阻塞程度,数值越大,表示FE处理能力不足 | P0| +|`doris_fe_request_total`| | Num | 所有通过 MySQL 端口接收的操作请求(包括查询和其他语句)| | +|`doris_fe_routine_load_error_rows`| | Num | 统计集群内所有 Routine Load 作业的错误行数总和 | | +|`doris_fe_routine_load_receive_bytes`| | 字节 | 统计集群内所有 Routine Load 作业接收的数据量大小 | | +|`doris_fe_routine_load_rows`| | Num | 统计集群内所有 Routine Load 作业接收的数据行数 | | +|`doris_fe_rps`| | Num | 当前FE每秒请求数量(包含查询以及其他各类语句) | 和 QPS 配合来查看集群处理请求的量 | P0 | +|`doris_fe_scheduled_tablet_num`| | Num | Master FE节点正在调度的 tablet 数量。包括正在修复的副本和正在均衡的副本 | 该数值可以反映当前集群,正在迁移的 tablet 数量。如果长时间有值,说明集群不稳定 | P0 | +|`doris_fe_tablet_max_compaction_score`| | Num | 各个BE节点汇报的 compaction core。如 {backend="172.21.0.1:9556"} 表示 "172.21.0.1:9556" 这个BE的汇报值| | +|`doris_fe_tablet_num`| | Num | 各个BE节点当前tablet总数。如 {backend="172.21.0.1:9556"} 表示 "172.21.0.1:9556" 这个BE的当前tablet数量 | 可以查看 tablet 分布是否均匀以及绝对值是否合理 | P0| +|`doris_fe_tablet_status_count`| |Num| 统计 Master FE 节点 Tablet调度器所调度的 tablet 数量的累计值。| | +| | {type="added"} |Num| 统计 Master FE 节点 Tablet调度器所调度的 tablet 数量的累计值。 "added" 表示被调度过的 tablet 数量 | | +|| {type="in_sched"} |Num| 同上。表示被重复调度的 tablet 数量 |该值如果增长较快,则说明有tablet长时间处于不健康状态,导致被调度器反复调度 | +|| {type="not_ready"} | Num |同上。表示尚未满足调度触发条件的tablet数量。| 该值如果增长较快,说明有大量 tablet 处于不健康状态但又无法被调度 | +|| {type="total"} |Num |同上。表示累积的被检查过(但不一定被调度)的tablet数量。| | +|| {type="unhealthy"}| Num| 同上。表示累积的被检查过的不健康的 tablet 数量。| | +|`doris_fe_thread_pool`| | Num | 统计各类线程池的工作线程数和排队情况。`"active_thread_num"` 表示正在执行的任务数。`"pool_size"` 表示线程池总线程数量。`"task_in_queue"` 表示正在排队的任务数| | +|| {name="agent-task-pool"} | Num | Master FE 用于发送 Agent Task 到 BE的线程池 | | +|| {name="connect-scheduler-check-timer"} | Num | 用于检查MySQL空闲连接是否超时的线程池 | | +|| {name="connect-scheduler-pool"} | Num | 用于接收MySQL连接请求的线程池 | | +|| {name="mysql-nio-pool"} | Num | NIO MySQL Server 用于处理任务的线程池 | | +|| {name="export-exporting-job-pool"} | Num | exporting状态的export作业的调度线程池 | | +|| {name="export-pending-job-pool"} | Num | pending状态的export作业的调度线程池| | +|| {name="heartbeat-mgr-pool"} | Num | Master FE 用于处理各个节点心跳的线程池| | +|| {name="loading-load-task-scheduler"} | Num | Master FE 用于调度Broker Load作业中,loading task的调度线程池| | +|| {name="pending-load-task-scheduler"} | Num | Master FE 用于调度Broker Load作业中,pending task的调度线程池| | +|| {name="schema-change-pool"} | Num | Master FE 用于调度 schema change 作业的线程池 | | +|| {name="thrift-server-pool"} | Num | FE 端ThriftServer的工作线程池。对应 fe.conf 中 `rpc_port`。用于和BE进行交互。 || +|`doris_fe_txn_counter`| | Num | 统计各个状态的导入事务的数量的累计值| 可以观测导入事务的执行情况。 | P0 | +|| {type="begin"}| Num| 提交的事务数量 | | +||{type="failed"} | Num| 失败的事务数量| | +|| {type="reject"} | Num| 被拒绝的事务数量。(如当前运行事务数大于阈值,则新的事务会被拒绝)| | +|| {type="succes"} | Num| 成功的事务数量| | +|`doris_fe_txn_status`| | Num | 统计当前处于各个状态的导入事务的数量。如 {type="committed"} 表示处于 committed 状态的事务的数量 | 可以观测各个状态下导入事务的数量,来判断是否有堆积 | P0 | +|`doris_fe_max_instances_num_per_user`|| Num| 当前连接用户中,发起fragment instance最多的用户的 instance 数目 |该数值可以用于观测当前是否有用户占用过多查询资源| P0 | + +### JVM 监控 + +|名称| 标签 |单位 | 含义 | 说明 | 等级 | +|---|---|---|---|---|---| +|`jvm_heap_size_bytes`| | 字节 | JVM 内存监控。标签包含 max, used, committed,分别对应最大值,已使用和已申请的内存 | 观测JVM内存使用情况 | P0 | +|`jvm_non_heap_size_bytes`| | 字节 | JVM 堆外内存统计| | +|`jvm_old_gc`| | | 老年代 GC 监控。| 观测是否出现长时间的 FullGC | P0 | +| |{type="count"} | Num | 老年代 GC 次数累计值| | +| |{type="time"} | 毫秒 | 老年代 GC 耗时累计值| | +|`jvm_old_size_bytes`| | 字节| JVM 老年代内存统计 | | P0 | +|`jvm_thread`| | Num| JVM 线程数统计 | 观测 JVM 线程数是否合理| P0| +|`jvm_young_gc`| | |新生代 GC 监控 | | +| |{type="count"} | Num | 新生代 GC 次数累计值| | +| |{type="time"} | 毫秒 | 新生代 GC 耗时累计值| | +|`jvm_young_size_bytes`| | 字节|JVM 新生代内存统计 | | P0 | + +### 机器监控 + +|名称| 标签 |单位 | 含义 | 说明 | 等级 | +|---|---|---|---|---|---| +|`system_meminfo`| | 字节| FE节点机器的内存监控。采集自 `/proc/meminfo`。包括 `buffers`,`cached`, `memory_available`, `memory_free`, `memory_total` | | +|`system_snmp`| | FE节点机器的网络监控。采集自 `/proc/net/snmp`。 | | | +||`{name="tcp_in_errs"}` | Num| tcp包接收错误的次数| | +||`{name="tcp_in_segs"}` | Num | tcp包发送的个数 | | +|| `{name="tcp_out_segs"}`| Num | tcp包发送的个数 | | +||`{name="tcp_retrans_segs"}` | Num | tcp包重传的个数 | | + +## BE 监控指标 + +### 进程监控 + +|名称| 标签 |单位 | 含义 | 说明 | 等级 | +|---|---|---|---|---|---| +|`doris_be_active_scan_context_count`| | Num | 展示当前由外部直接打开的scanner的个数 | | +|`doris_be_add_batch_task_queue_size`| | Num | 记录导入时,接收batch的线程池的队列大小 |如果大于0,则表示导入任务的接收端出现积压 | P0| +|`agent_task_queue_size`| | Num | 展示各个 Agent Task 处理队列的长度,如 `{type="CREATE_TABLE"}` 表示 CREATE_TABLE 任务队列的长度 | | +|`doris_be_brpc_endpoint_stub_count`| | Num | 已创建的 brpc stub 的数量,这些 stub 用于 BE 之间的交互 | | +|`doris_be_brpc_function_endpoint_stub_count`| | Num | 已创建的 brpc stub 的数量,这些 stub 用于和 Remote RPC 之间交互 | | +|`doris_be_cache_capacity`| | | 记录指定 LRU Cache 的容量 | | +|`doris_be_cache_usage`| | | 记录指定 LRU Cache 的使用量 | 用于观测内存占用情况 | P0| +|`doris_be_cache_usage_ratio`| | | 记录指定 LRU Cache 的使用率 | | +|`doris_be_cache_lookup_count`| | | 记录指定 LRU Cache 被查找的次数 | | +|`doris_be_cache_hit_count`| | | 记录指定 LRU Cache 的命中次数 | | +|`doris_be_cache_hit_ratio`| | | 记录指定 LRU Cache 的命中率 | 用于观测cache是否有效 | P0| +|| {name="DataPageCache"} | 字节 | DataPageCache 用于缓存数据的 Data Page | 数据Cache,直接影响查询效率 | P0| +|| {name="IndexPageCache"} | Num| IndexPageCache 用于缓存数据的 Index Page | 索引Cache,直接影响查询效率 | P0| +|| {name="LastestSuccessChannelCache"} | Num| LastestSuccessChannelCache 用于缓存导入接收端的 LoadChannel | | +|| {name="SegmentCache"} | Num | SegmentCache 用于缓存已打开的 Segment,如索引信息 | | +|`doris_be_chunk_pool_local_core_alloc_count`| | Num | ChunkAllocator中,从绑定的 core 的内存队列中分配内存的次数 | | +|`doris_be_chunk_pool_other_core_alloc_count`| | Num | ChunkAllocator中,从其他的 core 的内存队列中分配内存的次数 | | +|`doris_be_chunk_pool_reserved_bytes`| | 字节 | ChunkAllocator 中预留的内存大小 | | +|`doris_be_chunk_pool_system_alloc_cost_ns`| | 纳秒 | SystemAllocator 申请内存的耗时累计值 | 通过斜率可以观测内存分配的耗时| P0| +|`doris_be_chunk_pool_system_alloc_count`| | Num | SystemAllocator 申请内存的次数 | | +|`doris_be_chunk_pool_system_free_cost_ns`| | 纳秒 |SystemAllocator 释放内存的耗时累计值 | 通过斜率可以观测内存释放的耗时| P0| +|`doris_be_chunk_pool_system_free_count`| | Num | SystemAllocator 释放内存的次数 | | +|`doris_be_compaction_bytes_total`| |字节| compaction处理的数据量的累计值 | 记录的是 compaction 任务中,input rowset 的 disk size。通过斜率可以观测 compaction的速率 | P0| +| |{type="base"} |字节 | Base Compaction 的数据量累计 | | +| |{type="cumulative"} |字节 | Cumulative Compaction 的数据量累计 | | +|`doris_be_compaction_deltas_total`| |Num| compaction处理的 rowset 个数的累计值 | 记录的是 compaction 任务中,input rowset 的 个数 | +| |{type="base"} |Num | Base Compaction 处理的 rowset 个数累计 | | +| |{type="cumulative"} |Num | Cumulative Compaction 处理的 rowset 个数累计 | | +|`doris_be_disks_compaction_num`| | Num| 指定数据目录上正在执行的 compaction 任务数。如 `{path="/path1/"}` 表示`/path1` 目录上正在执行的任务数 | 用于观测各个磁盘上的 compaction 任务数是否合理 | P0 | +|`doris_be_disks_compaction_score`| | Num| 指定数据目录上正在执行的 compaction 令牌数。如 `{path="/path1/"}` 表示`/path1` 目录上正在执行的令牌数 | | +|`doris_be_compaction_used_permits`| | Num | Compaction 任务已使用的令牌数量 | 用于反映Compaction的资源消耗量 | +|`doris_be_compaction_waitting_permits`| | Num | 正在等待Compaction令牌的数量 | | +|`doris_be_data_stream_receiver_count`| | Num | 数据接收端 Receiver 的数量 | FIXME:向量化引擎此指标缺失 | +|`doris_be_disks_avail_capacity`| | 字节 | 指定数据目录所在磁盘的剩余空间。如 `{path="/path1/"}` 表示 `/path1` 目录所在磁盘的剩余空间 | | P0 | +|`doris_be_disks_local_used_capacity`| |字节|指定数据目录所在磁盘的本地已使用空间 | | +|`doris_be_disks_remote_used_capacity`| | 字节| 指定数据目录所在磁盘的对应的远端目录的已使用空间| | +|`doris_be_disks_state`| | 布尔 | 指定数据目录的磁盘状态。1 表示正常。0 表示异常 | | +|`doris_be_disks_total_capacity`| | 字节 | 定数据目录所在磁盘的总容量| 配合 `doris_be_disks_avail_capacity` 计算磁盘使用率 | P0 | +|`doris_be_engine_requests_total`| | Num | BE 上各类任务执行状态的累计值| | +|| {status="failed",type="xxx"} | Num | xxx 类型的任务的失败次数的累计值| | +|| {status="failed",type="xxx"} | Num | xxx 类型的任务的总次数的累计值。| 可以按需监控各类任务的失败次数 | P0 | +|| `{status="skip",type="report_all_tablets"}` | Num | xxx 类型任务被跳过执行的次数的累计值 | | +|`doris_be_fragment_endpoint_count`| | Num| 同 | FIXME: 同 `doris_be_data_stream_receiver_count` 数目。并且向量化引擎缺失 | +|`doris_be_fragment_request_duration_us`| | 微秒| 所有 fragment intance 的执行时间累计 | 通过斜率观测 instance 的执行耗时 | P0 | +|`doris_be_fragment_requests_total`| | Num | 执行过的 fragment instance 的数量累计 | | +|`doris_be_load_channel_count`| | Num | 当前打开的 load channel 个数 | 数值越大,说明当前正在执行的导入任务越多 | P0 | +|`doris_be_local_bytes_read_total`| | 字节 | 由 `LocalFileReader` 读取的字节数 | | P0 | +|`doris_be_local_bytes_written_total`| | 字节 | 由 `LocalFileWriter` 写入的字节数 | | P0 | +|`doris_be_local_file_reader_total`| | Num| 打开的 `LocalFileReader` 的累计计数 | | +|`doris_be_local_file_open_reading`| | Num | 当前打开的 `LocalFileReader` 个数 | | +|`doris_be_local_file_writer_total`| | Num | 打开的 `LocalFileWriter` 的累计计数。| | +|`doris_be_mem_consumption`| | 字节 | 指定模块的当前内存开销。如 {type="compaction"} 表示 compaction 模块的当前总内存开销 | FIXME: 需要重新梳理| +|`doris_be_memory_allocated_bytes`| | 字节 | 采集自 TcMalloc 的 `generic.total_physical_bytes` 属性。表示 TcMalloc 占用的虚拟内存的大小,并不代表实际的物理内存占用。 | 可能会比实际物理内存大 | P0 | +|`doris_be_memory_pool_bytes_total`| | 字节| 所有 MemPool 当前占用的内存大小。统计值,不代表真实内存使用。| | +|`doris_be_memtable_flush_duration_us`| | 微秒 | memtable写入磁盘的耗时累计值 | 通过斜率可以观测写入延迟 | P0 | +|`doris_be_memtable_flush_total`| | Num | memtable写入磁盘的个数累计值| 通过斜率可以计算写入文件的频率 | P0 | +|`doris_be_meta_request_duration`| | 微秒| 访问 RocksDB 中的 meta 的耗时累计 | 通过斜率观测 BE 元数据读写延迟 | P0 | +||{type="read"} | 微秒| 读取耗时 | | +||{type="write"} | 微秒| 写入耗时 | | +|`doris_be_meta_request_total`| |Num | 访问 RocksDB 中的 meta 的次数累计 | 通过斜率观测 BE 元数据访问频率 | P0 | +||{type="read"} | Num| 读取次数 | | +||{type="write"} | Num| 写入次数 | | +|`doris_be_plan_fragment_count`| | Num | 当前已接收的 fragment instance 的数量 | 观测是否出现 instance 堆积 | P0 | +|`doris_be_process_fd_num_limit_hard`| |Num| BE 进程的文件句柄数硬限。通过 `/proc/pid/limits` 采集 | | +|`doris_be_process_fd_num_limit_soft`| |Num| BE 进程的文件句柄数软限。通过 `/proc/pid/limits` 采集 | | +|`doris_be_process_fd_num_used`| |Num| BE 进程已使用的文件句柄数。通过 `/proc/pid/limits` 采集 | | +|`doris_be_process_thread_num`| | Num | BE 进程线程数。通过 `/proc/pid/task` 采集 | | P0 | +|`doris_be_query_cache_memory_total_byte`| | 字节 | Query Cache 占用字节数 | | +|`doris_be_query_cache_partition_total_count`| | Num | 当前 Partition Cache 缓存个数 | | +|`doris_be_query_cache_sql_total_count`| | Num | 当前 SQL Cache 缓存个数 | | +|`doris_be_query_scan_bytes`| | 字节 | 读取数据量的累计值。这里只统计读取 Olap 表的数据量 | | +|`doris_be_query_scan_bytes_per_second`| | 字节/秒 | 根据 `doris_be_query_scan_bytes` 计算得出的读取速率 | 观测查询速率 | P0| +|`doris_be_query_scan_rows`| | Num | 读取行数的累计值。这里只统计读取 Olap 表的数据量。并且是 RawRowsRead(部分数据行可能被索引跳过,并没有真正读取,但仍会记录到这个值中) | 通过斜率观测查询速率 | P0 | +|`doris_be_result_block_queue_count`| | Num | 当前查询结果缓存中的 fragment instance 个数 | 该队列仅用于被外部系统直接读取时使用。如 Spark on Doris 通过 external scan 查询数据 | +|`doris_be_result_buffer_block_count`| | Num | 当前查询结果缓存中的 query 个数 | 该数值反映当前 BE 中有多少查询的结果正在等待 FE 消费| P0| +|`doris_be_routine_load_task_count`| | Num | 当前正在执行的 routine load task 个数| | +|`doris_be_rowset_count_generated_and_in_use`| | Num | 自上次启动后,新增的并且正在使用的 rowset id 个数。 | | +|`doris_be_s3_bytes_read_total`| | Num | `S3FileReader` 的打开累计次数 | | +|`doris_be_s3_file_open_reading`| | Num | 当前打开的 `S3FileReader` 个数 | | +|`doris_be_s3_bytes_read_total`| | 字节 | `S3FileReader` 读取字节数累计值 | | +|`doris_be_scanner_thread_pool_queue_size`| | Num | 用于 OlapScanner 的线程池的当前排队数量 | 大于零则表示 Scanner 开始堆积 | P0| +|`doris_be_segment_read`| `{type="segment_read_total"}` | Num | 读取的segment的个数累计值 | | +|`doris_be_segment_read`| `{type="segment_row_total"}` | Num | 读取的segment的行数累计值 | 该数值也包含了被索引过滤的行数。相当于读取的segment个数 * 每个segment的总行数 | +|`doris_be_send_batch_thread_pool_queue_size`| | Num | 导入时用于发送数据包的线程池的排队个数 | 大于0则表示有堆积 | P0 | +|`doris_be_send_batch_thread_pool_thread_num`| | Num| 导入时用于发送数据包的线程池的线程数 | | +|`doris_be_small_file_cache_count`| |Num| 当前BE缓存的小文件数量 | | +|`doris_be_streaming_load_current_processing`| | Num | 当前正在运行的 stream load 任务数| 仅包含 curl 命令发送的任务 | +|`doris_be_streaming_load_duration_ms`| | 毫秒 | 所有stream load 任务执行时间的耗时累计值 | | +|`doris_be_streaming_load_requests_total`| | Num | stream load 任务数的累计值 | 通过斜率可观测任务提交频率 | P0 | +|`doris_be_stream_load_pipe_count`| | Num | 当前 stream load 数据管道的个数| 包括 stream load 和 routine load 任务 | +|`doris_be_stream_load`| {type="load_rows"} | Num | stream load 最终导入的行数累计值| 包括 stream load 和 routine load 任务 | P0 | +|`doris_be_stream_load`| {type="receive_bytes"}|字节 | stream load 接收的字节数累计值| 包括 stream load 从 http 接收的数据,以及 routine load 从kafka 读取的数据| P0 | +|`doris_be_tablet_base_max_compaction_score`| | Num | 当前最大的 Base Compaction Score | 该数值实时变化,有可能丢失峰值数据。数值越高,表示 compaction 堆积越严重 | P0 | +|`doris_be_tablet_base_max_compaction_score`| | Num | 同上。当前最大的 Cumulative Compaction Score | | +|`doris_be_tablet_version_num_distribution`| | Num | tablet version 数量的直方。| 用于反映 tablet version 数量的分布 | P0 | +|`doris_be_thrift_connections_total`| | Num | 创建过的 thrift 连接数的累计值。如 `{name="heartbeat"}` 表示心跳服务的连接数累计| 此数值为 BE 作为服务端的 thrift server 的连接 | +|`doris_be_thrift_current_connections`| | Num | 当前 thrift 连接数。如 `{name="heartbeat"}` 表示心跳服务的当前连接数。| 同上 | +|`doris_be_thrift_opened_clients`| |Num| 当前已打开的 thrift 客户端的数量。如 `{name="frontend"}` 表示访问 FE 服务的客户端数量 | | +|`doris_be_thrift_used_clients`| | Num | 当前正在使用的 thrift 客户端的数量。如 `{name="frontend"}` 表示正在用于访问 FE 服务的客户端数量 | | +|`doris_be_timeout_canceled_fragment_count`| | Num | 因超时而被取消的 fragment instance 数量累计值| 这个值可能会被重复记录。比如部分 fragment instance 被多次取消| P0 | +|`doris_be_stream_load_txn_request`| {type="begin"}| Num | stream load 开始事务数的累计值 | 包括 stream load 和 routine load 任务 | +|`doris_be_stream_load_txn_request `| {type="commit"}| Num| stream load 执行成功的事务数的累计值| 同上| +|`doris_be_stream_load_txn_request `| {type="rollback"}| | stream load 执行失败的事务数的累计值 |同上 | +|`doris_be_unused_rowsets_count`| | Num | 当前已废弃的rowset的个数| 这些rowset正常情况下会被定期删除 | +|`doris_be_upload_fail_count`| | Num | 冷热分离功能,上传到远端存储失败的rowset的次数累计值| | +|`doris_be_upload_rowset_count`| | Num | 冷热分离功能,上传到远端存储成功的rowset的次数累计值| | +|`doris_be_upload_total_byte`| | | 字节 | 冷热分离功能,上传到远端存储成功的rowset数据量累计值| | +|`doris_be_load_bytes`| | 字节|通过 tablet sink 发送的数量累计 | 可观测导入数据量 | P0 | +|`doris_be_load_rows`| | Num | 通过 tablet sink 发送的行数累计| 可观测导入数据量 | P0 | + +### 机器监控 + +|名称| 标签 |单位 | 含义 | 说明 | 等级 | +|---|---|---|---|---|---| +|`doris_be_cpu`| | Num | CPU 相关监控指标,从 `/proc/stat` 采集。会分别采集每个逻辑核的各项数值。如 `{device="cpu0",mode="nice"}` 表示 cpu0 的 nice 值 | 可计算得出 CPU 使用率 | P0| +|`doris_be_disk_bytes_read`| |字节| 磁盘读取量累计值。从 `/proc/diskstats` 采集。会分别采集每块磁盘的数值。如 `{device="vdd"}` 表示 vvd 盘的数值 | | | +|`doris_be_disk_bytes_written`| | 字节 | 磁盘写入量累计值。采集方式同上 | | +|`doris_be_disk_io_time_ms`| |字节 | 采集方式同上| 可计算得出 IO Util | P0 | +|`doris_be_disk_io_time_weigthed`| | 字节|采集方式同上| | +|`doris_be_disk_reads_completed`| |字节 |采集方式同上 | | +|`doris_be_disk_read_time_ms`| | 字节|采集方式同上| | +|`doris_be_disk_writes_completed`| | 字节|采集方式同上| | +|`doris_be_disk_write_time_ms`| | 字节|采集方式同上 | | +|`doris_be_fd_num_limit`| | Num | 系统文件句柄限制上限。从 `/proc/sys/fs/file-nr` 采集| | +|`doris_be_fd_num_used`| | Num | 系统已使用文件句柄数。 从 `/proc/sys/fs/file-nr` 采集 | | +|`doris_be_file_created_total`| | Num | 本地文件创建次数累计 | 所有调用 `local_file_writer` 并最终 close 的文件计数 | +|`doris_be_load_average`| | Num | 机器 Load Avg 指标监控。如 {mode="15_minutes"} 为 15 分钟 Load Avg| 观测整机负载 | P0 | +|`doris_be_max_disk_io_util_percent`| | 百分比 | 计算得出的所有磁盘中,最大的 IO UTIL 的磁盘的数值 | | P0 | +|`doris_be_max_network_receive_bytes_rate`| | 字节/秒 | 计算得出的所有网卡中,最大的接收速率| | P0 | +|`doris_be_max_network_send_bytes_rate`| | 字节/秒 | 计算得出的所有网卡中,最大的发送速率| | P0 | +|`doris_be_memory_pgpgin`| | 字节 | 系统从磁盘写到内存页的数据量 | | +|`doris_be_memory_pgpgout`| | 字节 | 系统内存页写入磁盘的数据量 | | +|`doris_be_memory_pswpin`| | 字节 | 系统从磁盘换入到内存的数量| 通常情况下,swap应该关闭,因此这个数值应该是0| +|`doris_be_memory_pswpout`| | 字节 | 系统从内存换入到磁盘的数量| 通常情况下,swap应该关闭,因此这个数值应该是0| +|`doris_be_network_receive_bytes`| | 字节| 各个网卡的接收字节累计。采集自 `/proc/net/dev` | | +|`doris_be_network_receive_packets`| | Num| 各个网卡的接收包个数累计。采集自 `/proc/net/dev` | | +|`doris_be_network_send_bytes`| | 字节| 各个网卡的发送字节累计。采集自 `/proc/net/dev` | | +|`doris_be_network_send_packets`| | Num| 各个网卡的发送包个数累计。采集自 `/proc/net/dev` | | +|`doris_be_proc`| `{mode="ctxt_switch"}` | Num | CPU 上下文切换的累计值。采集自 `/proc/stat` | 观测是否有异常的上下文切换 | P0 | +|`doris_be_proc `| `{mode="interrupt"}` | Num |CPU 中断次数的累计值。采集自 `/proc/stat` | | +|`doris_be_proc`| `{mode="procs_blocked"}` | Num | 系统当前被阻塞的进程数(如等待IO)。采集自 `/proc/stat` | | +|`doris_be_proc`| `{mode="procs_running"}` | Num | 系统当前正在执行的进程数。采集自 `/proc/stat` | | +|`doris_be_snmp_tcp_in_errs`| | Num| tcp包接收错误的次数。采集自 `/proc/net/snmp`| 可观测网络错误如重传、丢包等。需和其他 snmp 指标配合使用 | P0 +|`doris_be_snmp_tcp_in_segs`| | Num | tcp包发送的个数。 采集自 `/proc/net/snmp`| | +|`doris_be_snmp_tcp_out_segs`| | Num | tcp包发送的个数。采集自 `/proc/net/snmp`| | +|`doris_be_snmp_tcp_retrans_segs`| | Num | tcp包重传的个数。采集自 `/proc/net/snmp`| | diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InsertStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InsertStmt.java index bb6ece1e98..5fad203599 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InsertStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InsertStmt.java @@ -38,7 +38,6 @@ import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugUtil; import org.apache.doris.common.util.Util; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.planner.DataPartition; import org.apache.doris.planner.DataSink; @@ -302,7 +301,6 @@ public class InsertStmt extends DdlStmt { if (!isExplain() && !isTransactionBegin) { if (targetTable instanceof OlapTable) { LoadJobSourceType sourceType = LoadJobSourceType.INSERT_STREAMING; - MetricRepo.COUNTER_LOAD_ADD.increase(1L); transactionId = Env.getCurrentGlobalTransactionMgr().beginTransaction(db.getId(), Lists.newArrayList(targetTable.getId()), label, new TxnCoordinator(TxnSourceType.FE, FrontendOptions.getLocalHostAddress()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 2e4e1b84e5..48d64df514 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -581,12 +581,12 @@ public class Env { // The pendingLoadTaskScheduler's queue size should not less than Config.desired_max_waiting_jobs. // So that we can guarantee that all submitted load jobs can be scheduled without being starved. - this.pendingLoadTaskScheduler = new MasterTaskExecutor("pending_load_task_scheduler", + this.pendingLoadTaskScheduler = new MasterTaskExecutor("pending-load-task-scheduler", Config.async_pending_load_task_pool_size, Config.desired_max_waiting_jobs, !isCheckpointCatalog); // The loadingLoadTaskScheduler's queue size is unlimited, so that it can receive all loading tasks // created after pending tasks finish. And don't worry about the high concurrency, because the // concurrency is limited by Config.desired_max_waiting_jobs and Config.async_loading_load_task_pool_size. - this.loadingLoadTaskScheduler = new MasterTaskExecutor("loading_load_task_scheduler", + this.loadingLoadTaskScheduler = new MasterTaskExecutor("loading-load-task-scheduler", Config.async_loading_load_task_pool_size, Integer.MAX_VALUE, !isCheckpointCatalog); this.loadJobScheduler = new LoadJobScheduler(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java index 376e886912..c2b5011bfc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletChecker.java @@ -146,7 +146,7 @@ public class TabletChecker extends MasterDaemon { } }; gauge.addLabel(new MetricLabel("type", status)); - MetricRepo.PALO_METRIC_REGISTER.addPaloMetrics(gauge); + MetricRepo.DORIS_METRIC_REGISTER.addMetrics(gauge); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java index 4e5ae73b29..96e4b114d9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java @@ -93,9 +93,8 @@ public class ThreadPoolManager { } } }; - gauge.addLabel(new MetricLabel("name", poolName)) - .addLabel(new MetricLabel("type", poolMetricType)); - MetricRepo.PALO_METRIC_REGISTER.addPaloMetrics(gauge); + gauge.addLabel(new MetricLabel("name", poolName)).addLabel(new MetricLabel("type", poolMetricType)); + MetricRepo.DORIS_METRIC_REGISTER.addMetrics(gauge); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/MetricsAction.java b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/MetricsAction.java index 9158f04db4..6173f1aa20 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/MetricsAction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/MetricsAction.java @@ -43,11 +43,11 @@ public class MetricsAction { String type = request.getParameter(TYPE_PARAM); MetricVisitor visitor = null; if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("core")) { - visitor = new SimpleCoreMetricVisitor("doris_fe"); + visitor = new SimpleCoreMetricVisitor(); } else if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("json")) { - visitor = new JsonMetricVisitor("doris_fe"); + visitor = new JsonMetricVisitor(); } else { - visitor = new PrometheusMetricVisitor("doris_fe"); + visitor = new PrometheusMetricVisitor(); } response.setContentType("text/plain"); try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/util/LoadSubmitter.java b/fe/fe-core/src/main/java/org/apache/doris/httpv2/util/LoadSubmitter.java index 899df9066b..e971c3bc07 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/util/LoadSubmitter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/util/LoadSubmitter.java @@ -52,7 +52,7 @@ import java.util.concurrent.ThreadPoolExecutor; public class LoadSubmitter { private static final Logger LOG = LogManager.getLogger(LoadSubmitter.class); - private ThreadPoolExecutor executor = ThreadPoolManager.newDaemonCacheThreadPool(2, "Load submitter", true); + private ThreadPoolExecutor executor = ThreadPoolManager.newDaemonCacheThreadPool(2, "load-submitter", true); private static final String STREAM_LOAD_URL_PATTERN = "http://%s:%d/api/%s/%s/_stream_load"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportChecker.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportChecker.java index 0701e2ab5e..78a90b5c42 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportChecker.java @@ -52,10 +52,10 @@ public final class ExportChecker extends MasterDaemon { checkers.put(JobState.EXPORTING, new ExportChecker(JobState.EXPORTING, intervalMs)); int poolSize = Config.export_running_job_num_limit == 0 ? 5 : Config.export_running_job_num_limit; - MasterTaskExecutor pendingTaskExecutor = new MasterTaskExecutor("export_pending_job", poolSize, true); + MasterTaskExecutor pendingTaskExecutor = new MasterTaskExecutor("export-pending-job", poolSize, true); executors.put(JobState.PENDING, pendingTaskExecutor); - MasterTaskExecutor exportingTaskExecutor = new MasterTaskExecutor("export_exporting_job", poolSize, true); + MasterTaskExecutor exportingTaskExecutor = new MasterTaskExecutor("export-exporting-job", poolSize, true); executors.put(JobState.EXPORTING, exportingTaskExecutor); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/Load.java b/fe/fe-core/src/main/java/org/apache/doris/load/Load.java index dd3d78f474..b8ccd7b95c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/Load.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/Load.java @@ -80,7 +80,6 @@ import org.apache.doris.common.util.TimeUtils; import org.apache.doris.load.FailMsg.CancelType; import org.apache.doris.load.LoadJob.JobState; import org.apache.doris.load.loadv2.LoadTask; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.persist.ReplicaPersistInfo; import org.apache.doris.qe.ConnectContext; @@ -263,7 +262,6 @@ public class Load { writeLock(); try { unprotectAddLoadJob(job, false /* not replay */); - MetricRepo.COUNTER_LOAD_ADD.increase(1L); Env.getCurrentEnv().getEditLog().logLoadStart(job); } finally { writeUnlock(); @@ -2593,7 +2591,6 @@ public class Load { job.setProgress(100); job.setLoadFinishTimeMs(System.currentTimeMillis()); } - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); // job will transfer from LOADING to FINISHED, skip QUORUM_FINISHED idToLoadingLoadJob.remove(jobId); idToQuorumFinishedLoadJob.remove(jobId); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/LoadChecker.java b/fe/fe-core/src/main/java/org/apache/doris/load/LoadChecker.java index f7f97be943..285a7e7384 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/LoadChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/LoadChecker.java @@ -92,18 +92,18 @@ public class LoadChecker extends MasterDaemon { Map pendingPriorityMap = Maps.newHashMap(); pendingPriorityMap.put(TPriority.NORMAL, - new MasterTaskExecutor("load_pending_thread_num_normal_priority", + new MasterTaskExecutor("load-pending-thread-num-normal-priority", Config.load_pending_thread_num_normal_priority, true)); - pendingPriorityMap.put(TPriority.HIGH, - new MasterTaskExecutor("load_pending_thread_num_high_priority", - Config.load_pending_thread_num_high_priority, true)); + pendingPriorityMap.put(TPriority.HIGH, new MasterTaskExecutor("load-pending-thread-num-high-priority", + Config.load_pending_thread_num_high_priority, true)); executors.put(JobState.PENDING, pendingPriorityMap); Map etlPriorityMap = Maps.newHashMap(); - etlPriorityMap.put(TPriority.NORMAL, new MasterTaskExecutor("load_etl_thread_num_normal_priority", + etlPriorityMap.put(TPriority.NORMAL, new MasterTaskExecutor("load-etl-thread-num-normal-priority", Config.load_etl_thread_num_normal_priority, true)); - etlPriorityMap.put(TPriority.HIGH, new MasterTaskExecutor("load_etl_thread_num_high_priority", - Config.load_etl_thread_num_high_priority, true)); + etlPriorityMap.put(TPriority.HIGH, + new MasterTaskExecutor("load-etl-thread-num-high-priority", Config.load_etl_thread_num_high_priority, + true)); executors.put(JobState.ETL, etlPriorityMap); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BrokerLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BrokerLoadJob.java index d0a7971346..f5765ed38d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BrokerLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/BrokerLoadJob.java @@ -41,7 +41,6 @@ import org.apache.doris.load.BrokerFileGroup; import org.apache.doris.load.BrokerFileGroupAggInfo.FileGroupAggKey; import org.apache.doris.load.EtlJobType; import org.apache.doris.load.FailMsg; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.OriginStatement; import org.apache.doris.service.FrontendOptions; @@ -95,7 +94,6 @@ public class BrokerLoadJob extends BulkLoadJob { public void beginTxn() throws LabelAlreadyUsedException, BeginTransactionException, AnalysisException, DuplicatedRequestException, QuotaExceedException, MetaNotFoundException { - MetricRepo.COUNTER_LOAD_ADD.increase(1L); transactionId = Env.getCurrentGlobalTransactionMgr() .beginTransaction(dbId, Lists.newArrayList(fileGroupAggInfo.getAllTableIds()), label, null, new TxnCoordinator(TxnSourceType.FE, FrontendOptions.getLocalHostAddress()), @@ -294,7 +292,6 @@ public class BrokerLoadJob extends BulkLoadJob { .add("txn_id", transactionId) .add("msg", "Load job try to commit txn") .build()); - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); Env.getCurrentGlobalTransactionMgr().commitTransaction( dbId, tableList, transactionId, commitInfos, new LoadJobFinalOperation(id, loadingStatus, progress, loadStartTimestamp, diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java index 7af0b6ed64..13ef3da960 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java @@ -44,7 +44,6 @@ import org.apache.doris.load.EtlStatus; import org.apache.doris.load.FailMsg; import org.apache.doris.load.FailMsg.CancelType; import org.apache.doris.load.Load; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.privilege.PaloPrivilege; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.persist.gson.GsonUtils; @@ -690,10 +689,6 @@ public abstract class LoadJob extends AbstractTxnStateChangeCallback implements finishTimestamp = System.currentTimeMillis(); Env.getCurrentGlobalTransactionMgr().getCallbackFactory().removeCallback(id); state = JobState.FINISHED; - - if (MetricRepo.isInit) { - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); - } // when load job finished, there is no need to hold the tasks which are the biggest memory consumers. idToTasks.clear(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java index 6ff4fdf4df..01ea0ebc2b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java @@ -27,7 +27,6 @@ import org.apache.doris.common.QuotaExceedException; import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugUtil; import org.apache.doris.common.util.TimeUtils; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.service.FrontendOptions; import org.apache.doris.thrift.TRoutineLoadTask; import org.apache.doris.transaction.BeginTransactionException; @@ -167,7 +166,6 @@ public abstract class RoutineLoadTaskInfo { // begin a txn for task RoutineLoadJob routineLoadJob = routineLoadManager.getJob(jobId); try { - MetricRepo.COUNTER_LOAD_ADD.increase(1L); txnId = Env.getCurrentGlobalTransactionMgr().beginTransaction(routineLoadJob.getDbId(), Lists.newArrayList(routineLoadJob.getTableId()), DebugUtil.printId(id), null, new TxnCoordinator(TxnSourceType.FE, FrontendOptions.getLocalHostAddress()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/update/UpdateStmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/load/update/UpdateStmtExecutor.java index fce4578545..568b9442e7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/update/UpdateStmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/update/UpdateStmtExecutor.java @@ -35,7 +35,6 @@ import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugUtil; import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.VectorizedUtil; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.qe.Coordinator; import org.apache.doris.qe.QeProcessorImpl; import org.apache.doris.service.FrontendOptions; @@ -128,7 +127,6 @@ public class UpdateStmtExecutor { private void beginTxn() throws LabelAlreadyUsedException, AnalysisException, BeginTransactionException, DuplicatedRequestException, QuotaExceedException, MetaNotFoundException { LOG.info("begin transaction for update stmt, query id:{}", DebugUtil.printId(queryId)); - MetricRepo.COUNTER_LOAD_ADD.increase(1L); label = "update_" + DebugUtil.printId(queryId); txnId = Env.getCurrentGlobalTransactionMgr() .beginTransaction(dbId, Lists.newArrayList(targetTable.getId()), label, @@ -199,7 +197,6 @@ public class UpdateStmtExecutor { if (isPublished) { // situation2.2: publish successful txnStatus = TransactionStatus.VISIBLE; - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); } else { // situation2.3: be published later txnStatus = TransactionStatus.COMMITTED; diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java index 3f35b53fb2..91ebddf0f8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/ReportHandler.java @@ -110,7 +110,7 @@ public class ReportHandler extends Daemon { return (long) reportQueue.size(); } }; - MetricRepo.PALO_METRIC_REGISTER.addPaloMetrics(gauge); + MetricRepo.DORIS_METRIC_REGISTER.addMetrics(gauge); } public TMasterResult handleReport(TReportRequest request) throws TException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/DorisMetricRegistry.java b/fe/fe-core/src/main/java/org/apache/doris/metric/DorisMetricRegistry.java index def0271b8a..24b96ad86c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/DorisMetricRegistry.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/DorisMetricRegistry.java @@ -29,34 +29,49 @@ import java.util.stream.Collectors; public class DorisMetricRegistry { - private Collection paloMetrics = new PriorityQueue<>(Comparator.comparing(Metric::getName)); + private Collection metrics = new PriorityQueue<>(Comparator.comparing(Metric::getName)); + private Collection systemMetrics = new PriorityQueue<>(Comparator.comparing(Metric::getName)); public DorisMetricRegistry() { } - public synchronized void addPaloMetrics(Metric paloMetric) { + public synchronized void addMetrics(Metric metric) { // No metric needs to be added to the Checkpoint thread. // And if you add a metric in Checkpoint thread, it will cause the metric to be added repeatedly, // and the Checkpoint Catalog may be saved incorrectly, resulting in FE memory leaks. if (!Env.isCheckpointThread()) { - paloMetrics.add(paloMetric); + metrics.add(metric); } } - public synchronized List getPaloMetrics() { - return Lists.newArrayList(paloMetrics); + public synchronized void addSystemMetrics(Metric sysMetric) { + if (!Env.isCheckpointThread()) { + systemMetrics.add(sysMetric); + } + } + + public synchronized List getMetrics() { + return Lists.newArrayList(metrics); + } + + public synchronized List getSystemMetrics() { + return Lists.newArrayList(systemMetrics); } // the metrics by metric name - public synchronized List getPaloMetricsByName(String name) { - return paloMetrics.stream().filter(m -> m.getName().equals(name)).collect(Collectors.toList()); + public synchronized List getMetricsByName(String name) { + List list = metrics.stream().filter(m -> m.getName().equals(name)).collect(Collectors.toList()); + if (list.isEmpty()) { + list = systemMetrics.stream().filter(m -> m.getName().equals(name)).collect(Collectors.toList()); + } + return list; } public synchronized void removeMetrics(String name) { - // Same reason as comment in addPaloMetrics() + // Same reason as comment in addMetrics() if (!Env.isCheckpointThread()) { - paloMetrics = paloMetrics.stream().filter(m -> !(m.getName().equals(name))).collect(Collectors.toList()); + metrics = metrics.stream().filter(m -> !(m.getName().equals(name))).collect(Collectors.toList()); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/JsonMetricVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/metric/JsonMetricVisitor.java index 43b634360d..b234004239 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/JsonMetricVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/JsonMetricVisitor.java @@ -27,8 +27,8 @@ public class JsonMetricVisitor extends MetricVisitor { private int ordinal = 0; private int metricNumber = 0; - public JsonMetricVisitor(String prefix) { - super(prefix); + public JsonMetricVisitor() { + super(); } @Override @@ -42,16 +42,15 @@ public class JsonMetricVisitor extends MetricVisitor { } @Override - public void visit(StringBuilder sb, @SuppressWarnings("rawtypes") Metric metric) { + public void visit(StringBuilder sb, String prefix, @SuppressWarnings("rawtypes") Metric metric) { if (ordinal++ == 0) { sb.append("[\n"); } sb.append("{\n\t\"tags\":\n\t{\n"); - sb.append("\t\t\"metric\":\"").append(metric.getName()).append("\""); + sb.append("\t\t\"metric\":\"").append(prefix).append(metric.getName()).append("\""); // name - @SuppressWarnings("unchecked") - List labels = metric.getLabels(); + @SuppressWarnings("unchecked") List labels = metric.getLabels(); if (!labels.isEmpty()) { sb.append(",\n"); int i = 0; @@ -76,7 +75,7 @@ public class JsonMetricVisitor extends MetricVisitor { } @Override - public void visitHistogram(StringBuilder sb, String name, Histogram histogram) { + public void visitHistogram(StringBuilder sb, String prefix, String name, Histogram histogram) { return; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 30b2dab10c..c12fb11d75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -37,6 +37,7 @@ import org.apache.doris.qe.QeProcessorImpl; import org.apache.doris.service.ExecuteEnv; import org.apache.doris.system.Backend; import org.apache.doris.system.SystemInfoService; +import org.apache.doris.transaction.TransactionStatus; import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; @@ -55,8 +56,9 @@ import java.util.function.BinaryOperator; public final class MetricRepo { private static final Logger LOG = LogManager.getLogger(MetricRepo.class); + // METRIC_REGISTER is only used for histogram metrics private static final MetricRegistry METRIC_REGISTER = new MetricRegistry(); - public static final DorisMetricRegistry PALO_METRIC_REGISTER = new DorisMetricRegistry(); + public static final DorisMetricRegistry DORIS_METRIC_REGISTER = new DorisMetricRegistry(); public static volatile boolean isInit = false; public static final SystemMetrics SYSTEM_METRICS = new SystemMetrics(); @@ -65,19 +67,16 @@ public final class MetricRepo { public static final String TABLET_MAX_COMPACTION_SCORE = "tablet_max_compaction_score"; public static LongCounterMetric COUNTER_REQUEST_ALL; - public static LongCounterMetric COUNTER_QUERY_BEGIN; public static LongCounterMetric COUNTER_QUERY_ALL; public static LongCounterMetric COUNTER_QUERY_ERR; public static LongCounterMetric COUNTER_QUERY_TABLE; public static LongCounterMetric COUNTER_QUERY_OLAP_TABLE; - public static LongCounterMetric COUNTER_CACHE_MODE_SQL; + + public static LongCounterMetric COUNTER_CACHE_ADDED_SQL; + public static LongCounterMetric COUNTER_CACHE_ADDED_PARTITION; public static LongCounterMetric COUNTER_CACHE_HIT_SQL; - public static LongCounterMetric COUNTER_CACHE_MODE_PARTITION; public static LongCounterMetric COUNTER_CACHE_HIT_PARTITION; - public static LongCounterMetric COUNTER_CACHE_PARTITION_ALL; - public static LongCounterMetric COUNTER_CACHE_PARTITION_HIT; - public static LongCounterMetric COUNTER_LOAD_ADD; - public static LongCounterMetric COUNTER_LOAD_FINISHED; + public static LongCounterMetric COUNTER_EDIT_LOG_WRITE; public static LongCounterMetric COUNTER_EDIT_LOG_READ; public static LongCounterMetric COUNTER_EDIT_LOG_SIZE_BYTES; @@ -94,6 +93,7 @@ public final class MetricRepo { public static LongCounterMetric COUNTER_TXN_BEGIN; public static LongCounterMetric COUNTER_TXN_FAILED; public static LongCounterMetric COUNTER_TXN_SUCCESS; + public static LongCounterMetric COUNTER_ROUTINE_LOAD_ROWS; public static LongCounterMetric COUNTER_ROUTINE_LOAD_RECEIVED_BYTES; public static LongCounterMetric COUNTER_ROUTINE_LOAD_ERROR_ROWS; @@ -108,8 +108,8 @@ public final class MetricRepo { public static GaugeMetricImpl GAUGE_QUERY_ERR_RATE; public static GaugeMetricImpl GAUGE_MAX_TABLET_COMPACTION_SCORE; - private static ScheduledThreadPoolExecutor metricTimer = ThreadPoolManager - .newDaemonScheduledThreadPool(1, "Metric-Timer-Pool", true); + private static ScheduledThreadPoolExecutor metricTimer = ThreadPoolManager.newDaemonScheduledThreadPool(1, + "metric-timer-pool", true); private static MetricCalculator metricCalculator = new MetricCalculator(); // init() should only be called after catalog is contructed. @@ -122,9 +122,11 @@ public final class MetricRepo { // load jobs LoadManager loadManger = Env.getCurrentEnv().getLoadManager(); for (EtlJobType jobType : EtlJobType.values()) { + if (jobType == EtlJobType.UNKNOWN) { + continue; + } for (JobState state : JobState.values()) { - GaugeMetric gauge = (GaugeMetric) new GaugeMetric("job", - MetricUnit.NOUNIT, "job statistics") { + GaugeMetric gauge = new GaugeMetric("job", MetricUnit.NOUNIT, "job statistics") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { @@ -133,18 +135,16 @@ public final class MetricRepo { return loadManger.getLoadJobNum(state, jobType); } }; - gauge.addLabel(new MetricLabel("job", "load")) - .addLabel(new MetricLabel("type", jobType.name())) + gauge.addLabel(new MetricLabel("job", "load")).addLabel(new MetricLabel("type", jobType.name())) .addLabel(new MetricLabel("state", state.name())); - PALO_METRIC_REGISTER.addPaloMetrics(gauge); + DORIS_METRIC_REGISTER.addMetrics(gauge); } } // routine load jobs RoutineLoadManager routineLoadManager = Env.getCurrentEnv().getRoutineLoadManager(); for (RoutineLoadJob.JobState jobState : RoutineLoadJob.JobState.values()) { - GaugeMetric gauge = (GaugeMetric) new GaugeMetric("job", - MetricUnit.NOUNIT, "routine load job statistics") { + GaugeMetric gauge = new GaugeMetric("job", MetricUnit.NOUNIT, "routine load job statistics") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { @@ -156,10 +156,9 @@ public final class MetricRepo { return Long.valueOf(jobs.size()); } }; - gauge.addLabel(new MetricLabel("job", "load")) - .addLabel(new MetricLabel("type", "ROUTINE_LOAD")) + gauge.addLabel(new MetricLabel("job", "load")).addLabel(new MetricLabel("type", "ROUTINE_LOAD")) .addLabel(new MetricLabel("state", jobState.name())); - PALO_METRIC_REGISTER.addPaloMetrics(gauge); + DORIS_METRIC_REGISTER.addMetrics(gauge); } // running alter job @@ -169,44 +168,42 @@ public final class MetricRepo { continue; } - GaugeMetric gauge = (GaugeMetric) new GaugeMetric("job", - MetricUnit.NOUNIT, "job statistics") { + GaugeMetric gauge = new GaugeMetric("job", MetricUnit.NOUNIT, "job statistics") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { return 0L; } if (jobType == JobType.SCHEMA_CHANGE) { - return alter.getSchemaChangeHandler().getAlterJobV2Num( - org.apache.doris.alter.AlterJobV2.JobState.RUNNING); + return alter.getSchemaChangeHandler() + .getAlterJobV2Num(org.apache.doris.alter.AlterJobV2.JobState.RUNNING); } else { return alter.getMaterializedViewHandler().getAlterJobV2Num( org.apache.doris.alter.AlterJobV2.JobState.RUNNING); } } }; - gauge.addLabel(new MetricLabel("job", "alter")) - .addLabel(new MetricLabel("type", jobType.name())) + gauge.addLabel(new MetricLabel("job", "alter")).addLabel(new MetricLabel("type", jobType.name())) .addLabel(new MetricLabel("state", "running")); - PALO_METRIC_REGISTER.addPaloMetrics(gauge); + DORIS_METRIC_REGISTER.addMetrics(gauge); } // capacity generateBackendsTabletMetrics(); // connections - GaugeMetric connections = (GaugeMetric) new GaugeMetric( - "connection_total", MetricUnit.CONNECTIONS, "total connections") { + GaugeMetric connections = new GaugeMetric("connection_total", MetricUnit.CONNECTIONS, + "total connections") { @Override public Integer getValue() { return ExecuteEnv.getInstance().getScheduler().getConnectionNum(); } }; - PALO_METRIC_REGISTER.addPaloMetrics(connections); + DORIS_METRIC_REGISTER.addMetrics(connections); // journal id - GaugeMetric maxJournalId = (GaugeMetric) new GaugeMetric( - "max_journal_id", MetricUnit.NOUNIT, "max journal id of this frontends") { + GaugeMetric maxJournalId = new GaugeMetric("max_journal_id", MetricUnit.NOUNIT, + "max journal id of this frontends") { @Override public Long getValue() { EditLog editLog = Env.getCurrentEnv().getEditLog(); @@ -216,11 +213,11 @@ public final class MetricRepo { return editLog.getMaxJournalId(); } }; - PALO_METRIC_REGISTER.addPaloMetrics(maxJournalId); + DORIS_METRIC_REGISTER.addMetrics(maxJournalId); // scheduled tablet num - GaugeMetric scheduledTabletNum = (GaugeMetric) new GaugeMetric( - "scheduled_tablet_num", MetricUnit.NOUNIT, "number of tablets being scheduled") { + GaugeMetric scheduledTabletNum = new GaugeMetric("scheduled_tablet_num", MetricUnit.NOUNIT, + "number of tablets being scheduled") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { @@ -229,7 +226,7 @@ public final class MetricRepo { return (long) Env.getCurrentEnv().getTabletScheduler().getTotalNum(); } }; - PALO_METRIC_REGISTER.addPaloMetrics(scheduledTabletNum); + DORIS_METRIC_REGISTER.addMetrics(scheduledTabletNum); GaugeMetric maxInstanceNum = new GaugeMetric("max_instances_num_per_user", MetricUnit.NOUNIT, "max instances num of all current users") { @@ -237,154 +234,157 @@ public final class MetricRepo { public Long getValue() { try { return ((QeProcessorImpl) QeProcessorImpl.INSTANCE).getInstancesNumPerUser().values().stream() - .reduce(-1, BinaryOperator.maxBy(Integer::compareTo)).longValue(); + .reduce(-1, BinaryOperator.maxBy(Integer::compareTo)).longValue(); } catch (Throwable ex) { LOG.warn("Get max_instances_num_per_user error", ex); return -2L; } } }; - PALO_METRIC_REGISTER.addPaloMetrics(maxInstanceNum); + DORIS_METRIC_REGISTER.addMetrics(maxInstanceNum); + + // txn status + for (TransactionStatus status : TransactionStatus.values()) { + GaugeMetric gauge = new GaugeMetric("txn_status", MetricUnit.NOUNIT, "txn statistics") { + @Override + public Long getValue() { + if (!Env.getCurrentEnv().isMaster()) { + return 0L; + } + return Env.getCurrentGlobalTransactionMgr().getTxnNumByStatus(status); + } + }; + gauge.addLabel(new MetricLabel("type", status.name().toLowerCase())); + DORIS_METRIC_REGISTER.addMetrics(gauge); + } // qps, rps and error rate // these metrics should be set an init value, in case that metric calculator is not running - GAUGE_QUERY_PER_SECOND = new GaugeMetricImpl<>("qps", MetricUnit.NOUNIT, - "query per second"); + GAUGE_QUERY_PER_SECOND = new GaugeMetricImpl<>("qps", MetricUnit.NOUNIT, "query per second"); GAUGE_QUERY_PER_SECOND.setValue(0.0); - PALO_METRIC_REGISTER.addPaloMetrics(GAUGE_QUERY_PER_SECOND); - GAUGE_REQUEST_PER_SECOND = new GaugeMetricImpl<>("rps", MetricUnit.NOUNIT, - "request per second"); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_QUERY_PER_SECOND); + GAUGE_REQUEST_PER_SECOND = new GaugeMetricImpl<>("rps", MetricUnit.NOUNIT, "request per second"); GAUGE_REQUEST_PER_SECOND.setValue(0.0); - PALO_METRIC_REGISTER.addPaloMetrics(GAUGE_REQUEST_PER_SECOND); - GAUGE_QUERY_ERR_RATE = new GaugeMetricImpl<>("query_err_rate", MetricUnit.NOUNIT, - "query error rate"); - PALO_METRIC_REGISTER.addPaloMetrics(GAUGE_QUERY_ERR_RATE); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_REQUEST_PER_SECOND); + GAUGE_QUERY_ERR_RATE = new GaugeMetricImpl<>("query_err_rate", MetricUnit.NOUNIT, "query error rate"); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_QUERY_ERR_RATE); GAUGE_QUERY_ERR_RATE.setValue(0.0); - GAUGE_MAX_TABLET_COMPACTION_SCORE = new GaugeMetricImpl<>("max_tablet_compaction_score", - MetricUnit.NOUNIT, "max tablet compaction score of all backends"); - PALO_METRIC_REGISTER.addPaloMetrics(GAUGE_MAX_TABLET_COMPACTION_SCORE); + GAUGE_MAX_TABLET_COMPACTION_SCORE = new GaugeMetricImpl<>("max_tablet_compaction_score", MetricUnit.NOUNIT, + "max tablet compaction score of all backends"); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MAX_TABLET_COMPACTION_SCORE); GAUGE_MAX_TABLET_COMPACTION_SCORE.setValue(0L); // 2. counter - COUNTER_REQUEST_ALL = new LongCounterMetric("request_total", MetricUnit.REQUESTS, - "total request"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_REQUEST_ALL); - COUNTER_QUERY_ALL = new LongCounterMetric("query_total", MetricUnit.REQUESTS, - "total query"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_QUERY_ALL); - COUNTER_QUERY_BEGIN = new LongCounterMetric("query_begin", MetricUnit.REQUESTS, - "query begin"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_QUERY_BEGIN); - COUNTER_QUERY_ERR = new LongCounterMetric("query_err", MetricUnit.REQUESTS, - "total error query"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_QUERY_ERR); - COUNTER_LOAD_ADD = new LongCounterMetric("load_add", MetricUnit.REQUESTS, - "total load submit"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_LOAD_ADD); + COUNTER_REQUEST_ALL = new LongCounterMetric("request_total", MetricUnit.REQUESTS, "total request"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_REQUEST_ALL); + COUNTER_QUERY_ALL = new LongCounterMetric("query_total", MetricUnit.REQUESTS, "total query"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_QUERY_ALL); + COUNTER_QUERY_ERR = new LongCounterMetric("query_err", MetricUnit.REQUESTS, "total error query"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_QUERY_ERR); - COUNTER_QUERY_TABLE = new LongCounterMetric("query_table", MetricUnit.REQUESTS, - "total query from table"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_QUERY_TABLE); + COUNTER_QUERY_TABLE = new LongCounterMetric("query_table", MetricUnit.REQUESTS, "total query from table"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_QUERY_TABLE); COUNTER_QUERY_OLAP_TABLE = new LongCounterMetric("query_olap_table", MetricUnit.REQUESTS, "total query from olap table"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_QUERY_OLAP_TABLE); - COUNTER_CACHE_MODE_SQL = new LongCounterMetric("cache_mode_sql", MetricUnit.REQUESTS, - "total query of sql mode"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_MODE_SQL); - COUNTER_CACHE_HIT_SQL = new LongCounterMetric("cache_hit_sql", MetricUnit.REQUESTS, + DORIS_METRIC_REGISTER.addMetrics(COUNTER_QUERY_OLAP_TABLE); + COUNTER_CACHE_ADDED_SQL = new LongCounterMetric("cache_added", MetricUnit.REQUESTS, + "Number of SQL mode cache added"); + COUNTER_CACHE_ADDED_SQL.addLabel(new MetricLabel("type", "sql")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_CACHE_ADDED_SQL); + COUNTER_CACHE_ADDED_PARTITION = new LongCounterMetric("cache_added", MetricUnit.REQUESTS, + "Number of Partition mode cache added"); + COUNTER_CACHE_ADDED_PARTITION.addLabel(new MetricLabel("type", "partition")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_CACHE_ADDED_PARTITION); + COUNTER_CACHE_HIT_SQL = new LongCounterMetric("cache_hit", MetricUnit.REQUESTS, "total hits query by sql model"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_HIT_SQL); - COUNTER_CACHE_MODE_PARTITION = new LongCounterMetric("query_mode_partition", MetricUnit.REQUESTS, - "total query of partition mode"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_MODE_PARTITION); - COUNTER_CACHE_HIT_PARTITION = new LongCounterMetric("cache_hit_partition", MetricUnit.REQUESTS, + COUNTER_CACHE_HIT_SQL.addLabel(new MetricLabel("type", "sql")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_CACHE_HIT_SQL); + COUNTER_CACHE_HIT_PARTITION = new LongCounterMetric("cache_hit", MetricUnit.REQUESTS, "total hits query by partition model"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_HIT_PARTITION); - COUNTER_CACHE_PARTITION_ALL = new LongCounterMetric("partition_all", MetricUnit.REQUESTS, - "scan partition of cache partition model"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_PARTITION_ALL); - COUNTER_CACHE_PARTITION_HIT = new LongCounterMetric("partition_hit", MetricUnit.REQUESTS, - "hit partition of cache partition model"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_CACHE_PARTITION_HIT); + COUNTER_CACHE_HIT_PARTITION.addLabel(new MetricLabel("type", "partition")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_CACHE_HIT_PARTITION); - COUNTER_LOAD_FINISHED = new LongCounterMetric("load_finished", MetricUnit.REQUESTS, - "total load finished"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_LOAD_FINISHED); - COUNTER_EDIT_LOG_WRITE = new LongCounterMetric("edit_log_write", MetricUnit.OPERATIONS, + COUNTER_EDIT_LOG_WRITE = new LongCounterMetric("edit_log", MetricUnit.OPERATIONS, "counter of edit log write into bdbje"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_EDIT_LOG_WRITE); - COUNTER_EDIT_LOG_READ = new LongCounterMetric("edit_log_read", MetricUnit.OPERATIONS, + COUNTER_EDIT_LOG_WRITE.addLabel(new MetricLabel("type", "write")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_EDIT_LOG_WRITE); + COUNTER_EDIT_LOG_READ = new LongCounterMetric("edit_log", MetricUnit.OPERATIONS, "counter of edit log read from bdbje"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_EDIT_LOG_READ); - COUNTER_EDIT_LOG_SIZE_BYTES = new LongCounterMetric("edit_log_size_bytes", MetricUnit.BYTES, - "size of edit log"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_EDIT_LOG_SIZE_BYTES); + COUNTER_EDIT_LOG_READ.addLabel(new MetricLabel("type", "read")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_EDIT_LOG_READ); + COUNTER_EDIT_LOG_SIZE_BYTES = new LongCounterMetric("edit_log", MetricUnit.BYTES, "size of edit log"); + COUNTER_EDIT_LOG_SIZE_BYTES.addLabel(new MetricLabel("type", "bytes")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_EDIT_LOG_SIZE_BYTES); // image generate COUNTER_IMAGE_WRITE_SUCCESS = new LongCounterMetric("image_write", MetricUnit.OPERATIONS, "counter of image succeed in write"); COUNTER_IMAGE_WRITE_SUCCESS.addLabel(new MetricLabel("type", "success")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_WRITE_SUCCESS); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_WRITE_SUCCESS); COUNTER_IMAGE_WRITE_FAILED = new LongCounterMetric("image_write", MetricUnit.OPERATIONS, "counter of image failed to write"); COUNTER_IMAGE_WRITE_FAILED.addLabel(new MetricLabel("type", "failed")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_WRITE_FAILED); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_WRITE_FAILED); COUNTER_IMAGE_PUSH_SUCCESS = new LongCounterMetric("image_push", MetricUnit.OPERATIONS, "counter of image succeeded in pushing to other frontends"); COUNTER_IMAGE_PUSH_SUCCESS.addLabel(new MetricLabel("type", "success")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_PUSH_SUCCESS); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_PUSH_SUCCESS); COUNTER_IMAGE_PUSH_FAILED = new LongCounterMetric("image_push", MetricUnit.OPERATIONS, "counter of image failed to other frontends"); COUNTER_IMAGE_PUSH_FAILED.addLabel(new MetricLabel("type", "failed")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_PUSH_FAILED); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_PUSH_FAILED); // image clean COUNTER_IMAGE_CLEAN_SUCCESS = new LongCounterMetric("image_clean", MetricUnit.OPERATIONS, "counter of image succeeded in cleaning"); COUNTER_IMAGE_CLEAN_SUCCESS.addLabel(new MetricLabel("type", "success")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_CLEAN_SUCCESS); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_CLEAN_SUCCESS); COUNTER_IMAGE_CLEAN_FAILED = new LongCounterMetric("image_clean", MetricUnit.OPERATIONS, "counter of image failed to clean"); COUNTER_IMAGE_CLEAN_FAILED.addLabel(new MetricLabel("type", "failed")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_IMAGE_CLEAN_FAILED); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_IMAGE_CLEAN_FAILED); // edit log clean COUNTER_EDIT_LOG_CLEAN_SUCCESS = new LongCounterMetric("edit_log_clean", MetricUnit.OPERATIONS, "counter of edit log succeed in cleaning"); COUNTER_EDIT_LOG_CLEAN_SUCCESS.addLabel(new MetricLabel("type", "success")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_EDIT_LOG_CLEAN_SUCCESS); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_EDIT_LOG_CLEAN_SUCCESS); COUNTER_EDIT_LOG_CLEAN_FAILED = new LongCounterMetric("edit_log_clean", MetricUnit.OPERATIONS, "counter of edit log failed to clean"); COUNTER_EDIT_LOG_CLEAN_FAILED.addLabel(new MetricLabel("type", "failed")); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_EDIT_LOG_CLEAN_FAILED); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_EDIT_LOG_CLEAN_FAILED); - COUNTER_TXN_REJECT = new LongCounterMetric("txn_reject", MetricUnit.REQUESTS, + COUNTER_TXN_REJECT = new LongCounterMetric("txn_counter", MetricUnit.REQUESTS, "counter of rejected transactions"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_TXN_REJECT); - COUNTER_TXN_BEGIN = new LongCounterMetric("txn_begin", MetricUnit.REQUESTS, + COUNTER_TXN_REJECT.addLabel(new MetricLabel("type", "reject")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_TXN_REJECT); + COUNTER_TXN_BEGIN = new LongCounterMetric("txn_counter", MetricUnit.REQUESTS, "counter of beginning transactions"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_TXN_BEGIN); - COUNTER_TXN_SUCCESS = new LongCounterMetric("txn_success", MetricUnit.REQUESTS, + COUNTER_TXN_BEGIN.addLabel(new MetricLabel("type", "begin")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_TXN_BEGIN); + COUNTER_TXN_SUCCESS = new LongCounterMetric("txn_counter", MetricUnit.REQUESTS, "counter of success transactions"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_TXN_SUCCESS); - COUNTER_TXN_FAILED = new LongCounterMetric("txn_failed", MetricUnit.REQUESTS, + COUNTER_TXN_SUCCESS.addLabel(new MetricLabel("type", "success")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_TXN_SUCCESS); + COUNTER_TXN_FAILED = new LongCounterMetric("txn_counter", MetricUnit.REQUESTS, "counter of failed transactions"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_TXN_FAILED); + COUNTER_TXN_FAILED.addLabel(new MetricLabel("type", "failed")); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_TXN_FAILED); - COUNTER_ROUTINE_LOAD_ROWS = new LongCounterMetric("routine_load_rows", - MetricUnit.ROWS, "total rows of routine load"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_ROUTINE_LOAD_ROWS); + COUNTER_ROUTINE_LOAD_ROWS = new LongCounterMetric("routine_load_rows", MetricUnit.ROWS, + "total rows of routine load"); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_ROUTINE_LOAD_ROWS); COUNTER_ROUTINE_LOAD_RECEIVED_BYTES = new LongCounterMetric("routine_load_receive_bytes", MetricUnit.BYTES, "total received bytes of routine load"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_ROUTINE_LOAD_RECEIVED_BYTES); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_ROUTINE_LOAD_RECEIVED_BYTES); COUNTER_ROUTINE_LOAD_ERROR_ROWS = new LongCounterMetric("routine_load_error_rows", MetricUnit.ROWS, "total error rows of routine load"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_ROUTINE_LOAD_ERROR_ROWS); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_ROUTINE_LOAD_ERROR_ROWS); COUNTER_HIT_SQL_BLOCK_RULE = new LongCounterMetric("counter_hit_sql_block_rule", MetricUnit.ROWS, "total hit sql block rule query"); - PALO_METRIC_REGISTER.addPaloMetrics(COUNTER_HIT_SQL_BLOCK_RULE); + DORIS_METRIC_REGISTER.addMetrics(COUNTER_HIT_SQL_BLOCK_RULE); // 3. histogram HISTO_QUERY_LATENCY = METRIC_REGISTER.histogram( MetricRegistry.name("query", "latency", "ms")); @@ -412,7 +412,7 @@ public final class MetricRepo { } }; tcpRetransSegs.addLabel(new MetricLabel("name", "tcp_retrans_segs")); - PALO_METRIC_REGISTER.addPaloMetrics(tcpRetransSegs); + DORIS_METRIC_REGISTER.addSystemMetrics(tcpRetransSegs); // TCP inErrs GaugeMetric tpcInErrs = (GaugeMetric) new GaugeMetric( @@ -423,7 +423,7 @@ public final class MetricRepo { } }; tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs")); - PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs); + DORIS_METRIC_REGISTER.addSystemMetrics(tpcInErrs); // TCP inSegs GaugeMetric tpcInSegs = (GaugeMetric) new GaugeMetric( @@ -434,7 +434,7 @@ public final class MetricRepo { } }; tpcInSegs.addLabel(new MetricLabel("name", "tcp_in_segs")); - PALO_METRIC_REGISTER.addPaloMetrics(tpcInSegs); + DORIS_METRIC_REGISTER.addSystemMetrics(tpcInSegs); // TCP outSegs GaugeMetric tpcOutSegs = (GaugeMetric) new GaugeMetric( @@ -445,7 +445,7 @@ public final class MetricRepo { } }; tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs")); - PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs); + DORIS_METRIC_REGISTER.addSystemMetrics(tpcOutSegs); // Memory Total GaugeMetric memTotal = (GaugeMetric) new GaugeMetric( @@ -456,7 +456,7 @@ public final class MetricRepo { } }; memTotal.addLabel(new MetricLabel("name", "memory_total")); - PALO_METRIC_REGISTER.addPaloMetrics(memTotal); + DORIS_METRIC_REGISTER.addSystemMetrics(memTotal); // Memory Free GaugeMetric memFree = (GaugeMetric) new GaugeMetric( @@ -467,7 +467,7 @@ public final class MetricRepo { } }; memFree.addLabel(new MetricLabel("name", "memory_free")); - PALO_METRIC_REGISTER.addPaloMetrics(memFree); + DORIS_METRIC_REGISTER.addSystemMetrics(memFree); // Memory Total GaugeMetric memAvailable = (GaugeMetric) new GaugeMetric("meminfo", MetricUnit.BYTES, @@ -478,7 +478,7 @@ public final class MetricRepo { } }; memAvailable.addLabel(new MetricLabel("name", "memory_available")); - PALO_METRIC_REGISTER.addPaloMetrics(memAvailable); + DORIS_METRIC_REGISTER.addSystemMetrics(memAvailable); // Buffers GaugeMetric buffers = (GaugeMetric) new GaugeMetric("meminfo", MetricUnit.BYTES, @@ -489,7 +489,7 @@ public final class MetricRepo { } }; buffers.addLabel(new MetricLabel("name", "buffers")); - PALO_METRIC_REGISTER.addPaloMetrics(buffers); + DORIS_METRIC_REGISTER.addSystemMetrics(buffers); // Cached GaugeMetric cached = (GaugeMetric) new GaugeMetric( @@ -500,7 +500,7 @@ public final class MetricRepo { } }; cached.addLabel(new MetricLabel("name", "cached")); - PALO_METRIC_REGISTER.addPaloMetrics(cached); + DORIS_METRIC_REGISTER.addSystemMetrics(cached); } // to generate the metrics related to tablets of each backends @@ -508,8 +508,8 @@ public final class MetricRepo { // at runtime. public static void generateBackendsTabletMetrics() { // remove all previous 'tablet' metric - PALO_METRIC_REGISTER.removeMetrics(TABLET_NUM); - PALO_METRIC_REGISTER.removeMetrics(TABLET_MAX_COMPACTION_SCORE); + DORIS_METRIC_REGISTER.removeMetrics(TABLET_NUM); + DORIS_METRIC_REGISTER.removeMetrics(TABLET_MAX_COMPACTION_SCORE); SystemInfoService infoService = Env.getCurrentSystemInfo(); TabletInvertedIndex invertedIndex = Env.getCurrentInvertedIndex(); @@ -521,8 +521,7 @@ public final class MetricRepo { } // tablet number of each backends - GaugeMetric tabletNum = (GaugeMetric) new GaugeMetric(TABLET_NUM, - MetricUnit.NOUNIT, "tablet number") { + GaugeMetric tabletNum = new GaugeMetric(TABLET_NUM, MetricUnit.NOUNIT, "tablet number") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { @@ -532,12 +531,11 @@ public final class MetricRepo { } }; tabletNum.addLabel(new MetricLabel("backend", be.getHost() + ":" + be.getHeartbeatPort())); - PALO_METRIC_REGISTER.addPaloMetrics(tabletNum); + DORIS_METRIC_REGISTER.addMetrics(tabletNum); // max compaction score of tablets on each backends - GaugeMetric tabletMaxCompactionScore = (GaugeMetric) new GaugeMetric( - TABLET_MAX_COMPACTION_SCORE, MetricUnit.NOUNIT, - "tablet max compaction score") { + GaugeMetric tabletMaxCompactionScore = new GaugeMetric(TABLET_MAX_COMPACTION_SCORE, + MetricUnit.NOUNIT, "tablet max compaction score") { @Override public Long getValue() { if (!Env.getCurrentEnv().isMaster()) { @@ -547,7 +545,7 @@ public final class MetricRepo { } }; tabletMaxCompactionScore.addLabel(new MetricLabel("backend", be.getHost() + ":" + be.getHeartbeatPort())); - PALO_METRIC_REGISTER.addPaloMetrics(tabletMaxCompactionScore); + DORIS_METRIC_REGISTER.addMetrics(tabletMaxCompactionScore); } // end for backends } @@ -566,16 +564,21 @@ public final class MetricRepo { JvmStats jvmStats = jvmService.stats(); visitor.visitJvm(sb, jvmStats); - visitor.setMetricNumber(PALO_METRIC_REGISTER.getPaloMetrics().size()); + visitor.setMetricNumber( + DORIS_METRIC_REGISTER.getMetrics().size() + DORIS_METRIC_REGISTER.getSystemMetrics().size()); // doris metrics - for (Metric metric : PALO_METRIC_REGISTER.getPaloMetrics()) { - visitor.visit(sb, metric); + for (Metric metric : DORIS_METRIC_REGISTER.getMetrics()) { + visitor.visit(sb, MetricVisitor.FE_PREFIX, metric); + } + // system metric + for (Metric metric : DORIS_METRIC_REGISTER.getSystemMetrics()) { + visitor.visit(sb, MetricVisitor.SYS_PREFIX, metric); } // histogram SortedMap histograms = METRIC_REGISTER.getHistograms(); for (Map.Entry entry : histograms.entrySet()) { - visitor.visitHistogram(sb, entry.getKey(), entry.getValue()); + visitor.visitHistogram(sb, MetricVisitor.FE_PREFIX, entry.getKey(), entry.getValue()); } // node info @@ -590,6 +593,6 @@ public final class MetricRepo { } public static synchronized List getMetricsByName(String name) { - return PALO_METRIC_REGISTER.getPaloMetricsByName(name); + return DORIS_METRIC_REGISTER.getMetricsByName(name); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricVisitor.java index 681b6dfc4a..575afeeece 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricVisitor.java @@ -26,19 +26,21 @@ import com.codahale.metrics.Histogram; */ public abstract class MetricVisitor { - protected String prefix; + // for FE metrics + public static final String FE_PREFIX = "doris_fe_"; + // for system metrics + public static final String SYS_PREFIX = "system_"; - public MetricVisitor(String prefix) { - this.prefix = prefix; + public MetricVisitor() { } public abstract void setMetricNumber(int metricNumber); public abstract void visitJvm(StringBuilder sb, JvmStats jvmStats); - public abstract void visit(StringBuilder sb, Metric metric); + public abstract void visit(StringBuilder sb, String prefix, Metric metric); - public abstract void visitHistogram(StringBuilder sb, String name, Histogram histogram); + public abstract void visitHistogram(StringBuilder sb, String prefix, String name, Histogram histogram); public abstract void getNodeInfo(StringBuilder sb); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/PrometheusMetricVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/metric/PrometheusMetricVisitor.java index 603a84a4eb..b376af4bc8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/PrometheusMetricVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/PrometheusMetricVisitor.java @@ -19,7 +19,6 @@ package org.apache.doris.metric; import org.apache.doris.catalog.Env; import org.apache.doris.monitor.jvm.JvmStats; -import org.apache.doris.monitor.jvm.JvmStats.BufferPool; import org.apache.doris.monitor.jvm.JvmStats.GarbageCollector; import org.apache.doris.monitor.jvm.JvmStats.MemoryPool; import org.apache.doris.monitor.jvm.JvmStats.Threads; @@ -46,7 +45,6 @@ public class PrometheusMetricVisitor extends MetricVisitor { private static final String JVM_NON_HEAP_SIZE_BYTES = "jvm_non_heap_size_bytes"; private static final String JVM_YOUNG_SIZE_BYTES = "jvm_young_size_bytes"; private static final String JVM_OLD_SIZE_BYTES = "jvm_old_size_bytes"; - private static final String JVM_DIRECT_BUFFER_POOL_SIZE_BYTES = "jvm_direct_buffer_pool_size_bytes"; private static final String JVM_YOUNG_GC = "jvm_young_gc"; private static final String JVM_OLD_GC = "jvm_old_gc"; private static final String JVM_THREAD = "jvm_thread"; @@ -54,17 +52,14 @@ public class PrometheusMetricVisitor extends MetricVisitor { private static final String HELP = "# HELP "; private static final String TYPE = "# TYPE "; - private int ordinal = 0; - private int metricNumber = 0; private Set metricNames = new HashSet(); - public PrometheusMetricVisitor(String prefix) { - super(prefix); + public PrometheusMetricVisitor() { + super(); } @Override public void setMetricNumber(int metricNumber) { - this.metricNumber = metricNumber; } @Override @@ -72,12 +67,12 @@ public class PrometheusMetricVisitor extends MetricVisitor { // heap sb.append(Joiner.on(" ").join(HELP, JVM_HEAP_SIZE_BYTES, "jvm heap stat\n")); sb.append(Joiner.on(" ").join(TYPE, JVM_HEAP_SIZE_BYTES, "gauge\n")); - sb.append(JVM_HEAP_SIZE_BYTES).append("{type=\"max\"} ") - .append(jvmStats.getMem().getHeapMax().getBytes()).append("\n"); + sb.append(JVM_HEAP_SIZE_BYTES).append("{type=\"max\"} ").append(jvmStats.getMem().getHeapMax().getBytes()) + .append("\n"); sb.append(JVM_HEAP_SIZE_BYTES).append("{type=\"committed\"} ") .append(jvmStats.getMem().getHeapCommitted().getBytes()).append("\n"); - sb.append(JVM_HEAP_SIZE_BYTES).append("{type=\"used\"} ") - .append(jvmStats.getMem().getHeapUsed().getBytes()).append("\n"); + sb.append(JVM_HEAP_SIZE_BYTES).append("{type=\"used\"} ").append(jvmStats.getMem().getHeapUsed().getBytes()) + .append("\n"); // non heap sb.append(Joiner.on(" ").join(HELP, JVM_NON_HEAP_SIZE_BYTES, "jvm non heap stat\n")); sb.append(Joiner.on(" ").join(TYPE, JVM_NON_HEAP_SIZE_BYTES, "gauge\n")); @@ -111,23 +106,6 @@ public class PrometheusMetricVisitor extends MetricVisitor { } } - // direct buffer pool - Iterator poolIter = jvmStats.getBufferPools().iterator(); - while (poolIter.hasNext()) { - BufferPool pool = poolIter.next(); - if (pool.getName().equalsIgnoreCase("direct")) { - sb.append(Joiner.on(" ").join(HELP, JVM_DIRECT_BUFFER_POOL_SIZE_BYTES, - "jvm direct buffer pool stat\n")); - sb.append(Joiner.on(" ").join(TYPE, JVM_DIRECT_BUFFER_POOL_SIZE_BYTES, "gauge\n")); - sb.append(JVM_DIRECT_BUFFER_POOL_SIZE_BYTES).append("{type=\"count\"} ") - .append(pool.getCount()).append("\n"); - sb.append(JVM_DIRECT_BUFFER_POOL_SIZE_BYTES).append("{type=\"used\"} ") - .append(pool.getUsed().getBytes()).append("\n"); - sb.append(JVM_DIRECT_BUFFER_POOL_SIZE_BYTES).append("{type=\"capacity\"} ") - .append(pool.getTotalCapacity().getBytes()).append("\n"); - } - } - // gc Iterator gcIter = jvmStats.getGc().iterator(); while (gcIter.hasNext()) { @@ -171,9 +149,9 @@ public class PrometheusMetricVisitor extends MetricVisitor { } @Override - public void visit(StringBuilder sb, @SuppressWarnings("rawtypes") Metric metric) { + public void visit(StringBuilder sb, String prefix, @SuppressWarnings("rawtypes") Metric metric) { // title - final String fullName = prefix + "_" + metric.getName(); + final String fullName = prefix + metric.getName(); if (!metricNames.contains(fullName)) { sb.append(HELP).append(fullName).append(" ").append(metric.getDescription()).append("\n"); sb.append(TYPE).append(fullName).append(" ").append(metric.getType().name().toLowerCase()).append("\n"); @@ -198,8 +176,8 @@ public class PrometheusMetricVisitor extends MetricVisitor { } @Override - public void visitHistogram(StringBuilder sb, String name, Histogram histogram) { - final String fullName = prefix + "_" + name.replaceAll("\\.", "_"); + public void visitHistogram(StringBuilder sb, String prefix, String name, Histogram histogram) { + final String fullName = prefix + name.replaceAll("\\.", "_"); sb.append(HELP).append(fullName).append(" ").append("\n"); sb.append(TYPE).append(fullName).append(" ").append("summary\n"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/SimpleCoreMetricVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/metric/SimpleCoreMetricVisitor.java index d329a84a81..16ae00f213 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/SimpleCoreMetricVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/SimpleCoreMetricVisitor.java @@ -71,8 +71,8 @@ public class SimpleCoreMetricVisitor extends MetricVisitor { CORE_METRICS.put(MAX_TABLET_COMPACTION_SCORE, TYPE_LONG); } - public SimpleCoreMetricVisitor(String prefix) { - super(prefix); + public SimpleCoreMetricVisitor() { + super(); } @Override @@ -104,46 +104,46 @@ public class SimpleCoreMetricVisitor extends MetricVisitor { } @Override - public void visit(StringBuilder sb, Metric metric) { + public void visit(StringBuilder sb, String prefix, Metric metric) { if (!CORE_METRICS.containsKey(metric.getName())) { return; } if (CORE_METRICS.get(metric.getName()).equals(TYPE_DOUBLE)) { - sb.append(Joiner.on(" ").join(prefix + "_" + metric.getName(), TYPE_DOUBLE, + sb.append(Joiner.on(" ").join(prefix + metric.getName(), TYPE_DOUBLE, String.format("%.2f", Double.valueOf(metric.getValue().toString())))).append("\n"); } else { - sb.append(Joiner.on(" ").join(prefix + "_" + metric.getName(), CORE_METRICS.get(metric.getName()), - metric.getValue().toString())).append("\n"); + sb.append(Joiner.on(" ") + .join(prefix + metric.getName(), CORE_METRICS.get(metric.getName()), metric.getValue().toString())) + .append("\n"); } return; } @Override - public void visitHistogram(StringBuilder sb, String name, Histogram histogram) { + public void visitHistogram(StringBuilder sb, String prefix, String name, Histogram histogram) { if (!CORE_METRICS.containsKey(name)) { return; } Snapshot snapshot = histogram.getSnapshot(); - sb.append(Joiner.on(" ").join(prefix + "_" + name + "_75", CORE_METRICS.get(name), + sb.append(Joiner.on(" ").join(prefix + name + "_75", CORE_METRICS.get(name), String.format("%.0f", snapshot.get75thPercentile()))).append("\n"); - sb.append(Joiner.on(" ").join(prefix + "_" + name + "_95", CORE_METRICS.get(name), + sb.append(Joiner.on(" ").join(prefix + name + "_95", CORE_METRICS.get(name), String.format("%.0f", snapshot.get95thPercentile()))).append("\n"); - sb.append(Joiner.on(" ").join(prefix + "_" + name + "_99", CORE_METRICS.get(name), + sb.append(Joiner.on(" ").join(prefix + name + "_99", CORE_METRICS.get(name), String.format("%.0f", snapshot.get99thPercentile()))).append("\n"); return; } @Override public void getNodeInfo(StringBuilder sb) { - long feDeadNum = Env.getCurrentEnv() - .getFrontends(null).stream().filter(f -> !f.isAlive()).count(); - long beDeadNum = Env.getCurrentSystemInfo().getIdToBackend() - .values().stream().filter(b -> !b.isAlive()).count(); - long brokerDeadNum = Env.getCurrentEnv().getBrokerMgr() - .getAllBrokers().stream().filter(b -> !b.isAlive).count(); - sb.append(prefix + "_frontend_dead_num").append(" ").append(String.valueOf(feDeadNum)).append("\n"); - sb.append(prefix + "_backend_dead_num").append(" ").append(String.valueOf(beDeadNum)).append("\n"); - sb.append(prefix + "_broker_dead_num").append(" ").append(String.valueOf(brokerDeadNum)).append("\n"); + long feDeadNum = Env.getCurrentEnv().getFrontends(null).stream().filter(f -> !f.isAlive()).count(); + long beDeadNum = Env.getCurrentSystemInfo().getIdToBackend().values().stream().filter(b -> !b.isAlive()) + .count(); + long brokerDeadNum = Env.getCurrentEnv().getBrokerMgr().getAllBrokers().stream().filter(b -> !b.isAlive) + .count(); + sb.append("doris_fe_frontend_dead_num").append(" ").append(feDeadNum).append("\n"); + sb.append("doris_fe_backend_dead_num").append(" ").append(beDeadNum).append("\n"); + sb.append("doris_fe_broker_dead_num").append(" ").append(brokerDeadNum).append("\n"); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java index fdadcc1f0f..e239d950cd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java @@ -49,7 +49,7 @@ public class NMysqlServer extends MysqlServer { // default task service. private ExecutorService taskService = ThreadPoolManager.newDaemonCacheThreadPool( - Config.max_mysql_service_task_threads_num, "doris-mysql-nio-pool", true); + Config.max_mysql_service_task_threads_num, "mysql-nio-pool", true); public NMysqlServer(int port, ConnectScheduler connectScheduler) { this.port = port; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectProcessor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectProcessor.java index a68e3dbfbc..4ca8492ed0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectProcessor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectProcessor.java @@ -48,6 +48,7 @@ import org.apache.doris.mysql.MysqlServerStatusFlag; import org.apache.doris.nereids.parser.NereidsParser; import org.apache.doris.plugin.AuditEvent.EventType; import org.apache.doris.proto.Data; +import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.service.FrontendOptions; import org.apache.doris.thrift.TMasterOpRequest; import org.apache.doris.thrift.TMasterOpResult; @@ -130,11 +131,11 @@ public class ConnectProcessor { if (ctx.getState().isQuery()) { MetricRepo.COUNTER_QUERY_ALL.increase(1L); - if (ctx.getState().getStateType() == QueryState.MysqlStateType.ERR + if (ctx.getState().getStateType() == MysqlStateType.ERR && ctx.getState().getErrType() != QueryState.ErrType.ANALYSIS_ERR) { // err query MetricRepo.COUNTER_QUERY_ERR.increase(1L); - } else { + } else if (ctx.getState().getStateType() == MysqlStateType.OK) { // ok query MetricRepo.HISTO_QUERY_LATENCY.update(elapseMs); if (elapseMs > Config.qe_slow_log_ms) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectScheduler.java index c9eed7286a..32ea4f0911 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectScheduler.java @@ -57,7 +57,7 @@ public class ConnectScheduler { // Let timeout is 10m, and 5000 qps, then there are up to 3000000 tasks in scheduler. // 2. Use a thread to poll maybe lose some accurate, but is enough to us. private final ScheduledExecutorService checkTimer = ThreadPoolManager.newDaemonScheduledThreadPool(1, - "Connect-Scheduler-Check-Timer", true); + "connect-scheduler-check-timer", true); public ConnectScheduler(int maxConnections) { this.maxConnections = maxConnections; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index 5bfa5fa45c..b6f1345bbc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -81,7 +81,6 @@ import org.apache.doris.common.util.RuntimeProfile; import org.apache.doris.common.util.SqlParserUtils; import org.apache.doris.common.util.TimeUtils; import org.apache.doris.load.EtlJobType; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.MysqlChannel; import org.apache.doris.mysql.MysqlEofPacket; import org.apache.doris.mysql.MysqlSerializer; @@ -421,7 +420,6 @@ public class StmtExecutor implements ProfileWriter { } } - MetricRepo.COUNTER_QUERY_BEGIN.increase(1L); int retryTime = Config.max_query_retry_time; for (int i = 0; i < retryTime; i++) { try { @@ -1403,7 +1401,6 @@ public class StmtExecutor implements ProfileWriter { TabletCommitInfo.fromThrift(coord.getCommitInfos()), context.getSessionVariable().getInsertVisibleTimeoutMs())) { txnStatus = TransactionStatus.VISIBLE; - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); } else { txnStatus = TransactionStatus.COMMITTED; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/cache/CacheAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/qe/cache/CacheAnalyzer.java index 380e515bef..b29fef8f30 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/cache/CacheAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/cache/CacheAnalyzer.java @@ -234,7 +234,7 @@ public class CacheAnalyzer { Config.cache_last_version_interval_second * 1000); cache = new SqlCache(this.queryId, this.selectStmt); ((SqlCache) cache).setCacheInfo(this.latestTable, allViewExpandStmtListStr); - MetricRepo.COUNTER_CACHE_MODE_SQL.increase(1L); + MetricRepo.COUNTER_CACHE_ADDED_SQL.increase(1L); return CacheMode.Sql; } @@ -284,7 +284,7 @@ public class CacheAnalyzer { cache = new PartitionCache(this.queryId, this.selectStmt); ((PartitionCache) cache).setCacheInfo(this.latestTable, this.partitionInfo, this.partColumn, this.partitionPredicate, allViewExpandStmtListStr); - MetricRepo.COUNTER_CACHE_MODE_PARTITION.increase(1L); + MetricRepo.COUNTER_CACHE_ADDED_PARTITION.increase(1L); return CacheMode.Partition; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/cache/PartitionCache.java b/fe/fe-core/src/main/java/org/apache/doris/qe/cache/PartitionCache.java index b86444a088..9113bfec82 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/cache/PartitionCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/cache/PartitionCache.java @@ -106,8 +106,6 @@ public class PartitionCache extends Cache { } cacheResult = cacheResult.toBuilder().setAllCount(range.getPartitionSingleList().size()).build(); MetricRepo.COUNTER_CACHE_HIT_PARTITION.increase(1L); - MetricRepo.COUNTER_CACHE_PARTITION_ALL.increase((long) range.getPartitionSingleList().size()); - MetricRepo.COUNTER_CACHE_PARTITION_HIT.increase((long) cacheResult.getValuesList().size()); } range.setTooNewByID(latestTable.latestPartitionId); diff --git a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java index 48627907aa..4625cc2b7d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/service/FrontendServiceImpl.java @@ -45,7 +45,6 @@ import org.apache.doris.common.Version; import org.apache.doris.datasource.DataSourceIf; import org.apache.doris.datasource.InternalDataSource; import org.apache.doris.master.MasterImpl; -import org.apache.doris.metric.MetricRepo; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.planner.StreamLoadPlanner; import org.apache.doris.policy.Policy; @@ -596,7 +595,6 @@ public class FrontendServiceImpl implements FrontendService.Iface { OlapTable table = (OlapTable) db.getTableOrMetaException(request.tbl, TableType.OLAP); // begin long timeoutSecond = request.isSetTimeout() ? request.getTimeout() : Config.stream_load_default_timeout_second; - MetricRepo.COUNTER_LOAD_ADD.increase(1L); long txnId = Env.getCurrentGlobalTransactionMgr().beginTransaction( db.getId(), Lists.newArrayList(table.getId()), request.getLabel(), request.getRequestId(), new TxnCoordinator(TxnSourceType.BE, clientIp), @@ -803,15 +801,10 @@ public class FrontendServiceImpl implements FrontendService.Iface { long timeoutMs = request.isSetThriftRpcTimeoutMs() ? request.getThriftRpcTimeoutMs() / 2 : 5000; Table table = db.getTableOrMetaException(request.getTbl(), TableType.OLAP); - boolean ret = Env.getCurrentGlobalTransactionMgr() - .commitAndPublishTransaction((Database) db, Lists.newArrayList(table), request.getTxnId(), + return Env.getCurrentGlobalTransactionMgr() + .commitAndPublishTransaction(db, Lists.newArrayList(table), request.getTxnId(), TabletCommitInfo.fromThrift(request.getCommitInfos()), timeoutMs, TxnCommitAttachment.fromThrift(request.txnCommitAttachment)); - if (ret) { - // if commit and publish is success, load can be regarded as success - MetricRepo.COUNTER_LOAD_FINISHED.increase(1L); - } - return ret; } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/task/MasterTaskExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/task/MasterTaskExecutor.java index 938fb44f58..440c45bb98 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/task/MasterTaskExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/task/MasterTaskExecutor.java @@ -40,19 +40,18 @@ public class MasterTaskExecutor { public ScheduledThreadPoolExecutor scheduledThreadPool; public MasterTaskExecutor(String name, int threadNum, boolean needRegisterMetric) { - executor = ThreadPoolManager.newDaemonFixedThreadPool( - threadNum, threadNum * 2, name + "_pool", needRegisterMetric); + executor = ThreadPoolManager.newDaemonFixedThreadPool(threadNum, threadNum * 2, name + "-pool", + needRegisterMetric); runningTasks = Maps.newHashMap(); - scheduledThreadPool = ThreadPoolManager.newDaemonScheduledThreadPool( - 1, name + "_scheduler_thread_pool", needRegisterMetric); + scheduledThreadPool = ThreadPoolManager.newDaemonScheduledThreadPool(1, name + "-scheduler-thread-pool", + needRegisterMetric); } public MasterTaskExecutor(String name, int threadNum, int queueSize, boolean needRegisterMetric) { - executor = ThreadPoolManager.newDaemonFixedThreadPool( - threadNum, queueSize, name + "_pool", needRegisterMetric); + executor = ThreadPoolManager.newDaemonFixedThreadPool(threadNum, queueSize, name + "-pool", needRegisterMetric); runningTasks = Maps.newHashMap(); - scheduledThreadPool = ThreadPoolManager.newDaemonScheduledThreadPool( - 1, name + "_scheduler_thread_pool", needRegisterMetric); + scheduledThreadPool = ThreadPoolManager.newDaemonScheduledThreadPool(1, name + "-scheduler-thread-pool", + needRegisterMetric); } public boolean hasIdleThread() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index 4d5b2fd8e9..c4b4e3954c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -1772,4 +1772,19 @@ public class DatabaseTransactionMgr { LOG.info("clean {} labels on db {} with label '{}' in database transaction mgr.", removedTxnIds.size(), dbId, label); } + + public long getTxnNumByStatus(TransactionStatus status) { + readLock(); + try { + if (idToRunningTransactionState.size() > 10000) { + return idToRunningTransactionState.values().parallelStream() + .filter(t -> t.getTransactionStatus() == status).count(); + } else { + return idToRunningTransactionState.values().stream().filter(t -> t.getTransactionStatus() == status) + .count(); + } + } finally { + readUnlock(); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java index b170d31575..4eae5c817a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java @@ -481,6 +481,14 @@ public class GlobalTransactionMgr implements Writable { return dbTransactionMgr.getTxnStateInfoList(status); } + public long getTxnNumByStatus(TransactionStatus status) { + long counter = 0; + for (DatabaseTransactionMgr dbMgr : dbIdToDatabaseTransactionMgrs.values()) { + counter += dbMgr.getTxnNumByStatus(status); + } + return counter; + } + // get show info of a specified txnId public List> getSingleTranInfo(long dbId, long txnId) throws AnalysisException { DatabaseTransactionMgr dbTransactionMgr = getDatabaseTransactionMgr(dbId); diff --git a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/LoadJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/LoadJobTest.java index e842e3a0bc..c14f083736 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/LoadJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/LoadJobTest.java @@ -166,8 +166,6 @@ public class LoadJobTest { public void testUpdateStateToFinished(@Mocked MetricRepo metricRepo, @Injectable LoadTask loadTask1, @Mocked LongCounterMetric longCounterMetric) { - - MetricRepo.COUNTER_LOAD_FINISHED = longCounterMetric; LoadJob loadJob = new BrokerLoadJob(); loadJob.idToTasks.put(1L, loadTask1);