From c758a25dd818dcad3ec2a3119431f7b4e7c562d9 Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Thu, 28 Mar 2024 23:36:16 +0800 Subject: [PATCH] [opt](fqdn) Add DNS Cache for FE and BE (#32869) In previously, when enabling FQDN, Doris will call dns resolver to get IP from hostname each time when 1) FE gets BE's grpc client. 2) BE gets other BE's brpc client. So when in high concurrency case, the dns resolver be overloaded and failed to resolve hostname. This PR mainly changes: 1. Add DNSCache for both FE and BE. The DNSCache will run on every FE and BE node. It has a cache, key is hostname and value is IP. Caller can get IP by hostname from this cache, and if hostname does not exist, it will try to resolve it and update the cache. In addition, DNSCache has a daemon thread to refresh the cache every 1 min, in case that the IP may be changed at anytime. There are other implements of this dns cache: 1. https://github.com/kaka11chen/doris/commit/36fed139974ed52dfa61b656f3e4d64f56a4185a This is for BE side, but it does not handle the IP change case. 3. https://github.com/apache/doris/pull/28479 This is for FE side, but it can only work with Master FE. Other FE node will not be aware of the IP change. And there are a bunch of BackendServiceProxy, this PR only handle cache in one of them. --- be/src/common/config.cpp | 2 + be/src/common/config.h | 4 + be/src/runtime/client_cache.cpp | 2 +- be/src/runtime/exec_env.h | 4 + be/src/runtime/exec_env_init.cpp | 3 + be/src/util/brpc_client_cache.h | 4 +- be/src/util/dns_cache.cpp | 84 ++++++++++++++++ be/src/util/dns_cache.h | 57 +++++++++++ .../java/org/apache/doris/catalog/Env.java | 10 ++ .../org/apache/doris/common/DNSCache.java | 95 +++++++++++++++++++ .../apache/doris/common/util/NetUtils.java | 16 +++- .../apache/doris/rpc/BackendServiceProxy.java | 4 +- 12 files changed, 278 insertions(+), 7 deletions(-) create mode 100644 be/src/util/dns_cache.cpp create mode 100644 be/src/util/dns_cache.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index d577f5994d..1d26085db3 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1188,6 +1188,8 @@ DEFINE_mInt32(table_sink_partition_write_max_partition_nums_per_writer, "128"); /** Hive sink configurations **/ DEFINE_mInt64(hive_sink_max_file_size, "1073741824"); // 1GB +DEFINE_mInt32(thrift_client_open_num_tries, "1"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 951fd62f87..5650db764f 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1263,6 +1263,10 @@ DECLARE_mInt32(table_sink_partition_write_max_partition_nums_per_writer); /** Hive sink configurations **/ DECLARE_mInt64(hive_sink_max_file_size); // 1GB +// Number of open tries, default 1 means only try to open once. +// Retry the Open num_retries time waiting 100 milliseconds between retries. +DECLARE_mInt32(thrift_client_open_num_tries); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/runtime/client_cache.cpp b/be/src/runtime/client_cache.cpp index 3da31caf5c..ea7b43b610 100644 --- a/be/src/runtime/client_cache.cpp +++ b/be/src/runtime/client_cache.cpp @@ -114,7 +114,7 @@ Status ClientCacheHelper::_create_client(const TNetworkAddress& hostport, client_impl->set_conn_timeout(config::thrift_connect_timeout_seconds * 1000); - Status status = client_impl->open(); + Status status = client_impl->open_with_retry(config::thrift_client_open_num_tries, 100); if (!status.ok()) { *client_key = nullptr; diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index ea665f3f5a..f8b2ecfa6a 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -105,6 +105,7 @@ class RowCache; class DummyLRUCache; class CacheManager; class WalManager; +class DNSCache; inline bool k_doris_exit = false; @@ -214,6 +215,8 @@ public: FileMetaCache* file_meta_cache() { return _file_meta_cache; } MemTableMemoryLimiter* memtable_memory_limiter() { return _memtable_memory_limiter.get(); } WalManager* wal_mgr() { return _wal_manager.get(); } + DNSCache* dns_cache() { return _dns_cache; } + #ifdef BE_TEST void set_ready() { this->_s_ready = true; } void set_not_ready() { this->_s_ready = false; } @@ -363,6 +366,7 @@ private: std::unique_ptr _load_stream_map_pool; std::unique_ptr _delta_writer_v2_pool; std::shared_ptr _wal_manager; + DNSCache* _dns_cache = nullptr; std::mutex _frontends_lock; // ip:brpc_port -> frontend_indo diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 1da172716d..89040a342f 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -87,6 +87,7 @@ #include "util/brpc_client_cache.h" #include "util/cpu_info.h" #include "util/disk_info.h" +#include "util/dns_cache.h" #include "util/doris_metrics.h" #include "util/mem_info.h" #include "util/metrics.h" @@ -232,6 +233,7 @@ Status ExecEnv::_init(const std::vector& store_paths, _load_stream_map_pool = std::make_unique(); _delta_writer_v2_pool = std::make_unique(); _wal_manager = WalManager::create_shared(this, config::group_commit_wal_path); + _dns_cache = new DNSCache(); _spill_stream_mgr = new vectorized::SpillStreamManager(spill_store_paths); _backend_client_cache->init_metrics("backend"); @@ -554,6 +556,7 @@ void ExecEnv::destroy() { _delta_writer_v2_pool.reset(); _load_stream_map_pool.reset(); SAFE_STOP(_storage_engine); + SAFE_DELETE(_dns_cache); SAFE_STOP(_spill_stream_mgr); SAFE_SHUTDOWN(_buffered_reader_prefetch_thread_pool); SAFE_SHUTDOWN(_s3_file_upload_thread_pool); diff --git a/be/src/util/brpc_client_cache.h b/be/src/util/brpc_client_cache.h index ff53785085..7b313d6ae0 100644 --- a/be/src/util/brpc_client_cache.h +++ b/be/src/util/brpc_client_cache.h @@ -40,6 +40,8 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" +#include "runtime/exec_env.h" +#include "util/dns_cache.h" #include "util/network_util.h" namespace doris { @@ -79,7 +81,7 @@ public: std::string realhost; realhost = host; if (!is_valid_ip(host)) { - Status status = hostname_to_ip(host, realhost); + Status status = ExecEnv::GetInstance()->dns_cache()->get(host, &realhost); if (!status.ok()) { LOG(WARNING) << "failed to get ip from host:" << status.to_string(); return nullptr; diff --git a/be/src/util/dns_cache.cpp b/be/src/util/dns_cache.cpp new file mode 100644 index 0000000000..f2bd4ce91e --- /dev/null +++ b/be/src/util/dns_cache.cpp @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/dns_cache.h" + +#include "service/backend_options.h" +#include "util/network_util.h" + +namespace doris { + +DNSCache::DNSCache() { + refresh_thread = std::thread(&DNSCache::_refresh_cache, this); + refresh_thread.detach(); +} + +DNSCache::~DNSCache() { + stop_refresh = true; + if (refresh_thread.joinable()) { + refresh_thread.join(); + } +} + +Status DNSCache::get(const std::string& hostname, std::string* ip) { + { + std::shared_lock lock(mutex); + auto it = cache.find(hostname); + if (it != cache.end()) { + *ip = it->second; + return Status::OK(); + } + } + // Update if not found + RETURN_IF_ERROR(_update(hostname)); + { + std::shared_lock lock(mutex); + *ip = cache[hostname]; + return Status::OK(); + } +} + +Status DNSCache::_update(const std::string& hostname) { + std::string real_ip = ""; + RETURN_IF_ERROR(hostname_to_ip(hostname, real_ip, BackendOptions::is_bind_ipv6())); + std::unique_lock lock(mutex); + auto it = cache.find(hostname); + if (it == cache.end() || it->second != real_ip) { + cache[hostname] = real_ip; + LOG(INFO) << "update hostname " << hostname << "'s ip to " << real_ip; + } + return Status::OK(); +} + +void DNSCache::_refresh_cache() { + while (!stop_refresh) { + // refresh every 1 min + std::this_thread::sleep_for(std::chrono::minutes(1)); + std::unordered_set keys; + { + std::shared_lock lock(mutex); + std::transform(cache.begin(), cache.end(), std::inserter(keys, keys.end()), + [](const auto& pair) { return pair.first; }); + } + Status st; + for (auto& key : keys) { + st = _update(key); + } + } +} + +} // end of namespace doris diff --git a/be/src/util/dns_cache.h b/be/src/util/dns_cache.h new file mode 100644 index 0000000000..5dc413c53e --- /dev/null +++ b/be/src/util/dns_cache.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" + +namespace doris { + +// Same as +// fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java +class DNSCache { +public: + DNSCache(); + ~DNSCache(); + + // get ip by hostname + Status get(const std::string& hostname, std::string* ip); + +private: + // update the ip of hostname in cache + Status _update(const std::string& hostname); + + // a function for refresh daemon thread + // update cache at fix internal + void _refresh_cache(); + +private: + // hostname -> ip + std::unordered_map cache; + mutable std::shared_mutex mutex; + std::thread refresh_thread; + bool stop_refresh = false; +}; + +} // end of namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 0f0eab8b67..dd6ec52bac 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -98,6 +98,7 @@ import org.apache.doris.common.ClientPool; import org.apache.doris.common.Config; import org.apache.doris.common.ConfigBase; import org.apache.doris.common.ConfigException; +import org.apache.doris.common.DNSCache; import org.apache.doris.common.DdlException; import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; @@ -526,6 +527,8 @@ public class Env { private InsertOverwriteManager insertOverwriteManager; + private DNSCache dnsCache; + public List getFrontendInfos() { List res = new ArrayList<>(); @@ -760,6 +763,7 @@ public class Env { "TopicPublisher", Config.publish_topic_info_interval_ms, systemInfo); this.mtmvService = new MTMVService(); this.insertOverwriteManager = new InsertOverwriteManager(); + this.dnsCache = new DNSCache(); } public static void destroyCheckpoint() { @@ -915,6 +919,10 @@ public class Env { return getCurrentEnv().getHiveTransactionMgr(); } + public DNSCache getDnsCache() { + return dnsCache; + } + // Use tryLock to avoid potential dead lock private boolean tryLock(boolean mustLock) { while (true) { @@ -1685,6 +1693,8 @@ public class Env { if (Config.enable_hms_events_incremental_sync) { metastoreEventsProcessor.start(); } + + dnsCache.start(); } private void transferToNonMaster(FrontendNodeType newType) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java b/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java new file mode 100644 index 0000000000..1fe96eba20 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/common/DNSCache.java @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.common; + +import org.apache.doris.common.util.NetUtils; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.net.UnknownHostException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ScheduledExecutorService; + +/** + * DNSCache is a class that caches DNS lookups and periodically refreshes them. + * It uses a ConcurrentHashMap to store the hostname to IP address mappings and a ScheduledExecutorService + * to periodically refresh these mappings. + */ +public class DNSCache { + private static final Logger LOG = LogManager.getLogger(DNSCache.class); + + private final ConcurrentHashMap cache = new ConcurrentHashMap<>(); + private final ScheduledExecutorService executor = ThreadPoolManager.newDaemonScheduledThreadPool(1, + "dns_cache_pool", true); + + /** + * Check if the enable_fqdn_mode configuration is set. + * If it is, it schedules a task to refresh the DNS cache every 60 seconds, + * starting after an initial delay of 120 seconds. + */ + public void start() { + if (Config.enable_fqdn_mode) { + executor.scheduleAtFixedRate(this::refresh, 120, 60, java.util.concurrent.TimeUnit.SECONDS); + } + } + + /** + * The get method retrieves the IP address for a given hostname from the cache. + * If the hostname is not in the cache, it resolves the hostname to an IP address and stores it in the cache. + * + * @param hostname The hostname for which to get the IP address. + * @return The IP address for the given hostname. + */ + public String get(String hostname) { + return cache.computeIfAbsent(hostname, this::resolveHostname); + } + + /** + * The resolveHostname method resolves a hostname to an IP address. + * If the hostname cannot be resolved, it returns an empty string. + * + * @param hostname The hostname to resolve. + * @return The IP address for the given hostname, or an empty string if the hostname cannot be resolved. + */ + private String resolveHostname(String hostname) { + try { + return NetUtils.getIpByHost(hostname, 0); + } catch (UnknownHostException e) { + return ""; + } + } + + /** + * The refresh method periodically refreshes the DNS cache. + * It iterates over each hostname in the cache, resolves the hostname to an IP address, + * and compares it with the current IP address in the cache. + * If they are different, it updates the cache with the new IP address and logs the change. + */ + private void refresh() { + for (String hostname : cache.keySet()) { + String resolvedHostname = resolveHostname(hostname); + String currentHostname = cache.get(hostname); + if (!resolvedHostname.equals(currentHostname)) { + cache.put(hostname, resolvedHostname); + LOG.info("IP for hostname {} has changed from {} to {}", hostname, currentHostname, + resolvedHostname); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java index 0c1ac130cd..9b787f52bf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/NetUtils.java @@ -95,9 +95,19 @@ public class NetUtils { return hostName; } - public static String getIpByHost(String host) throws UnknownHostException { - InetAddress inetAddress = InetAddress.getByName(host); - return inetAddress.getHostAddress(); + public static String getIpByHost(String host, int retryTimes) throws UnknownHostException { + InetAddress inetAddress; + while (true) { + try { + inetAddress = InetAddress.getByName(host); + return inetAddress.getHostAddress(); + } catch (UnknownHostException e) { + LOG.warn("Get IP by host failed, hostname: {}, remaining retryTimes: {}", host, retryTimes, e); + if (retryTimes-- <= 0) { + throw e; + } + } + } } // This is the implementation is inspired by Apache camel project: diff --git a/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java b/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java index d78e055a1a..af21194263 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java +++ b/fe/fe-core/src/main/java/org/apache/doris/rpc/BackendServiceProxy.java @@ -17,9 +17,9 @@ package org.apache.doris.rpc; +import org.apache.doris.catalog.Env; import org.apache.doris.common.Config; import org.apache.doris.common.ThreadPoolManager; -import org.apache.doris.common.util.NetUtils; import org.apache.doris.metric.MetricRepo; import org.apache.doris.planner.PlanFragmentId; import org.apache.doris.proto.InternalService; @@ -112,7 +112,7 @@ public class BackendServiceProxy { } private BackendServiceClient getProxy(TNetworkAddress address) throws UnknownHostException { - String realIp = NetUtils.getIpByHost(address.getHostname()); + String realIp = Env.getCurrentEnv().getDnsCache().get(address.hostname); BackendServiceClientExtIp serviceClientExtIp = serviceMap.get(address); if (serviceClientExtIp != null && serviceClientExtIp.realIp.equals(realIp) && serviceClientExtIp.client.isNormalState()) {