From 413d2332ce5a8bba423fd14ff6d56724ee6da385 Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Sat, 22 Oct 2022 17:53:07 +0800 Subject: [PATCH] [improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568) The regression test may failed because of heartbeat failure occasionally. So I add 2 new FE config to relax this limit 1. `disable_backend_black_list` Set to true to not put Backend to black list even if we failed to send task to it. Default is false. 2. `max_backend_heartbeat_failure_tolerance_count` Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1. --- docs/en/docs/admin-manual/config/fe-config.md | 22 ++++++++++++++++ .../docs/admin-manual/config/fe-config.md | 22 ++++++++++++++++ .../java/org/apache/doris/common/Config.java | 17 ++++++++++++ .../doris/common/proc/BackendsProcDir.java | 3 +++ .../java/org/apache/doris/qe/Coordinator.java | 2 +- .../org/apache/doris/qe/SimpleScheduler.java | 5 +++- .../java/org/apache/doris/system/Backend.java | 26 ++++++++++++++++--- .../doris/utframe/DemoMultiBackendsTest.java | 5 ++-- 8 files changed, 95 insertions(+), 7 deletions(-) diff --git a/docs/en/docs/admin-manual/config/fe-config.md b/docs/en/docs/admin-manual/config/fe-config.md index 674437e670..a4de6ececb 100644 --- a/docs/en/docs/admin-manual/config/fe-config.md +++ b/docs/en/docs/admin-manual/config/fe-config.md @@ -2242,3 +2242,25 @@ Default: 100 Is it possible to dynamically configure: true Is it a configuration item unique to the Master FE node: false + +### `disable_backend_black_list` + +Used to disable the BE blacklist function. After this function is disabled, if the query request to the BE fails, the BE will not be added to the blacklist. +This parameter is suitable for regression testing environments to reduce occasional bugs that cause a large number of regression tests to fail. + +Default: false + +Is it possible to configure dynamically: true + +Is it a configuration item unique to the Master FE node: false + +### `max_backend_heartbeat_failure_tolerance_count` + +The maximum tolerable number of BE node heartbeat failures. If the number of consecutive heartbeat failures exceeds this value, the BE state will be set to dead. +This parameter is suitable for regression test environments to reduce occasional heartbeat failures that cause a large number of regression test failures. + +Default: 1 + +Is it possible to configure dynamically: true + +Whether it is a configuration item unique to the Master FE node: true diff --git a/docs/zh-CN/docs/admin-manual/config/fe-config.md b/docs/zh-CN/docs/admin-manual/config/fe-config.md index bc00e0d649..6608b58845 100644 --- a/docs/zh-CN/docs/admin-manual/config/fe-config.md +++ b/docs/zh-CN/docs/admin-manual/config/fe-config.md @@ -2297,3 +2297,25 @@ load 标签清理器将每隔 `label_clean_interval_second` 运行一次以清 是否可以动态配置:true 是否为 Master FE 节点独有的配置项:false + +### `disable_backend_black_list` + +用于禁止BE黑名单功能。禁止该功能后,如果向BE发送查询请求失败,也不会将这个BE添加到黑名单。 +该参数适用于回归测试环境,以减少偶发的错误导致大量回归测试失败。 + +默认值:false + +是否可以动态配置:true + +是否为 Master FE 节点独有的配置项:false + +### `max_backend_heartbeat_failure_tolerance_count` + +最大可容忍的BE节点心跳失败次数。如果连续心跳失败次数超过这个值,则会将BE状态置为 dead。 +该参数适用于回归测试环境,以减少偶发的心跳失败导致大量回归测试失败。 + +默认值:1 + +是否可以动态配置:true + +是否为 Master FE 节点独有的配置项:true diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java index 871fd0470f..de1d8015ec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java @@ -1794,4 +1794,21 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = false) public static int max_query_profile_num = 100; + /** + * Set to true to disable backend black list, so that even if we failed to send task to a backend, + * that backend won't be added to black list. + * This should only be set when running tests, such as regression test. + * Highly recommended NOT disable it in product environment. + */ + @ConfField(mutable = true, masterOnly = false) + public static boolean disable_backend_black_list = false; + + /** + * Maximum backend heartbeat failure tolerance count. + * Default is 1, which means if 1 heart failed, the backend will be marked as dead. + * A larger value can improve the tolerance of the cluster to occasional heartbeat failures. + * For example, when running regression tests, this value can be increased. + */ + @ConfField(mutable = true, masterOnly = true) + public static long max_backend_heartbeat_failure_tolerance_count = 1; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java b/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java index aa3c9821b0..096cf57c0b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java @@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface { .add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum") .add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct") .add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status") + .add("HeartbeatFailureCounter") .build(); public static final int HOSTNAME_INDEX = 3; @@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface { backendInfo.add(backend.getVersion()); // status backendInfo.add(new Gson().toJson(backend.getBackendStatus())); + // heartbeat failure counter + backendInfo.add(backend.getHeartbeatFailureCounter()); comparableBackendInfos.add(backendInfo); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index cd3296c354..13809d5633 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -2150,7 +2150,7 @@ public class Coordinator { } public boolean isBackendStateHealthy() { - if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) { + if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) { LOG.warn("backend {} is down while joining the coordinator. job id: {}", backend.getId(), jobId); return false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java index 1b1cc6bfa4..69f4408848 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java @@ -18,6 +18,7 @@ package org.apache.doris.qe; import org.apache.doris.catalog.Env; +import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; import org.apache.doris.common.Reference; @@ -169,7 +170,9 @@ public class SimpleScheduler { } public static void addToBlacklist(Long backendID, String reason) { - if (backendID == null) { + if (backendID == null || Config.disable_backend_black_list) { + LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID, + Config.disable_backend_black_list); return; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java index fc5758374c..f705f2a2b1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java @@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType; import org.apache.doris.catalog.DiskInfo; import org.apache.doris.catalog.DiskInfo.DiskState; import org.apache.doris.catalog.Env; +import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; @@ -128,6 +129,14 @@ public class Backend implements Writable { @SerializedName("tagMap") private Map tagMap = Maps.newHashMap(); + // Counter of heartbeat failure. + // Once a heartbeat failed, increase this counter by one. + // And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend + // will be marked as dead. + // And once it back to alive, reset this counter. + // No need to persist, because only master FE handle heartbeat. + private int heartbeatFailureCounter = 0; + public Backend() { this.host = ""; this.version = ""; @@ -333,6 +342,10 @@ public class Backend implements Writable { return backendStatus; } + public int getHeartbeatFailureCounter() { + return heartbeatFailureCounter; + } + /** * backend belong to some cluster * @@ -690,12 +703,19 @@ public class Backend implements Writable { } heartbeatErrMsg = ""; + this.heartbeatFailureCounter = 0; } else { - if (isAlive.compareAndSet(true, false)) { - isChanged = true; - LOG.warn("{} is dead,", this.toString()); + // Only set backend to dead if the heartbeat failure counter exceed threshold. + if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) { + if (isAlive.compareAndSet(true, false)) { + isChanged = true; + LOG.warn("{} is dead,", this.toString()); + } } + // still set error msg and missing time even if we may not mark this backend as dead, + // for debug easily. + // But notice that if isChanged = false, these msg will not sync to other FE. heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg(); lastMissingHeartbeatTime = System.currentTimeMillis(); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java b/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java index 7f619f8267..860abe35e2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java @@ -199,8 +199,9 @@ public class DemoMultiBackendsTest { ProcResult result = dir.fetchResult(); Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size()); Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20)); - Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}", - result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1)); + Assert.assertEquals( + "{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}", + result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2)); } private static void updateReplicaPathHash() {