[improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)

The regression test may failed because of heartbeat failure occasionally. So I add 2 new FE config to relax this limit 1. `disable_backend_black_list` Set to true to not put Backend to black list even if we failed to send task to it. Default is false. 2. `max_backend_heartbeat_failure_tolerance_count` Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1.
2022-10-22 17:53:07 +08:00
parent 20ade4ae96
commit 413d2332ce
8 changed files with 95 additions and 7 deletions
--- a/docs/en/docs/admin-manual/config/fe-config.md
+++ b/docs/en/docs/admin-manual/config/fe-config.md
@ -2242,3 +2242,25 @@ Default: 100
 Is it possible to dynamically configure: true

 Is it a configuration item unique to the Master FE node: false
+
+### `disable_backend_black_list`
+
+Used to disable the BE blacklist function. After this function is disabled, if the query request to the BE fails, the BE will not be added to the blacklist.
+This parameter is suitable for regression testing environments to reduce occasional bugs that cause a large number of regression tests to fail.
+
+Default: false
+
+Is it possible to configure dynamically: true
+
+Is it a configuration item unique to the Master FE node: false
+
+### `max_backend_heartbeat_failure_tolerance_count`
+
+The maximum tolerable number of BE node heartbeat failures. If the number of consecutive heartbeat failures exceeds this value, the BE state will be set to dead.
+This parameter is suitable for regression test environments to reduce occasional heartbeat failures that cause a large number of regression test failures.
+
+Default: 1
+
+Is it possible to configure dynamically: true
+
+Whether it is a configuration item unique to the Master FE node: true
--- a/docs/zh-CN/docs/admin-manual/config/fe-config.md
+++ b/docs/zh-CN/docs/admin-manual/config/fe-config.md
@ -2297,3 +2297,25 @@ load 标签清理器将每隔 `label_clean_interval_second` 运行一次以清
 是否可以动态配置：true

 是否为 Master FE 节点独有的配置项：false
+
+### `disable_backend_black_list`
+
+用于禁止BE黑名单功能。禁止该功能后，如果向BE发送查询请求失败，也不会将这个BE添加到黑名单。
+该参数适用于回归测试环境，以减少偶发的错误导致大量回归测试失败。
+
+默认值：false
+
+是否可以动态配置：true
+
+是否为 Master FE 节点独有的配置项：false
+
+### `max_backend_heartbeat_failure_tolerance_count`
+
+最大可容忍的BE节点心跳失败次数。如果连续心跳失败次数超过这个值，则会将BE状态置为 dead。
+该参数适用于回归测试环境，以减少偶发的心跳失败导致大量回归测试失败。
+
+默认值：1
+
+是否可以动态配置：true
+
+是否为 Master FE 节点独有的配置项：true
--- a/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/Config.java
@ -1794,4 +1794,21 @@ public class Config extends ConfigBase {
    @ConfField(mutable = true, masterOnly = false)
    public static int max_query_profile_num = 100;

+    /**
+     * Set to true to disable backend black list, so that even if we failed to send task to a backend,
+     * that backend won't be added to black list.
+     * This should only be set when running tests, such as regression test.
+     * Highly recommended NOT disable it in product environment.
+     */
+    @ConfField(mutable = true, masterOnly = false)
+    public static boolean disable_backend_black_list = false;
+
+    /**
+     * Maximum backend heartbeat failure tolerance count.
+     * Default is 1, which means if 1 heart failed, the backend will be marked as dead.
+     * A larger value can improve the tolerance of the cluster to occasional heartbeat failures.
+     * For example, when running regression tests, this value can be increased.
+     */
+    @ConfField(mutable = true, masterOnly = true)
+    public static long max_backend_heartbeat_failure_tolerance_count = 1;
 }
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/BackendsProcDir.java
@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface {
            .add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum")
            .add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct")
            .add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status")
+            .add("HeartbeatFailureCounter")
            .build();

    public static final int HOSTNAME_INDEX = 3;
@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface {
            backendInfo.add(backend.getVersion());
            // status
            backendInfo.add(new Gson().toJson(backend.getBackendStatus()));
+            // heartbeat failure counter
+            backendInfo.add(backend.getHeartbeatFailureCounter());

            comparableBackendInfos.add(backendInfo);
        }
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java
@ -2150,7 +2150,7 @@ public class Coordinator {
        }

        public boolean isBackendStateHealthy() {
-            if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) {
+            if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) {
                LOG.warn("backend {} is down while joining the coordinator. job id: {}",
                        backend.getId(), jobId);
                return false;
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SimpleScheduler.java
@ -18,6 +18,7 @@
 package org.apache.doris.qe;

 import org.apache.doris.catalog.Env;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.Pair;
 import org.apache.doris.common.Reference;
@ -169,7 +170,9 @@ public class SimpleScheduler {
    }

    public static void addToBlacklist(Long backendID, String reason) {
-        if (backendID == null) {
+        if (backendID == null || Config.disable_backend_black_list) {
+            LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID,
+                    Config.disable_backend_black_list);
            return;
        }

--- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType;
 import org.apache.doris.catalog.DiskInfo;
 import org.apache.doris.catalog.DiskInfo.DiskState;
 import org.apache.doris.catalog.Env;
+import org.apache.doris.common.Config;
 import org.apache.doris.common.FeConstants;
 import org.apache.doris.common.io.Text;
 import org.apache.doris.common.io.Writable;
@ -128,6 +129,14 @@ public class Backend implements Writable {
    @SerializedName("tagMap")
    private Map<String, String> tagMap = Maps.newHashMap();

+    // Counter of heartbeat failure.
+    // Once a heartbeat failed, increase this counter by one.
+    // And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend
+    // will be marked as dead.
+    // And once it back to alive, reset this counter.
+    // No need to persist, because only master FE handle heartbeat.
+    private int heartbeatFailureCounter = 0;
+
    public Backend() {
        this.host = "";
        this.version = "";
@ -333,6 +342,10 @@ public class Backend implements Writable {
        return backendStatus;
    }

+    public int getHeartbeatFailureCounter() {
+        return heartbeatFailureCounter;
+    }
+
    /**
     * backend belong to some cluster
     *
@ -690,12 +703,19 @@ public class Backend implements Writable {
            }

            heartbeatErrMsg = "";
+            this.heartbeatFailureCounter = 0;
        } else {
-            if (isAlive.compareAndSet(true, false)) {
-                isChanged = true;
-                LOG.warn("{} is dead,", this.toString());
+            // Only set backend to dead if the heartbeat failure counter exceed threshold.
+            if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
+                if (isAlive.compareAndSet(true, false)) {
+                    isChanged = true;
+                    LOG.warn("{} is dead,", this.toString());
+                }
            }

+            // still set error msg and missing time even if we may not mark this backend as dead,
+            // for debug easily.
+            // But notice that if isChanged = false, these msg will not sync to other FE.
            heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg();
            lastMissingHeartbeatTime = System.currentTimeMillis();
        }
--- a/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/utframe/DemoMultiBackendsTest.java
@ -199,8 +199,9 @@ public class DemoMultiBackendsTest {
        ProcResult result = dir.fetchResult();
        Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size());
        Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20));
-        Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
-                result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1));
+        Assert.assertEquals(
+                "{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
+                result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2));
    }

    private static void updateReplicaPathHash() {