[improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)

The regression test may failed because of heartbeat failure occasionally.
So I add 2 new FE config to relax this limit

1. `disable_backend_black_list`
    Set to true to not put Backend to black list even if we failed to send task to it. Default is false.
2. `max_backend_heartbeat_failure_tolerance_count`
   Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1.
This commit is contained in:
Mingyu Chen
2022-10-22 17:53:07 +08:00
committed by GitHub
parent 20ade4ae96
commit 413d2332ce
8 changed files with 95 additions and 7 deletions

View File

@ -2242,3 +2242,25 @@ Default: 100
Is it possible to dynamically configure: true
Is it a configuration item unique to the Master FE node: false
### `disable_backend_black_list`
Used to disable the BE blacklist function. After this function is disabled, if the query request to the BE fails, the BE will not be added to the blacklist.
This parameter is suitable for regression testing environments to reduce occasional bugs that cause a large number of regression tests to fail.
Default: false
Is it possible to configure dynamically: true
Is it a configuration item unique to the Master FE node: false
### `max_backend_heartbeat_failure_tolerance_count`
The maximum tolerable number of BE node heartbeat failures. If the number of consecutive heartbeat failures exceeds this value, the BE state will be set to dead.
This parameter is suitable for regression test environments to reduce occasional heartbeat failures that cause a large number of regression test failures.
Default: 1
Is it possible to configure dynamically: true
Whether it is a configuration item unique to the Master FE node: true

View File

@ -2297,3 +2297,25 @@ load 标签清理器将每隔 `label_clean_interval_second` 运行一次以清
是否可以动态配置:true
是否为 Master FE 节点独有的配置项:false
### `disable_backend_black_list`
用于禁止BE黑名单功能。禁止该功能后,如果向BE发送查询请求失败,也不会将这个BE添加到黑名单。
该参数适用于回归测试环境,以减少偶发的错误导致大量回归测试失败。
默认值:false
是否可以动态配置:true
是否为 Master FE 节点独有的配置项:false
### `max_backend_heartbeat_failure_tolerance_count`
最大可容忍的BE节点心跳失败次数。如果连续心跳失败次数超过这个值,则会将BE状态置为 dead。
该参数适用于回归测试环境,以减少偶发的心跳失败导致大量回归测试失败。
默认值:1
是否可以动态配置:true
是否为 Master FE 节点独有的配置项:true

View File

@ -1794,4 +1794,21 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = false)
public static int max_query_profile_num = 100;
/**
* Set to true to disable backend black list, so that even if we failed to send task to a backend,
* that backend won't be added to black list.
* This should only be set when running tests, such as regression test.
* Highly recommended NOT disable it in product environment.
*/
@ConfField(mutable = true, masterOnly = false)
public static boolean disable_backend_black_list = false;
/**
* Maximum backend heartbeat failure tolerance count.
* Default is 1, which means if 1 heart failed, the backend will be marked as dead.
* A larger value can improve the tolerance of the cluster to occasional heartbeat failures.
* For example, when running regression tests, this value can be increased.
*/
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;
}

View File

@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface {
.add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum")
.add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct")
.add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status")
.add("HeartbeatFailureCounter")
.build();
public static final int HOSTNAME_INDEX = 3;
@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface {
backendInfo.add(backend.getVersion());
// status
backendInfo.add(new Gson().toJson(backend.getBackendStatus()));
// heartbeat failure counter
backendInfo.add(backend.getHeartbeatFailureCounter());
comparableBackendInfos.add(backendInfo);
}

View File

@ -2150,7 +2150,7 @@ public class Coordinator {
}
public boolean isBackendStateHealthy() {
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) {
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) {
LOG.warn("backend {} is down while joining the coordinator. job id: {}",
backend.getId(), jobId);
return false;

View File

@ -18,6 +18,7 @@
package org.apache.doris.qe;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.common.Reference;
@ -169,7 +170,9 @@ public class SimpleScheduler {
}
public static void addToBlacklist(Long backendID, String reason) {
if (backendID == null) {
if (backendID == null || Config.disable_backend_black_list) {
LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID,
Config.disable_backend_black_list);
return;
}

View File

@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.DiskInfo.DiskState;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
@ -128,6 +129,14 @@ public class Backend implements Writable {
@SerializedName("tagMap")
private Map<String, String> tagMap = Maps.newHashMap();
// Counter of heartbeat failure.
// Once a heartbeat failed, increase this counter by one.
// And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend
// will be marked as dead.
// And once it back to alive, reset this counter.
// No need to persist, because only master FE handle heartbeat.
private int heartbeatFailureCounter = 0;
public Backend() {
this.host = "";
this.version = "";
@ -333,6 +342,10 @@ public class Backend implements Writable {
return backendStatus;
}
public int getHeartbeatFailureCounter() {
return heartbeatFailureCounter;
}
/**
* backend belong to some cluster
*
@ -690,12 +703,19 @@ public class Backend implements Writable {
}
heartbeatErrMsg = "";
this.heartbeatFailureCounter = 0;
} else {
if (isAlive.compareAndSet(true, false)) {
isChanged = true;
LOG.warn("{} is dead,", this.toString());
// Only set backend to dead if the heartbeat failure counter exceed threshold.
if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
if (isAlive.compareAndSet(true, false)) {
isChanged = true;
LOG.warn("{} is dead,", this.toString());
}
}
// still set error msg and missing time even if we may not mark this backend as dead,
// for debug easily.
// But notice that if isChanged = false, these msg will not sync to other FE.
heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg();
lastMissingHeartbeatTime = System.currentTimeMillis();
}

View File

@ -199,8 +199,9 @@ public class DemoMultiBackendsTest {
ProcResult result = dir.fetchResult();
Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size());
Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20));
Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1));
Assert.assertEquals(
"{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2));
}
private static void updateReplicaPathHash() {