[improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)

The regression test may failed because of heartbeat failure occasionally.
So I add 2 new FE config to relax this limit

1. `disable_backend_black_list`
    Set to true to not put Backend to black list even if we failed to send task to it. Default is false.
2. `max_backend_heartbeat_failure_tolerance_count`
   Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1.
This commit is contained in:
Mingyu Chen
2022-10-22 17:53:07 +08:00
committed by GitHub
parent 20ade4ae96
commit 413d2332ce
8 changed files with 95 additions and 7 deletions

View File

@ -1794,4 +1794,21 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = false)
public static int max_query_profile_num = 100;
/**
* Set to true to disable backend black list, so that even if we failed to send task to a backend,
* that backend won't be added to black list.
* This should only be set when running tests, such as regression test.
* Highly recommended NOT disable it in product environment.
*/
@ConfField(mutable = true, masterOnly = false)
public static boolean disable_backend_black_list = false;
/**
* Maximum backend heartbeat failure tolerance count.
* Default is 1, which means if 1 heart failed, the backend will be marked as dead.
* A larger value can improve the tolerance of the cluster to occasional heartbeat failures.
* For example, when running regression tests, this value can be increased.
*/
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;
}

View File

@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface {
.add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum")
.add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct")
.add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status")
.add("HeartbeatFailureCounter")
.build();
public static final int HOSTNAME_INDEX = 3;
@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface {
backendInfo.add(backend.getVersion());
// status
backendInfo.add(new Gson().toJson(backend.getBackendStatus()));
// heartbeat failure counter
backendInfo.add(backend.getHeartbeatFailureCounter());
comparableBackendInfos.add(backendInfo);
}

View File

@ -2150,7 +2150,7 @@ public class Coordinator {
}
public boolean isBackendStateHealthy() {
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) {
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) {
LOG.warn("backend {} is down while joining the coordinator. job id: {}",
backend.getId(), jobId);
return false;

View File

@ -18,6 +18,7 @@
package org.apache.doris.qe;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.common.Reference;
@ -169,7 +170,9 @@ public class SimpleScheduler {
}
public static void addToBlacklist(Long backendID, String reason) {
if (backendID == null) {
if (backendID == null || Config.disable_backend_black_list) {
LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID,
Config.disable_backend_black_list);
return;
}

View File

@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType;
import org.apache.doris.catalog.DiskInfo;
import org.apache.doris.catalog.DiskInfo.DiskState;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
@ -128,6 +129,14 @@ public class Backend implements Writable {
@SerializedName("tagMap")
private Map<String, String> tagMap = Maps.newHashMap();
// Counter of heartbeat failure.
// Once a heartbeat failed, increase this counter by one.
// And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend
// will be marked as dead.
// And once it back to alive, reset this counter.
// No need to persist, because only master FE handle heartbeat.
private int heartbeatFailureCounter = 0;
public Backend() {
this.host = "";
this.version = "";
@ -333,6 +342,10 @@ public class Backend implements Writable {
return backendStatus;
}
public int getHeartbeatFailureCounter() {
return heartbeatFailureCounter;
}
/**
* backend belong to some cluster
*
@ -690,12 +703,19 @@ public class Backend implements Writable {
}
heartbeatErrMsg = "";
this.heartbeatFailureCounter = 0;
} else {
if (isAlive.compareAndSet(true, false)) {
isChanged = true;
LOG.warn("{} is dead,", this.toString());
// Only set backend to dead if the heartbeat failure counter exceed threshold.
if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
if (isAlive.compareAndSet(true, false)) {
isChanged = true;
LOG.warn("{} is dead,", this.toString());
}
}
// still set error msg and missing time even if we may not mark this backend as dead,
// for debug easily.
// But notice that if isChanged = false, these msg will not sync to other FE.
heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg();
lastMissingHeartbeatTime = System.currentTimeMillis();
}

View File

@ -199,8 +199,9 @@ public class DemoMultiBackendsTest {
ProcResult result = dir.fetchResult();
Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size());
Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20));
Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1));
Assert.assertEquals(
"{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2));
}
private static void updateReplicaPathHash() {