[improvement](heartbeat) Add some relaxation strategies to reduce the failure probability of regression testing (#13568)
The regression test may failed because of heartbeat failure occasionally.
So I add 2 new FE config to relax this limit
1. `disable_backend_black_list`
Set to true to not put Backend to black list even if we failed to send task to it. Default is false.
2. `max_backend_heartbeat_failure_tolerance_count`
Only if the failure time of heartbeat exceed this config, we can set Backend as dead. Default is 1.
This commit is contained in:
@ -1794,4 +1794,21 @@ public class Config extends ConfigBase {
|
||||
@ConfField(mutable = true, masterOnly = false)
|
||||
public static int max_query_profile_num = 100;
|
||||
|
||||
/**
|
||||
* Set to true to disable backend black list, so that even if we failed to send task to a backend,
|
||||
* that backend won't be added to black list.
|
||||
* This should only be set when running tests, such as regression test.
|
||||
* Highly recommended NOT disable it in product environment.
|
||||
*/
|
||||
@ConfField(mutable = true, masterOnly = false)
|
||||
public static boolean disable_backend_black_list = false;
|
||||
|
||||
/**
|
||||
* Maximum backend heartbeat failure tolerance count.
|
||||
* Default is 1, which means if 1 heart failed, the backend will be marked as dead.
|
||||
* A larger value can improve the tolerance of the cluster to occasional heartbeat failures.
|
||||
* For example, when running regression tests, this value can be increased.
|
||||
*/
|
||||
@ConfField(mutable = true, masterOnly = true)
|
||||
public static long max_backend_heartbeat_failure_tolerance_count = 1;
|
||||
}
|
||||
|
||||
@ -52,6 +52,7 @@ public class BackendsProcDir implements ProcDirInterface {
|
||||
.add("SystemDecommissioned").add("ClusterDecommissioned").add("TabletNum")
|
||||
.add("DataUsedCapacity").add("AvailCapacity").add("TotalCapacity").add("UsedPct")
|
||||
.add("MaxDiskUsedPct").add("RemoteUsedCapacity").add("Tag").add("ErrMsg").add("Version").add("Status")
|
||||
.add("HeartbeatFailureCounter")
|
||||
.build();
|
||||
|
||||
public static final int HOSTNAME_INDEX = 3;
|
||||
@ -178,6 +179,8 @@ public class BackendsProcDir implements ProcDirInterface {
|
||||
backendInfo.add(backend.getVersion());
|
||||
// status
|
||||
backendInfo.add(new Gson().toJson(backend.getBackendStatus()));
|
||||
// heartbeat failure counter
|
||||
backendInfo.add(backend.getHeartbeatFailureCounter());
|
||||
|
||||
comparableBackendInfos.add(backendInfo);
|
||||
}
|
||||
|
||||
@ -2150,7 +2150,7 @@ public class Coordinator {
|
||||
}
|
||||
|
||||
public boolean isBackendStateHealthy() {
|
||||
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime) {
|
||||
if (backend.getLastMissingHeartbeatTime() > lastMissingHeartbeatTime && !backend.isAlive()) {
|
||||
LOG.warn("backend {} is down while joining the coordinator. job id: {}",
|
||||
backend.getId(), jobId);
|
||||
return false;
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
package org.apache.doris.qe;
|
||||
|
||||
import org.apache.doris.catalog.Env;
|
||||
import org.apache.doris.common.Config;
|
||||
import org.apache.doris.common.FeConstants;
|
||||
import org.apache.doris.common.Pair;
|
||||
import org.apache.doris.common.Reference;
|
||||
@ -169,7 +170,9 @@ public class SimpleScheduler {
|
||||
}
|
||||
|
||||
public static void addToBlacklist(Long backendID, String reason) {
|
||||
if (backendID == null) {
|
||||
if (backendID == null || Config.disable_backend_black_list) {
|
||||
LOG.warn("ignore backend black list for backend: {}, disabled: {}", backendID,
|
||||
Config.disable_backend_black_list);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@ import org.apache.doris.alter.DecommissionType;
|
||||
import org.apache.doris.catalog.DiskInfo;
|
||||
import org.apache.doris.catalog.DiskInfo.DiskState;
|
||||
import org.apache.doris.catalog.Env;
|
||||
import org.apache.doris.common.Config;
|
||||
import org.apache.doris.common.FeConstants;
|
||||
import org.apache.doris.common.io.Text;
|
||||
import org.apache.doris.common.io.Writable;
|
||||
@ -128,6 +129,14 @@ public class Backend implements Writable {
|
||||
@SerializedName("tagMap")
|
||||
private Map<String, String> tagMap = Maps.newHashMap();
|
||||
|
||||
// Counter of heartbeat failure.
|
||||
// Once a heartbeat failed, increase this counter by one.
|
||||
// And if it reaches Config.max_backend_heartbeat_failure_tolerance_count, this backend
|
||||
// will be marked as dead.
|
||||
// And once it back to alive, reset this counter.
|
||||
// No need to persist, because only master FE handle heartbeat.
|
||||
private int heartbeatFailureCounter = 0;
|
||||
|
||||
public Backend() {
|
||||
this.host = "";
|
||||
this.version = "";
|
||||
@ -333,6 +342,10 @@ public class Backend implements Writable {
|
||||
return backendStatus;
|
||||
}
|
||||
|
||||
public int getHeartbeatFailureCounter() {
|
||||
return heartbeatFailureCounter;
|
||||
}
|
||||
|
||||
/**
|
||||
* backend belong to some cluster
|
||||
*
|
||||
@ -690,12 +703,19 @@ public class Backend implements Writable {
|
||||
}
|
||||
|
||||
heartbeatErrMsg = "";
|
||||
this.heartbeatFailureCounter = 0;
|
||||
} else {
|
||||
if (isAlive.compareAndSet(true, false)) {
|
||||
isChanged = true;
|
||||
LOG.warn("{} is dead,", this.toString());
|
||||
// Only set backend to dead if the heartbeat failure counter exceed threshold.
|
||||
if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
|
||||
if (isAlive.compareAndSet(true, false)) {
|
||||
isChanged = true;
|
||||
LOG.warn("{} is dead,", this.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// still set error msg and missing time even if we may not mark this backend as dead,
|
||||
// for debug easily.
|
||||
// But notice that if isChanged = false, these msg will not sync to other FE.
|
||||
heartbeatErrMsg = hbResponse.getMsg() == null ? "Unknown error" : hbResponse.getMsg();
|
||||
lastMissingHeartbeatTime = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
@ -199,8 +199,9 @@ public class DemoMultiBackendsTest {
|
||||
ProcResult result = dir.fetchResult();
|
||||
Assert.assertEquals(BackendsProcDir.TITLE_NAMES.size(), result.getColumnNames().size());
|
||||
Assert.assertEquals("{\"location\" : \"default\"}", result.getRows().get(0).get(20));
|
||||
Assert.assertEquals("{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
|
||||
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 1));
|
||||
Assert.assertEquals(
|
||||
"{\"lastSuccessReportTabletsTime\":\"N/A\",\"lastStreamLoadTime\":-1,\"isQueryDisabled\":false,\"isLoadDisabled\":false}",
|
||||
result.getRows().get(0).get(BackendsProcDir.TITLE_NAMES.size() - 2));
|
||||
}
|
||||
|
||||
private static void updateReplicaPathHash() {
|
||||
|
||||
Reference in New Issue
Block a user