From c2b483529c74826aff079f801b1e7a28bd6789fa Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Tue, 4 Jul 2023 17:12:53 +0800 Subject: [PATCH] [fix](heartbeat) need to set backend status base on edit log (#21410) For non-master FE, must set Backend's status based on the content of edit log. There is a bug that if we set fe config: `max_backend_heartbeat_failure_tolerance_count` larger that one, the non-master FE will not set Backend as dead until it receive enough number of heartbeat edit log, which is wrong. This will causing the Backend is dead on Master FE, but is alive on non-master FE --- .../src/main/java/org/apache/doris/system/Backend.java | 5 +++-- .../src/main/java/org/apache/doris/system/HeartbeatMgr.java | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java index fc719c21f4..c6a06d1b13 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java @@ -586,7 +586,7 @@ public class Backend implements Writable { * handle Backend's heartbeat response. * return true if any port changed, or alive state is changed. */ - public boolean handleHbResponse(BackendHbResponse hbResponse) { + public boolean handleHbResponse(BackendHbResponse hbResponse, boolean isReplay) { boolean isChanged = false; if (hbResponse.getStatus() == HbStatus.OK) { if (!this.version.equals(hbResponse.getVersion())) { @@ -632,7 +632,8 @@ public class Backend implements Writable { this.heartbeatFailureCounter = 0; } else { // Only set backend to dead if the heartbeat failure counter exceed threshold. - if (++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) { + // And if it is a replay process, must set backend to dead. + if (isReplay || ++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) { if (isAlive.compareAndSet(true, false)) { isChanged = true; LOG.warn("{} is dead,", this.toString()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java index ffd0269070..dc4c28217b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java @@ -166,7 +166,7 @@ public class HeartbeatMgr extends MasterDaemon { BackendHbResponse hbResponse = (BackendHbResponse) response; Backend be = nodeMgr.getBackend(hbResponse.getBeId()); if (be != null) { - boolean isChanged = be.handleHbResponse(hbResponse); + boolean isChanged = be.handleHbResponse(hbResponse, isReplay); if (hbResponse.getStatus() != HbStatus.OK) { // invalid all connections cached in ClientPool ClientPool.backendPool.clearPool(new TNetworkAddress(be.getHost(), be.getBePort()));