From 4ee661202e7199e9f254983b953098ae0e58d4e4 Mon Sep 17 00:00:00 2001 From: HHoflittlefish777 <77738092+HHoflittlefish777@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:01:05 +0800 Subject: [PATCH] [improve](transaction) extend abort transaction time (#28662) --- docs/en/docs/admin-manual/config/fe-config.md | 10 ++++++++++ docs/zh-CN/docs/admin-manual/config/fe-config.md | 10 ++++++++++ .../src/main/java/org/apache/doris/common/Config.java | 7 +++++++ .../java/org/apache/doris/system/HeartbeatMgr.java | 3 ++- 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/en/docs/admin-manual/config/fe-config.md b/docs/en/docs/admin-manual/config/fe-config.md index 2fd5b58d9f..b408a8156f 100644 --- a/docs/en/docs/admin-manual/config/fe-config.md +++ b/docs/en/docs/admin-manual/config/fe-config.md @@ -587,6 +587,16 @@ Is it possible to configure dynamically: true Whether it is a configuration item unique to the Master FE node: true +### `abort_txn_after_lost_heartbeat_time_second` + +Abort transaction time after lost heartbeat. The default value is 300, which means transactions of be will be aborted after lost heartbeat 300s. + +Default: 300(s) + +Is it possible to configure dynamically: true + +Whether it is a configuration item unique to the Master FE node: true + #### `enable_access_file_without_broker` Default:false diff --git a/docs/zh-CN/docs/admin-manual/config/fe-config.md b/docs/zh-CN/docs/admin-manual/config/fe-config.md index 47bcc9fba9..cdc8295557 100644 --- a/docs/zh-CN/docs/admin-manual/config/fe-config.md +++ b/docs/zh-CN/docs/admin-manual/config/fe-config.md @@ -587,6 +587,16 @@ FE向BE的BackendService发送rpc请求时的超时时间,单位:毫秒。 是否为 Master FE 节点独有的配置项:true +#### `abort_txn_after_lost_heartbeat_time_second` + +丢失be心跳后丢弃be事务的时间。默认时间为三百秒,当三百秒fe没有接收到be心跳时,会丢弃该be的所有事务。 + +默认值:300(秒) + +是否可以动态配置:true + +是否为 Master FE 节点独有的配置项:true + #### `enable_access_file_without_broker` 默认值:false diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 8c90f6a516..962bf219e4 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1782,6 +1782,13 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = true) public static long max_backend_heartbeat_failure_tolerance_count = 1; + /** + * Abort transaction time after lost heartbeat. + * The default value is 300s, which means transactions of be will be aborted after lost heartbeat 300s. + */ + @ConfField(mutable = true, masterOnly = true) + public static int abort_txn_after_lost_heartbeat_time_second = 300; + /** * Heartbeat interval in seconds. * Default is 10, which means every 10 seconds, the master will send a heartbeat to all backends. diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java index 4282ad9aa3..5d17846476 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java @@ -174,7 +174,8 @@ public class HeartbeatMgr extends MasterDaemon { if (hbResponse.getStatus() != HbStatus.OK) { // invalid all connections cached in ClientPool ClientPool.backendPool.clearPool(new TNetworkAddress(be.getHost(), be.getBePort())); - if (!isReplay && System.currentTimeMillis() - be.getLastUpdateMs() > 60 * 1000L) { + if (!isReplay && System.currentTimeMillis() - be.getLastUpdateMs() + >= Config.abort_txn_after_lost_heartbeat_time_second * 1000L) { Env.getCurrentGlobalTransactionMgr() .abortTxnWhenCoordinateBeDown(be.getHost(), 100); }