[enhance](blacklist) seperate blacklist conf from heartbeat (#28638)

There is a circuit breaker lasting for 2 minutes in grpc, then if a be is down and up again, send fragments to the be fails lasting for 2 minutes.
This commit is contained in:
Yongqiang YANG
2023-12-21 00:17:45 +08:00
committed by GitHub
parent 18ad8562f2
commit a443a39e2c
7 changed files with 22 additions and 10 deletions

View File

@ -1782,6 +1782,20 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;
/**
* Heartbeat interval in seconds.
* Default is 10, which means every 10 seconds, the master will send a heartbeat to all backends.
*/
@ConfField(mutable = false, masterOnly = false)
public static int heartbeat_interval_second = 10;
/**
* After a backend is marked as unavailable, it will be added to blacklist.
* Default is 120.
*/
@ConfField(mutable = true, masterOnly = false)
public static int blacklist_duration_second = 120;
@ConfField(mutable = true, masterOnly = false, description = {
"禁止创建odbc, mysql, broker类型的外表", "Disallow the creation of odbc, mysql, broker type external tables"})
public static boolean enable_odbc_mysql_broker_table = false;

View File

@ -2424,7 +2424,7 @@ public class Env {
}
public void createFeDiskUpdater() {
feDiskUpdater = new Daemon("feDiskUpdater", FeConstants.heartbeat_interval_second * 1000L) {
feDiskUpdater = new Daemon("feDiskUpdater", Config.heartbeat_interval_second * 1000L) {
@Override
protected void runOneCycle() {
ExecuteEnv.getInstance().refreshAndGetDiskInfo(true);

View File

@ -27,7 +27,7 @@ import org.apache.commons.pool2.impl.GenericKeyedObjectPoolConfig;
public class ClientPool {
static GenericKeyedObjectPoolConfig heartbeatConfig = new GenericKeyedObjectPoolConfig();
static int heartbeatTimeoutMs = FeConstants.heartbeat_interval_second * 1000;
static int heartbeatTimeoutMs = Config.heartbeat_interval_second * 1000;
static GenericKeyedObjectPoolConfig backendConfig = new GenericKeyedObjectPoolConfig();

View File

@ -34,7 +34,6 @@ public class FeConstants {
public static int shortkey_max_column_count = 3;
public static int shortkey_maxsize_bytes = 36;
public static int heartbeat_interval_second = 5;
public static int checkpoint_interval_second = 60; // 1 minutes
// dpp version

View File

@ -19,7 +19,6 @@ package org.apache.doris.qe;
import org.apache.doris.catalog.Env;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
import org.apache.doris.common.Reference;
import org.apache.doris.common.UserException;
@ -176,7 +175,7 @@ public class SimpleScheduler {
return;
}
blacklistBackends.put(backendID, Pair.of(FeConstants.heartbeat_interval_second + 1, reason));
blacklistBackends.put(backendID, Pair.of(Config.blacklist_duration_second + 1, reason));
LOG.warn("add backend {} to black list. reason: {}", backendID, reason);
}

View File

@ -78,7 +78,7 @@ public class HeartbeatMgr extends MasterDaemon {
private static volatile AtomicReference<TMasterInfo> masterInfo = new AtomicReference<>();
public HeartbeatMgr(SystemInfoService nodeMgr, boolean needRegisterMetric) {
super("heartbeat mgr", FeConstants.heartbeat_interval_second * 1000);
super("heartbeat mgr", Config.heartbeat_interval_second * 1000);
this.nodeMgr = nodeMgr;
this.executor = ThreadPoolManager.newDaemonFixedThreadPool(Config.heartbeat_mgr_threads_num,
Config.heartbeat_mgr_blocking_queue_size, "heartbeat-mgr-pool", needRegisterMetric);

View File

@ -17,7 +17,7 @@
package org.apache.doris.qe;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Config;
import org.apache.doris.common.Reference;
import org.apache.doris.common.UserException;
import org.apache.doris.system.Backend;
@ -47,7 +47,7 @@ public class SimpleSchedulerTest {
@BeforeClass
public static void setUp() {
SimpleScheduler.init();
FeConstants.heartbeat_interval_second = 2;
Config.heartbeat_interval_second = 2;
be1 = new Backend(1000L, "192.168.100.0", 9050);
be2 = new Backend(1001L, "192.168.100.1", 9050);
be3 = new Backend(1002L, "192.168.100.2", 9050);
@ -139,7 +139,7 @@ public class SimpleSchedulerTest {
t3.join();
Assert.assertFalse(SimpleScheduler.isAvailable(be1));
Thread.sleep((FeConstants.heartbeat_interval_second + 5) * 1000);
Thread.sleep((Config.heartbeat_interval_second + 5) * 1000);
Assert.assertTrue(SimpleScheduler.isAvailable(be1));
}
@ -194,7 +194,7 @@ public class SimpleSchedulerTest {
System.out.println(e.getMessage());
}
Thread.sleep((FeConstants.heartbeat_interval_second + 5) * 1000);
Thread.sleep((Config.heartbeat_interval_second + 5) * 1000);
Assert.assertNotNull(SimpleScheduler.getHost(locations.get(0).backend_id, locations, backends, ref));
}
}