[CP] [RS] Fix stop server problem when arb server is down

This commit is contained in:
joseph12138
2023-06-06 10:12:23 +00:00
committed by ob-robot
parent 03fd746b46
commit 4d2381a1d6
2 changed files with 144 additions and 80 deletions

View File

@ -760,84 +760,14 @@ int ObAdminZoneExecutor::execute(ObExecContext &ctx, ObAdminZoneStmt &stmt)
common_proxy->get_server()); common_proxy->get_server());
} else if (ObAdminZoneArg::STOP == stmt.get_op() } else if (ObAdminZoneArg::STOP == stmt.get_op()
|| ObAdminZoneArg::FORCE_STOP == stmt.get_op()) { || ObAdminZoneArg::FORCE_STOP == stmt.get_op()) {
// check whether all leaders are switched out obrpc::ObServerList server_list;
ObMySQLProxy *sql_proxy = ctx.get_sql_proxy(); if (OB_FAIL(construct_servers_in_zone_(*(ctx.get_sql_proxy()), arg, server_list))) {
const int64_t idx = 0; LOG_WARN("fail to construct servers in zone", KR(ret), K(arg));
const int64_t retry_interval_us = 1000l * 1000l; // 1s } else if (0 == server_list.count()) {
bool stop = false; // no need to wait leader election and arb-degration
while (OB_SUCC(ret) && !stop) { } else if (OB_FAIL(wait_leader_switch_out_(*(ctx.get_sql_proxy()), arg))) {
ObSqlString sql; // check whether all leaders are switched out
SMART_VAR(ObMySQLProxy::MySQLResult, res) { LOG_WARN("fail to wait leader switch out", KR(ret), K(arg));
sqlclient::ObMySQLResult *result = NULL;
const int64_t rpc_timeout = THIS_WORKER.get_timeout_remain();
obrpc::Bool can_stop(true /* default value */);
int64_t leader_cnt = 0;
if (0 > THIS_WORKER.get_timeout_remain()) {
ret = OB_WAIT_LEADER_SWITCH_TIMEOUT;
LOG_WARN("wait switching out all leaders timeout", K(ret));
} else if (OB_FAIL(THIS_WORKER.check_status())) {
LOG_WARN("ctx check status failed", K(ret));
}
if (OB_FAIL(ret)) {
} else if (!can_stop) {
} else if (OB_FAIL(sql.assign_fmt(
"SELECT CAST(COUNT(*) AS SIGNED) FROM %s WHERE role = 'LEADER' AND zone = '%s'",
share::OB_CDB_OB_LS_LOCATIONS_TNAME, arg.zone_.ptr()))) {
LOG_WARN("assign_fmt failed", K(ret));
} else if (OB_FAIL(sql_proxy->read(res, sql.ptr()))) {
if (OB_RS_SHUTDOWN == ret || OB_RS_NOT_MASTER == ret) {
// switching rs, sleep and retry
ret = OB_SUCCESS;
} else {
LOG_WARN("execute sql failed", K(ret), K(sql));
}
} else if (OB_ISNULL(result = res.get_result())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get result failed", K(ret));
} else if (OB_FAIL(result->next())) {
if (OB_ITER_END == ret) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("result is empty", K(ret));
} else {
LOG_WARN("get next result failed", K(ret));
}
} else if (OB_FAIL(result->get_int(idx, leader_cnt))) {
if (OB_ERR_NULL_VALUE == ret) {
ret = OB_SUCCESS;
ObSqlString this_sql;
SMART_VAR(ObMySQLProxy::MySQLResult, this_res) {
sqlclient::ObMySQLResult *this_result = NULL;
int64_t server_cnt = -1;
if (OB_FAIL(this_sql.assign_fmt("select count(*) from %s where zone = '%s'",
share::OB_ALL_SERVER_TNAME, arg.zone_.ptr()))) {
LOG_WARN("fail to assign fmt", K(ret));
} else if (OB_FAIL(sql_proxy->read(this_res, this_sql.ptr()))) {
LOG_WARN("fail to execute sql", K(ret), K(this_sql));
} else if (OB_ISNULL(this_result = this_res.get_result())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("fail to get result", K(ret));
} else if (OB_FAIL(this_result->next())) {
LOG_WARN("get result error", K(ret));
} else if (OB_FAIL(this_result->get_int(0L, server_cnt))) {
LOG_WARN("fail to get result", K(ret));
} else if (0 == server_cnt) {
// no server in this zone;
stop = true;
} else {
// __all_virtual_server_stat is not ready, sleep and retry
}
}
} else {
LOG_WARN("get sum failed", K(ret));
}
} else if (0 == leader_cnt) {
stop = true;
} else {
LOG_INFO("waiting switching leaders out", K(ret), "left count", leader_cnt);
ob_usleep(retry_interval_us);
}
}
} }
} else {} // force stop, no need to wait leader switch } else {} // force stop, no need to wait leader switch
} else if (ObAdminZoneArg::MODIFY == stmt.get_op()) { } else if (ObAdminZoneArg::MODIFY == stmt.get_op()) {
@ -852,6 +782,112 @@ int ObAdminZoneExecutor::execute(ObExecContext &ctx, ObAdminZoneStmt &stmt)
return ret; return ret;
} }
int ObAdminZoneExecutor::wait_leader_switch_out_(
ObISQLClient &sql_proxy,
const obrpc::ObAdminZoneArg &arg)
{
int ret = OB_SUCCESS;
const int64_t idx = 0;
const int64_t retry_interval_us = 1000l * 1000l; // 1s
bool stop = false;
ObSqlString sql("AdminZoneExe");
if (OB_UNLIKELY(!arg.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(arg));
} else if (OB_FAIL(construct_wait_leader_switch_sql_(arg, sql))) {
LOG_WARN("fail to construct wait leader switch sql", KR(ret), K(arg));
}
while (OB_SUCC(ret) && !stop) {
SMART_VAR(ObMySQLProxy::MySQLResult, res) {
sqlclient::ObMySQLResult *result = NULL;
int64_t leader_cnt = 0;
if (0 > THIS_WORKER.get_timeout_remain()) {
ret = OB_WAIT_LEADER_SWITCH_TIMEOUT;
LOG_WARN("wait switching out leaders from all servers timeout", KR(ret));
} else if (OB_FAIL(THIS_WORKER.check_status())) {
LOG_WARN("ctx check status failed", KR(ret));
} else if (OB_FAIL(sql_proxy.read(res, sql.ptr()))) {
if (OB_RS_SHUTDOWN == ret || OB_RS_NOT_MASTER == ret) {
// switching rs, sleep and retry
ret = OB_SUCCESS;
} else {
LOG_WARN("execute sql failed", KR(ret), K(sql));
}
} else if (OB_ISNULL(result = res.get_result())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get result failed", KR(ret));
} else if (OB_FAIL(result->next())) {
if (OB_ITER_END == ret) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("result is empty", KR(ret));
} else {
LOG_WARN("get next result failed", KR(ret));
}
} else if (OB_FAIL(result->get_int(idx, leader_cnt))) {
if (OB_ERR_NULL_VALUE == ret) {
ret = OB_SUCCESS;
// __all_virtual_server_stat is not ready, sleep and retry
} else {
LOG_WARN("get sum failed", KR(ret));
}
} else if (0 == leader_cnt) {
stop = true;
} else {
LOG_INFO("waiting switching leaders out", KR(ret), "left count", leader_cnt);
ob_usleep(retry_interval_us);
}
}
}
return ret;
}
int ObAdminZoneExecutor::construct_wait_leader_switch_sql_(
const obrpc::ObAdminZoneArg &arg,
ObSqlString &sql)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!arg.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(arg));
} else if (OB_FAIL(sql.assign_fmt(
"SELECT CAST(COUNT(*) AS SIGNED) FROM %s "
"WHERE role = 'LEADER' AND zone = '%s'",
share::OB_CDB_OB_LS_LOCATIONS_TNAME, arg.zone_.ptr()))) {
LOG_WARN("assign_fmt failed", KR(ret), K(arg));
}
return ret;
}
int ObAdminZoneExecutor::construct_servers_in_zone_(
ObISQLClient &sql_proxy,
const obrpc::ObAdminZoneArg &arg,
obrpc::ObServerList &svr_list)
{
int ret = OB_SUCCESS;
svr_list.reset();
share::ObServerTableOperator st_operator;
ObArray<share::ObServerStatus> server_statuses;
if (OB_UNLIKELY(!arg.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(arg));
} else if (OB_FAIL(st_operator.init(&sql_proxy))) {
LOG_WARN("fail to init ObServerTableOperator", KR(ret));
} else if (OB_FAIL(st_operator.get(server_statuses))) {
LOG_WARN("build server statused from __all_server failed", KR(ret));
} else {
for (int64_t idx = 0; OB_SUCC(ret) && idx < server_statuses.count(); ++idx) {
if (arg.zone_ == server_statuses.at(idx).zone_) {
if (OB_FAIL(svr_list.push_back(server_statuses.at(idx).server_))) {
LOG_WARN("fail to add server to server_list", KR(ret), K(arg), K(server_statuses));
}
}
}
}
return ret;
}
int ObSwitchReplicaRoleExecutor::execute(ObExecContext &ctx, ObSwitchReplicaRoleStmt &stmt) int ObSwitchReplicaRoleExecutor::execute(ObExecContext &ctx, ObSwitchReplicaRoleStmt &stmt)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;

View File

@ -38,8 +38,6 @@ class ObBootstrapStmt;
DISALLOW_COPY_AND_ASSIGN(name##Executor); \ DISALLOW_COPY_AND_ASSIGN(name##Executor); \
} }
DEF_SIMPLE_EXECUTOR(ObAdminZone);
DEF_SIMPLE_EXECUTOR(ObFreeze); DEF_SIMPLE_EXECUTOR(ObFreeze);
DEF_SIMPLE_EXECUTOR(ObFlushCache); DEF_SIMPLE_EXECUTOR(ObFlushCache);
@ -194,6 +192,36 @@ private:
DISALLOW_COPY_AND_ASSIGN(ObAdminServerExecutor); DISALLOW_COPY_AND_ASSIGN(ObAdminServerExecutor);
}; };
class ObAdminZoneExecutor
{
public:
ObAdminZoneExecutor() {}
virtual ~ObAdminZoneExecutor() {}
int execute(ObExecContext &ctx, ObAdminZoneStmt &stmt);
private:
// wait leader switch out
// @params[in] sql_proxy, the proxy to use
// @params[in] arg, which zone to stop
int wait_leader_switch_out_(
ObISQLClient &sql_proxy,
const obrpc::ObAdminZoneArg &arg);
// construct sql to check waitint-result
// @params[in] arg, which zone to stop
// @params[out] sql, the sql builded
int construct_wait_leader_switch_sql_(
const obrpc::ObAdminZoneArg &arg,
ObSqlString &sql);
// construct server infos in this zone
// @params[in] sql_proxy, the proxy to use
// @params[in] arg, which zone to stop
// @params[out] svr_list, which servers to stop
int construct_servers_in_zone_(
ObISQLClient &sql_proxy,
const obrpc::ObAdminZoneArg &arg,
obrpc::ObServerList &svr_list);
DISALLOW_COPY_AND_ASSIGN(ObAdminZoneExecutor);
};
#undef DEF_SIMPLE_EXECUTOR #undef DEF_SIMPLE_EXECUTOR
} // end namespace sql } // end namespace sql