fix switch to primary when permanent offline servers exist

This commit is contained in:
linqiucen
2023-09-11 09:10:26 +00:00
committed by ob-robot
parent d7f9c1f067
commit 681df0ad17
4 changed files with 124 additions and 31 deletions

View File

@ -25,6 +25,8 @@
#include "share/ls/ob_ls_table_iterator.h"//ObAllLSTableIterator
#include "share/ls/ob_ls_info.h"//ObLSInfo
#include "share/ob_all_server_tracer.h"
#include "share/ls/ob_ls_table_operator.h"
#include "lib/utility/ob_tracepoint.h" // ERRSIM_POINT_DEF
#include "observer/ob_server_struct.h"
@ -183,6 +185,48 @@ void ObEmptyServerChecker::stop()
}
}
int ObEmptyServerChecker::check_if_tenant_ls_replicas_exist_in_servers(
const uint64_t tenant_id,
const common::ObArray<common::ObAddr> &servers,
bool &exist)
{
int ret = OB_SUCCESS;
common::ObArray<ObLSInfo> tenant_ls_infos;
ObArray<ObAddr> empty_servers;
exist = false;
// if a tenant has ls replicas on a server, the server is not empty.
empty_servers.reset();
if (OB_ISNULL(GCTX.lst_operator_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("GCTX.lst_operator_ is null", KR(ret), KP(GCTX.lst_operator_));
} else if (OB_UNLIKELY(!is_valid_tenant_id(tenant_id))) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid tenant", KR(ret), K(tenant_id));
} else if (OB_FAIL(GCTX.lst_operator_->load_all_ls_in_tenant(gen_meta_tenant_id(tenant_id), tenant_ls_infos))) {
LOG_WARN("fail to execute load_all_ls_in_tenant", KR(ret), K(tenant_id));
} else if (OB_FAIL(empty_servers.assign(servers))) {
// assumpt that all servers are empty
// (i.e. assumpt that the tenant does not have any ls replicas on these servers)
// not empty servers will be removed from empty_servers array in func check_server_emtpy_by_ls_
LOG_WARN("fail to assign servers to another array", KR(ret), K(servers));
} else {
for (int64_t i = 0; i < tenant_ls_infos.count() && OB_SUCC(ret) && empty_servers.count() == servers.count(); i++) {
// if empty_servers.count() < servers.count()
// it means that there is a not empty server
// the check can be returned
const ObLSInfo &ls_info = tenant_ls_infos.at(i);
if (OB_FAIL(check_server_emtpy_by_ls_(ls_info, empty_servers))) {
LOG_WARN("fail to check server empty", KR(ret), K(ls_info));
} else if (empty_servers.count() < servers.count()) {
exist = true;
LOG_INFO("the tenant has ls replicas on one of the given servers", KR(ret),
K(tenant_id), K(ls_info), K(empty_servers), K(servers));
}
}
}
return ret;
}
//check server not in meta table
int ObEmptyServerChecker::check_server_empty_()
{
@ -210,8 +254,8 @@ int ObEmptyServerChecker::check_server_empty_()
LOG_WARN("iterate ls table failed", K(ret));
}
break;
} else if (OB_FAIL(check_server_emtpy_by_ls_(ls_info))) {
LOG_WARN("failed to check server empty", KR(ret), K(ls_info));
} else if (OB_FAIL(check_server_emtpy_by_ls_(ls_info, empty_servers_))) {
LOG_WARN("failed to check server empty", KR(ret), K(ls_info), K(empty_servers_));
}
}
}
@ -222,17 +266,13 @@ int ObEmptyServerChecker::check_server_empty_()
return ret;
}
int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_info)
ERRSIM_POINT_DEF(CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER);
int ObEmptyServerChecker::check_server_emtpy_by_ls_(
const share::ObLSInfo &ls_info,
common::ObArray<common::ObAddr> &empty_servers)
{
int ret = OB_SUCCESS;
if (!inited_) {
ret = OB_NOT_INIT;
LOG_WARN("not init", K(ret));
} else if (stop_) {
ret = OB_CANCELED;
LOG_WARN("cancle empty server check", KR(ret));
} else if (OB_UNLIKELY(!ls_info.is_valid())) {
if (OB_UNLIKELY(!ls_info.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("ls info is invalid", KR(ret), K(ls_info));
} else {
@ -252,11 +292,11 @@ int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_in
// check whether has member on empty servers
FOREACH_CNT_X(m, replica->get_member_list(), OB_SUCC(ret)) {
const ObAddr &addr = m->get_server();
if (has_exist_in_array(empty_servers_, addr, &idx)) {
if (has_exist_in_array(empty_servers, addr, &idx)) {
//has member in server
LOG_INFO("ls replica has member on sever", K(ls_info), K(addr));
if (OB_FAIL(empty_servers_.remove(idx))) {
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx));
LOG_INFO("ls replica has member on sever", K(ls_info), K(addr), K(empty_servers));
if (OB_FAIL(empty_servers.remove(idx))) {
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx), K(empty_servers));
}
}
} // end FORECAH member_list
@ -264,16 +304,19 @@ int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_in
// filter server of replicas
for (int64_t i = 0; i < replica_array.count() && OB_SUCC(ret); ++i) {
const ObAddr &addr = replica_array.at(i).get_server();
if (has_exist_in_array(empty_servers_, addr, &idx)) {
if (has_exist_in_array(empty_servers, addr, &idx)) {
//has member in server
LOG_INFO("this sever has ls replica", K(ls_info), K(addr));
if (OB_FAIL(empty_servers_.remove(idx))) {
if (OB_FAIL(empty_servers.remove(idx))) {
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx));
}
}
}//end for
}
if (OB_SUCC(ret) && CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER) {
ret = OB_LEADER_NOT_EXIST;
LOG_WARN("errsim CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER opened", KR(ret));
}
return ret;
}

View File

@ -65,10 +65,24 @@ public:
virtual void wakeup();
virtual void stop();
/**
* @description:
* check if the given tenant has ls replicas on servers
* @param[in] tenant_id the tenant which need to be checked
* @param[in] servers on which the tenant might have ls replicas
* @param[out] exists true if at least one of the servers has tenant ls replicas
* @return return code
*/
static int check_if_tenant_ls_replicas_exist_in_servers(
const uint64_t tenant_id,
const common::ObArray<common::ObAddr> &servers,
bool &exist);
private:
int try_delete_server_();
int check_server_empty_();
int check_server_emtpy_by_ls_(const share::ObLSInfo &ls_info);
static int check_server_emtpy_by_ls_(
const share::ObLSInfo &ls_info,
common::ObArray<common::ObAddr> &empty_servers);
//TODO no need check, check in unit_mgr now
int check_server_empty_in_unit(const common::ObAddr &addr, bool &is_empty);

View File

@ -21,6 +21,7 @@
#include "rootserver/ob_cluster_event.h"// CLUSTER_EVENT_ADD_CONTROL
#include "rootserver/ob_rs_event_history_table_operator.h" // ROOTSERVICE_EVENT_ADD
#include "rootserver/ob_ls_service_helper.h" // ObLSServiceHelper
#include "rootserver/ob_empty_server_checker.h" // ObEmptyServerChecker
#include "share/ob_rpc_struct.h"//ObLSAccessModeInfo
#include "observer/ob_server_struct.h"//GCTX
#include "share/location_cache/ob_location_service.h"//get ls leader
@ -1480,7 +1481,8 @@ int ObTenantRoleTransitionService::check_tenant_server_online_()
{
int ret = OB_SUCCESS;
ObSqlString sql;
ObArray<ObAddr> inactive_servers;
ObArray<ObAddr> temporary_offline_servers;
ObArray<ObAddr> permanent_offline_servers;
if (OB_FAIL(check_inner_stat())) {
LOG_WARN("inner stat error", KR(ret));
} else if (OB_FAIL(sql.append_fmt("select distinct svr_ip, svr_port from %s "
@ -1496,27 +1498,49 @@ int ObTenantRoleTransitionService::check_tenant_server_online_()
} else if (NULL == (result = res.get_result())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("sql result is null", KR(ret), K(tenant_id_));
} else if (OB_FAIL(construct_inactive_servers_(*result, inactive_servers))) {
LOG_WARN("fail to construct inactive_servers", KR(ret), K(tenant_id_));
} else if (OB_FAIL(construct_offline_servers_(*result, temporary_offline_servers, permanent_offline_servers))) {
LOG_WARN("fail to construct offline servers", KR(ret), K(tenant_id_));
}
}
}
int64_t cnt = inactive_servers.count();
if (OB_SUCC(ret) && inactive_servers.count() != 0) {
if (OB_FAIL(ret)) {
} else if (0 != temporary_offline_servers.count()) {
ret = OB_OP_NOT_ALLOW;
LOG_INFO("the tenant has inactive servers", KR(ret), K(inactive_servers));
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has units on inactive servers, switch to primary");
LOG_WARN("the tenant has units on temporary offline servers", KR(ret), K(tenant_id_), K(temporary_offline_servers));
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has units on temporary offline servers, switch to primary");
} else if (0 != permanent_offline_servers.count()) {
bool exists = false;
if (OB_FAIL(ObEmptyServerChecker::check_if_tenant_ls_replicas_exist_in_servers(
tenant_id_,
permanent_offline_servers,
exists))) {
LOG_WARN("fail to check if the tenant's ls_replicas exist in permanent_offline_servers",
KR(ret), K(tenant_id_), K(permanent_offline_servers));
if (OB_LEADER_NOT_EXIST == ret) {
ret = OB_OP_NOT_ALLOW;
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has ls replicas without leader, switch to primary");
}
} else if (exists) {
ret = OB_OP_NOT_ALLOW;
LOG_WARN("the tenant has ls replicas on at least one of the permanent offline servers",
KR(ret), K(tenant_id_), K(exists), K(permanent_offline_servers));
LOG_USER_ERROR(OB_OP_NOT_ALLOW,
"the tenant has ls replicas on at least one of the permanent offline servers, switch to primary");
}
}
return ret;
}
int ObTenantRoleTransitionService::construct_inactive_servers_(
int ObTenantRoleTransitionService::construct_offline_servers_(
common::sqlclient::ObMySQLResult &res,
ObArray<ObAddr> &inactive_servers)
ObArray<ObAddr> &temporary_offline_servers,
ObArray<ObAddr> &permanent_offline_servers)
{
int ret = OB_SUCCESS;
ObAddr server;
inactive_servers.reset();
temporary_offline_servers.reset();
permanent_offline_servers.reset();
bool is_offline = false;
while (OB_SUCC(ret)) {
server.reset();
char svr_ip[OB_IP_STR_BUFF] = "";
@ -1537,10 +1561,19 @@ int ObTenantRoleTransitionService::construct_inactive_servers_(
} else if (OB_UNLIKELY(false == server.set_ip_addr(svr_ip, static_cast<int32_t>(svr_port)))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("invalid server address", KR(ret), K(svr_ip), K(svr_port));
} else if (OB_FAIL(inactive_servers.push_back(server))) {
} else if (OB_FAIL(SVR_TRACER.check_server_permanent_offline(server, is_offline))) {
LOG_WARN("fail to check whether the server is permanent offline", KR(ret), K(tenant_id_), K(server));
} else if (is_offline) {
if (OB_FAIL(permanent_offline_servers.push_back(server))) {
LOG_WARN("fail to push back server", KR(ret), K(server));
}
} else if (OB_FAIL(temporary_offline_servers.push_back(server))) {
LOG_WARN("fail to push back server", KR(ret), K(server));
}
}
if (0 != temporary_offline_servers.count() || 0 != permanent_offline_servers.count()) {
LOG_INFO("the tenant has offline_servers", KR(ret), K(temporary_offline_servers), K(permanent_offline_servers));
}
return ret;
}

View File

@ -231,7 +231,10 @@ private:
const share::SCN &ref_scn,
const share::SCN &sys_ls_sync_scn);
int check_tenant_server_online_();
int construct_inactive_servers_(common::sqlclient::ObMySQLResult &res, ObArray<ObAddr> &inactive_servers);
int construct_offline_servers_(
common::sqlclient::ObMySQLResult &res,
ObArray<ObAddr> &temporary_offline_servers,
ObArray<ObAddr> &permanent_offline_servers);
private:
const static int64_t SEC_UNIT = 1000L * 1000L;