fix switch to primary when permanent offline servers exist
This commit is contained in:
@ -25,6 +25,8 @@
|
||||
#include "share/ls/ob_ls_table_iterator.h"//ObAllLSTableIterator
|
||||
#include "share/ls/ob_ls_info.h"//ObLSInfo
|
||||
#include "share/ob_all_server_tracer.h"
|
||||
#include "share/ls/ob_ls_table_operator.h"
|
||||
#include "lib/utility/ob_tracepoint.h" // ERRSIM_POINT_DEF
|
||||
|
||||
#include "observer/ob_server_struct.h"
|
||||
|
||||
@ -183,6 +185,48 @@ void ObEmptyServerChecker::stop()
|
||||
}
|
||||
}
|
||||
|
||||
int ObEmptyServerChecker::check_if_tenant_ls_replicas_exist_in_servers(
|
||||
const uint64_t tenant_id,
|
||||
const common::ObArray<common::ObAddr> &servers,
|
||||
bool &exist)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
common::ObArray<ObLSInfo> tenant_ls_infos;
|
||||
ObArray<ObAddr> empty_servers;
|
||||
exist = false;
|
||||
// if a tenant has ls replicas on a server, the server is not empty.
|
||||
empty_servers.reset();
|
||||
if (OB_ISNULL(GCTX.lst_operator_)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("GCTX.lst_operator_ is null", KR(ret), KP(GCTX.lst_operator_));
|
||||
} else if (OB_UNLIKELY(!is_valid_tenant_id(tenant_id))) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid tenant", KR(ret), K(tenant_id));
|
||||
} else if (OB_FAIL(GCTX.lst_operator_->load_all_ls_in_tenant(gen_meta_tenant_id(tenant_id), tenant_ls_infos))) {
|
||||
LOG_WARN("fail to execute load_all_ls_in_tenant", KR(ret), K(tenant_id));
|
||||
} else if (OB_FAIL(empty_servers.assign(servers))) {
|
||||
// assumpt that all servers are empty
|
||||
// (i.e. assumpt that the tenant does not have any ls replicas on these servers)
|
||||
// not empty servers will be removed from empty_servers array in func check_server_emtpy_by_ls_
|
||||
LOG_WARN("fail to assign servers to another array", KR(ret), K(servers));
|
||||
} else {
|
||||
for (int64_t i = 0; i < tenant_ls_infos.count() && OB_SUCC(ret) && empty_servers.count() == servers.count(); i++) {
|
||||
// if empty_servers.count() < servers.count()
|
||||
// it means that there is a not empty server
|
||||
// the check can be returned
|
||||
const ObLSInfo &ls_info = tenant_ls_infos.at(i);
|
||||
if (OB_FAIL(check_server_emtpy_by_ls_(ls_info, empty_servers))) {
|
||||
LOG_WARN("fail to check server empty", KR(ret), K(ls_info));
|
||||
} else if (empty_servers.count() < servers.count()) {
|
||||
exist = true;
|
||||
LOG_INFO("the tenant has ls replicas on one of the given servers", KR(ret),
|
||||
K(tenant_id), K(ls_info), K(empty_servers), K(servers));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
//check server not in meta table
|
||||
int ObEmptyServerChecker::check_server_empty_()
|
||||
{
|
||||
@ -210,8 +254,8 @@ int ObEmptyServerChecker::check_server_empty_()
|
||||
LOG_WARN("iterate ls table failed", K(ret));
|
||||
}
|
||||
break;
|
||||
} else if (OB_FAIL(check_server_emtpy_by_ls_(ls_info))) {
|
||||
LOG_WARN("failed to check server empty", KR(ret), K(ls_info));
|
||||
} else if (OB_FAIL(check_server_emtpy_by_ls_(ls_info, empty_servers_))) {
|
||||
LOG_WARN("failed to check server empty", KR(ret), K(ls_info), K(empty_servers_));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -222,17 +266,13 @@ int ObEmptyServerChecker::check_server_empty_()
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_info)
|
||||
ERRSIM_POINT_DEF(CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER);
|
||||
int ObEmptyServerChecker::check_server_emtpy_by_ls_(
|
||||
const share::ObLSInfo &ls_info,
|
||||
common::ObArray<common::ObAddr> &empty_servers)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (!inited_) {
|
||||
ret = OB_NOT_INIT;
|
||||
LOG_WARN("not init", K(ret));
|
||||
} else if (stop_) {
|
||||
ret = OB_CANCELED;
|
||||
LOG_WARN("cancle empty server check", KR(ret));
|
||||
} else if (OB_UNLIKELY(!ls_info.is_valid())) {
|
||||
if (OB_UNLIKELY(!ls_info.is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("ls info is invalid", KR(ret), K(ls_info));
|
||||
} else {
|
||||
@ -252,11 +292,11 @@ int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_in
|
||||
// check whether has member on empty servers
|
||||
FOREACH_CNT_X(m, replica->get_member_list(), OB_SUCC(ret)) {
|
||||
const ObAddr &addr = m->get_server();
|
||||
if (has_exist_in_array(empty_servers_, addr, &idx)) {
|
||||
if (has_exist_in_array(empty_servers, addr, &idx)) {
|
||||
//has member in server
|
||||
LOG_INFO("ls replica has member on sever", K(ls_info), K(addr));
|
||||
if (OB_FAIL(empty_servers_.remove(idx))) {
|
||||
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx));
|
||||
LOG_INFO("ls replica has member on sever", K(ls_info), K(addr), K(empty_servers));
|
||||
if (OB_FAIL(empty_servers.remove(idx))) {
|
||||
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx), K(empty_servers));
|
||||
}
|
||||
}
|
||||
} // end FORECAH member_list
|
||||
@ -264,16 +304,19 @@ int ObEmptyServerChecker::check_server_emtpy_by_ls_(const share::ObLSInfo &ls_in
|
||||
// filter server of replicas
|
||||
for (int64_t i = 0; i < replica_array.count() && OB_SUCC(ret); ++i) {
|
||||
const ObAddr &addr = replica_array.at(i).get_server();
|
||||
if (has_exist_in_array(empty_servers_, addr, &idx)) {
|
||||
if (has_exist_in_array(empty_servers, addr, &idx)) {
|
||||
//has member in server
|
||||
LOG_INFO("this sever has ls replica", K(ls_info), K(addr));
|
||||
if (OB_FAIL(empty_servers_.remove(idx))) {
|
||||
if (OB_FAIL(empty_servers.remove(idx))) {
|
||||
LOG_WARN("failed to remove addr from empty servers", KR(ret), K(idx));
|
||||
}
|
||||
}
|
||||
}//end for
|
||||
}
|
||||
|
||||
if (OB_SUCC(ret) && CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER) {
|
||||
ret = OB_LEADER_NOT_EXIST;
|
||||
LOG_WARN("errsim CHECK_SERVER_EMPTY_WHEN_LS_HAS_NO_LEADER opened", KR(ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -65,10 +65,24 @@ public:
|
||||
|
||||
virtual void wakeup();
|
||||
virtual void stop();
|
||||
/**
|
||||
* @description:
|
||||
* check if the given tenant has ls replicas on servers
|
||||
* @param[in] tenant_id the tenant which need to be checked
|
||||
* @param[in] servers on which the tenant might have ls replicas
|
||||
* @param[out] exists true if at least one of the servers has tenant ls replicas
|
||||
* @return return code
|
||||
*/
|
||||
static int check_if_tenant_ls_replicas_exist_in_servers(
|
||||
const uint64_t tenant_id,
|
||||
const common::ObArray<common::ObAddr> &servers,
|
||||
bool &exist);
|
||||
private:
|
||||
int try_delete_server_();
|
||||
int check_server_empty_();
|
||||
int check_server_emtpy_by_ls_(const share::ObLSInfo &ls_info);
|
||||
static int check_server_emtpy_by_ls_(
|
||||
const share::ObLSInfo &ls_info,
|
||||
common::ObArray<common::ObAddr> &empty_servers);
|
||||
//TODO no need check, check in unit_mgr now
|
||||
int check_server_empty_in_unit(const common::ObAddr &addr, bool &is_empty);
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "rootserver/ob_cluster_event.h"// CLUSTER_EVENT_ADD_CONTROL
|
||||
#include "rootserver/ob_rs_event_history_table_operator.h" // ROOTSERVICE_EVENT_ADD
|
||||
#include "rootserver/ob_ls_service_helper.h" // ObLSServiceHelper
|
||||
#include "rootserver/ob_empty_server_checker.h" // ObEmptyServerChecker
|
||||
#include "share/ob_rpc_struct.h"//ObLSAccessModeInfo
|
||||
#include "observer/ob_server_struct.h"//GCTX
|
||||
#include "share/location_cache/ob_location_service.h"//get ls leader
|
||||
@ -1480,7 +1481,8 @@ int ObTenantRoleTransitionService::check_tenant_server_online_()
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObSqlString sql;
|
||||
ObArray<ObAddr> inactive_servers;
|
||||
ObArray<ObAddr> temporary_offline_servers;
|
||||
ObArray<ObAddr> permanent_offline_servers;
|
||||
if (OB_FAIL(check_inner_stat())) {
|
||||
LOG_WARN("inner stat error", KR(ret));
|
||||
} else if (OB_FAIL(sql.append_fmt("select distinct svr_ip, svr_port from %s "
|
||||
@ -1496,27 +1498,49 @@ int ObTenantRoleTransitionService::check_tenant_server_online_()
|
||||
} else if (NULL == (result = res.get_result())) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("sql result is null", KR(ret), K(tenant_id_));
|
||||
} else if (OB_FAIL(construct_inactive_servers_(*result, inactive_servers))) {
|
||||
LOG_WARN("fail to construct inactive_servers", KR(ret), K(tenant_id_));
|
||||
} else if (OB_FAIL(construct_offline_servers_(*result, temporary_offline_servers, permanent_offline_servers))) {
|
||||
LOG_WARN("fail to construct offline servers", KR(ret), K(tenant_id_));
|
||||
}
|
||||
}
|
||||
}
|
||||
int64_t cnt = inactive_servers.count();
|
||||
if (OB_SUCC(ret) && inactive_servers.count() != 0) {
|
||||
if (OB_FAIL(ret)) {
|
||||
} else if (0 != temporary_offline_servers.count()) {
|
||||
ret = OB_OP_NOT_ALLOW;
|
||||
LOG_INFO("the tenant has inactive servers", KR(ret), K(inactive_servers));
|
||||
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has units on inactive servers, switch to primary");
|
||||
LOG_WARN("the tenant has units on temporary offline servers", KR(ret), K(tenant_id_), K(temporary_offline_servers));
|
||||
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has units on temporary offline servers, switch to primary");
|
||||
} else if (0 != permanent_offline_servers.count()) {
|
||||
bool exists = false;
|
||||
if (OB_FAIL(ObEmptyServerChecker::check_if_tenant_ls_replicas_exist_in_servers(
|
||||
tenant_id_,
|
||||
permanent_offline_servers,
|
||||
exists))) {
|
||||
LOG_WARN("fail to check if the tenant's ls_replicas exist in permanent_offline_servers",
|
||||
KR(ret), K(tenant_id_), K(permanent_offline_servers));
|
||||
if (OB_LEADER_NOT_EXIST == ret) {
|
||||
ret = OB_OP_NOT_ALLOW;
|
||||
LOG_USER_ERROR(OB_OP_NOT_ALLOW, "the tenant has ls replicas without leader, switch to primary");
|
||||
}
|
||||
} else if (exists) {
|
||||
ret = OB_OP_NOT_ALLOW;
|
||||
LOG_WARN("the tenant has ls replicas on at least one of the permanent offline servers",
|
||||
KR(ret), K(tenant_id_), K(exists), K(permanent_offline_servers));
|
||||
LOG_USER_ERROR(OB_OP_NOT_ALLOW,
|
||||
"the tenant has ls replicas on at least one of the permanent offline servers, switch to primary");
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObTenantRoleTransitionService::construct_inactive_servers_(
|
||||
int ObTenantRoleTransitionService::construct_offline_servers_(
|
||||
common::sqlclient::ObMySQLResult &res,
|
||||
ObArray<ObAddr> &inactive_servers)
|
||||
ObArray<ObAddr> &temporary_offline_servers,
|
||||
ObArray<ObAddr> &permanent_offline_servers)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObAddr server;
|
||||
inactive_servers.reset();
|
||||
temporary_offline_servers.reset();
|
||||
permanent_offline_servers.reset();
|
||||
bool is_offline = false;
|
||||
while (OB_SUCC(ret)) {
|
||||
server.reset();
|
||||
char svr_ip[OB_IP_STR_BUFF] = "";
|
||||
@ -1537,10 +1561,19 @@ int ObTenantRoleTransitionService::construct_inactive_servers_(
|
||||
} else if (OB_UNLIKELY(false == server.set_ip_addr(svr_ip, static_cast<int32_t>(svr_port)))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("invalid server address", KR(ret), K(svr_ip), K(svr_port));
|
||||
} else if (OB_FAIL(inactive_servers.push_back(server))) {
|
||||
} else if (OB_FAIL(SVR_TRACER.check_server_permanent_offline(server, is_offline))) {
|
||||
LOG_WARN("fail to check whether the server is permanent offline", KR(ret), K(tenant_id_), K(server));
|
||||
} else if (is_offline) {
|
||||
if (OB_FAIL(permanent_offline_servers.push_back(server))) {
|
||||
LOG_WARN("fail to push back server", KR(ret), K(server));
|
||||
}
|
||||
} else if (OB_FAIL(temporary_offline_servers.push_back(server))) {
|
||||
LOG_WARN("fail to push back server", KR(ret), K(server));
|
||||
}
|
||||
}
|
||||
if (0 != temporary_offline_servers.count() || 0 != permanent_offline_servers.count()) {
|
||||
LOG_INFO("the tenant has offline_servers", KR(ret), K(temporary_offline_servers), K(permanent_offline_servers));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -231,7 +231,10 @@ private:
|
||||
const share::SCN &ref_scn,
|
||||
const share::SCN &sys_ls_sync_scn);
|
||||
int check_tenant_server_online_();
|
||||
int construct_inactive_servers_(common::sqlclient::ObMySQLResult &res, ObArray<ObAddr> &inactive_servers);
|
||||
int construct_offline_servers_(
|
||||
common::sqlclient::ObMySQLResult &res,
|
||||
ObArray<ObAddr> &temporary_offline_servers,
|
||||
ObArray<ObAddr> &permanent_offline_servers);
|
||||
|
||||
private:
|
||||
const static int64_t SEC_UNIT = 1000L * 1000L;
|
||||
|
||||
Reference in New Issue
Block a user