check all memebers are active for transfer

This commit is contained in:
ZhenNan0
2023-09-09 12:46:41 +00:00
committed by ob-robot
parent 32b49bc8df
commit 56bc4e6bc6
8 changed files with 178 additions and 65 deletions

View File

@ -69,7 +69,7 @@ int ObAllBalanceGroupBuilder::init(const int64_t tenant_id,
} else if (OB_UNLIKELY(OB_INVALID_TENANT_ID == tenant_id) || OB_ISNULL(mod)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(tenant_id), K(mod));
} else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS")))) {
} else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS"), tenant_id))) {
LOG_WARN("create map for tablet to LS fail", KR(ret), K(tenant_id));
} else if (OB_FAIL(tablet_data_size_.create(MAP_BUCKET_NUM, lib::ObLabel("TabletSizeMap")))) {
LOG_WARN("create map for tablet data size fail", KR(ret), K(tenant_id));

View File

@ -177,10 +177,10 @@ int ObTenantTransferService::process_task_(const ObTransferTask::TaskStatus &tas
int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
{
int ret = OB_SUCCESS;
int tmp_ret = OB_SUCCESS;
ObTransferTask task;
ObMySQLTransaction trans;
bool member_list_is_same = false;
bool update_comment_to_wait_for_member_list = false;
ObTransferTaskComment result_comment = EMPTY_COMMENT;
ObArray<ObTabletID> tablet_ids;
ObTableLockOwnerID lock_owner_id;
ObTransferPartList not_exist_part_list;
@ -219,15 +219,12 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
*sql_proxy_,
task.get_src_ls(),
task.get_dest_ls(),
member_list_is_same))) { // can't use trans
LOG_WARN("fail to check ls member_list", KR(ret), K(task), K(member_list_is_same));
} else if (!member_list_is_same) {
result_comment))) { // can't use trans
LOG_WARN("fail to check ls member_list", KR(ret), K(task));
} else if (EMPTY_COMMENT != result_comment) {
ret = OB_NEED_RETRY;
TTS_INFO("member_lists of src_ls and dest_ls are not same, need retry",
KR(ret), K_(tenant_id), K(member_list_is_same), K(task));
if (task.get_comment() != ObTransferTaskComment::WAIT_FOR_MEMBER_LIST) {
update_comment_to_wait_for_member_list = true;
}
TTS_INFO("member_lists of src_ls and dest_ls are not same or there has inacitve server in member_list, need retry",
KR(ret), K_(tenant_id), K(task), "result_comment", transfer_task_comment_to_str(result_comment));
} else if (OB_FAIL(lock_table_and_part_(
trans,
task.get_src_ls(),
@ -289,22 +286,10 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
}
}
if ((OB_NEED_RETRY == ret && update_comment_to_wait_for_member_list)
|| OB_TRANS_TIMEOUT == ret || OB_TIMEOUT == ret) {
ObTimeoutCtx ctx_comment;
int tmp_ret = OB_SUCCESS;
ObTransferTaskComment comment = (OB_NEED_RETRY == ret)
? ObTransferTaskComment::WAIT_FOR_MEMBER_LIST
: ObTransferTaskComment::TRANSACTION_TIMEOUT;
if (OB_TMP_FAIL(ctx_comment.set_timeout(2000000/*2s*/))) { // overwrite timeout
LOG_WARN("set default timeout ctx failed", KR(tmp_ret), K(ctx_comment), K_(tenant_id), K(task_id));
} else if (OB_TMP_FAIL(ObTransferTaskOperator::update_comment(
*sql_proxy_,
tenant_id_,
task_id,
comment))) {
LOG_WARN("update comment failed", KR(tmp_ret), K_(tenant_id), K(task_id), K(comment));
}
// update comments for expected error codes
if (OB_TMP_FAIL(update_comment_for_expected_errors_(ret, task_id, result_comment))) {
LOG_WARN("update comment for expected errors failed", KR(tmp_ret), KR(ret),
K_(tenant_id), K(task_id), "result_comment", transfer_task_comment_to_str(result_comment));
}
if (OB_SUCC(ret)) {
@ -322,24 +307,87 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
ERRSIM_POINT_DEF(EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME);
// 1.check leader member_lists of src_ls and dest_ls are same
// 2.if member_lists are same, check that all servers in member_list are acitve
int ObTenantTransferService::check_ls_member_list_(
common::ObISQLClient &sql_proxy,
const ObLSID &src_ls,
const ObLSID &dest_ls,
bool &is_same)
ObTransferTaskComment &result_comment)
{
int ret = OB_SUCCESS;
result_comment = EMPTY_COMMENT;
bool all_members_are_active = false;
ObLSReplica::MemberList src_ls_member_list;
ObLSReplica::MemberList dest_ls_member_list;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret));
} else if (OB_FAIL(get_member_lists_by_inner_sql_(
sql_proxy,
src_ls,
dest_ls,
src_ls_member_list,
dest_ls_member_list))) {
LOG_WARN("get member list by inner sql failed", KR(ret), K(src_ls), K(dest_ls));
} else if (!ObLSReplica::servers_in_member_list_are_same(
src_ls_member_list,
dest_ls_member_list)) {
// result 1: member_lists are not same
result_comment = WAIT_FOR_MEMBER_LIST;
LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(src_ls_member_list), K(dest_ls_member_list), K(result_comment));
} else if (OB_FAIL(ObLSReplica::check_all_servers_in_member_list_are_active(
src_ls_member_list,
all_members_are_active))) {
LOG_WARN("check all servers in member list are active failed",
KR(ret), K(src_ls_member_list), K(all_members_are_active));
} else if (!all_members_are_active) {
// result 2: member_lists are same, but server in member_list is inactive
result_comment = INACTIVE_SERVER_IN_MEMBER_LIST;
LOG_WARN("member_list has inactive server", KR(ret), K(src_ls),
K(src_ls_member_list), K(all_members_are_active), K(result_comment));
} else {
// result 3: member_lists are same && all members are active
result_comment = EMPTY_COMMENT;
TTS_INFO("member_lists of src_ls and dest_ls are same and all members are acitve",
KR(ret), K_(tenant_id), K(src_ls), K(dest_ls), K(all_members_are_active),
K(src_ls_member_list), K(dest_ls_member_list), K(result_comment));
}
// just for debug
if (OB_FAIL(ret)) {
} else if (OB_IN_STOP_STATE == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
result_comment = INACTIVE_SERVER_IN_MEMBER_LIST;
TTS_INFO("errsim tenant transfer check ls member list with inactive server", K(result_comment));
} else if (OB_STATE_NOT_MATCH == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
result_comment = WAIT_FOR_MEMBER_LIST;
TTS_INFO("errsim tenant transfer check ls member list not same", K(result_comment));
}
return ret;
}
// get ls leader member list of src_ls and dest_ls
int ObTenantTransferService::get_member_lists_by_inner_sql_(
common::ObISQLClient &sql_proxy,
const ObLSID &src_ls,
const ObLSID &dest_ls,
ObLSReplica::MemberList &src_ls_member_list,
ObLSReplica::MemberList &dest_ls_member_list)
{
int ret = OB_SUCCESS;
src_ls_member_list.reset();
dest_ls_member_list.reset();
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret));
} else if (!src_ls.is_valid() || !dest_ls.is_valid()) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid args", KR(ret), K(src_ls), K(dest_ls));
} else {
is_same = false;
SMART_VAR(ObISQLClient::ReadResult, result) {
ObSqlString sql;
ObString src_ls_member_list_str;
ObString dest_ls_member_list_str;
ObLSReplica::MemberList src_ls_member_list;
ObLSReplica::MemberList dest_ls_member_list;
common::sqlclient::ObMySQLResult *res = NULL;
if (OB_FAIL(sql.assign_fmt(
"SELECT PAXOS_MEMBER_LIST FROM %s WHERE TENANT_ID = %lu AND ROLE = 'LEADER'"
@ -372,12 +420,8 @@ int ObTenantTransferService::check_ls_member_list_(
to_cstring(dest_ls_member_list_str),
dest_ls_member_list))) {
LOG_WARN("text2member_list failed", KR(ret), K_(tenant_id), K(dest_ls), K(dest_ls_member_list_str));
} else if (ObLSReplica::servers_in_member_list_are_same(src_ls_member_list, dest_ls_member_list)) {
is_same = true;
} else {
is_same = false;
}
// double check sql result
if (OB_FAIL(ret)) {
if (OB_UNLIKELY(OB_ITER_END == ret)) { // read less than two rows
ret = OB_LEADER_NOT_EXIST;
@ -396,22 +440,9 @@ int ObTenantTransferService::check_ls_member_list_(
K(sql), K(src_ls_member_list_str), K(dest_ls_member_list_str));
} else {
ret = OB_SUCCESS;
if (is_same) {
LOG_INFO("member_list of src_ls and dest_ls are same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list));
} else {
LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list));
}
}
} // end SMART_VAR
}
if (OB_SUCC(ret)) {
if (EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
is_same = false;
TTS_INFO("errsim tenant transfer check ls member list not same", K(is_same));
}
}
return ret;
}
@ -1566,6 +1597,50 @@ int ObTenantTransferService::set_transaction_timeout_(common::ObTimeoutCtx &ctx)
return ret;
}
// err --> comment
// OB_TRANS_TIMEOUT TRANSACTION_TIMEOUT
// OB_TIMEOUT TRANSACTION_TIMEOUT
// OB_NEED_RETRY WAIT_FOR_MEMBER_LIST/INACTIVE_SERVER_IN_MEMBER_LIST
int ObTenantTransferService::update_comment_for_expected_errors_(
const int err,
const ObTransferTaskID &task_id,
const ObTransferTaskComment &result_comment)
{
int ret = OB_SUCCESS;
ObTransferTaskComment actual_comment = EMPTY_COMMENT;
ObTimeoutCtx ctx;
if (IS_NOT_INIT || OB_ISNULL(sql_proxy_)) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret));
} else if (OB_SUCCESS == err) {
// skip
} else if (OB_UNLIKELY(!task_id.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid task_id", KR(ret), K(task_id));
} else if (OB_TRANS_TIMEOUT == err || OB_TIMEOUT == err) {
actual_comment = TRANSACTION_TIMEOUT;
} else if (OB_NEED_RETRY == err) {
if (WAIT_FOR_MEMBER_LIST != result_comment && INACTIVE_SERVER_IN_MEMBER_LIST != result_comment) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected comment with err", KR(ret), K(err), K(result_comment));
} else {
actual_comment = result_comment;
}
}
if (OB_FAIL(ret) || EMPTY_COMMENT == actual_comment) {
// do nothing
} else if (OB_FAIL(ctx.set_timeout(GCONF.internal_sql_execute_timeout))) { // overwrite timeout
LOG_WARN("set default timeout ctx failed", KR(ret), K(ctx), K_(tenant_id), K(task_id));
} else if (OB_FAIL(ObTransferTaskOperator::update_comment(
*sql_proxy_,
tenant_id_,
task_id,
actual_comment))) {
LOG_WARN("update comment failed", KR(ret), K_(tenant_id), K(task_id), K(actual_comment));
}
return ret;
}
#undef TTS_INFO
} // end namespace rootserver
} // end namespace oceanbase

View File

@ -122,7 +122,13 @@ private:
common::ObISQLClient &sql_proxy,
const share::ObLSID &src_ls,
const share::ObLSID &dest_ls,
bool &is_same);
ObTransferTaskComment &result_comment);
int get_member_lists_by_inner_sql_(
common::ObISQLClient &sql_proxy,
const ObLSID &src_ls,
const ObLSID &dest_ls,
share::ObLSReplica::MemberList &src_ls_member_list,
share::ObLSReplica::MemberList &dest_ls_member_list);
int lock_table_and_part_(
ObMySQLTransaction &trans,
const share::ObLSID &src_ls,
@ -216,6 +222,10 @@ private:
const ObTabletID &tablet_id,
share::ObDisplayTabletList &table_lock_tablet_list);
int set_transaction_timeout_(common::ObTimeoutCtx &ctx);
int update_comment_for_expected_errors_(
const int err,
const ObTransferTaskID &task_id,
const ObTransferTaskComment &result_comment);
private:
static const int64_t IDLE_TIME_US = 10 * 1000 * 1000L; // 10s
static const int64_t BUSY_IDLE_TIME_US = 100 * 1000L; // 100ms

View File

@ -16,6 +16,7 @@
#include "share/config/ob_server_config.h" // for KR(), common::ob_error_name(x)
#include "share/ls/ob_ls_replica_filter.h" // ObLSReplicaFilter
#include "share/ob_share_util.h" // ObShareUtils
#include "share/ob_all_server_tracer.h" // SVR_TRACER
#include "lib/string/ob_sql_string.h" // ObSqlString
#include "lib/utility/utility.h" // split_on()
@ -324,6 +325,7 @@ bool ObLSReplica::learner_list_is_equal(const common::GlobalLearnerList &a, cons
return is_equal;
}
// both server and timestamp of member need to be equal
bool ObLSReplica::member_list_is_equal(const MemberList &a, const MemberList &b)
{
bool is_equal = true;
@ -377,6 +379,24 @@ bool ObLSReplica::servers_in_member_list_are_same(const MemberList &a, const Mem
return is_same;
}
int ObLSReplica::check_all_servers_in_member_list_are_active(
const MemberList &member_list,
bool &all_active)
{
int ret = OB_SUCCESS;
all_active = true;
ARRAY_FOREACH_X(member_list, idx, cnt, OB_SUCC(ret) && all_active) {
const ObAddr &server = member_list.at(idx).get_server();
if (OB_FAIL(SVR_TRACER.check_server_alive(server, all_active))) {
all_active = false;
LOG_WARN("check server alive failed", KR(ret), K(server), K(all_active), K(member_list));
} else if (!all_active) {
LOG_WARN("server in member_list is inactive", KR(ret), K(server), K(member_list));
}
}
return ret;
}
int64_t ObLSReplica::to_string(char *buf, const int64_t buf_len) const
{
int64_t pos = 0;

View File

@ -83,6 +83,24 @@ class ObLSReplica
public:
static const int64_t DEFAULT_REPLICA_COUNT = 7;
typedef common::ObSEArray<SimpleMember, DEFAULT_REPLICA_COUNT, ObNullAllocator> MemberList;
/*---------------------- MemberList related functions begin -----------------------*/
// format-related functions
static int member_list2text(const MemberList &member_list, ObSqlString &text);
static int text2learner_list(const char *text, GlobalLearnerList &learner_list);
static int text2member_list(const char *text, MemberList &member_list);
// transform ObMemberList into MemberList
static int transform_ob_member_list(
const common::ObMemberList &ob_member_list,
MemberList &member_list);
static bool member_list_is_equal(const MemberList &a, const MemberList &b);
static bool server_is_in_member_list(
const MemberList &member_list,
const common::ObAddr &server);
static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b);
static int check_all_servers_in_member_list_are_active(
const MemberList &member_list,
bool &all_acitve);
/*---------------------- MemberList related functions end -------------------------*/
// initial-related functions
ObLSReplica();
@ -118,19 +136,6 @@ public:
|| common::REPLICA_TYPE_FULL == replica_type_
|| common::REPLICA_TYPE_LOGONLY == replica_type_; }
inline bool is_in_restore() const { return !restore_status_.is_restore_none(); }
// format-related functions
static int member_list2text(const MemberList &member_list, ObSqlString &text);
static int text2learner_list(const char *text, GlobalLearnerList &learner_list);
static int text2member_list(const char *text, MemberList &member_list);
// transform ObMemberList into MemberList
static int transform_ob_member_list(
const common::ObMemberList &ob_member_list,
MemberList &member_list);
static bool member_list_is_equal(const MemberList &a, const MemberList &b);
static bool server_is_in_member_list(
const MemberList &member_list,
const common::ObAddr &server);
static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b);
int64_t to_string(char *buf, const int64_t buf_len) const;
// operator-related functions
int assign(const ObLSReplica &other);

View File

@ -40,9 +40,10 @@ public:
~ObTenantTabletToLSMap() {}
int init(const int64_t bucket_num = 4096,
const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap"))
const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap"),
const uint64_t tenant_id = OB_SERVER_TENANT_ID)
{
return map_.create(bucket_num, label);
return map_.create(bucket_num, label, ObModIds::OB_HASH_NODE, tenant_id);
}
void destroy() { map_.destroy(); }

View File

@ -313,6 +313,7 @@ static const char* TRANSFER_TASK_COMMENT_ARRAY[] =
"Task completed as no valid partition",
"Task canceled",
"Unable to process task due to transaction timeout",
"Unable to process task due to inactive server in member list",
"Unknow"/*MAX_COMMENT*/
};

View File

@ -213,6 +213,7 @@ enum ObTransferTaskComment
TASK_COMPLETED_AS_NO_VALID_PARTITION = 2,
TASK_CANCELED = 3,
TRANSACTION_TIMEOUT = 4,
INACTIVE_SERVER_IN_MEMBER_LIST = 5,
MAX_COMMENT
};