check all memebers are active for transfer

This commit is contained in:
ZhenNan0
2023-09-09 12:46:41 +00:00
committed by ob-robot
parent 32b49bc8df
commit 56bc4e6bc6
8 changed files with 178 additions and 65 deletions

View File

@ -69,7 +69,7 @@ int ObAllBalanceGroupBuilder::init(const int64_t tenant_id,
} else if (OB_UNLIKELY(OB_INVALID_TENANT_ID == tenant_id) || OB_ISNULL(mod)) { } else if (OB_UNLIKELY(OB_INVALID_TENANT_ID == tenant_id) || OB_ISNULL(mod)) {
ret = OB_INVALID_ARGUMENT; ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(tenant_id), K(mod)); LOG_WARN("invalid argument", KR(ret), K(tenant_id), K(mod));
} else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS")))) { } else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS"), tenant_id))) {
LOG_WARN("create map for tablet to LS fail", KR(ret), K(tenant_id)); LOG_WARN("create map for tablet to LS fail", KR(ret), K(tenant_id));
} else if (OB_FAIL(tablet_data_size_.create(MAP_BUCKET_NUM, lib::ObLabel("TabletSizeMap")))) { } else if (OB_FAIL(tablet_data_size_.create(MAP_BUCKET_NUM, lib::ObLabel("TabletSizeMap")))) {
LOG_WARN("create map for tablet data size fail", KR(ret), K(tenant_id)); LOG_WARN("create map for tablet data size fail", KR(ret), K(tenant_id));

View File

@ -177,10 +177,10 @@ int ObTenantTransferService::process_task_(const ObTransferTask::TaskStatus &tas
int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id) int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
int tmp_ret = OB_SUCCESS;
ObTransferTask task; ObTransferTask task;
ObMySQLTransaction trans; ObMySQLTransaction trans;
bool member_list_is_same = false; ObTransferTaskComment result_comment = EMPTY_COMMENT;
bool update_comment_to_wait_for_member_list = false;
ObArray<ObTabletID> tablet_ids; ObArray<ObTabletID> tablet_ids;
ObTableLockOwnerID lock_owner_id; ObTableLockOwnerID lock_owner_id;
ObTransferPartList not_exist_part_list; ObTransferPartList not_exist_part_list;
@ -219,15 +219,12 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
*sql_proxy_, *sql_proxy_,
task.get_src_ls(), task.get_src_ls(),
task.get_dest_ls(), task.get_dest_ls(),
member_list_is_same))) { // can't use trans result_comment))) { // can't use trans
LOG_WARN("fail to check ls member_list", KR(ret), K(task), K(member_list_is_same)); LOG_WARN("fail to check ls member_list", KR(ret), K(task));
} else if (!member_list_is_same) { } else if (EMPTY_COMMENT != result_comment) {
ret = OB_NEED_RETRY; ret = OB_NEED_RETRY;
TTS_INFO("member_lists of src_ls and dest_ls are not same, need retry", TTS_INFO("member_lists of src_ls and dest_ls are not same or there has inacitve server in member_list, need retry",
KR(ret), K_(tenant_id), K(member_list_is_same), K(task)); KR(ret), K_(tenant_id), K(task), "result_comment", transfer_task_comment_to_str(result_comment));
if (task.get_comment() != ObTransferTaskComment::WAIT_FOR_MEMBER_LIST) {
update_comment_to_wait_for_member_list = true;
}
} else if (OB_FAIL(lock_table_and_part_( } else if (OB_FAIL(lock_table_and_part_(
trans, trans,
task.get_src_ls(), task.get_src_ls(),
@ -289,22 +286,10 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
} }
} }
if ((OB_NEED_RETRY == ret && update_comment_to_wait_for_member_list) // update comments for expected error codes
|| OB_TRANS_TIMEOUT == ret || OB_TIMEOUT == ret) { if (OB_TMP_FAIL(update_comment_for_expected_errors_(ret, task_id, result_comment))) {
ObTimeoutCtx ctx_comment; LOG_WARN("update comment for expected errors failed", KR(tmp_ret), KR(ret),
int tmp_ret = OB_SUCCESS; K_(tenant_id), K(task_id), "result_comment", transfer_task_comment_to_str(result_comment));
ObTransferTaskComment comment = (OB_NEED_RETRY == ret)
? ObTransferTaskComment::WAIT_FOR_MEMBER_LIST
: ObTransferTaskComment::TRANSACTION_TIMEOUT;
if (OB_TMP_FAIL(ctx_comment.set_timeout(2000000/*2s*/))) { // overwrite timeout
LOG_WARN("set default timeout ctx failed", KR(tmp_ret), K(ctx_comment), K_(tenant_id), K(task_id));
} else if (OB_TMP_FAIL(ObTransferTaskOperator::update_comment(
*sql_proxy_,
tenant_id_,
task_id,
comment))) {
LOG_WARN("update comment failed", KR(tmp_ret), K_(tenant_id), K(task_id), K(comment));
}
} }
if (OB_SUCC(ret)) { if (OB_SUCC(ret)) {
@ -322,24 +307,87 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id)
ERRSIM_POINT_DEF(EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME); ERRSIM_POINT_DEF(EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME);
// 1.check leader member_lists of src_ls and dest_ls are same
// 2.if member_lists are same, check that all servers in member_list are acitve
int ObTenantTransferService::check_ls_member_list_( int ObTenantTransferService::check_ls_member_list_(
common::ObISQLClient &sql_proxy, common::ObISQLClient &sql_proxy,
const ObLSID &src_ls, const ObLSID &src_ls,
const ObLSID &dest_ls, const ObLSID &dest_ls,
bool &is_same) ObTransferTaskComment &result_comment)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
result_comment = EMPTY_COMMENT;
bool all_members_are_active = false;
ObLSReplica::MemberList src_ls_member_list;
ObLSReplica::MemberList dest_ls_member_list;
if (IS_NOT_INIT) { if (IS_NOT_INIT) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret)); LOG_WARN("not init", KR(ret));
} else if (OB_FAIL(get_member_lists_by_inner_sql_(
sql_proxy,
src_ls,
dest_ls,
src_ls_member_list,
dest_ls_member_list))) {
LOG_WARN("get member list by inner sql failed", KR(ret), K(src_ls), K(dest_ls));
} else if (!ObLSReplica::servers_in_member_list_are_same(
src_ls_member_list,
dest_ls_member_list)) {
// result 1: member_lists are not same
result_comment = WAIT_FOR_MEMBER_LIST;
LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(src_ls_member_list), K(dest_ls_member_list), K(result_comment));
} else if (OB_FAIL(ObLSReplica::check_all_servers_in_member_list_are_active(
src_ls_member_list,
all_members_are_active))) {
LOG_WARN("check all servers in member list are active failed",
KR(ret), K(src_ls_member_list), K(all_members_are_active));
} else if (!all_members_are_active) {
// result 2: member_lists are same, but server in member_list is inactive
result_comment = INACTIVE_SERVER_IN_MEMBER_LIST;
LOG_WARN("member_list has inactive server", KR(ret), K(src_ls),
K(src_ls_member_list), K(all_members_are_active), K(result_comment));
} else {
// result 3: member_lists are same && all members are active
result_comment = EMPTY_COMMENT;
TTS_INFO("member_lists of src_ls and dest_ls are same and all members are acitve",
KR(ret), K_(tenant_id), K(src_ls), K(dest_ls), K(all_members_are_active),
K(src_ls_member_list), K(dest_ls_member_list), K(result_comment));
}
// just for debug
if (OB_FAIL(ret)) {
} else if (OB_IN_STOP_STATE == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
result_comment = INACTIVE_SERVER_IN_MEMBER_LIST;
TTS_INFO("errsim tenant transfer check ls member list with inactive server", K(result_comment));
} else if (OB_STATE_NOT_MATCH == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
result_comment = WAIT_FOR_MEMBER_LIST;
TTS_INFO("errsim tenant transfer check ls member list not same", K(result_comment));
}
return ret;
}
// get ls leader member list of src_ls and dest_ls
int ObTenantTransferService::get_member_lists_by_inner_sql_(
common::ObISQLClient &sql_proxy,
const ObLSID &src_ls,
const ObLSID &dest_ls,
ObLSReplica::MemberList &src_ls_member_list,
ObLSReplica::MemberList &dest_ls_member_list)
{
int ret = OB_SUCCESS;
src_ls_member_list.reset();
dest_ls_member_list.reset();
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret));
} else if (!src_ls.is_valid() || !dest_ls.is_valid()) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid args", KR(ret), K(src_ls), K(dest_ls));
} else { } else {
is_same = false;
SMART_VAR(ObISQLClient::ReadResult, result) { SMART_VAR(ObISQLClient::ReadResult, result) {
ObSqlString sql; ObSqlString sql;
ObString src_ls_member_list_str; ObString src_ls_member_list_str;
ObString dest_ls_member_list_str; ObString dest_ls_member_list_str;
ObLSReplica::MemberList src_ls_member_list;
ObLSReplica::MemberList dest_ls_member_list;
common::sqlclient::ObMySQLResult *res = NULL; common::sqlclient::ObMySQLResult *res = NULL;
if (OB_FAIL(sql.assign_fmt( if (OB_FAIL(sql.assign_fmt(
"SELECT PAXOS_MEMBER_LIST FROM %s WHERE TENANT_ID = %lu AND ROLE = 'LEADER'" "SELECT PAXOS_MEMBER_LIST FROM %s WHERE TENANT_ID = %lu AND ROLE = 'LEADER'"
@ -372,12 +420,8 @@ int ObTenantTransferService::check_ls_member_list_(
to_cstring(dest_ls_member_list_str), to_cstring(dest_ls_member_list_str),
dest_ls_member_list))) { dest_ls_member_list))) {
LOG_WARN("text2member_list failed", KR(ret), K_(tenant_id), K(dest_ls), K(dest_ls_member_list_str)); LOG_WARN("text2member_list failed", KR(ret), K_(tenant_id), K(dest_ls), K(dest_ls_member_list_str));
} else if (ObLSReplica::servers_in_member_list_are_same(src_ls_member_list, dest_ls_member_list)) {
is_same = true;
} else {
is_same = false;
} }
// double check sql result
if (OB_FAIL(ret)) { if (OB_FAIL(ret)) {
if (OB_UNLIKELY(OB_ITER_END == ret)) { // read less than two rows if (OB_UNLIKELY(OB_ITER_END == ret)) { // read less than two rows
ret = OB_LEADER_NOT_EXIST; ret = OB_LEADER_NOT_EXIST;
@ -396,22 +440,9 @@ int ObTenantTransferService::check_ls_member_list_(
K(sql), K(src_ls_member_list_str), K(dest_ls_member_list_str)); K(sql), K(src_ls_member_list_str), K(dest_ls_member_list_str));
} else { } else {
ret = OB_SUCCESS; ret = OB_SUCCESS;
if (is_same) {
LOG_INFO("member_list of src_ls and dest_ls are same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list));
} else {
LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls),
K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list));
}
} }
} // end SMART_VAR } // end SMART_VAR
} }
if (OB_SUCC(ret)) {
if (EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) {
is_same = false;
TTS_INFO("errsim tenant transfer check ls member list not same", K(is_same));
}
}
return ret; return ret;
} }
@ -1566,6 +1597,50 @@ int ObTenantTransferService::set_transaction_timeout_(common::ObTimeoutCtx &ctx)
return ret; return ret;
} }
// err --> comment
// OB_TRANS_TIMEOUT TRANSACTION_TIMEOUT
// OB_TIMEOUT TRANSACTION_TIMEOUT
// OB_NEED_RETRY WAIT_FOR_MEMBER_LIST/INACTIVE_SERVER_IN_MEMBER_LIST
int ObTenantTransferService::update_comment_for_expected_errors_(
const int err,
const ObTransferTaskID &task_id,
const ObTransferTaskComment &result_comment)
{
int ret = OB_SUCCESS;
ObTransferTaskComment actual_comment = EMPTY_COMMENT;
ObTimeoutCtx ctx;
if (IS_NOT_INIT || OB_ISNULL(sql_proxy_)) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret));
} else if (OB_SUCCESS == err) {
// skip
} else if (OB_UNLIKELY(!task_id.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid task_id", KR(ret), K(task_id));
} else if (OB_TRANS_TIMEOUT == err || OB_TIMEOUT == err) {
actual_comment = TRANSACTION_TIMEOUT;
} else if (OB_NEED_RETRY == err) {
if (WAIT_FOR_MEMBER_LIST != result_comment && INACTIVE_SERVER_IN_MEMBER_LIST != result_comment) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected comment with err", KR(ret), K(err), K(result_comment));
} else {
actual_comment = result_comment;
}
}
if (OB_FAIL(ret) || EMPTY_COMMENT == actual_comment) {
// do nothing
} else if (OB_FAIL(ctx.set_timeout(GCONF.internal_sql_execute_timeout))) { // overwrite timeout
LOG_WARN("set default timeout ctx failed", KR(ret), K(ctx), K_(tenant_id), K(task_id));
} else if (OB_FAIL(ObTransferTaskOperator::update_comment(
*sql_proxy_,
tenant_id_,
task_id,
actual_comment))) {
LOG_WARN("update comment failed", KR(ret), K_(tenant_id), K(task_id), K(actual_comment));
}
return ret;
}
#undef TTS_INFO #undef TTS_INFO
} // end namespace rootserver } // end namespace rootserver
} // end namespace oceanbase } // end namespace oceanbase

View File

@ -122,7 +122,13 @@ private:
common::ObISQLClient &sql_proxy, common::ObISQLClient &sql_proxy,
const share::ObLSID &src_ls, const share::ObLSID &src_ls,
const share::ObLSID &dest_ls, const share::ObLSID &dest_ls,
bool &is_same); ObTransferTaskComment &result_comment);
int get_member_lists_by_inner_sql_(
common::ObISQLClient &sql_proxy,
const ObLSID &src_ls,
const ObLSID &dest_ls,
share::ObLSReplica::MemberList &src_ls_member_list,
share::ObLSReplica::MemberList &dest_ls_member_list);
int lock_table_and_part_( int lock_table_and_part_(
ObMySQLTransaction &trans, ObMySQLTransaction &trans,
const share::ObLSID &src_ls, const share::ObLSID &src_ls,
@ -216,6 +222,10 @@ private:
const ObTabletID &tablet_id, const ObTabletID &tablet_id,
share::ObDisplayTabletList &table_lock_tablet_list); share::ObDisplayTabletList &table_lock_tablet_list);
int set_transaction_timeout_(common::ObTimeoutCtx &ctx); int set_transaction_timeout_(common::ObTimeoutCtx &ctx);
int update_comment_for_expected_errors_(
const int err,
const ObTransferTaskID &task_id,
const ObTransferTaskComment &result_comment);
private: private:
static const int64_t IDLE_TIME_US = 10 * 1000 * 1000L; // 10s static const int64_t IDLE_TIME_US = 10 * 1000 * 1000L; // 10s
static const int64_t BUSY_IDLE_TIME_US = 100 * 1000L; // 100ms static const int64_t BUSY_IDLE_TIME_US = 100 * 1000L; // 100ms

View File

@ -16,6 +16,7 @@
#include "share/config/ob_server_config.h" // for KR(), common::ob_error_name(x) #include "share/config/ob_server_config.h" // for KR(), common::ob_error_name(x)
#include "share/ls/ob_ls_replica_filter.h" // ObLSReplicaFilter #include "share/ls/ob_ls_replica_filter.h" // ObLSReplicaFilter
#include "share/ob_share_util.h" // ObShareUtils #include "share/ob_share_util.h" // ObShareUtils
#include "share/ob_all_server_tracer.h" // SVR_TRACER
#include "lib/string/ob_sql_string.h" // ObSqlString #include "lib/string/ob_sql_string.h" // ObSqlString
#include "lib/utility/utility.h" // split_on() #include "lib/utility/utility.h" // split_on()
@ -324,6 +325,7 @@ bool ObLSReplica::learner_list_is_equal(const common::GlobalLearnerList &a, cons
return is_equal; return is_equal;
} }
// both server and timestamp of member need to be equal
bool ObLSReplica::member_list_is_equal(const MemberList &a, const MemberList &b) bool ObLSReplica::member_list_is_equal(const MemberList &a, const MemberList &b)
{ {
bool is_equal = true; bool is_equal = true;
@ -377,6 +379,24 @@ bool ObLSReplica::servers_in_member_list_are_same(const MemberList &a, const Mem
return is_same; return is_same;
} }
int ObLSReplica::check_all_servers_in_member_list_are_active(
const MemberList &member_list,
bool &all_active)
{
int ret = OB_SUCCESS;
all_active = true;
ARRAY_FOREACH_X(member_list, idx, cnt, OB_SUCC(ret) && all_active) {
const ObAddr &server = member_list.at(idx).get_server();
if (OB_FAIL(SVR_TRACER.check_server_alive(server, all_active))) {
all_active = false;
LOG_WARN("check server alive failed", KR(ret), K(server), K(all_active), K(member_list));
} else if (!all_active) {
LOG_WARN("server in member_list is inactive", KR(ret), K(server), K(member_list));
}
}
return ret;
}
int64_t ObLSReplica::to_string(char *buf, const int64_t buf_len) const int64_t ObLSReplica::to_string(char *buf, const int64_t buf_len) const
{ {
int64_t pos = 0; int64_t pos = 0;

View File

@ -83,6 +83,24 @@ class ObLSReplica
public: public:
static const int64_t DEFAULT_REPLICA_COUNT = 7; static const int64_t DEFAULT_REPLICA_COUNT = 7;
typedef common::ObSEArray<SimpleMember, DEFAULT_REPLICA_COUNT, ObNullAllocator> MemberList; typedef common::ObSEArray<SimpleMember, DEFAULT_REPLICA_COUNT, ObNullAllocator> MemberList;
/*---------------------- MemberList related functions begin -----------------------*/
// format-related functions
static int member_list2text(const MemberList &member_list, ObSqlString &text);
static int text2learner_list(const char *text, GlobalLearnerList &learner_list);
static int text2member_list(const char *text, MemberList &member_list);
// transform ObMemberList into MemberList
static int transform_ob_member_list(
const common::ObMemberList &ob_member_list,
MemberList &member_list);
static bool member_list_is_equal(const MemberList &a, const MemberList &b);
static bool server_is_in_member_list(
const MemberList &member_list,
const common::ObAddr &server);
static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b);
static int check_all_servers_in_member_list_are_active(
const MemberList &member_list,
bool &all_acitve);
/*---------------------- MemberList related functions end -------------------------*/
// initial-related functions // initial-related functions
ObLSReplica(); ObLSReplica();
@ -118,19 +136,6 @@ public:
|| common::REPLICA_TYPE_FULL == replica_type_ || common::REPLICA_TYPE_FULL == replica_type_
|| common::REPLICA_TYPE_LOGONLY == replica_type_; } || common::REPLICA_TYPE_LOGONLY == replica_type_; }
inline bool is_in_restore() const { return !restore_status_.is_restore_none(); } inline bool is_in_restore() const { return !restore_status_.is_restore_none(); }
// format-related functions
static int member_list2text(const MemberList &member_list, ObSqlString &text);
static int text2learner_list(const char *text, GlobalLearnerList &learner_list);
static int text2member_list(const char *text, MemberList &member_list);
// transform ObMemberList into MemberList
static int transform_ob_member_list(
const common::ObMemberList &ob_member_list,
MemberList &member_list);
static bool member_list_is_equal(const MemberList &a, const MemberList &b);
static bool server_is_in_member_list(
const MemberList &member_list,
const common::ObAddr &server);
static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b);
int64_t to_string(char *buf, const int64_t buf_len) const; int64_t to_string(char *buf, const int64_t buf_len) const;
// operator-related functions // operator-related functions
int assign(const ObLSReplica &other); int assign(const ObLSReplica &other);

View File

@ -40,9 +40,10 @@ public:
~ObTenantTabletToLSMap() {} ~ObTenantTabletToLSMap() {}
int init(const int64_t bucket_num = 4096, int init(const int64_t bucket_num = 4096,
const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap")) const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap"),
const uint64_t tenant_id = OB_SERVER_TENANT_ID)
{ {
return map_.create(bucket_num, label); return map_.create(bucket_num, label, ObModIds::OB_HASH_NODE, tenant_id);
} }
void destroy() { map_.destroy(); } void destroy() { map_.destroy(); }

View File

@ -313,6 +313,7 @@ static const char* TRANSFER_TASK_COMMENT_ARRAY[] =
"Task completed as no valid partition", "Task completed as no valid partition",
"Task canceled", "Task canceled",
"Unable to process task due to transaction timeout", "Unable to process task due to transaction timeout",
"Unable to process task due to inactive server in member list",
"Unknow"/*MAX_COMMENT*/ "Unknow"/*MAX_COMMENT*/
}; };

View File

@ -213,6 +213,7 @@ enum ObTransferTaskComment
TASK_COMPLETED_AS_NO_VALID_PARTITION = 2, TASK_COMPLETED_AS_NO_VALID_PARTITION = 2,
TASK_CANCELED = 3, TASK_CANCELED = 3,
TRANSACTION_TIMEOUT = 4, TRANSACTION_TIMEOUT = 4,
INACTIVE_SERVER_IN_MEMBER_LIST = 5,
MAX_COMMENT MAX_COMMENT
}; };