diff --git a/src/rootserver/balance/ob_all_balance_group_builder.cpp b/src/rootserver/balance/ob_all_balance_group_builder.cpp index 73851135d9..c2c1356c1a 100644 --- a/src/rootserver/balance/ob_all_balance_group_builder.cpp +++ b/src/rootserver/balance/ob_all_balance_group_builder.cpp @@ -69,7 +69,7 @@ int ObAllBalanceGroupBuilder::init(const int64_t tenant_id, } else if (OB_UNLIKELY(OB_INVALID_TENANT_ID == tenant_id) || OB_ISNULL(mod)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(tenant_id), K(mod)); - } else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS")))) { + } else if (OB_FAIL(tablet_to_ls_.init(MAP_BUCKET_NUM, lib::ObLabel("TabletToLS"), tenant_id))) { LOG_WARN("create map for tablet to LS fail", KR(ret), K(tenant_id)); } else if (OB_FAIL(tablet_data_size_.create(MAP_BUCKET_NUM, lib::ObLabel("TabletSizeMap")))) { LOG_WARN("create map for tablet data size fail", KR(ret), K(tenant_id)); diff --git a/src/rootserver/ob_tenant_transfer_service.cpp b/src/rootserver/ob_tenant_transfer_service.cpp index f073386f9f..6b7534b03e 100644 --- a/src/rootserver/ob_tenant_transfer_service.cpp +++ b/src/rootserver/ob_tenant_transfer_service.cpp @@ -177,10 +177,10 @@ int ObTenantTransferService::process_task_(const ObTransferTask::TaskStatus &tas int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id) { int ret = OB_SUCCESS; + int tmp_ret = OB_SUCCESS; ObTransferTask task; ObMySQLTransaction trans; - bool member_list_is_same = false; - bool update_comment_to_wait_for_member_list = false; + ObTransferTaskComment result_comment = EMPTY_COMMENT; ObArray tablet_ids; ObTableLockOwnerID lock_owner_id; ObTransferPartList not_exist_part_list; @@ -219,15 +219,12 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id) *sql_proxy_, task.get_src_ls(), task.get_dest_ls(), - member_list_is_same))) { // can't use trans - LOG_WARN("fail to check ls member_list", KR(ret), K(task), K(member_list_is_same)); - } else if (!member_list_is_same) { + result_comment))) { // can't use trans + LOG_WARN("fail to check ls member_list", KR(ret), K(task)); + } else if (EMPTY_COMMENT != result_comment) { ret = OB_NEED_RETRY; - TTS_INFO("member_lists of src_ls and dest_ls are not same, need retry", - KR(ret), K_(tenant_id), K(member_list_is_same), K(task)); - if (task.get_comment() != ObTransferTaskComment::WAIT_FOR_MEMBER_LIST) { - update_comment_to_wait_for_member_list = true; - } + TTS_INFO("member_lists of src_ls and dest_ls are not same or there has inacitve server in member_list, need retry", + KR(ret), K_(tenant_id), K(task), "result_comment", transfer_task_comment_to_str(result_comment)); } else if (OB_FAIL(lock_table_and_part_( trans, task.get_src_ls(), @@ -289,22 +286,10 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id) } } - if ((OB_NEED_RETRY == ret && update_comment_to_wait_for_member_list) - || OB_TRANS_TIMEOUT == ret || OB_TIMEOUT == ret) { - ObTimeoutCtx ctx_comment; - int tmp_ret = OB_SUCCESS; - ObTransferTaskComment comment = (OB_NEED_RETRY == ret) - ? ObTransferTaskComment::WAIT_FOR_MEMBER_LIST - : ObTransferTaskComment::TRANSACTION_TIMEOUT; - if (OB_TMP_FAIL(ctx_comment.set_timeout(2000000/*2s*/))) { // overwrite timeout - LOG_WARN("set default timeout ctx failed", KR(tmp_ret), K(ctx_comment), K_(tenant_id), K(task_id)); - } else if (OB_TMP_FAIL(ObTransferTaskOperator::update_comment( - *sql_proxy_, - tenant_id_, - task_id, - comment))) { - LOG_WARN("update comment failed", KR(tmp_ret), K_(tenant_id), K(task_id), K(comment)); - } + // update comments for expected error codes + if (OB_TMP_FAIL(update_comment_for_expected_errors_(ret, task_id, result_comment))) { + LOG_WARN("update comment for expected errors failed", KR(tmp_ret), KR(ret), + K_(tenant_id), K(task_id), "result_comment", transfer_task_comment_to_str(result_comment)); } if (OB_SUCC(ret)) { @@ -322,24 +307,87 @@ int ObTenantTransferService::process_init_task_(const ObTransferTaskID task_id) ERRSIM_POINT_DEF(EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME); +// 1.check leader member_lists of src_ls and dest_ls are same +// 2.if member_lists are same, check that all servers in member_list are acitve int ObTenantTransferService::check_ls_member_list_( common::ObISQLClient &sql_proxy, const ObLSID &src_ls, const ObLSID &dest_ls, - bool &is_same) + ObTransferTaskComment &result_comment) { int ret = OB_SUCCESS; + result_comment = EMPTY_COMMENT; + bool all_members_are_active = false; + ObLSReplica::MemberList src_ls_member_list; + ObLSReplica::MemberList dest_ls_member_list; if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); + } else if (OB_FAIL(get_member_lists_by_inner_sql_( + sql_proxy, + src_ls, + dest_ls, + src_ls_member_list, + dest_ls_member_list))) { + LOG_WARN("get member list by inner sql failed", KR(ret), K(src_ls), K(dest_ls)); + } else if (!ObLSReplica::servers_in_member_list_are_same( + src_ls_member_list, + dest_ls_member_list)) { + // result 1: member_lists are not same + result_comment = WAIT_FOR_MEMBER_LIST; + LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls), + K(dest_ls), K(src_ls_member_list), K(dest_ls_member_list), K(result_comment)); + } else if (OB_FAIL(ObLSReplica::check_all_servers_in_member_list_are_active( + src_ls_member_list, + all_members_are_active))) { + LOG_WARN("check all servers in member list are active failed", + KR(ret), K(src_ls_member_list), K(all_members_are_active)); + } else if (!all_members_are_active) { + // result 2: member_lists are same, but server in member_list is inactive + result_comment = INACTIVE_SERVER_IN_MEMBER_LIST; + LOG_WARN("member_list has inactive server", KR(ret), K(src_ls), + K(src_ls_member_list), K(all_members_are_active), K(result_comment)); + } else { + // result 3: member_lists are same && all members are active + result_comment = EMPTY_COMMENT; + TTS_INFO("member_lists of src_ls and dest_ls are same and all members are acitve", + KR(ret), K_(tenant_id), K(src_ls), K(dest_ls), K(all_members_are_active), + K(src_ls_member_list), K(dest_ls_member_list), K(result_comment)); + } + // just for debug + if (OB_FAIL(ret)) { + } else if (OB_IN_STOP_STATE == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) { + result_comment = INACTIVE_SERVER_IN_MEMBER_LIST; + TTS_INFO("errsim tenant transfer check ls member list with inactive server", K(result_comment)); + } else if (OB_STATE_NOT_MATCH == EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) { + result_comment = WAIT_FOR_MEMBER_LIST; + TTS_INFO("errsim tenant transfer check ls member list not same", K(result_comment)); + } + return ret; +} + +// get ls leader member list of src_ls and dest_ls +int ObTenantTransferService::get_member_lists_by_inner_sql_( + common::ObISQLClient &sql_proxy, + const ObLSID &src_ls, + const ObLSID &dest_ls, + ObLSReplica::MemberList &src_ls_member_list, + ObLSReplica::MemberList &dest_ls_member_list) +{ + int ret = OB_SUCCESS; + src_ls_member_list.reset(); + dest_ls_member_list.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret)); + } else if (!src_ls.is_valid() || !dest_ls.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid args", KR(ret), K(src_ls), K(dest_ls)); } else { - is_same = false; SMART_VAR(ObISQLClient::ReadResult, result) { ObSqlString sql; ObString src_ls_member_list_str; ObString dest_ls_member_list_str; - ObLSReplica::MemberList src_ls_member_list; - ObLSReplica::MemberList dest_ls_member_list; common::sqlclient::ObMySQLResult *res = NULL; if (OB_FAIL(sql.assign_fmt( "SELECT PAXOS_MEMBER_LIST FROM %s WHERE TENANT_ID = %lu AND ROLE = 'LEADER'" @@ -372,12 +420,8 @@ int ObTenantTransferService::check_ls_member_list_( to_cstring(dest_ls_member_list_str), dest_ls_member_list))) { LOG_WARN("text2member_list failed", KR(ret), K_(tenant_id), K(dest_ls), K(dest_ls_member_list_str)); - } else if (ObLSReplica::servers_in_member_list_are_same(src_ls_member_list, dest_ls_member_list)) { - is_same = true; - } else { - is_same = false; } - + // double check sql result if (OB_FAIL(ret)) { if (OB_UNLIKELY(OB_ITER_END == ret)) { // read less than two rows ret = OB_LEADER_NOT_EXIST; @@ -396,22 +440,9 @@ int ObTenantTransferService::check_ls_member_list_( K(sql), K(src_ls_member_list_str), K(dest_ls_member_list_str)); } else { ret = OB_SUCCESS; - if (is_same) { - LOG_INFO("member_list of src_ls and dest_ls are same", KR(ret), K_(tenant_id), K(src_ls), - K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list)); - } else { - LOG_WARN("member_list of src_ls and dest_ls are not same", KR(ret), K_(tenant_id), K(src_ls), - K(dest_ls), K(is_same), K(src_ls_member_list), K(dest_ls_member_list)); - } } } // end SMART_VAR } - if (OB_SUCC(ret)) { - if (EN_TENANT_TRANSFER_CHECK_LS_MEMBER_LIST_NOT_SAME) { - is_same = false; - TTS_INFO("errsim tenant transfer check ls member list not same", K(is_same)); - } - } return ret; } @@ -1566,6 +1597,50 @@ int ObTenantTransferService::set_transaction_timeout_(common::ObTimeoutCtx &ctx) return ret; } +// err --> comment +// OB_TRANS_TIMEOUT TRANSACTION_TIMEOUT +// OB_TIMEOUT TRANSACTION_TIMEOUT +// OB_NEED_RETRY WAIT_FOR_MEMBER_LIST/INACTIVE_SERVER_IN_MEMBER_LIST +int ObTenantTransferService::update_comment_for_expected_errors_( + const int err, + const ObTransferTaskID &task_id, + const ObTransferTaskComment &result_comment) +{ + int ret = OB_SUCCESS; + ObTransferTaskComment actual_comment = EMPTY_COMMENT; + ObTimeoutCtx ctx; + if (IS_NOT_INIT || OB_ISNULL(sql_proxy_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret)); + } else if (OB_SUCCESS == err) { + // skip + } else if (OB_UNLIKELY(!task_id.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid task_id", KR(ret), K(task_id)); + } else if (OB_TRANS_TIMEOUT == err || OB_TIMEOUT == err) { + actual_comment = TRANSACTION_TIMEOUT; + } else if (OB_NEED_RETRY == err) { + if (WAIT_FOR_MEMBER_LIST != result_comment && INACTIVE_SERVER_IN_MEMBER_LIST != result_comment) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected comment with err", KR(ret), K(err), K(result_comment)); + } else { + actual_comment = result_comment; + } + } + if (OB_FAIL(ret) || EMPTY_COMMENT == actual_comment) { + // do nothing + } else if (OB_FAIL(ctx.set_timeout(GCONF.internal_sql_execute_timeout))) { // overwrite timeout + LOG_WARN("set default timeout ctx failed", KR(ret), K(ctx), K_(tenant_id), K(task_id)); + } else if (OB_FAIL(ObTransferTaskOperator::update_comment( + *sql_proxy_, + tenant_id_, + task_id, + actual_comment))) { + LOG_WARN("update comment failed", KR(ret), K_(tenant_id), K(task_id), K(actual_comment)); + } + return ret; +} + #undef TTS_INFO } // end namespace rootserver } // end namespace oceanbase diff --git a/src/rootserver/ob_tenant_transfer_service.h b/src/rootserver/ob_tenant_transfer_service.h index 062deec6ac..9aa2380f06 100644 --- a/src/rootserver/ob_tenant_transfer_service.h +++ b/src/rootserver/ob_tenant_transfer_service.h @@ -122,7 +122,13 @@ private: common::ObISQLClient &sql_proxy, const share::ObLSID &src_ls, const share::ObLSID &dest_ls, - bool &is_same); + ObTransferTaskComment &result_comment); + int get_member_lists_by_inner_sql_( + common::ObISQLClient &sql_proxy, + const ObLSID &src_ls, + const ObLSID &dest_ls, + share::ObLSReplica::MemberList &src_ls_member_list, + share::ObLSReplica::MemberList &dest_ls_member_list); int lock_table_and_part_( ObMySQLTransaction &trans, const share::ObLSID &src_ls, @@ -216,6 +222,10 @@ private: const ObTabletID &tablet_id, share::ObDisplayTabletList &table_lock_tablet_list); int set_transaction_timeout_(common::ObTimeoutCtx &ctx); + int update_comment_for_expected_errors_( + const int err, + const ObTransferTaskID &task_id, + const ObTransferTaskComment &result_comment); private: static const int64_t IDLE_TIME_US = 10 * 1000 * 1000L; // 10s static const int64_t BUSY_IDLE_TIME_US = 100 * 1000L; // 100ms diff --git a/src/share/ls/ob_ls_info.cpp b/src/share/ls/ob_ls_info.cpp index 0aa6d4c3a5..20255d692b 100644 --- a/src/share/ls/ob_ls_info.cpp +++ b/src/share/ls/ob_ls_info.cpp @@ -16,6 +16,7 @@ #include "share/config/ob_server_config.h" // for KR(), common::ob_error_name(x) #include "share/ls/ob_ls_replica_filter.h" // ObLSReplicaFilter #include "share/ob_share_util.h" // ObShareUtils +#include "share/ob_all_server_tracer.h" // SVR_TRACER #include "lib/string/ob_sql_string.h" // ObSqlString #include "lib/utility/utility.h" // split_on() @@ -324,6 +325,7 @@ bool ObLSReplica::learner_list_is_equal(const common::GlobalLearnerList &a, cons return is_equal; } +// both server and timestamp of member need to be equal bool ObLSReplica::member_list_is_equal(const MemberList &a, const MemberList &b) { bool is_equal = true; @@ -377,6 +379,24 @@ bool ObLSReplica::servers_in_member_list_are_same(const MemberList &a, const Mem return is_same; } +int ObLSReplica::check_all_servers_in_member_list_are_active( + const MemberList &member_list, + bool &all_active) +{ + int ret = OB_SUCCESS; + all_active = true; + ARRAY_FOREACH_X(member_list, idx, cnt, OB_SUCC(ret) && all_active) { + const ObAddr &server = member_list.at(idx).get_server(); + if (OB_FAIL(SVR_TRACER.check_server_alive(server, all_active))) { + all_active = false; + LOG_WARN("check server alive failed", KR(ret), K(server), K(all_active), K(member_list)); + } else if (!all_active) { + LOG_WARN("server in member_list is inactive", KR(ret), K(server), K(member_list)); + } + } + return ret; +} + int64_t ObLSReplica::to_string(char *buf, const int64_t buf_len) const { int64_t pos = 0; diff --git a/src/share/ls/ob_ls_info.h b/src/share/ls/ob_ls_info.h index 999f1aa346..11344e7c85 100644 --- a/src/share/ls/ob_ls_info.h +++ b/src/share/ls/ob_ls_info.h @@ -83,6 +83,24 @@ class ObLSReplica public: static const int64_t DEFAULT_REPLICA_COUNT = 7; typedef common::ObSEArray MemberList; + /*---------------------- MemberList related functions begin -----------------------*/ + // format-related functions + static int member_list2text(const MemberList &member_list, ObSqlString &text); + static int text2learner_list(const char *text, GlobalLearnerList &learner_list); + static int text2member_list(const char *text, MemberList &member_list); + // transform ObMemberList into MemberList + static int transform_ob_member_list( + const common::ObMemberList &ob_member_list, + MemberList &member_list); + static bool member_list_is_equal(const MemberList &a, const MemberList &b); + static bool server_is_in_member_list( + const MemberList &member_list, + const common::ObAddr &server); + static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b); + static int check_all_servers_in_member_list_are_active( + const MemberList &member_list, + bool &all_acitve); + /*---------------------- MemberList related functions end -------------------------*/ // initial-related functions ObLSReplica(); @@ -118,19 +136,6 @@ public: || common::REPLICA_TYPE_FULL == replica_type_ || common::REPLICA_TYPE_LOGONLY == replica_type_; } inline bool is_in_restore() const { return !restore_status_.is_restore_none(); } - // format-related functions - static int member_list2text(const MemberList &member_list, ObSqlString &text); - static int text2learner_list(const char *text, GlobalLearnerList &learner_list); - static int text2member_list(const char *text, MemberList &member_list); - // transform ObMemberList into MemberList - static int transform_ob_member_list( - const common::ObMemberList &ob_member_list, - MemberList &member_list); - static bool member_list_is_equal(const MemberList &a, const MemberList &b); - static bool server_is_in_member_list( - const MemberList &member_list, - const common::ObAddr &server); - static bool servers_in_member_list_are_same(const MemberList &a, const MemberList &b); int64_t to_string(char *buf, const int64_t buf_len) const; // operator-related functions int assign(const ObLSReplica &other); diff --git a/src/share/tablet/ob_tenant_tablet_to_ls_map.h b/src/share/tablet/ob_tenant_tablet_to_ls_map.h index 77b27235a4..a6e6985345 100644 --- a/src/share/tablet/ob_tenant_tablet_to_ls_map.h +++ b/src/share/tablet/ob_tenant_tablet_to_ls_map.h @@ -40,9 +40,10 @@ public: ~ObTenantTabletToLSMap() {} int init(const int64_t bucket_num = 4096, - const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap")) + const lib::ObLabel label = lib::ObLabel("TenantTabletToLSMap"), + const uint64_t tenant_id = OB_SERVER_TENANT_ID) { - return map_.create(bucket_num, label); + return map_.create(bucket_num, label, ObModIds::OB_HASH_NODE, tenant_id); } void destroy() { map_.destroy(); } diff --git a/src/share/transfer/ob_transfer_info.cpp b/src/share/transfer/ob_transfer_info.cpp index f4292c61a9..c28aa64ba8 100644 --- a/src/share/transfer/ob_transfer_info.cpp +++ b/src/share/transfer/ob_transfer_info.cpp @@ -313,6 +313,7 @@ static const char* TRANSFER_TASK_COMMENT_ARRAY[] = "Task completed as no valid partition", "Task canceled", "Unable to process task due to transaction timeout", + "Unable to process task due to inactive server in member list", "Unknow"/*MAX_COMMENT*/ }; diff --git a/src/share/transfer/ob_transfer_info.h b/src/share/transfer/ob_transfer_info.h index 240b60ea61..454b03551f 100644 --- a/src/share/transfer/ob_transfer_info.h +++ b/src/share/transfer/ob_transfer_info.h @@ -213,6 +213,7 @@ enum ObTransferTaskComment TASK_COMPLETED_AS_NO_VALID_PARTITION = 2, TASK_CANCELED = 3, TRANSACTION_TIMEOUT = 4, + INACTIVE_SERVER_IN_MEMBER_LIST = 5, MAX_COMMENT };