Fix the bug that the standby database transfer is stuck in the doing state

This commit is contained in:
godyangfight
2023-07-19 07:18:23 +00:00
committed by ob-robot
parent 8bdd31baa2
commit 8acf567a80
3 changed files with 66 additions and 9 deletions

View File

@ -1009,6 +1009,8 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_()
//TODO(muwei.ym) MAKE THIS TIME PARAM as hide configuration iterms //TODO(muwei.ym) MAKE THIS TIME PARAM as hide configuration iterms
bool need_wait = false; bool need_wait = false;
bool is_done = false; bool is_done = false;
const bool is_primay_tenant = MTL_IS_PRIMARY_TENANT();
share::SCN readable_scn;
if (!is_inited_) { if (!is_inited_) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
@ -1029,6 +1031,8 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_()
LOG_WARN("failed to check need wait log replay", K(ret), KPC(ctx_)); LOG_WARN("failed to check need wait log replay", K(ret), KPC(ctx_));
} else if (!need_wait) { } else if (!need_wait) {
FLOG_INFO("no need wait replay log sync", KPC(ctx_)); FLOG_INFO("no need wait replay log sync", KPC(ctx_));
} else if (!is_primay_tenant && OB_FAIL(ObStorageHAUtils::get_readable_scn_with_retry(readable_scn))) {
LOG_WARN("failed to get readable scn", K(ret), KPC(ctx_));
} else { } else {
#ifdef ERRSIM #ifdef ERRSIM
SERVER_EVENT_SYNC_ADD("storage_ha", "wait_log_replay_sync", SERVER_EVENT_SYNC_ADD("storage_ha", "wait_log_replay_sync",
@ -1060,7 +1064,14 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_()
LOG_INFO("wait replay log ts ns success, stop wait", "arg", ctx_->arg_, K(cost_ts)); LOG_INFO("wait replay log ts ns success, stop wait", "arg", ctx_->arg_, K(cost_ts));
} else if (OB_FAIL(ls->get_max_decided_scn(current_replay_scn))) { } else if (OB_FAIL(ls->get_max_decided_scn(current_replay_scn))) {
LOG_WARN("failed to get current replay log ts", K(ret), KPC(ctx_)); LOG_WARN("failed to get current replay log ts", K(ret), KPC(ctx_));
} else { } else if (!is_primay_tenant && current_replay_scn >= readable_scn) {
wait_log_replay_success = true;
const int64_t cost_ts = ObTimeUtility::current_time() - wait_replay_start_ts;
LOG_INFO("wait replay log ts ns success, stop wait", "arg", ctx_->arg_, K(cost_ts),
K(is_primay_tenant), K(current_replay_scn), K(readable_scn));
}
if (OB_SUCC(ret) && !wait_log_replay_success) {
current_ts = ObTimeUtility::current_time(); current_ts = ObTimeUtility::current_time();
bool is_timeout = false; bool is_timeout = false;
if (REACH_TENANT_TIME_INTERVAL(60 * 1000 * 1000)) { if (REACH_TENANT_TIME_INTERVAL(60 * 1000 * 1000)) {

View File

@ -246,25 +246,69 @@ int ObStorageHAUtils::check_transfer_ls_can_rebuild(
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
SCN readable_scn = SCN::base_scn(); SCN readable_scn = SCN::base_scn();
rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*);
need_rebuild = false; need_rebuild = false;
if (!replay_scn.is_valid()) { if (!replay_scn.is_valid()) {
ret = OB_INVALID_ARGUMENT; ret = OB_INVALID_ARGUMENT;
LOG_WARN("argument invalid", K(ret), K(replay_scn)); LOG_WARN("argument invalid", K(ret), K(replay_scn));
} else if (OB_ISNULL(info)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant info is null", K(ret), K(replay_scn));
} else if (MTL_IS_PRIMARY_TENANT()) { } else if (MTL_IS_PRIMARY_TENANT()) {
need_rebuild = true; need_rebuild = true;
} else if (OB_FAIL(get_readable_scn_(readable_scn))) {
LOG_WARN("failed to get readable scn", K(ret), K(replay_scn));
} else if (readable_scn >= replay_scn) {
need_rebuild = true;
} else {
need_rebuild = false;
}
return ret;
}
int ObStorageHAUtils::get_readable_scn_with_retry(share::SCN &readable_scn)
{
int ret = OB_SUCCESS;
readable_scn.set_base();
rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*);
const int64_t GET_READABLE_SCN_INTERVAL = 100 * 1000; // 100ms
const int64_t GET_REABLE_SCN_TIMEOUT = 9 * 1000 * 1000; // 9s
if (OB_ISNULL(info)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant info is null", K(ret), KP(info));
} else {
const int64_t start_ts = ObTimeUtility::current_time();
while (OB_SUCC(ret)) {
if (OB_FAIL(get_readable_scn_(readable_scn))) {
LOG_WARN("failed to get readable scn", K(ret));
if (OB_EAGAIN == ret) {
//overwrite ret
if (ObTimeUtil::current_time() - start_ts >= GET_REABLE_SCN_TIMEOUT) {
ret = OB_TIMEOUT;
LOG_WARN("get valid readable scn timeout", K(ret), K(readable_scn));
} else {
ret = OB_SUCCESS;
ob_usleep(GET_READABLE_SCN_INTERVAL);
}
}
} else {
break;
}
}
}
return ret;
}
int ObStorageHAUtils::get_readable_scn_(share::SCN &readable_scn)
{
int ret = OB_SUCCESS;
readable_scn.set_base();
rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*);
if (OB_ISNULL(info)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant info is null", K(ret), KP(info));
} else if (OB_FAIL(info->get_readable_scn(readable_scn))) { } else if (OB_FAIL(info->get_readable_scn(readable_scn))) {
LOG_WARN("failed to get readable scn", K(ret), K(readable_scn)); LOG_WARN("failed to get readable scn", K(ret), K(readable_scn));
} else if (!readable_scn.is_valid()) { } else if (!readable_scn.is_valid()) {
ret = OB_EAGAIN; ret = OB_EAGAIN;
LOG_WARN("readable_scn not valid", K(ret), K(readable_scn)); LOG_WARN("readable_scn not valid", K(ret), K(readable_scn));
} else if (readable_scn >= replay_scn) {
need_rebuild = true;
} else {
need_rebuild = false;
} }
return ret; return ret;
} }

View File

@ -42,6 +42,7 @@ public:
static int check_transfer_ls_can_rebuild( static int check_transfer_ls_can_rebuild(
const share::SCN replay_scn, const share::SCN replay_scn,
bool &need_rebuild); bool &need_rebuild);
static int get_readable_scn_with_retry(share::SCN &readable_scn);
private: private:
static int check_merge_error_(const uint64_t tenant_id, common::ObISQLClient &sql_client); static int check_merge_error_(const uint64_t tenant_id, common::ObISQLClient &sql_client);
@ -50,6 +51,7 @@ private:
share::SCN &compaction_scn); share::SCN &compaction_scn);
static int check_tablet_replica_checksum_(const uint64_t tenant_id, const common::ObTabletID &tablet_id, static int check_tablet_replica_checksum_(const uint64_t tenant_id, const common::ObTabletID &tablet_id,
const share::ObLSID &ls_id, const share::SCN &compaction_scn, common::ObISQLClient &sql_client); const share::ObLSID &ls_id, const share::SCN &compaction_scn, common::ObISQLClient &sql_client);
static int get_readable_scn_(share::SCN &readable_scn);
}; };
struct ObTransferUtils struct ObTransferUtils