From 8acf567a803210a96e5512d133dc1302d6d86c63 Mon Sep 17 00:00:00 2001 From: godyangfight Date: Wed, 19 Jul 2023 07:18:23 +0000 Subject: [PATCH] Fix the bug that the standby database transfer is stuck in the doing state --- .../ob_ls_complete_migration.cpp | 13 +++- .../high_availability/ob_storage_ha_utils.cpp | 60 ++++++++++++++++--- .../high_availability/ob_storage_ha_utils.h | 2 + 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/src/storage/high_availability/ob_ls_complete_migration.cpp b/src/storage/high_availability/ob_ls_complete_migration.cpp index 396ecbf58d..d853b67fe3 100644 --- a/src/storage/high_availability/ob_ls_complete_migration.cpp +++ b/src/storage/high_availability/ob_ls_complete_migration.cpp @@ -1009,6 +1009,8 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_() //TODO(muwei.ym) MAKE THIS TIME PARAM as hide configuration iterms bool need_wait = false; bool is_done = false; + const bool is_primay_tenant = MTL_IS_PRIMARY_TENANT(); + share::SCN readable_scn; if (!is_inited_) { ret = OB_NOT_INIT; @@ -1029,6 +1031,8 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_() LOG_WARN("failed to check need wait log replay", K(ret), KPC(ctx_)); } else if (!need_wait) { FLOG_INFO("no need wait replay log sync", KPC(ctx_)); + } else if (!is_primay_tenant && OB_FAIL(ObStorageHAUtils::get_readable_scn_with_retry(readable_scn))) { + LOG_WARN("failed to get readable scn", K(ret), KPC(ctx_)); } else { #ifdef ERRSIM SERVER_EVENT_SYNC_ADD("storage_ha", "wait_log_replay_sync", @@ -1060,7 +1064,14 @@ int ObStartCompleteMigrationTask::wait_log_replay_sync_() LOG_INFO("wait replay log ts ns success, stop wait", "arg", ctx_->arg_, K(cost_ts)); } else if (OB_FAIL(ls->get_max_decided_scn(current_replay_scn))) { LOG_WARN("failed to get current replay log ts", K(ret), KPC(ctx_)); - } else { + } else if (!is_primay_tenant && current_replay_scn >= readable_scn) { + wait_log_replay_success = true; + const int64_t cost_ts = ObTimeUtility::current_time() - wait_replay_start_ts; + LOG_INFO("wait replay log ts ns success, stop wait", "arg", ctx_->arg_, K(cost_ts), + K(is_primay_tenant), K(current_replay_scn), K(readable_scn)); + } + + if (OB_SUCC(ret) && !wait_log_replay_success) { current_ts = ObTimeUtility::current_time(); bool is_timeout = false; if (REACH_TENANT_TIME_INTERVAL(60 * 1000 * 1000)) { diff --git a/src/storage/high_availability/ob_storage_ha_utils.cpp b/src/storage/high_availability/ob_storage_ha_utils.cpp index 814a535cae..3b828e1bdb 100644 --- a/src/storage/high_availability/ob_storage_ha_utils.cpp +++ b/src/storage/high_availability/ob_storage_ha_utils.cpp @@ -246,25 +246,69 @@ int ObStorageHAUtils::check_transfer_ls_can_rebuild( { int ret = OB_SUCCESS; SCN readable_scn = SCN::base_scn(); - rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*); need_rebuild = false; if (!replay_scn.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("argument invalid", K(ret), K(replay_scn)); - } else if (OB_ISNULL(info)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("tenant info is null", K(ret), K(replay_scn)); } else if (MTL_IS_PRIMARY_TENANT()) { need_rebuild = true; + } else if (OB_FAIL(get_readable_scn_(readable_scn))) { + LOG_WARN("failed to get readable scn", K(ret), K(replay_scn)); + } else if (readable_scn >= replay_scn) { + need_rebuild = true; + } else { + need_rebuild = false; + } + return ret; +} + +int ObStorageHAUtils::get_readable_scn_with_retry(share::SCN &readable_scn) +{ + int ret = OB_SUCCESS; + readable_scn.set_base(); + rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*); + const int64_t GET_READABLE_SCN_INTERVAL = 100 * 1000; // 100ms + const int64_t GET_REABLE_SCN_TIMEOUT = 9 * 1000 * 1000; // 9s + + if (OB_ISNULL(info)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("tenant info is null", K(ret), KP(info)); + } else { + const int64_t start_ts = ObTimeUtility::current_time(); + while (OB_SUCC(ret)) { + if (OB_FAIL(get_readable_scn_(readable_scn))) { + LOG_WARN("failed to get readable scn", K(ret)); + if (OB_EAGAIN == ret) { + //overwrite ret + if (ObTimeUtil::current_time() - start_ts >= GET_REABLE_SCN_TIMEOUT) { + ret = OB_TIMEOUT; + LOG_WARN("get valid readable scn timeout", K(ret), K(readable_scn)); + } else { + ret = OB_SUCCESS; + ob_usleep(GET_READABLE_SCN_INTERVAL); + } + } + } else { + break; + } + } + } + return ret; +} + +int ObStorageHAUtils::get_readable_scn_(share::SCN &readable_scn) +{ + int ret = OB_SUCCESS; + readable_scn.set_base(); + rootserver::ObTenantInfoLoader *info = MTL(rootserver::ObTenantInfoLoader*); + if (OB_ISNULL(info)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("tenant info is null", K(ret), KP(info)); } else if (OB_FAIL(info->get_readable_scn(readable_scn))) { LOG_WARN("failed to get readable scn", K(ret), K(readable_scn)); } else if (!readable_scn.is_valid()) { ret = OB_EAGAIN; LOG_WARN("readable_scn not valid", K(ret), K(readable_scn)); - } else if (readable_scn >= replay_scn) { - need_rebuild = true; - } else { - need_rebuild = false; } return ret; } diff --git a/src/storage/high_availability/ob_storage_ha_utils.h b/src/storage/high_availability/ob_storage_ha_utils.h index 0def339084..afd6313653 100644 --- a/src/storage/high_availability/ob_storage_ha_utils.h +++ b/src/storage/high_availability/ob_storage_ha_utils.h @@ -42,6 +42,7 @@ public: static int check_transfer_ls_can_rebuild( const share::SCN replay_scn, bool &need_rebuild); + static int get_readable_scn_with_retry(share::SCN &readable_scn); private: static int check_merge_error_(const uint64_t tenant_id, common::ObISQLClient &sql_client); @@ -50,6 +51,7 @@ private: share::SCN &compaction_scn); static int check_tablet_replica_checksum_(const uint64_t tenant_id, const common::ObTabletID &tablet_id, const share::ObLSID &ls_id, const share::SCN &compaction_scn, common::ObISQLClient &sql_client); + static int get_readable_scn_(share::SCN &readable_scn); }; struct ObTransferUtils