From bddf3eadd959cfaffa4a4d22fbdd04dc6af2d045 Mon Sep 17 00:00:00 2001 From: godyangfight Date: Thu, 2 Jan 2025 15:45:52 +0000 Subject: [PATCH] [CP] Fix restore with ls migration relay start transfer in redo log bug. --- src/share/restore/ob_ls_restore_status.h | 5 + .../ob_ls_complete_migration.cpp | 2 +- .../ob_tablet_start_transfer_mds_helper.cpp | 104 +++++++++++++++++- .../ob_tablet_start_transfer_mds_helper.h | 10 ++ 4 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/share/restore/ob_ls_restore_status.h b/src/share/restore/ob_ls_restore_status.h index bb92dda83a..0c8241b66e 100644 --- a/src/share/restore/ob_ls_restore_status.h +++ b/src/share/restore/ob_ls_restore_status.h @@ -183,6 +183,11 @@ public: { return NONE == status_ || QUICK_RESTORE_FINISH == status_; } + bool is_in_restore_and_before_quick_restore_finish() const + { + return status_ >= RESTORE_START && status_ < QUICK_RESTORE_FINISH; + } + Status get_status() const { return status_; } int set_status(int32_t status); diff --git a/src/storage/high_availability/ob_ls_complete_migration.cpp b/src/storage/high_availability/ob_ls_complete_migration.cpp index 484853f74a..9061fc427f 100644 --- a/src/storage/high_availability/ob_ls_complete_migration.cpp +++ b/src/storage/high_availability/ob_ls_complete_migration.cpp @@ -1779,7 +1779,7 @@ int ObWaitDataReadyTask::check_need_wait_( LOG_WARN("check need wait log sync get invalid argument", K(ret), KP(ls)); } else if (OB_FAIL(ls->get_restore_status(ls_restore_status))) { LOG_WARN("failed to get restore status", K(ret), KPC(ctx_)); - } else if (ls_restore_status.is_in_restoring_or_failed()) { + } else if (ls_restore_status.is_in_restore_and_before_quick_restore_finish() || ls_restore_status.is_failed()) { need_wait = false; } else if (ObMigrationOpType::REBUILD_LS_OP == ctx_->arg_.type_) { need_wait = false; diff --git a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp index 24143e0f05..c39e28e19e 100644 --- a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp +++ b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp @@ -1628,6 +1628,9 @@ int ObTabletStartTransferInHelper::on_replay( ObTransferUtils::set_transfer_module(); const int64_t start_ts = ObTimeUtility::current_time(); share::ObStorageHACostItemName diagnose_result_msg = share::ObStorageHACostItemName::MAX_NAME; + ObMigrationStatus migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; + share::ObLSRestoreStatus ls_restore_status; + if (OB_ISNULL(buf) || len < 0) { ret = OB_INVALID_ARGUMENT; LOG_WARN("on replay start transfer in get invalid argument", K(ret), KP(buf), K(len)); @@ -1639,6 +1642,10 @@ int ObTabletStartTransferInHelper::on_replay( } else if (!tx_start_transfer_in_info.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tx start transfer in info is unexpected", K(ret), K(tx_start_transfer_in_info)); + } else if (CLICK_FAIL(get_migration_and_restore_status_(tx_start_transfer_in_info, migration_status, ls_restore_status))) { + LOG_WARN("failed to get migration and restore status", K(ret), K(tx_start_transfer_in_info)); + } else if (CLICK_FAIL(check_can_replay_redo_log_(tx_start_transfer_in_info, scn, migration_status, ls_restore_status))) { + LOG_WARN("failed to check can replay redo log", K(ret), K(tx_start_transfer_in_info)); } else if (CLICK_FAIL(check_can_skip_replay_(scn, tx_start_transfer_in_info, skip_replay))) { LOG_WARN("failed to check can skip replay", K(ret), K(tx_start_transfer_in_info)); } else if (skip_replay) { @@ -1663,7 +1670,11 @@ int ObTabletStartTransferInHelper::on_replay( #ifdef ERRSIM SERVER_EVENT_SYNC_ADD("TRANSFER", "AFTER_ON_REDO_START_TRANSFER_IN"); #endif - DEBUG_SYNC(AFTER_ON_REDO_START_TRANSFER_IN); + + if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE == migration_status) { + DEBUG_SYNC(AFTER_ON_REDO_START_TRANSFER_IN); + } + ObTransferUtils::clear_transfer_module(); #ifdef ERRSIM if (OB_SUCC(ret)) { @@ -2249,5 +2260,96 @@ int ObTabletStartTransferInHelper::do_tx_end_before_abort_( return ret; } +int ObTabletStartTransferInHelper::get_migration_and_restore_status_( + const ObTXStartTransferInInfo &tx_start_transfer_in_info, + ObMigrationStatus &migration_status, + share::ObLSRestoreStatus &ls_restore_status) +{ + int ret = OB_SUCCESS; + ObLSHandle ls_handle; + ObLSService *ls_service = nullptr; + ObLS *ls = nullptr; + migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; + if (!tx_start_transfer_in_info.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("get migration and restore status get invalid argument", K(ret), K(tx_start_transfer_in_info)); + } else if (OB_ISNULL(ls_service = MTL(ObLSService *))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls service should not be null", K(ret), KP(ls_service)); + } else if (OB_FAIL(ls_service->get_ls(tx_start_transfer_in_info.dest_ls_id_, ls_handle, ObLSGetMod::STORAGE_MOD))) { + LOG_WARN("fail to get ls", KR(ret), K(tx_start_transfer_in_info)); + } else if (OB_UNLIKELY(nullptr == (ls = ls_handle.get_ls()))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", KR(ret), K(tx_start_transfer_in_info), KP(ls)); + } else if (OB_FAIL(ls->get_migration_and_restore_status(migration_status, ls_restore_status))) { + LOG_WARN("failed to get migration and restore status", K(ret), KPC(ls)); + } + return ret; +} + +int ObTabletStartTransferInHelper::check_can_replay_redo_log_( + const ObTXStartTransferInInfo &tx_start_transfer_in_info, + const share::SCN &scn, + const ObMigrationStatus &migration_status, + const share::ObLSRestoreStatus &ls_restore_status) +{ + int ret = OB_SUCCESS; + const uint64_t tenant_id = MTL_ID(); + SCN gts_scn; + + if (!tx_start_transfer_in_info.is_valid() || !scn.is_valid() + || !ObMigrationStatusHelper::is_valid(migration_status) || !ls_restore_status.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("check can replay redo log invalid argument", K(ret), K(tx_start_transfer_in_info), + K(scn), K(migration_status), K(ls_restore_status)); + } else if (ls_restore_status.is_in_restore_and_before_quick_restore_finish() + && ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status) { + const SCN new_scn = SCN::scn_dec(scn); + if (OB_FAIL(ObTransferUtils::get_gts(tenant_id, gts_scn))) { + LOG_WARN("failed to get gts", K(ret), K(tenant_id), K(scn)); + } else if (gts_scn < new_scn) { + LOG_INFO("ls is in restore status with migration, and tenant readable scn is smaller than transfer in redo scn", + K(tx_start_transfer_in_info), K(gts_scn), K(new_scn), K(scn)); + ObLSHandle ls_handle; + ObLSService *ls_service = nullptr; + ObLS *ls = nullptr; + bool is_exist = false; + ObMigrationStatus src_ls_migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; + if (OB_ISNULL(ls_service = MTL(ObLSService *))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls service should not be null", K(ret), KP(ls_service)); + } else if (OB_FAIL(ls_service->get_ls(tx_start_transfer_in_info.src_ls_id_, ls_handle, ObLSGetMod::STORAGE_MOD))) { + if (OB_LS_NOT_EXIST == ret) { + ret = OB_EAGAIN; + LOG_WARN("src ls do not exist, cannot replay start transfer in redo log", K(ret), K(tx_start_transfer_in_info)); + } else { + LOG_WARN("fail to get ls", KR(ret), K(tx_start_transfer_in_info)); + } + } else if (OB_UNLIKELY(nullptr == (ls = ls_handle.get_ls()))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", KR(ret), K(tx_start_transfer_in_info), KP(ls)); + } else if (OB_FAIL(ls->get_migration_status(src_ls_migration_status))) { + LOG_WARN("failed to get ls migration status", K(ret), KPC(ls)); + } else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE == src_ls_migration_status) { + SCN max_decided_scn; + if (OB_FAIL(ls->get_max_decided_scn(max_decided_scn))) { + LOG_WARN("failed to get max decided scn", K(ret), KPC(ls)); + } else if (max_decided_scn >= tx_start_transfer_in_info.start_scn_) { + //allow replay redo log + } else { + ret = OB_EAGAIN; + LOG_WARN("src ls exit but replay scn is smaller than transfer scn, cannot replay reod log", + K(tx_start_transfer_in_info), K(max_decided_scn)); + } + } else { + ret = OB_EAGAIN; + LOG_WARN("src ls exit but in migration and tenant readable scn is smaller than transfer in redo scn, cannot replay redo log", + K(tx_start_transfer_in_info), K(src_ls_migration_status)); + } + } + } + return ret; +} + } } diff --git a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.h b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.h index d3102be7b4..0f1777dffa 100644 --- a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.h +++ b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.h @@ -17,6 +17,7 @@ #include "lib/container/ob_iarray.h" #include "lib/utility/ob_macro_utils.h" #include "common/ob_tablet_id.h" +#include "storage/high_availability/ob_storage_ha_struct.h" namespace oceanbase { @@ -257,6 +258,15 @@ private: static int do_tx_end_before_abort_( const ObTXStartTransferInInfo &tx_start_transfer_in_info, const char *&can_not_do_reason); + static int get_migration_and_restore_status_( + const ObTXStartTransferInInfo &tx_start_transfer_in_info, + ObMigrationStatus &migration_status, + share::ObLSRestoreStatus &ls_restore_status); + static int check_can_replay_redo_log_( + const ObTXStartTransferInInfo &tx_start_transfer_in_info, + const share::SCN &scn, + const ObMigrationStatus &migration_status, + const share::ObLSRestoreStatus &ls_restore_status); private: DISALLOW_COPY_AND_ASSIGN(ObTabletStartTransferInHelper);