From daaf3a2e0881bb7cbe6431ba7df7905e9ee69c7a Mon Sep 17 00:00:00 2001 From: godyangfight Date: Tue, 17 Oct 2023 09:09:55 +0000 Subject: [PATCH] Fix restore transfer src ls cannnot gc bug --- src/share/parameter/ob_parameter_seed.ipp | 5 + .../ob_storage_ha_struct.cpp | 119 ++++++++++++++++-- .../high_availability/ob_storage_ha_struct.h | 13 +- .../ob_transfer_backfill_tx.cpp | 24 ++-- .../ob_tenant_checkpoint_slog_handler.cpp | 5 + .../ob_tablet_start_transfer_mds_helper.cpp | 11 ++ 6 files changed, 154 insertions(+), 23 deletions(-) diff --git a/src/share/parameter/ob_parameter_seed.ipp b/src/share/parameter/ob_parameter_seed.ipp index b1251d0076..6420d8b109 100755 --- a/src/share/parameter/ob_parameter_seed.ipp +++ b/src/share/parameter/ob_parameter_seed.ipp @@ -1645,6 +1645,11 @@ ERRSIM_DEF_DBL(errsim_module_error_percentage, OB_TENANT_PARAMETER, "0", "[0,100 "Range: [0, 100] in percentage", ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)) + +ERRSIM_DEF_BOOL(block_transfer_out_replay, OB_TENANT_PARAMETER, "False", + "errsim to block transfer out clog replay", + ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); + // ttl DEF_STR_WITH_CHECKER(kv_ttl_duty_duration, OB_TENANT_PARAMETER, "", common::ObTTLDutyDurationChecker, "ttl background task working time duration" diff --git a/src/storage/high_availability/ob_storage_ha_struct.cpp b/src/storage/high_availability/ob_storage_ha_struct.cpp index 25d7f163a9..82d8f44296 100644 --- a/src/storage/high_availability/ob_storage_ha_struct.cpp +++ b/src/storage/high_availability/ob_storage_ha_struct.cpp @@ -19,6 +19,8 @@ #include "storage/tablet/ob_tablet_common.h" #include "storage/tablet/ob_tablet_iterator.h" #include "storage/ls/ob_ls_tablet_service.h" +#include "logservice/ob_log_service.h" +#include "share/transfer/ob_transfer_task_operator.h" namespace oceanbase { @@ -299,25 +301,44 @@ bool ObMigrationStatusHelper::check_can_restore(const ObMigrationStatus &cur_sta // dest_ls replay clog process: create transfer in tablet(on_redo) ----> check the migration_status of src_ls in dest_ls replay clog(on_prepare) // if the replay of the next start transfer in log depends on this log stream, the replay of the on_prepare log will be stuck, and the newly created transfer in tablet will be unreadable // If dest_tablet exists, has_transfer_table=true, the log stream does not allow GC, because the data of the log stream also needs to be relied on -int ObMigrationStatusHelper::check_transfer_dest_tablet_for_ls_gc(ObLS *ls, const ObTabletID &tablet_id, bool &allow_gc) +int ObMigrationStatusHelper::check_transfer_dest_tablet_for_ls_gc( + ObLS *ls, + const ObTabletID &tablet_id, + const share::SCN &transfer_scn, + const bool need_wait_dest_ls_replay, + bool &allow_gc) { int ret = OB_SUCCESS; ObTabletHandle tablet_handle; ObTablet *tablet = nullptr; - if (OB_ISNULL(ls) || !tablet_id.is_valid()) { + SCN dest_transfer_scn; + if (OB_ISNULL(ls) || !tablet_id.is_valid() || !transfer_scn.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), KP(ls), K(tablet_id)); + LOG_WARN("invalid argument", K(ret), KP(ls), K(tablet_id), K(transfer_scn)); } else if (OB_FAIL(ls->ha_get_tablet(tablet_id, tablet_handle))) { if (OB_TABLET_NOT_EXIST == ret) { - LOG_WARN("dest tablet not exist", K(ret), "ls_id", ls->get_ls_id(), K(tablet_id)); - allow_gc = true; ret = OB_SUCCESS; + if (need_wait_dest_ls_replay) { + allow_gc = false; + } else { + allow_gc = true; + } + LOG_WARN("dest tablet not exist", K(ret), "ls_id", ls->get_ls_id(), K(tablet_id), K(allow_gc)); } else { LOG_WARN("failed to get tablet", K(ret), "ls_id", ls->get_ls_id(), K(tablet_id)); } } else if (OB_ISNULL(tablet = tablet_handle.get_obj())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tablet should not be NULL", K(ret), "ls_id", ls->get_ls_id(), K(tablet_id)); + } else if (FALSE_IT(dest_transfer_scn = tablet->get_tablet_meta().transfer_info_.transfer_start_scn_)) { + } else if (transfer_scn < dest_transfer_scn) { + allow_gc = true; + LOG_INFO("src tablet transfer is smaller than dest tablet, allow gc", "ls_id", ls->get_ls_id(), K(tablet_id), + "src_transfer_scn", transfer_scn, "dest_transfer_scn", dest_transfer_scn, KPC(tablet)); + } else if (transfer_scn > dest_transfer_scn) { + allow_gc = false; + LOG_INFO("src tablet transfer is bigger than dest tablet, do not allow gc", "ls_id", ls->get_ls_id(), + K(tablet_id), "src_transfer_scn", transfer_scn, "dest_transfer_scn", dest_transfer_scn, KPC(tablet)); } else if (tablet->get_tablet_meta().has_transfer_table()) { allow_gc = false; LOG_INFO("dest tablet has transfer table", "ls_id", ls->get_ls_id(), K(tablet_id)); @@ -329,8 +350,10 @@ int ObMigrationStatusHelper::check_transfer_dest_tablet_for_ls_gc(ObLS *ls, cons } int ObMigrationStatusHelper::check_transfer_dest_ls_status_for_ls_gc( - const ObLSID &transfer_ls_id, + const share::ObLSID &transfer_ls_id, const ObTabletID &tablet_id, + const share::SCN &transfer_scn, + const bool need_wait_dest_ls_replay, bool &allow_gc) { int ret = OB_SUCCESS; @@ -339,9 +362,9 @@ int ObMigrationStatusHelper::check_transfer_dest_ls_status_for_ls_gc( ObLSHandle ls_handle; allow_gc = false; ObMigrationStatus dest_ls_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; - if (!transfer_ls_id.is_valid()) { + if (!transfer_ls_id.is_valid() || !tablet_id.is_valid() || !transfer_scn.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("ls id is invalid", K(ret), K(transfer_ls_id)); + LOG_WARN("ls id is invalid", K(ret), K(transfer_ls_id), K(tablet_id), K(transfer_scn)); } else if (OB_ISNULL(ls_service = MTL(ObLSService*))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("failed to get ObLSService from MTL", K(ret), KP(ls_service)); @@ -364,7 +387,7 @@ int ObMigrationStatusHelper::check_transfer_dest_ls_status_for_ls_gc( && ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD_WAIT != dest_ls_status) { allow_gc = true; LOG_INFO("transfer dest ls check transfer status passed", K(ret), K(transfer_ls_id), K(dest_ls_status)); - } else if (OB_FAIL(check_transfer_dest_tablet_for_ls_gc(dest_ls, tablet_id, allow_gc))) { + } else if (OB_FAIL(check_transfer_dest_tablet_for_ls_gc(dest_ls, tablet_id, transfer_scn, need_wait_dest_ls_replay, allow_gc))) { LOG_WARN("failed to check transfer dest tablet", K(ret), KPC(dest_ls), K(tablet_id)); } @@ -374,6 +397,8 @@ int ObMigrationStatusHelper::check_transfer_dest_ls_status_for_ls_gc( // The status of the log stream is OB_MIGRATION_STATUS_GC, which will block the replay of the start transfer in log corresponding to transfer dest_ls // Log stream that is not in the member_list will not be added to the member_list. // If the log stream status modification fails, there is no need to online log_handler. +// After setting the flag of ls gc and stopping log synchronization, it will only affect the destination of the transfer minority, +// and the destination can be restored through rebuilding. int ObMigrationStatusHelper::set_ls_migrate_gc_status_( ObLS &ls, const ObMigrationStatus &migration_status) @@ -402,6 +427,9 @@ int ObMigrationStatusHelper::check_ls_transfer_tablet_( ObLSHandle ls_handle; ObLSTabletIterator tablet_iter(ObMDSGetTabletMode::READ_WITHOUT_CHECK); ObInnerLSStatus create_status; + bool need_check_allow_gc = true; + bool need_wait_dest_ls_replay = false; + ObLSRestoreStatus restore_status; if (!ls_id.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("ls id is invalid", K(ret), K(ls_id)); @@ -418,6 +446,15 @@ int ObMigrationStatusHelper::check_ls_transfer_tablet_( allow_gc = true; } else if (OB_FAIL(set_ls_migrate_gc_status_(*ls, migration_status))) { LOG_WARN("failed to set ls gc status", KR(ret)); + } else if (OB_FAIL(ls->get_restore_status(restore_status))) { + LOG_WARN("failed to get restore status", K(ret), KPC(ls)); + } else if (restore_status.is_in_restore()) { + allow_gc = true; + LOG_INFO("ls ls in restore status, allow gc", K(ret), K(restore_status), K(ls_id)); + } else if (OB_FAIL(check_ls_with_transfer_task_(*ls, need_check_allow_gc, need_wait_dest_ls_replay))) { + LOG_WARN("failed to check ls with transfer task", K(ret), KPC(ls)); + } else if (!need_check_allow_gc) { + allow_gc = false; } else if (OB_FAIL(ls->get_tablet_svr()->build_tablet_iter(tablet_iter))) { LOG_WARN("failed to build ls tablet iter", KR(ret)); } else { @@ -453,7 +490,8 @@ int ObMigrationStatusHelper::check_ls_transfer_tablet_( && ObTabletStatus::TRANSFER_OUT_DELETED != user_data.tablet_status_) { // do nothing } else if (OB_FAIL(check_transfer_dest_ls_status_for_ls_gc( - user_data.transfer_ls_id_, tablet->get_tablet_meta().tablet_id_, allow_gc))) { + user_data.transfer_ls_id_, tablet->get_tablet_meta().tablet_id_, + tablet->get_tablet_meta().transfer_info_.transfer_start_scn_, need_wait_dest_ls_replay, allow_gc))) { LOG_WARN("failed to check ls transfer tablet", K(ret), K(ls), K(user_data)); } else if (!allow_gc) { LOG_INFO("The ls is not allowed to be GC because it is also dependent on other ls", K(user_data), @@ -483,6 +521,67 @@ int ObMigrationStatusHelper::check_ls_allow_gc( return ret; } +int ObMigrationStatusHelper::check_ls_with_transfer_task_( + ObLS &ls, + bool &need_check_allow_gc, + bool &need_wait_dest_ls_replay) +{ + int ret = OB_SUCCESS; + need_check_allow_gc = false; + need_wait_dest_ls_replay = false; + common::ObMySQLProxy *sql_proxy = GCTX.sql_proxy_; + ObTransferTask task; + const uint64_t tenant_id = ls.get_tenant_id(); + const ObLSID &src_ls_id = ls.get_ls_id(); + share::ObTransferTaskInfo task_info; + SCN max_decided_scn(SCN::base_scn()); + ObLSService *ls_service = NULL; + ObLSHandle dest_ls_handle; + + if (OB_ISNULL(sql_proxy)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("mysql proxy should not be NULL", K(ret), KP(sql_proxy)); + } else if (OB_FAIL(ObTransferTaskOperator::get_by_src_ls( + *sql_proxy, tenant_id, src_ls_id, task, share::OBCG_STORAGE_HA_LEVEL2))) { + LOG_WARN("failed to get transfer task", K(ret), K(tenant_id), K(src_ls_id)); + if (OB_ENTRY_NOT_EXIST == ret || OB_TABLE_NOT_EXIST == ret) { + need_check_allow_gc = true; + need_wait_dest_ls_replay = false; + ret = OB_SUCCESS; + } + } else if (OB_FAIL(task_info.convert_from(tenant_id, task))) { + LOG_WARN("failed to convert from transfer task", K(ret), K(task)); + } else if (!task_info.status_.is_doing_status()) { + need_check_allow_gc = true; + need_wait_dest_ls_replay = false; + } else if (OB_ISNULL(ls_service = MTL(ObLSService*))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to get ObLSService from MTL", K(ret), KP(ls_service)); + } else if (OB_FAIL(ls_service->get_ls(task_info.dest_ls_id_, dest_ls_handle, ObLSGetMod::HA_MOD))) { + if (OB_LS_NOT_EXIST == ret) { + LOG_INFO("transfer dest ls not exist", K(ret), K(task_info)); + need_check_allow_gc = true; + need_wait_dest_ls_replay = false; + ret = OB_SUCCESS; + } else { + LOG_WARN("failed to get ls", K(ret), K(task_info)); + } + } else { + if (OB_FAIL(ls.get_max_decided_scn(max_decided_scn))) { + LOG_WARN("failed to get max decided scn", K(ret), K(ls)); + } else if (max_decided_scn < task_info.start_scn_) { + need_check_allow_gc = false; + need_wait_dest_ls_replay = false; + LOG_INFO("transfer src ls is not replay to transfer scn, do not allow gc", K(max_decided_scn), K(task_info)); + } else { + need_check_allow_gc = true; + need_wait_dest_ls_replay = true; + LOG_INFO("transfer src ls is in doing status, need wait dest ls replay", K(max_decided_scn), K(task_info)); + } + } + return ret; +} + bool ObMigrationStatusHelper::check_migration_status_is_fail_(const ObMigrationStatus &cur_status) { bool is_fail = false; diff --git a/src/storage/high_availability/ob_storage_ha_struct.h b/src/storage/high_availability/ob_storage_ha_struct.h index 47a682a413..5c35b640ea 100644 --- a/src/storage/high_availability/ob_storage_ha_struct.h +++ b/src/storage/high_availability/ob_storage_ha_struct.h @@ -105,12 +105,23 @@ private: static int check_transfer_dest_ls_status_for_ls_gc( const share::ObLSID &transfer_ls_id, const ObTabletID &tablet_id, + const share::SCN &transfer_scn, + const bool need_wait_dest_ls_replay, + bool &allow_gc); + static int check_transfer_dest_tablet_for_ls_gc( + ObLS *ls, + const ObTabletID &tablet_id, + const share::SCN &transfer_scn, + const bool need_wait_dest_ls_replay, bool &allow_gc); - static int check_transfer_dest_tablet_for_ls_gc(ObLS *ls, const ObTabletID &tablet_id, bool &allow_gc); static bool check_migration_status_is_fail_(const ObMigrationStatus &cur_status); static int set_ls_migrate_gc_status_( ObLS &ls, const ObMigrationStatus &migration_status); + static int check_ls_with_transfer_task_( + ObLS &ls, + bool &need_check_allow_gc, + bool &need_wait_dest_ls_replay); }; enum ObMigrationOpPriority diff --git a/src/storage/high_availability/ob_transfer_backfill_tx.cpp b/src/storage/high_availability/ob_transfer_backfill_tx.cpp index c535b0a221..002f683ac8 100644 --- a/src/storage/high_availability/ob_transfer_backfill_tx.cpp +++ b/src/storage/high_availability/ob_transfer_backfill_tx.cpp @@ -324,6 +324,18 @@ int ObTransferWorkerMgr::process() int ret = OB_SUCCESS; bool is_exist = false; ObTransferBackfillTXParam param; + +#ifdef ERRSIM + if (OB_SUCC(ret)) { + ret = OB_E(EventTable::EN_CHECK_TRANSFER_TASK_EXSIT) OB_SUCCESS; + if (OB_FAIL(ret)) { + STORAGE_LOG(ERROR, "fake EN_CHECK_TRANSFER_TASK_EXSIT", K(ret)); + is_exist = true; + ret = OB_SUCCESS; + } + } +#endif + if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("transfer work not init", K(ret)); @@ -362,18 +374,6 @@ int ObTransferWorkerMgr::check_task_exist_( LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret)); } else if (OB_FAIL(scheduler->check_dag_net_exist(task_id, is_exist))) { LOG_WARN("failed to check dag net exist", K(ret), K(task_id)); - } else { -#ifdef ERRSIM - if (OB_SUCC(ret)) { - ret = OB_E(EventTable::EN_CHECK_TRANSFER_TASK_EXSIT) OB_SUCCESS; - if (OB_FAIL(ret)) { - STORAGE_LOG(ERROR, "fake EN_CHECK_TRANSFER_TASK_EXSIT", K(ret)); - is_exist = true; - ret = OB_SUCCESS; - } - } -#endif - } return ret; } diff --git a/src/storage/slog_ckpt/ob_tenant_checkpoint_slog_handler.cpp b/src/storage/slog_ckpt/ob_tenant_checkpoint_slog_handler.cpp index 369b03efa0..84a874888a 100755 --- a/src/storage/slog_ckpt/ob_tenant_checkpoint_slog_handler.cpp +++ b/src/storage/slog_ckpt/ob_tenant_checkpoint_slog_handler.cpp @@ -675,6 +675,7 @@ int ObTenantCheckpointSlogHandler::record_ls_transfer_info( bool is_need = false; ObMigrationStatus current_migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; ObMigrationStatus new_migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; + ObLSRestoreStatus ls_restore_status(ObLSRestoreStatus::LS_RESTORE_STATUS_MAX); if (!ls_handle.is_valid() || !tablet_transfer_info.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(ls_handle), K(tablet_transfer_info)); @@ -688,6 +689,10 @@ int ObTenantCheckpointSlogHandler::record_ls_transfer_info( K(current_migration_status), K(new_migration_status)); } else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE != new_migration_status) { LOG_INFO("The log stream does not need to record transfer_info", "ls_id", ls->get_ls_id(), K(current_migration_status), K(new_migration_status)); + } else if (OB_FAIL(ls->get_restore_status(ls_restore_status))) { + LOG_WARN("failed to get ls restore status", K(ret), KPC(ls)); + } else if (ls_restore_status.is_in_restore_and_before_quick_restore()) { + LOG_INFO("the log stream in restore and before quick restore, no need to record transfer info", "ls_id", ls->get_ls_id(), K(ls_restore_status)); } else if (!tablet_transfer_info.has_transfer_table()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tablet should have transfer table", K(ret), "ls_id", ls->get_ls_id(), K(tablet_id), K(tablet_transfer_info)); diff --git a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp index d094a587ae..3f0aadb318 100644 --- a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp +++ b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp @@ -422,6 +422,17 @@ int ObTabletStartTransferOutHelper::on_replay( } else if (!tx_start_transfer_out_info.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tx start transfer out info is unexpected", K(ret), K(tx_start_transfer_out_info)); + } else { +#ifdef ERRSIM + omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID())); + if (tenant_config.is_valid()) { + const bool block_transfer_out_replay = tenant_config->block_transfer_out_replay; + if (block_transfer_out_replay) { + ret = OB_EAGAIN; + LOG_WARN("errsim block transfer out replay", K(ret)); + } + } +#endif } #ifdef ERRSIM SERVER_EVENT_SYNC_ADD("TRANSFER", "BEFORE_ON_REDO_START_TRANSFER_OUT",