From 9c018210bd9e53538a0b6976af5a901536932027 Mon Sep 17 00:00:00 2001 From: obdev Date: Wed, 7 Feb 2024 23:42:11 +0000 Subject: [PATCH] BUGFIX: fix restore ls state after restart --- .../simple_server/test_replay_from_middle.cpp | 15 ++-- src/share/restore/ob_ls_restore_status.cpp | 7 ++ src/share/restore/ob_ls_restore_status.h | 6 +- .../ob_storage_ha_struct.cpp | 5 ++ .../high_availability/ob_storage_ha_struct.h | 1 + src/storage/ls/ob_ls.cpp | 55 +++--------- src/storage/ls/ob_ls.h | 6 +- src/storage/ls/ob_ls_meta.cpp | 27 ++++-- src/storage/ls/ob_ls_meta.h | 1 + .../ob_tablet_start_transfer_mds_helper.cpp | 6 +- src/storage/tx_storage/ob_ls_service.cpp | 90 ++++++++++--------- 11 files changed, 112 insertions(+), 107 deletions(-) diff --git a/mittest/simple_server/test_replay_from_middle.cpp b/mittest/simple_server/test_replay_from_middle.cpp index 678416cdc..f233ee4b7 100644 --- a/mittest/simple_server/test_replay_from_middle.cpp +++ b/mittest/simple_server/test_replay_from_middle.cpp @@ -713,7 +713,7 @@ int ObLSService::online_ls() int tmp_ret = OB_SUCCESS; common::ObSharedGuard ls_iter; ObLS *ls = nullptr; - bool can_replay = true; + int64_t create_type = ObLSCreateType::NORMAL; if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) { LOG_WARN("failed to get ls iter", K(ret)); } else { @@ -725,12 +725,13 @@ int ObLSService::online_ls() } else if (nullptr == ls) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("ls is null", K(ret)); - } else if (OB_FAIL(ls->check_can_replay_clog(can_replay))) { - LOG_WARN("failed to check ls can replay clog", K(ret), KPC(ls)); - } else if (!can_replay) { - // ls can not enable replay - } else if (OB_FAIL(ls->enable_replay())) { - LOG_ERROR("fail to enable replay", K(ret)); + } else { + ObLSLockGuard lock_ls(ls); + if (OB_FAIL(ls->get_create_type(create_type))) { + LOG_WARN("get ls create type failed", K(ret)); + } else if (OB_FAIL(post_create_ls_(create_type, ls))) { + LOG_WARN("post create ls failed", K(ret)); + } } } if (OB_ITER_END == ret) { diff --git a/src/share/restore/ob_ls_restore_status.cpp b/src/share/restore/ob_ls_restore_status.cpp index eb3a33c15..6141f32d2 100644 --- a/src/share/restore/ob_ls_restore_status.cpp +++ b/src/share/restore/ob_ls_restore_status.cpp @@ -88,6 +88,13 @@ bool ObLSRestoreStatus::is_valid_(int32_t status) const } #undef LS_RESTORE_STATUS_CASE_TO_TYPE +bool ObLSRestoreStatus::need_online() const +{ + return ((status_ >= WAIT_RESTORE_SYS_TABLETS + && status_ <= WAIT_RESTORE_MAJOR_DATA) + || status_ == NONE + || status_ == CLONE_CLOG_REPLAY); +} int ObLSRestoreStatus::set_status(int32_t status) { diff --git a/src/share/restore/ob_ls_restore_status.h b/src/share/restore/ob_ls_restore_status.h index 839f55ba2..fb7ae14e1 100644 --- a/src/share/restore/ob_ls_restore_status.h +++ b/src/share/restore/ob_ls_restore_status.h @@ -110,7 +110,9 @@ public: } // offline ls and enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED - bool is_required_to_switch_ls_state_for_restore() const + bool need_online() const; + // enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED + bool is_restore_first_step() const { return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) || status_ == Status::RESTORE_FAILED); @@ -122,7 +124,7 @@ public: { return status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_ALL_TABLET_META; } - bool is_required_to_switch_ls_state_for_clone() const + bool is_clone_first_step() const { return ((status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_LS_META) || Status::CLONE_FAILED == status_); diff --git a/src/storage/high_availability/ob_storage_ha_struct.cpp b/src/storage/high_availability/ob_storage_ha_struct.cpp index 47d66bc56..d348182d4 100644 --- a/src/storage/high_availability/ob_storage_ha_struct.cpp +++ b/src/storage/high_availability/ob_storage_ha_struct.cpp @@ -594,6 +594,11 @@ bool ObMigrationStatusHelper::check_migration_status_is_fail_(const ObMigrationS return is_fail; } +bool ObMigrationStatusHelper::need_online(const ObMigrationStatus &cur_status) +{ + return (OB_MIGRATION_STATUS_NONE == cur_status); +} + bool ObMigrationStatusHelper::check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status) { bool allow_gc = false; diff --git a/src/storage/high_availability/ob_storage_ha_struct.h b/src/storage/high_availability/ob_storage_ha_struct.h index 7f335fd0c..54df630d0 100644 --- a/src/storage/high_availability/ob_storage_ha_struct.h +++ b/src/storage/high_availability/ob_storage_ha_struct.h @@ -81,6 +81,7 @@ public: const ObMigrationStatus &cur_status, bool &allow_gc); // Check the migration status. The LS in the XXX_FAIL state is considered to be an abandoned LS, which can be judged to be directly GC when restarting + static bool need_online(const ObMigrationStatus &cur_status); static bool check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status); static bool check_can_migrate_out(const ObMigrationStatus &cur_status); static int check_can_change_status( diff --git a/src/storage/ls/ob_ls.cpp b/src/storage/ls/ob_ls.cpp index bfb7da665..521b3a476 100644 --- a/src/storage/ls/ob_ls.cpp +++ b/src/storage/ls/ob_ls.cpp @@ -576,7 +576,7 @@ bool ObLS::is_need_gc() const return bool_ret; } -bool ObLS::is_required_to_switch_state_for_restore_() const +bool ObLS::is_clone_first_step() const { int ret = OB_SUCCESS; bool bool_ret = false; @@ -584,12 +584,12 @@ bool ObLS::is_required_to_switch_state_for_restore_() const if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); } else { - bool_ret = restore_status.is_required_to_switch_ls_state_for_restore(); + bool_ret = restore_status.is_clone_first_step(); } return bool_ret; } -bool ObLS::is_required_to_switch_state_for_clone_() const +bool ObLS::is_restore_first_step() const { int ret = OB_SUCCESS; bool bool_ret = false; @@ -597,7 +597,7 @@ bool ObLS::is_required_to_switch_state_for_clone_() const if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); } else { - bool_ret = restore_status.is_required_to_switch_ls_state_for_clone(); + bool_ret = restore_status.is_restore_first_step(); } return bool_ret; } @@ -2096,47 +2096,16 @@ int ObLS::enable_replay() return ret; } -int ObLS::check_can_online(bool &can_online) +int ObLS::check_ls_need_online(bool &need_online) { int ret = OB_SUCCESS; - can_online = true; - if (is_need_gc()) { - // this ls will be gc later, should not enable replay - can_online = false; - } else if (startup_transfer_info_.is_valid()) { - // There is a tablet has_transfer_table=true in the log stream - can_online = false; - LOG_INFO("ls need to wait for dependency to be removed", "ls_id", get_ls_id(), - K_(startup_transfer_info)); - } - return ret; -} - -int ObLS::check_can_replay_clog(bool &can_replay) -{ - int ret = OB_SUCCESS; - share::ObLSRestoreStatus restore_status; - ObMigrationStatus migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX; - can_replay = true; - if (is_need_gc()) { - // this ls will be gc later, should not enable replay - can_replay = false; - } else if (OB_FAIL(get_migration_status(migration_status))) { - LOG_WARN("failed to get ls migration status", K(ret)); - } else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status) { - // ls will online in rebuild process, ls online will enable clog replay - can_replay = false; - LOG_INFO("ls is in rebuild process, cannot replay clog", "ls_id", get_ls_id(), K(migration_status)); - } else if (OB_FAIL(get_restore_status(restore_status))) { - LOG_WARN("fail to get ls restore status", K(ret)); - } else if (!restore_status.can_replay_log()) { - // while downtime, if ls's restore status is in [restore_start, wait_restore_tablet_meta], clog can't replay - can_replay = false; - LOG_INFO("restore status not as expected, can not replay clog", "ls_id", get_ls_id(), K(restore_status)); - } else if (startup_transfer_info_.is_valid()) { - // There is a tablet has_transfer_table=true in the log stream, clog can't replay - can_replay = false; - LOG_INFO("ls not enable clog replay, need to wait for dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info)); + need_online = true; + if (startup_transfer_info_.is_valid()) { + // There is a tablet has_transfer_table=true in the log stream, ls can't online + need_online = false; + LOG_INFO("ls not online, need to wait dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info)); + } else if (OB_FAIL(ls_meta_.check_ls_need_online(need_online))) { + LOG_WARN("fail to check ls need online", K(ret)); } return ret; } diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index c2a072cbc..f2fc796f3 100644 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -310,6 +310,8 @@ public: bool is_create_committed() const; bool is_need_gc() const; bool is_in_gc(); + bool is_restore_first_step() const; + bool is_clone_first_step() const; // for rebuild // remove inner tablet, the memtable and minor sstable of data tablet, disable replay // int prepare_rebuild(); @@ -383,7 +385,7 @@ public: int flush_if_need(const bool need_flush); int try_sync_reserved_snapshot(const int64_t new_reserved_snapshot, const bool update_flag); int check_can_replay_clog(bool &can_replay); - int check_can_online(bool &can_online); + int check_ls_need_online(bool &need_online); TO_STRING_KV(K_(running_state), K_(ls_meta), K_(switch_epoch), K_(log_handler), K_(restore_handler), K_(is_inited), K_(tablet_gc_handler), K_(startup_transfer_info)); private: @@ -405,8 +407,6 @@ private: ObTabletHandle &handle); int offline_advance_epoch_(); int online_advance_epoch_(); - bool is_required_to_switch_state_for_restore_() const; - bool is_required_to_switch_state_for_clone_() const; public: // ObLSMeta interface: int update_ls_meta(const bool update_restore_status, diff --git a/src/storage/ls/ob_ls_meta.cpp b/src/storage/ls/ob_ls_meta.cpp index 5871529d6..7192c81cc 100644 --- a/src/storage/ls/ob_ls_meta.cpp +++ b/src/storage/ls/ob_ls_meta.cpp @@ -805,21 +805,34 @@ int ObLSMeta::get_create_type(int64_t &create_type) const if (!is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("log stream meta is not valid, cannot get restore status", K(ret), K(*this)); - // before 4.3 - } else if (restore_status_.is_required_to_switch_ls_state_for_restore()) { - create_type = ObLSCreateType::RESTORE; - } else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status_) { + } else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status_) { create_type = ObLSCreateType::MIGRATE; - } else if (restore_status_.is_required_to_switch_ls_state_for_clone()) { + } else if (restore_status_.is_in_clone()) { create_type = ObLSCreateType::CLONE; - // before 4.3 end - // after 4.3 + } else if (restore_status_.is_in_restore()) { + create_type = ObLSCreateType::RESTORE; } else if (ls_persistent_state_.is_ha_state()) { create_type = ObLSCreateType::MIGRATE; } return ret; } +int ObLSMeta::check_ls_need_online(bool &need_online) const +{ + int ret = OB_SUCCESS; + need_online = true; + if (!is_valid()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("log stream meta is not valid", K(ret), K(*this)); + } else if (!ObMigrationStatusHelper::need_online(migration_status_)) { + need_online = false; + } else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE == migration_status_ && + !restore_status_.need_online()) { + need_online = false; + } + return ret; +} + ObLSMeta::ObReentrantWLockGuard::ObReentrantWLockGuard(ObLatch &lock, const bool try_lock, const int64_t warn_threshold) diff --git a/src/storage/ls/ob_ls_meta.h b/src/storage/ls/ob_ls_meta.h index 97cfeb270..c9c92a674 100644 --- a/src/storage/ls/ob_ls_meta.h +++ b/src/storage/ls/ob_ls_meta.h @@ -108,6 +108,7 @@ public: int set_rebuild_info(const ObLSRebuildInfo &rebuild_info); int get_rebuild_info(ObLSRebuildInfo &rebuild_info) const; int get_create_type(int64_t &create_type) const; + int check_ls_need_online(bool &need_online) const; int init( const uint64_t tenant_id, diff --git a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp index 638284552..f1b478a93 100644 --- a/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp +++ b/src/storage/tablet/ob_tablet_start_transfer_mds_helper.cpp @@ -516,7 +516,7 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay( ObLSHandle dest_ls_handle; ObLS *dest_ls = NULL; SCN max_decided_scn; - bool can_online = true; + bool need_online = true; ObLSTransferInfo transfer_info; static const int64_t SLEEP_TS = 100_ms; if (!scn.is_valid() || !dest_ls_id.is_valid()) { @@ -543,9 +543,9 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay( } else { transfer_info = dest_ls->get_ls_startup_transfer_info(); dest_ls->get_ls_startup_transfer_info().reset(); - if (OB_FAIL(dest_ls->check_can_online(can_online))) { + if (OB_FAIL(dest_ls->check_ls_need_online(need_online))) { LOG_WARN("failed to check can online", KR(ret), K(dest_ls)); - } else if (!can_online) { + } else if (!need_online) { // do nothing } else if (CLICK_FAIL(dest_ls->online())) { LOG_ERROR("fail to online ls", K(ret), K(scn), K(dest_ls_id), "ls_startup_transfer_info", dest_ls->get_ls_startup_transfer_info()); diff --git a/src/storage/tx_storage/ob_ls_service.cpp b/src/storage/tx_storage/ob_ls_service.cpp index d2433eb11..3e17741ee 100644 --- a/src/storage/tx_storage/ob_ls_service.cpp +++ b/src/storage/tx_storage/ob_ls_service.cpp @@ -480,47 +480,58 @@ int ObLSService::post_create_ls_(const int64_t create_type, { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; - switch(create_type) { - case ObLSCreateType::NORMAL: { - if (OB_FAIL(ls->online_without_lock())) { - LOG_ERROR("ls start failed", K(ret)); - } else if (OB_FAIL(ls->set_start_work_state())) { - LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls)); - } else { + bool need_online = false; + if (OB_FAIL(ls->check_ls_need_online(need_online))) { + LOG_WARN("check ls need online failed", K(ret)); + } else if (need_online && + OB_FAIL(ls->online_without_lock())) { + LOG_ERROR("ls start failed", K(ret)); + } else { + switch(create_type) { + case ObLSCreateType::NORMAL: { + if (OB_FAIL(ls->set_start_work_state())) { + LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls)); + } + break; } - break; - } - case ObLSCreateType::RESTORE: { - if (OB_FAIL(ls->get_log_handler()->enable_sync())) { - LOG_WARN("failed to enable sync", K(ret)); - } else if (OB_FAIL(ls->get_ls_restore_handler()->online())) { - LOG_WARN("failed to online restore handler", K(ret)); - } else if (OB_FAIL(ls->set_start_ha_state())) { - LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); - } else { + case ObLSCreateType::RESTORE: { + if (!need_online && ls->is_restore_first_step()) { + if (OB_FAIL(ls->get_log_handler()->enable_sync())) { + LOG_WARN("failed to enable sync", K(ret)); + } else if (OB_FAIL(ls->get_ls_restore_handler()->online())) { + LOG_WARN("failed to online restore handler", K(ret)); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(ls->set_start_ha_state())) { + LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); + } + break; } - break; - } - case ObLSCreateType::MIGRATE: { - if (OB_FAIL(ls->set_start_ha_state())) { - LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); + case ObLSCreateType::MIGRATE: { + if (OB_FAIL(ls->set_start_ha_state())) { + LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); + } + break; } - break; - } - case ObLSCreateType::CLONE: { - if (OB_FAIL(ls->get_log_handler()->enable_sync())) { - LOG_WARN("failed to enable sync", K(ret)); - } else if (OB_FAIL(ls->set_start_ha_state())) { - LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); - } else { + case ObLSCreateType::CLONE: { + if (!need_online && ls->is_clone_first_step()) { + if (OB_FAIL(ls->get_log_handler()->enable_sync())) { + LOG_WARN("failed to enable sync", K(ret)); + } + } + if (OB_FAIL(ret)) { + } else if (OB_FAIL(ls->set_start_ha_state())) { + LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); + } + break; } - break; + default: { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("should not be here.", KR(ret)); + } // default + } // switch } - default: { - ret = OB_ERR_UNEXPECTED; - LOG_ERROR("should not be here.", KR(ret)); - } // default - } // switch if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) { LOG_WARN("fail to report ls", KR(tmp_ret), KPC(ls)); @@ -740,7 +751,6 @@ int ObLSService::online_ls() int tmp_ret = OB_SUCCESS; common::ObSharedGuard ls_iter; ObLS *ls = nullptr; - bool can_online = true; int64_t create_type = ObLSCreateType::NORMAL; if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) { LOG_WARN("failed to get ls iter", K(ret)); @@ -755,11 +765,7 @@ int ObLSService::online_ls() LOG_ERROR("ls is null", K(ret)); } else { ObLSLockGuard lock_ls(ls); - if (OB_FAIL(ls->check_can_online(can_online))) { - LOG_WARN("check ls can online failed", K(ret)); - } else if (!can_online) { - // ls can not online, do nothing - } else if (OB_FAIL(ls->get_create_type(create_type))) { + if (OB_FAIL(ls->get_create_type(create_type))) { LOG_WARN("get ls create type failed", K(ret)); } else if (OB_FAIL(post_create_ls_(create_type, ls))) { LOG_WARN("post create ls failed", K(ret));