BUGFIX: fix restore ls state after restart

This commit is contained in:
obdev
2023-12-18 19:43:05 +00:00
committed by ob-robot
parent e09eb1c0be
commit 14cdc89246
11 changed files with 112 additions and 107 deletions

View File

@ -713,7 +713,7 @@ int ObLSService::online_ls()
int tmp_ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS;
common::ObSharedGuard<ObLSIterator> ls_iter; common::ObSharedGuard<ObLSIterator> ls_iter;
ObLS *ls = nullptr; ObLS *ls = nullptr;
bool can_replay = true; int64_t create_type = ObLSCreateType::NORMAL;
if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) { if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) {
LOG_WARN("failed to get ls iter", K(ret)); LOG_WARN("failed to get ls iter", K(ret));
} else { } else {
@ -725,12 +725,13 @@ int ObLSService::online_ls()
} else if (nullptr == ls) { } else if (nullptr == ls) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_ERROR("ls is null", K(ret)); LOG_ERROR("ls is null", K(ret));
} else if (OB_FAIL(ls->check_can_replay_clog(can_replay))) { } else {
LOG_WARN("failed to check ls can replay clog", K(ret), KPC(ls)); ObLSLockGuard lock_ls(ls);
} else if (!can_replay) { if (OB_FAIL(ls->get_create_type(create_type))) {
// ls can not enable replay LOG_WARN("get ls create type failed", K(ret));
} else if (OB_FAIL(ls->enable_replay())) { } else if (OB_FAIL(post_create_ls_(create_type, ls))) {
LOG_ERROR("fail to enable replay", K(ret)); LOG_WARN("post create ls failed", K(ret));
}
} }
} }
if (OB_ITER_END == ret) { if (OB_ITER_END == ret) {

View File

@ -88,6 +88,13 @@ bool ObLSRestoreStatus::is_valid_(int32_t status) const
} }
#undef LS_RESTORE_STATUS_CASE_TO_TYPE #undef LS_RESTORE_STATUS_CASE_TO_TYPE
bool ObLSRestoreStatus::need_online() const
{
return ((status_ >= WAIT_RESTORE_SYS_TABLETS
&& status_ <= WAIT_RESTORE_MAJOR_DATA)
|| status_ == NONE
|| status_ == CLONE_CLOG_REPLAY);
}
int ObLSRestoreStatus::set_status(int32_t status) int ObLSRestoreStatus::set_status(int32_t status)
{ {

View File

@ -110,7 +110,9 @@ public:
} }
// offline ls and enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED // offline ls and enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED
bool is_required_to_switch_ls_state_for_restore() const bool need_online() const;
// enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED
bool is_restore_first_step() const
{ {
return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) || return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) ||
status_ == Status::RESTORE_FAILED); status_ == Status::RESTORE_FAILED);
@ -122,7 +124,7 @@ public:
{ {
return status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_ALL_TABLET_META; return status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_ALL_TABLET_META;
} }
bool is_required_to_switch_ls_state_for_clone() const bool is_clone_first_step() const
{ {
return ((status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_LS_META) || return ((status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_LS_META) ||
Status::CLONE_FAILED == status_); Status::CLONE_FAILED == status_);

View File

@ -594,6 +594,11 @@ bool ObMigrationStatusHelper::check_migration_status_is_fail_(const ObMigrationS
return is_fail; return is_fail;
} }
bool ObMigrationStatusHelper::need_online(const ObMigrationStatus &cur_status)
{
return (OB_MIGRATION_STATUS_NONE == cur_status);
}
bool ObMigrationStatusHelper::check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status) bool ObMigrationStatusHelper::check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status)
{ {
bool allow_gc = false; bool allow_gc = false;

View File

@ -81,6 +81,7 @@ public:
const ObMigrationStatus &cur_status, const ObMigrationStatus &cur_status,
bool &allow_gc); bool &allow_gc);
// Check the migration status. The LS in the XXX_FAIL state is considered to be an abandoned LS, which can be judged to be directly GC when restarting // Check the migration status. The LS in the XXX_FAIL state is considered to be an abandoned LS, which can be judged to be directly GC when restarting
static bool need_online(const ObMigrationStatus &cur_status);
static bool check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status); static bool check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status);
static bool check_can_migrate_out(const ObMigrationStatus &cur_status); static bool check_can_migrate_out(const ObMigrationStatus &cur_status);
static int check_can_change_status( static int check_can_change_status(

View File

@ -576,7 +576,7 @@ bool ObLS::is_need_gc() const
return bool_ret; return bool_ret;
} }
bool ObLS::is_required_to_switch_state_for_restore_() const bool ObLS::is_clone_first_step() const
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
bool bool_ret = false; bool bool_ret = false;
@ -584,12 +584,12 @@ bool ObLS::is_required_to_switch_state_for_restore_() const
if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) {
LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_));
} else { } else {
bool_ret = restore_status.is_required_to_switch_ls_state_for_restore(); bool_ret = restore_status.is_clone_first_step();
} }
return bool_ret; return bool_ret;
} }
bool ObLS::is_required_to_switch_state_for_clone_() const bool ObLS::is_restore_first_step() const
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
bool bool_ret = false; bool bool_ret = false;
@ -597,7 +597,7 @@ bool ObLS::is_required_to_switch_state_for_clone_() const
if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) {
LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_));
} else { } else {
bool_ret = restore_status.is_required_to_switch_ls_state_for_clone(); bool_ret = restore_status.is_restore_first_step();
} }
return bool_ret; return bool_ret;
} }
@ -2096,47 +2096,16 @@ int ObLS::enable_replay()
return ret; return ret;
} }
int ObLS::check_can_online(bool &can_online) int ObLS::check_ls_need_online(bool &need_online)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
can_online = true; need_online = true;
if (is_need_gc()) { if (startup_transfer_info_.is_valid()) {
// this ls will be gc later, should not enable replay // There is a tablet has_transfer_table=true in the log stream, ls can't online
can_online = false; need_online = false;
} else if (startup_transfer_info_.is_valid()) { LOG_INFO("ls not online, need to wait dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info));
// There is a tablet has_transfer_table=true in the log stream } else if (OB_FAIL(ls_meta_.check_ls_need_online(need_online))) {
can_online = false; LOG_WARN("fail to check ls need online", K(ret));
LOG_INFO("ls need to wait for dependency to be removed", "ls_id", get_ls_id(),
K_(startup_transfer_info));
}
return ret;
}
int ObLS::check_can_replay_clog(bool &can_replay)
{
int ret = OB_SUCCESS;
share::ObLSRestoreStatus restore_status;
ObMigrationStatus migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX;
can_replay = true;
if (is_need_gc()) {
// this ls will be gc later, should not enable replay
can_replay = false;
} else if (OB_FAIL(get_migration_status(migration_status))) {
LOG_WARN("failed to get ls migration status", K(ret));
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status) {
// ls will online in rebuild process, ls online will enable clog replay
can_replay = false;
LOG_INFO("ls is in rebuild process, cannot replay clog", "ls_id", get_ls_id(), K(migration_status));
} else if (OB_FAIL(get_restore_status(restore_status))) {
LOG_WARN("fail to get ls restore status", K(ret));
} else if (!restore_status.can_replay_log()) {
// while downtime, if ls's restore status is in [restore_start, wait_restore_tablet_meta], clog can't replay
can_replay = false;
LOG_INFO("restore status not as expected, can not replay clog", "ls_id", get_ls_id(), K(restore_status));
} else if (startup_transfer_info_.is_valid()) {
// There is a tablet has_transfer_table=true in the log stream, clog can't replay
can_replay = false;
LOG_INFO("ls not enable clog replay, need to wait for dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info));
} }
return ret; return ret;
} }

View File

@ -310,6 +310,8 @@ public:
bool is_create_committed() const; bool is_create_committed() const;
bool is_need_gc() const; bool is_need_gc() const;
bool is_in_gc(); bool is_in_gc();
bool is_restore_first_step() const;
bool is_clone_first_step() const;
// for rebuild // for rebuild
// remove inner tablet, the memtable and minor sstable of data tablet, disable replay // remove inner tablet, the memtable and minor sstable of data tablet, disable replay
// int prepare_rebuild(); // int prepare_rebuild();
@ -383,7 +385,7 @@ public:
int flush_if_need(const bool need_flush); int flush_if_need(const bool need_flush);
int try_sync_reserved_snapshot(const int64_t new_reserved_snapshot, const bool update_flag); int try_sync_reserved_snapshot(const int64_t new_reserved_snapshot, const bool update_flag);
int check_can_replay_clog(bool &can_replay); int check_can_replay_clog(bool &can_replay);
int check_can_online(bool &can_online); int check_ls_need_online(bool &need_online);
TO_STRING_KV(K_(running_state), K_(ls_meta), K_(switch_epoch), K_(log_handler), K_(restore_handler), K_(is_inited), K_(tablet_gc_handler), K_(startup_transfer_info)); TO_STRING_KV(K_(running_state), K_(ls_meta), K_(switch_epoch), K_(log_handler), K_(restore_handler), K_(is_inited), K_(tablet_gc_handler), K_(startup_transfer_info));
private: private:
@ -405,8 +407,6 @@ private:
ObTabletHandle &handle); ObTabletHandle &handle);
int offline_advance_epoch_(); int offline_advance_epoch_();
int online_advance_epoch_(); int online_advance_epoch_();
bool is_required_to_switch_state_for_restore_() const;
bool is_required_to_switch_state_for_clone_() const;
public: public:
// ObLSMeta interface: // ObLSMeta interface:
int update_ls_meta(const bool update_restore_status, int update_ls_meta(const bool update_restore_status,

View File

@ -805,21 +805,34 @@ int ObLSMeta::get_create_type(int64_t &create_type) const
if (!is_valid()) { if (!is_valid()) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_WARN("log stream meta is not valid, cannot get restore status", K(ret), K(*this)); LOG_WARN("log stream meta is not valid, cannot get restore status", K(ret), K(*this));
// before 4.3 } else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status_) {
} else if (restore_status_.is_required_to_switch_ls_state_for_restore()) {
create_type = ObLSCreateType::RESTORE;
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status_) {
create_type = ObLSCreateType::MIGRATE; create_type = ObLSCreateType::MIGRATE;
} else if (restore_status_.is_required_to_switch_ls_state_for_clone()) { } else if (restore_status_.is_in_clone()) {
create_type = ObLSCreateType::CLONE; create_type = ObLSCreateType::CLONE;
// before 4.3 end } else if (restore_status_.is_in_restore()) {
// after 4.3 create_type = ObLSCreateType::RESTORE;
} else if (ls_persistent_state_.is_ha_state()) { } else if (ls_persistent_state_.is_ha_state()) {
create_type = ObLSCreateType::MIGRATE; create_type = ObLSCreateType::MIGRATE;
} }
return ret; return ret;
} }
int ObLSMeta::check_ls_need_online(bool &need_online) const
{
int ret = OB_SUCCESS;
need_online = true;
if (!is_valid()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("log stream meta is not valid", K(ret), K(*this));
} else if (!ObMigrationStatusHelper::need_online(migration_status_)) {
need_online = false;
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE == migration_status_ &&
!restore_status_.need_online()) {
need_online = false;
}
return ret;
}
ObLSMeta::ObReentrantWLockGuard::ObReentrantWLockGuard(ObLatch &lock, ObLSMeta::ObReentrantWLockGuard::ObReentrantWLockGuard(ObLatch &lock,
const bool try_lock, const bool try_lock,
const int64_t warn_threshold) const int64_t warn_threshold)

View File

@ -108,6 +108,7 @@ public:
int set_rebuild_info(const ObLSRebuildInfo &rebuild_info); int set_rebuild_info(const ObLSRebuildInfo &rebuild_info);
int get_rebuild_info(ObLSRebuildInfo &rebuild_info) const; int get_rebuild_info(ObLSRebuildInfo &rebuild_info) const;
int get_create_type(int64_t &create_type) const; int get_create_type(int64_t &create_type) const;
int check_ls_need_online(bool &need_online) const;
int init( int init(
const uint64_t tenant_id, const uint64_t tenant_id,

View File

@ -516,7 +516,7 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay(
ObLSHandle dest_ls_handle; ObLSHandle dest_ls_handle;
ObLS *dest_ls = NULL; ObLS *dest_ls = NULL;
SCN max_decided_scn; SCN max_decided_scn;
bool can_online = true; bool need_online = true;
ObLSTransferInfo transfer_info; ObLSTransferInfo transfer_info;
static const int64_t SLEEP_TS = 100_ms; static const int64_t SLEEP_TS = 100_ms;
if (!scn.is_valid() || !dest_ls_id.is_valid()) { if (!scn.is_valid() || !dest_ls_id.is_valid()) {
@ -543,9 +543,9 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay(
} else { } else {
transfer_info = dest_ls->get_ls_startup_transfer_info(); transfer_info = dest_ls->get_ls_startup_transfer_info();
dest_ls->get_ls_startup_transfer_info().reset(); dest_ls->get_ls_startup_transfer_info().reset();
if (OB_FAIL(dest_ls->check_can_online(can_online))) { if (OB_FAIL(dest_ls->check_ls_need_online(need_online))) {
LOG_WARN("failed to check can online", KR(ret), K(dest_ls)); LOG_WARN("failed to check can online", KR(ret), K(dest_ls));
} else if (!can_online) { } else if (!need_online) {
// do nothing // do nothing
} else if (CLICK_FAIL(dest_ls->online())) { } else if (CLICK_FAIL(dest_ls->online())) {
LOG_ERROR("fail to online ls", K(ret), K(scn), K(dest_ls_id), "ls_startup_transfer_info", dest_ls->get_ls_startup_transfer_info()); LOG_ERROR("fail to online ls", K(ret), K(scn), K(dest_ls_id), "ls_startup_transfer_info", dest_ls->get_ls_startup_transfer_info());

View File

@ -480,47 +480,58 @@ int ObLSService::post_create_ls_(const int64_t create_type,
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
int tmp_ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS;
switch(create_type) { bool need_online = false;
case ObLSCreateType::NORMAL: { if (OB_FAIL(ls->check_ls_need_online(need_online))) {
if (OB_FAIL(ls->online_without_lock())) { LOG_WARN("check ls need online failed", K(ret));
LOG_ERROR("ls start failed", K(ret)); } else if (need_online &&
} else if (OB_FAIL(ls->set_start_work_state())) { OB_FAIL(ls->online_without_lock())) {
LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls)); LOG_ERROR("ls start failed", K(ret));
} else { } else {
switch(create_type) {
case ObLSCreateType::NORMAL: {
if (OB_FAIL(ls->set_start_work_state())) {
LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls));
}
break;
} }
break; case ObLSCreateType::RESTORE: {
} if (!need_online && ls->is_restore_first_step()) {
case ObLSCreateType::RESTORE: { if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) { LOG_WARN("failed to enable sync", K(ret));
LOG_WARN("failed to enable sync", K(ret)); } else if (OB_FAIL(ls->get_ls_restore_handler()->online())) {
} else if (OB_FAIL(ls->get_ls_restore_handler()->online())) { LOG_WARN("failed to online restore handler", K(ret));
LOG_WARN("failed to online restore handler", K(ret)); }
} else if (OB_FAIL(ls->set_start_ha_state())) { }
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); if (OB_FAIL(ret)) {
} else { } else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
}
break;
} }
break; case ObLSCreateType::MIGRATE: {
} if (OB_FAIL(ls->set_start_ha_state())) {
case ObLSCreateType::MIGRATE: { LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
if (OB_FAIL(ls->set_start_ha_state())) { }
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); break;
} }
break; case ObLSCreateType::CLONE: {
} if (!need_online && ls->is_clone_first_step()) {
case ObLSCreateType::CLONE: { if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) { LOG_WARN("failed to enable sync", K(ret));
LOG_WARN("failed to enable sync", K(ret)); }
} else if (OB_FAIL(ls->set_start_ha_state())) { }
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls)); if (OB_FAIL(ret)) {
} else { } else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
}
break;
} }
break; default: {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("should not be here.", KR(ret));
} // default
} // switch
} }
default: {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("should not be here.", KR(ret));
} // default
} // switch
if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) { if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) {
LOG_WARN("fail to report ls", KR(tmp_ret), KPC(ls)); LOG_WARN("fail to report ls", KR(tmp_ret), KPC(ls));
@ -740,7 +751,6 @@ int ObLSService::online_ls()
int tmp_ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS;
common::ObSharedGuard<ObLSIterator> ls_iter; common::ObSharedGuard<ObLSIterator> ls_iter;
ObLS *ls = nullptr; ObLS *ls = nullptr;
bool can_online = true;
int64_t create_type = ObLSCreateType::NORMAL; int64_t create_type = ObLSCreateType::NORMAL;
if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) { if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) {
LOG_WARN("failed to get ls iter", K(ret)); LOG_WARN("failed to get ls iter", K(ret));
@ -755,11 +765,7 @@ int ObLSService::online_ls()
LOG_ERROR("ls is null", K(ret)); LOG_ERROR("ls is null", K(ret));
} else { } else {
ObLSLockGuard lock_ls(ls); ObLSLockGuard lock_ls(ls);
if (OB_FAIL(ls->check_can_online(can_online))) { if (OB_FAIL(ls->get_create_type(create_type))) {
LOG_WARN("check ls can online failed", K(ret));
} else if (!can_online) {
// ls can not online, do nothing
} else if (OB_FAIL(ls->get_create_type(create_type))) {
LOG_WARN("get ls create type failed", K(ret)); LOG_WARN("get ls create type failed", K(ret));
} else if (OB_FAIL(post_create_ls_(create_type, ls))) { } else if (OB_FAIL(post_create_ls_(create_type, ls))) {
LOG_WARN("post create ls failed", K(ret)); LOG_WARN("post create ls failed", K(ret));