BUGFIX: fix restore ls state after restart

This commit is contained in:
obdev 2024-02-07 23:42:11 +00:00 committed by ob-robot
parent 4cc88e0b6e
commit 9c018210bd
11 changed files with 112 additions and 107 deletions

View File

@ -713,7 +713,7 @@ int ObLSService::online_ls()
int tmp_ret = OB_SUCCESS;
common::ObSharedGuard<ObLSIterator> ls_iter;
ObLS *ls = nullptr;
bool can_replay = true;
int64_t create_type = ObLSCreateType::NORMAL;
if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) {
LOG_WARN("failed to get ls iter", K(ret));
} else {
@ -725,12 +725,13 @@ int ObLSService::online_ls()
} else if (nullptr == ls) {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("ls is null", K(ret));
} else if (OB_FAIL(ls->check_can_replay_clog(can_replay))) {
LOG_WARN("failed to check ls can replay clog", K(ret), KPC(ls));
} else if (!can_replay) {
// ls can not enable replay
} else if (OB_FAIL(ls->enable_replay())) {
LOG_ERROR("fail to enable replay", K(ret));
} else {
ObLSLockGuard lock_ls(ls);
if (OB_FAIL(ls->get_create_type(create_type))) {
LOG_WARN("get ls create type failed", K(ret));
} else if (OB_FAIL(post_create_ls_(create_type, ls))) {
LOG_WARN("post create ls failed", K(ret));
}
}
}
if (OB_ITER_END == ret) {

View File

@ -88,6 +88,13 @@ bool ObLSRestoreStatus::is_valid_(int32_t status) const
}
#undef LS_RESTORE_STATUS_CASE_TO_TYPE
bool ObLSRestoreStatus::need_online() const
{
return ((status_ >= WAIT_RESTORE_SYS_TABLETS
&& status_ <= WAIT_RESTORE_MAJOR_DATA)
|| status_ == NONE
|| status_ == CLONE_CLOG_REPLAY);
}
int ObLSRestoreStatus::set_status(int32_t status)
{

View File

@ -110,7 +110,9 @@ public:
}
// offline ls and enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED
bool is_required_to_switch_ls_state_for_restore() const
bool need_online() const;
// enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED
bool is_restore_first_step() const
{
return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) ||
status_ == Status::RESTORE_FAILED);
@ -122,7 +124,7 @@ public:
{
return status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_ALL_TABLET_META;
}
bool is_required_to_switch_ls_state_for_clone() const
bool is_clone_first_step() const
{
return ((status_ >= Status::CLONE_START && status_ <= Status::CLONE_COPY_LS_META) ||
Status::CLONE_FAILED == status_);

View File

@ -594,6 +594,11 @@ bool ObMigrationStatusHelper::check_migration_status_is_fail_(const ObMigrationS
return is_fail;
}
bool ObMigrationStatusHelper::need_online(const ObMigrationStatus &cur_status)
{
return (OB_MIGRATION_STATUS_NONE == cur_status);
}
bool ObMigrationStatusHelper::check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status)
{
bool allow_gc = false;

View File

@ -81,6 +81,7 @@ public:
const ObMigrationStatus &cur_status,
bool &allow_gc);
// Check the migration status. The LS in the XXX_FAIL state is considered to be an abandoned LS, which can be judged to be directly GC when restarting
static bool need_online(const ObMigrationStatus &cur_status);
static bool check_allow_gc_abandoned_ls(const ObMigrationStatus &cur_status);
static bool check_can_migrate_out(const ObMigrationStatus &cur_status);
static int check_can_change_status(

View File

@ -576,7 +576,7 @@ bool ObLS::is_need_gc() const
return bool_ret;
}
bool ObLS::is_required_to_switch_state_for_restore_() const
bool ObLS::is_clone_first_step() const
{
int ret = OB_SUCCESS;
bool bool_ret = false;
@ -584,12 +584,12 @@ bool ObLS::is_required_to_switch_state_for_restore_() const
if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) {
LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_));
} else {
bool_ret = restore_status.is_required_to_switch_ls_state_for_restore();
bool_ret = restore_status.is_clone_first_step();
}
return bool_ret;
}
bool ObLS::is_required_to_switch_state_for_clone_() const
bool ObLS::is_restore_first_step() const
{
int ret = OB_SUCCESS;
bool bool_ret = false;
@ -597,7 +597,7 @@ bool ObLS::is_required_to_switch_state_for_clone_() const
if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) {
LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_));
} else {
bool_ret = restore_status.is_required_to_switch_ls_state_for_clone();
bool_ret = restore_status.is_restore_first_step();
}
return bool_ret;
}
@ -2096,47 +2096,16 @@ int ObLS::enable_replay()
return ret;
}
int ObLS::check_can_online(bool &can_online)
int ObLS::check_ls_need_online(bool &need_online)
{
int ret = OB_SUCCESS;
can_online = true;
if (is_need_gc()) {
// this ls will be gc later, should not enable replay
can_online = false;
} else if (startup_transfer_info_.is_valid()) {
// There is a tablet has_transfer_table=true in the log stream
can_online = false;
LOG_INFO("ls need to wait for dependency to be removed", "ls_id", get_ls_id(),
K_(startup_transfer_info));
}
return ret;
}
int ObLS::check_can_replay_clog(bool &can_replay)
{
int ret = OB_SUCCESS;
share::ObLSRestoreStatus restore_status;
ObMigrationStatus migration_status = ObMigrationStatus::OB_MIGRATION_STATUS_MAX;
can_replay = true;
if (is_need_gc()) {
// this ls will be gc later, should not enable replay
can_replay = false;
} else if (OB_FAIL(get_migration_status(migration_status))) {
LOG_WARN("failed to get ls migration status", K(ret));
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status) {
// ls will online in rebuild process, ls online will enable clog replay
can_replay = false;
LOG_INFO("ls is in rebuild process, cannot replay clog", "ls_id", get_ls_id(), K(migration_status));
} else if (OB_FAIL(get_restore_status(restore_status))) {
LOG_WARN("fail to get ls restore status", K(ret));
} else if (!restore_status.can_replay_log()) {
// while downtime, if ls's restore status is in [restore_start, wait_restore_tablet_meta], clog can't replay
can_replay = false;
LOG_INFO("restore status not as expected, can not replay clog", "ls_id", get_ls_id(), K(restore_status));
} else if (startup_transfer_info_.is_valid()) {
// There is a tablet has_transfer_table=true in the log stream, clog can't replay
can_replay = false;
LOG_INFO("ls not enable clog replay, need to wait for dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info));
need_online = true;
if (startup_transfer_info_.is_valid()) {
// There is a tablet has_transfer_table=true in the log stream, ls can't online
need_online = false;
LOG_INFO("ls not online, need to wait dependency to be removed", "ls_id", get_ls_id(), K_(startup_transfer_info));
} else if (OB_FAIL(ls_meta_.check_ls_need_online(need_online))) {
LOG_WARN("fail to check ls need online", K(ret));
}
return ret;
}

View File

@ -310,6 +310,8 @@ public:
bool is_create_committed() const;
bool is_need_gc() const;
bool is_in_gc();
bool is_restore_first_step() const;
bool is_clone_first_step() const;
// for rebuild
// remove inner tablet, the memtable and minor sstable of data tablet, disable replay
// int prepare_rebuild();
@ -383,7 +385,7 @@ public:
int flush_if_need(const bool need_flush);
int try_sync_reserved_snapshot(const int64_t new_reserved_snapshot, const bool update_flag);
int check_can_replay_clog(bool &can_replay);
int check_can_online(bool &can_online);
int check_ls_need_online(bool &need_online);
TO_STRING_KV(K_(running_state), K_(ls_meta), K_(switch_epoch), K_(log_handler), K_(restore_handler), K_(is_inited), K_(tablet_gc_handler), K_(startup_transfer_info));
private:
@ -405,8 +407,6 @@ private:
ObTabletHandle &handle);
int offline_advance_epoch_();
int online_advance_epoch_();
bool is_required_to_switch_state_for_restore_() const;
bool is_required_to_switch_state_for_clone_() const;
public:
// ObLSMeta interface:
int update_ls_meta(const bool update_restore_status,

View File

@ -805,21 +805,34 @@ int ObLSMeta::get_create_type(int64_t &create_type) const
if (!is_valid()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("log stream meta is not valid, cannot get restore status", K(ret), K(*this));
// before 4.3
} else if (restore_status_.is_required_to_switch_ls_state_for_restore()) {
create_type = ObLSCreateType::RESTORE;
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_REBUILD == migration_status_) {
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status_) {
create_type = ObLSCreateType::MIGRATE;
} else if (restore_status_.is_required_to_switch_ls_state_for_clone()) {
} else if (restore_status_.is_in_clone()) {
create_type = ObLSCreateType::CLONE;
// before 4.3 end
// after 4.3
} else if (restore_status_.is_in_restore()) {
create_type = ObLSCreateType::RESTORE;
} else if (ls_persistent_state_.is_ha_state()) {
create_type = ObLSCreateType::MIGRATE;
}
return ret;
}
int ObLSMeta::check_ls_need_online(bool &need_online) const
{
int ret = OB_SUCCESS;
need_online = true;
if (!is_valid()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("log stream meta is not valid", K(ret), K(*this));
} else if (!ObMigrationStatusHelper::need_online(migration_status_)) {
need_online = false;
} else if (ObMigrationStatus::OB_MIGRATION_STATUS_NONE == migration_status_ &&
!restore_status_.need_online()) {
need_online = false;
}
return ret;
}
ObLSMeta::ObReentrantWLockGuard::ObReentrantWLockGuard(ObLatch &lock,
const bool try_lock,
const int64_t warn_threshold)

View File

@ -108,6 +108,7 @@ public:
int set_rebuild_info(const ObLSRebuildInfo &rebuild_info);
int get_rebuild_info(ObLSRebuildInfo &rebuild_info) const;
int get_create_type(int64_t &create_type) const;
int check_ls_need_online(bool &need_online) const;
int init(
const uint64_t tenant_id,

View File

@ -516,7 +516,7 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay(
ObLSHandle dest_ls_handle;
ObLS *dest_ls = NULL;
SCN max_decided_scn;
bool can_online = true;
bool need_online = true;
ObLSTransferInfo transfer_info;
static const int64_t SLEEP_TS = 100_ms;
if (!scn.is_valid() || !dest_ls_id.is_valid()) {
@ -543,9 +543,9 @@ int ObTabletStartTransferOutCommonHelper::try_enable_dest_ls_clog_replay(
} else {
transfer_info = dest_ls->get_ls_startup_transfer_info();
dest_ls->get_ls_startup_transfer_info().reset();
if (OB_FAIL(dest_ls->check_can_online(can_online))) {
if (OB_FAIL(dest_ls->check_ls_need_online(need_online))) {
LOG_WARN("failed to check can online", KR(ret), K(dest_ls));
} else if (!can_online) {
} else if (!need_online) {
// do nothing
} else if (CLICK_FAIL(dest_ls->online())) {
LOG_ERROR("fail to online ls", K(ret), K(scn), K(dest_ls_id), "ls_startup_transfer_info", dest_ls->get_ls_startup_transfer_info());

View File

@ -480,47 +480,58 @@ int ObLSService::post_create_ls_(const int64_t create_type,
{
int ret = OB_SUCCESS;
int tmp_ret = OB_SUCCESS;
switch(create_type) {
case ObLSCreateType::NORMAL: {
if (OB_FAIL(ls->online_without_lock())) {
LOG_ERROR("ls start failed", K(ret));
} else if (OB_FAIL(ls->set_start_work_state())) {
LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls));
} else {
bool need_online = false;
if (OB_FAIL(ls->check_ls_need_online(need_online))) {
LOG_WARN("check ls need online failed", K(ret));
} else if (need_online &&
OB_FAIL(ls->online_without_lock())) {
LOG_ERROR("ls start failed", K(ret));
} else {
switch(create_type) {
case ObLSCreateType::NORMAL: {
if (OB_FAIL(ls->set_start_work_state())) {
LOG_ERROR("ls set start work state failed", KR(ret), KPC(ls));
}
break;
}
break;
}
case ObLSCreateType::RESTORE: {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
LOG_WARN("failed to enable sync", K(ret));
} else if (OB_FAIL(ls->get_ls_restore_handler()->online())) {
LOG_WARN("failed to online restore handler", K(ret));
} else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
} else {
case ObLSCreateType::RESTORE: {
if (!need_online && ls->is_restore_first_step()) {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
LOG_WARN("failed to enable sync", K(ret));
} else if (OB_FAIL(ls->get_ls_restore_handler()->online())) {
LOG_WARN("failed to online restore handler", K(ret));
}
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
}
break;
}
break;
}
case ObLSCreateType::MIGRATE: {
if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
case ObLSCreateType::MIGRATE: {
if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
}
break;
}
break;
}
case ObLSCreateType::CLONE: {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
LOG_WARN("failed to enable sync", K(ret));
} else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
} else {
case ObLSCreateType::CLONE: {
if (!need_online && ls->is_clone_first_step()) {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
LOG_WARN("failed to enable sync", K(ret));
}
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(ls->set_start_ha_state())) {
LOG_ERROR("ls set start ha state failed", KR(ret), KPC(ls));
}
break;
}
break;
default: {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("should not be here.", KR(ret));
} // default
} // switch
}
default: {
ret = OB_ERR_UNEXPECTED;
LOG_ERROR("should not be here.", KR(ret));
} // default
} // switch
if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) {
LOG_WARN("fail to report ls", KR(tmp_ret), KPC(ls));
@ -740,7 +751,6 @@ int ObLSService::online_ls()
int tmp_ret = OB_SUCCESS;
common::ObSharedGuard<ObLSIterator> ls_iter;
ObLS *ls = nullptr;
bool can_online = true;
int64_t create_type = ObLSCreateType::NORMAL;
if (OB_FAIL(get_ls_iter(ls_iter, ObLSGetMod::TXSTORAGE_MOD))) {
LOG_WARN("failed to get ls iter", K(ret));
@ -755,11 +765,7 @@ int ObLSService::online_ls()
LOG_ERROR("ls is null", K(ret));
} else {
ObLSLockGuard lock_ls(ls);
if (OB_FAIL(ls->check_can_online(can_online))) {
LOG_WARN("check ls can online failed", K(ret));
} else if (!can_online) {
// ls can not online, do nothing
} else if (OB_FAIL(ls->get_create_type(create_type))) {
if (OB_FAIL(ls->get_create_type(create_type))) {
LOG_WARN("get ls create type failed", K(ret));
} else if (OB_FAIL(post_create_ls_(create_type, ls))) {
LOG_WARN("post create ls failed", K(ret));