fix dead lock and wrong get gts interface

This commit is contained in:
wxhwang
2023-06-22 09:18:09 +00:00
committed by ob-robot
parent 7ee480eceb
commit cab47b299e
3 changed files with 49 additions and 35 deletions

View File

@ -765,14 +765,8 @@ int ObArchiveHandler::get_max_checkpoint_scn_(const uint64_t tenant_id, SCN &max
// That will leads some log of type of create log stream is archived before been replayed. In this case,
// we should limit tenant archive progress not more than the GTS.
int ret = OB_SUCCESS;
ObAllTenantInfo tenant_info;
const bool for_update = false;
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id, sql_proxy_, for_update, tenant_info))) {
LOG_WARN("failed to get tenant info", K(ret), K(tenant_id));
} else if (OB_FALSE_IT(max_checkpoint_scn = tenant_info.get_standby_scn())) {
} else if (SCN::base_scn() >= max_checkpoint_scn) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("max_checkpoint_scn not valid", K(ret), K(tenant_info));
if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, max_checkpoint_scn))) {
LOG_WARN("failed to get max checkpoint scn.", K(ret), K_(tenant_id));
}
return ret;
}

View File

@ -139,14 +139,8 @@ bool ObArchiveRoundHandler::can_suspend_archive(const ObTenantArchiveRoundAttr &
int ObArchiveRoundHandler::decide_start_scn_(SCN &start_scn)
{
int ret = OB_SUCCESS;
ObAllTenantInfo tenant_info;
const bool for_update = false;
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id_, sql_proxy_, for_update, tenant_info))) {
LOG_WARN("failed to get tenant info", K(ret), K_(tenant_id));
} else if (OB_FALSE_IT(start_scn = tenant_info.get_standby_scn())){
} else if (SCN::base_scn() >= start_scn) {
ret = OB_EAGAIN;
LOG_WARN("start_scn not valid, need wait", K(ret), K(tenant_info));
if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, start_scn))) {
LOG_WARN("failed to decide archive start scn.", K(ret), K_(tenant_id));
}
return ret;
}

View File

@ -227,27 +227,53 @@ int ObLSRestoreHandler::handle_execute_over(
} else if (result != OB_SUCCESS) {
share::ObLSRestoreStatus status;
common::ObRole role;
lib::ObMutexGuard guard(mtx_);
if (nullptr != state_handler_) {
status = state_handler_->get_restore_status();
role = state_handler_->get_role();
}
// This function will be called when dag net finish. There is lock problem with offline.
// Lock sequence for offline is:
// 1. lock ObLSRestoreHandler::mtx_
// 2. lock ObTenantDagScheduler::scheduler_sync_, which is called in notify when cancel dag net.
//
// Lock sequence for finish dag net is:
// 1. lock ObTenantDagScheduler::scheduler_sync_
// 2. lock ObLSRestoreHandler::mtx_, as following
//
// To solve the dead lock problem, using trylock instead of lock.
int retry_cnt = 0;
// if lock failed, retry 3 times.
const int64_t MAX_TRY_LOCK_CNT = 3;
do {
if (OB_FAIL(mtx_.trylock())) {
LOG_WARN("lock restore handler failed, retry later", K(ret), KPC_(ls));
sleep(1);
} else {
if (nullptr != state_handler_) {
status = state_handler_->get_restore_status();
role = state_handler_->get_role();
}
#ifdef ERRSIM
SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result);
#endif
#ifdef ERRSIM
SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result);
#endif
if (status.is_restore_sys_tablets()) {
state_handler_->set_retry_flag();
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("restore sys tablets dag failed, need retry", K(ret));
} else if (OB_TABLET_NOT_EXIST == result) {
LOG_INFO("tablet has been deleted, no need to record err info", K(restore_failed_tablets));
} else if (common::ObRole::FOLLOWER == role && result_mgr_.can_retrieable_err(result)) {
LOG_INFO("follower met retrieable err, no need to record", K(result), K(task_id));
} else {
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
if (status.is_restore_sys_tablets()) {
state_handler_->set_retry_flag();
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("restore sys tablets dag failed, need retry", K(ret));
} else if (OB_TABLET_NOT_EXIST == result) {
LOG_INFO("tablet has been deleted, no need to record err info", K(restore_failed_tablets));
} else if (common::ObRole::FOLLOWER == role && result_mgr_.can_retrieable_err(result)) {
LOG_INFO("follower met retrieable err, no need to record", K(result), K(task_id));
} else {
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
}
mtx_.unlock();
}
} while (OB_EAGAIN == ret && MAX_TRY_LOCK_CNT > ++retry_cnt);
if (MAX_TRY_LOCK_CNT <= retry_cnt) {
ret = OB_TRY_LOCK_OBJ_CONFLICT;
LOG_WARN("lock restore handler failed", K(ret), K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), K(ls_id));
}
}
return ret;