fix dead lock and wrong get gts interface

This commit is contained in:
wxhwang
2023-06-22 09:18:09 +00:00
committed by ob-robot
parent 7ee480eceb
commit cab47b299e
3 changed files with 49 additions and 35 deletions

View File

@ -765,14 +765,8 @@ int ObArchiveHandler::get_max_checkpoint_scn_(const uint64_t tenant_id, SCN &max
// That will leads some log of type of create log stream is archived before been replayed. In this case, // That will leads some log of type of create log stream is archived before been replayed. In this case,
// we should limit tenant archive progress not more than the GTS. // we should limit tenant archive progress not more than the GTS.
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
ObAllTenantInfo tenant_info; if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, max_checkpoint_scn))) {
const bool for_update = false; LOG_WARN("failed to get max checkpoint scn.", K(ret), K_(tenant_id));
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id, sql_proxy_, for_update, tenant_info))) {
LOG_WARN("failed to get tenant info", K(ret), K(tenant_id));
} else if (OB_FALSE_IT(max_checkpoint_scn = tenant_info.get_standby_scn())) {
} else if (SCN::base_scn() >= max_checkpoint_scn) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("max_checkpoint_scn not valid", K(ret), K(tenant_info));
} }
return ret; return ret;
} }

View File

@ -139,14 +139,8 @@ bool ObArchiveRoundHandler::can_suspend_archive(const ObTenantArchiveRoundAttr &
int ObArchiveRoundHandler::decide_start_scn_(SCN &start_scn) int ObArchiveRoundHandler::decide_start_scn_(SCN &start_scn)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
ObAllTenantInfo tenant_info; if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, start_scn))) {
const bool for_update = false; LOG_WARN("failed to decide archive start scn.", K(ret), K_(tenant_id));
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id_, sql_proxy_, for_update, tenant_info))) {
LOG_WARN("failed to get tenant info", K(ret), K_(tenant_id));
} else if (OB_FALSE_IT(start_scn = tenant_info.get_standby_scn())){
} else if (SCN::base_scn() >= start_scn) {
ret = OB_EAGAIN;
LOG_WARN("start_scn not valid, need wait", K(ret), K(tenant_info));
} }
return ret; return ret;
} }

View File

@ -227,15 +227,32 @@ int ObLSRestoreHandler::handle_execute_over(
} else if (result != OB_SUCCESS) { } else if (result != OB_SUCCESS) {
share::ObLSRestoreStatus status; share::ObLSRestoreStatus status;
common::ObRole role; common::ObRole role;
lib::ObMutexGuard guard(mtx_); // This function will be called when dag net finish. There is lock problem with offline.
// Lock sequence for offline is:
// 1. lock ObLSRestoreHandler::mtx_
// 2. lock ObTenantDagScheduler::scheduler_sync_, which is called in notify when cancel dag net.
//
// Lock sequence for finish dag net is:
// 1. lock ObTenantDagScheduler::scheduler_sync_
// 2. lock ObLSRestoreHandler::mtx_, as following
//
// To solve the dead lock problem, using trylock instead of lock.
int retry_cnt = 0;
// if lock failed, retry 3 times.
const int64_t MAX_TRY_LOCK_CNT = 3;
do {
if (OB_FAIL(mtx_.trylock())) {
LOG_WARN("lock restore handler failed, retry later", K(ret), KPC_(ls));
sleep(1);
} else {
if (nullptr != state_handler_) { if (nullptr != state_handler_) {
status = state_handler_->get_restore_status(); status = state_handler_->get_restore_status();
role = state_handler_->get_role(); role = state_handler_->get_role();
} }
#ifdef ERRSIM #ifdef ERRSIM
SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result); SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result);
#endif #endif
if (status.is_restore_sys_tablets()) { if (status.is_restore_sys_tablets()) {
state_handler_->set_retry_flag(); state_handler_->set_retry_flag();
@ -249,6 +266,15 @@ int ObLSRestoreHandler::handle_execute_over(
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls)); LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
} }
mtx_.unlock();
}
} while (OB_EAGAIN == ret && MAX_TRY_LOCK_CNT > ++retry_cnt);
if (MAX_TRY_LOCK_CNT <= retry_cnt) {
ret = OB_TRY_LOCK_OBJ_CONFLICT;
LOG_WARN("lock restore handler failed", K(ret), K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), K(ls_id));
}
} }
return ret; return ret;
} }