fix dead lock and wrong get gts interface
This commit is contained in:
@ -765,14 +765,8 @@ int ObArchiveHandler::get_max_checkpoint_scn_(const uint64_t tenant_id, SCN &max
|
|||||||
// That will leads some log of type of create log stream is archived before been replayed. In this case,
|
// That will leads some log of type of create log stream is archived before been replayed. In this case,
|
||||||
// we should limit tenant archive progress not more than the GTS.
|
// we should limit tenant archive progress not more than the GTS.
|
||||||
int ret = OB_SUCCESS;
|
int ret = OB_SUCCESS;
|
||||||
ObAllTenantInfo tenant_info;
|
if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, max_checkpoint_scn))) {
|
||||||
const bool for_update = false;
|
LOG_WARN("failed to get max checkpoint scn.", K(ret), K_(tenant_id));
|
||||||
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id, sql_proxy_, for_update, tenant_info))) {
|
|
||||||
LOG_WARN("failed to get tenant info", K(ret), K(tenant_id));
|
|
||||||
} else if (OB_FALSE_IT(max_checkpoint_scn = tenant_info.get_standby_scn())) {
|
|
||||||
} else if (SCN::base_scn() >= max_checkpoint_scn) {
|
|
||||||
ret = OB_ERR_UNEXPECTED;
|
|
||||||
LOG_WARN("max_checkpoint_scn not valid", K(ret), K(tenant_info));
|
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -139,14 +139,8 @@ bool ObArchiveRoundHandler::can_suspend_archive(const ObTenantArchiveRoundAttr &
|
|||||||
int ObArchiveRoundHandler::decide_start_scn_(SCN &start_scn)
|
int ObArchiveRoundHandler::decide_start_scn_(SCN &start_scn)
|
||||||
{
|
{
|
||||||
int ret = OB_SUCCESS;
|
int ret = OB_SUCCESS;
|
||||||
ObAllTenantInfo tenant_info;
|
if (OB_FAIL(ObBackupUtils::get_backup_scn(tenant_id_, start_scn))) {
|
||||||
const bool for_update = false;
|
LOG_WARN("failed to decide archive start scn.", K(ret), K_(tenant_id));
|
||||||
if (OB_FAIL(ObAllTenantInfoProxy::load_tenant_info(tenant_id_, sql_proxy_, for_update, tenant_info))) {
|
|
||||||
LOG_WARN("failed to get tenant info", K(ret), K_(tenant_id));
|
|
||||||
} else if (OB_FALSE_IT(start_scn = tenant_info.get_standby_scn())){
|
|
||||||
} else if (SCN::base_scn() >= start_scn) {
|
|
||||||
ret = OB_EAGAIN;
|
|
||||||
LOG_WARN("start_scn not valid, need wait", K(ret), K(tenant_info));
|
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -227,27 +227,53 @@ int ObLSRestoreHandler::handle_execute_over(
|
|||||||
} else if (result != OB_SUCCESS) {
|
} else if (result != OB_SUCCESS) {
|
||||||
share::ObLSRestoreStatus status;
|
share::ObLSRestoreStatus status;
|
||||||
common::ObRole role;
|
common::ObRole role;
|
||||||
lib::ObMutexGuard guard(mtx_);
|
// This function will be called when dag net finish. There is lock problem with offline.
|
||||||
if (nullptr != state_handler_) {
|
// Lock sequence for offline is:
|
||||||
status = state_handler_->get_restore_status();
|
// 1. lock ObLSRestoreHandler::mtx_
|
||||||
role = state_handler_->get_role();
|
// 2. lock ObTenantDagScheduler::scheduler_sync_, which is called in notify when cancel dag net.
|
||||||
}
|
//
|
||||||
|
// Lock sequence for finish dag net is:
|
||||||
|
// 1. lock ObTenantDagScheduler::scheduler_sync_
|
||||||
|
// 2. lock ObLSRestoreHandler::mtx_, as following
|
||||||
|
//
|
||||||
|
// To solve the dead lock problem, using trylock instead of lock.
|
||||||
|
int retry_cnt = 0;
|
||||||
|
// if lock failed, retry 3 times.
|
||||||
|
const int64_t MAX_TRY_LOCK_CNT = 3;
|
||||||
|
do {
|
||||||
|
if (OB_FAIL(mtx_.trylock())) {
|
||||||
|
LOG_WARN("lock restore handler failed, retry later", K(ret), KPC_(ls));
|
||||||
|
sleep(1);
|
||||||
|
} else {
|
||||||
|
if (nullptr != state_handler_) {
|
||||||
|
status = state_handler_->get_restore_status();
|
||||||
|
role = state_handler_->get_role();
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ERRSIM
|
#ifdef ERRSIM
|
||||||
SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result);
|
SERVER_EVENT_ADD("storage_ha", "handle_execute_over_errsim", "result", result);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (status.is_restore_sys_tablets()) {
|
if (status.is_restore_sys_tablets()) {
|
||||||
state_handler_->set_retry_flag();
|
state_handler_->set_retry_flag();
|
||||||
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
|
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
|
||||||
LOG_WARN("restore sys tablets dag failed, need retry", K(ret));
|
LOG_WARN("restore sys tablets dag failed, need retry", K(ret));
|
||||||
} else if (OB_TABLET_NOT_EXIST == result) {
|
} else if (OB_TABLET_NOT_EXIST == result) {
|
||||||
LOG_INFO("tablet has been deleted, no need to record err info", K(restore_failed_tablets));
|
LOG_INFO("tablet has been deleted, no need to record err info", K(restore_failed_tablets));
|
||||||
} else if (common::ObRole::FOLLOWER == role && result_mgr_.can_retrieable_err(result)) {
|
} else if (common::ObRole::FOLLOWER == role && result_mgr_.can_retrieable_err(result)) {
|
||||||
LOG_INFO("follower met retrieable err, no need to record", K(result), K(task_id));
|
LOG_INFO("follower met retrieable err, no need to record", K(result), K(task_id));
|
||||||
} else {
|
} else {
|
||||||
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
|
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
|
||||||
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
|
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
|
||||||
|
}
|
||||||
|
|
||||||
|
mtx_.unlock();
|
||||||
|
}
|
||||||
|
} while (OB_EAGAIN == ret && MAX_TRY_LOCK_CNT > ++retry_cnt);
|
||||||
|
|
||||||
|
if (MAX_TRY_LOCK_CNT <= retry_cnt) {
|
||||||
|
ret = OB_TRY_LOCK_OBJ_CONFLICT;
|
||||||
|
LOG_WARN("lock restore handler failed", K(ret), K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), K(ls_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
Reference in New Issue
Block a user