add check skip start transfer in commit before replay commit and fix dead lock

This commit is contained in:
wxhwang
2023-06-27 21:42:15 +00:00
committed by ob-robot
parent cd1b04ff90
commit d0b5dd3226
4 changed files with 26 additions and 49 deletions

View File

@ -3458,9 +3458,7 @@ int ObTenantDagScheduler::cancel_dag_net(const ObDagId &dag_id)
} }
} }
if (OB_SUCC(ret)) { // Donot call notify(), may cause dead lock.
notify();
}
} }
return ret; return ret;
} }

View File

@ -1067,7 +1067,7 @@ int ObTenantDagScheduler::create_and_add_dag_net(const ObIDagInitParam *param)
} }
} else { } else {
COMMON_LOG(INFO, "success to create and add dag_net", K(ret), KP(dag_net)); COMMON_LOG(INFO, "success to create and add dag_net", K(ret), KP(dag_net));
notify(); // Donot call notify(), may cause dead lock.
} }
} }
if (OB_FAIL(ret)) { if (OB_FAIL(ret)) {

View File

@ -227,24 +227,7 @@ int ObLSRestoreHandler::handle_execute_over(
} else if (result != OB_SUCCESS) { } else if (result != OB_SUCCESS) {
share::ObLSRestoreStatus status; share::ObLSRestoreStatus status;
common::ObRole role; common::ObRole role;
// This function will be called when dag net finish. There is lock problem with offline. lib::ObMutexGuard guard(mtx_);
// Lock sequence for offline is:
// 1. lock ObLSRestoreHandler::mtx_
// 2. lock ObTenantDagScheduler::scheduler_sync_, which is called in notify when cancel dag net.
//
// Lock sequence for finish dag net is:
// 1. lock ObTenantDagScheduler::scheduler_sync_
// 2. lock ObLSRestoreHandler::mtx_, as following
//
// To solve the dead lock problem, using trylock instead of lock.
int retry_cnt = 0;
// if lock failed, retry 3 times.
const int64_t MAX_TRY_LOCK_CNT = 3;
do {
if (OB_FAIL(mtx_.trylock())) {
LOG_WARN("lock restore handler failed, retry later", K(ret), KPC_(ls));
sleep(1);
} else {
if (nullptr != state_handler_) { if (nullptr != state_handler_) {
status = state_handler_->get_restore_status(); status = state_handler_->get_restore_status();
role = state_handler_->get_role(); role = state_handler_->get_role();
@ -266,15 +249,6 @@ int ObLSRestoreHandler::handle_execute_over(
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls)); LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
} }
mtx_.unlock();
}
} while (OB_EAGAIN == ret && MAX_TRY_LOCK_CNT > ++retry_cnt);
if (MAX_TRY_LOCK_CNT <= retry_cnt) {
ret = OB_TRY_LOCK_OBJ_CONFLICT;
LOG_WARN("lock restore handler failed", K(ret), K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), K(ls_id));
}
} }
return ret; return ret;
} }

View File

@ -1532,6 +1532,11 @@ bool ObTabletStartTransferInHelper::check_can_replay_commit(
} else if (!tx_start_transfer_in_info.is_valid()) { } else if (!tx_start_transfer_in_info.is_valid()) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_WARN("tx start transfer in info is unexpected", K(ret), K(tx_start_transfer_in_info)); LOG_WARN("tx start transfer in info is unexpected", K(ret), K(tx_start_transfer_in_info));
} else if (OB_FAIL(check_can_skip_replay_(scn, tx_start_transfer_in_info, skip_replay))) {
LOG_WARN("failed to check can skip replay commit", K(ret), K(scn), K(tx_start_transfer_in_info));
} else if (skip_replay) {
b_ret = true;
LOG_INFO("skip replay start transfer in commit", K(scn), K(tx_start_transfer_in_info));
} else { } else {
if (OB_FAIL(check_can_skip_check_transfer_src_tablet_(scn, tx_start_transfer_in_info, can_skip_check_src))) { if (OB_FAIL(check_can_skip_check_transfer_src_tablet_(scn, tx_start_transfer_in_info, can_skip_check_src))) {
LOG_WARN("failed to check can skip check transfer src tablet", K(ret), K(tx_start_transfer_in_info)); LOG_WARN("failed to check can skip check transfer src tablet", K(ret), K(tx_start_transfer_in_info));