add check skip start transfer in commit before replay commit and fix dead lock
This commit is contained in:
@ -3458,9 +3458,7 @@ int ObTenantDagScheduler::cancel_dag_net(const ObDagId &dag_id)
|
||||
}
|
||||
}
|
||||
|
||||
if (OB_SUCC(ret)) {
|
||||
notify();
|
||||
}
|
||||
// Donot call notify(), may cause dead lock.
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1067,7 +1067,7 @@ int ObTenantDagScheduler::create_and_add_dag_net(const ObIDagInitParam *param)
|
||||
}
|
||||
} else {
|
||||
COMMON_LOG(INFO, "success to create and add dag_net", K(ret), KP(dag_net));
|
||||
notify();
|
||||
// Donot call notify(), may cause dead lock.
|
||||
}
|
||||
}
|
||||
if (OB_FAIL(ret)) {
|
||||
|
||||
@ -227,24 +227,7 @@ int ObLSRestoreHandler::handle_execute_over(
|
||||
} else if (result != OB_SUCCESS) {
|
||||
share::ObLSRestoreStatus status;
|
||||
common::ObRole role;
|
||||
// This function will be called when dag net finish. There is lock problem with offline.
|
||||
// Lock sequence for offline is:
|
||||
// 1. lock ObLSRestoreHandler::mtx_
|
||||
// 2. lock ObTenantDagScheduler::scheduler_sync_, which is called in notify when cancel dag net.
|
||||
//
|
||||
// Lock sequence for finish dag net is:
|
||||
// 1. lock ObTenantDagScheduler::scheduler_sync_
|
||||
// 2. lock ObLSRestoreHandler::mtx_, as following
|
||||
//
|
||||
// To solve the dead lock problem, using trylock instead of lock.
|
||||
int retry_cnt = 0;
|
||||
// if lock failed, retry 3 times.
|
||||
const int64_t MAX_TRY_LOCK_CNT = 3;
|
||||
do {
|
||||
if (OB_FAIL(mtx_.trylock())) {
|
||||
LOG_WARN("lock restore handler failed, retry later", K(ret), KPC_(ls));
|
||||
sleep(1);
|
||||
} else {
|
||||
lib::ObMutexGuard guard(mtx_);
|
||||
if (nullptr != state_handler_) {
|
||||
status = state_handler_->get_restore_status();
|
||||
role = state_handler_->get_role();
|
||||
@ -266,15 +249,6 @@ int ObLSRestoreHandler::handle_execute_over(
|
||||
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
|
||||
LOG_WARN("failed restore dag net task", K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), KPC_(ls));
|
||||
}
|
||||
|
||||
mtx_.unlock();
|
||||
}
|
||||
} while (OB_EAGAIN == ret && MAX_TRY_LOCK_CNT > ++retry_cnt);
|
||||
|
||||
if (MAX_TRY_LOCK_CNT <= retry_cnt) {
|
||||
ret = OB_TRY_LOCK_OBJ_CONFLICT;
|
||||
LOG_WARN("lock restore handler failed", K(ret), K(result), K(task_id), K(ls_id), K(restore_succeed_tablets), K(restore_failed_tablets), K(ls_id));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1532,6 +1532,11 @@ bool ObTabletStartTransferInHelper::check_can_replay_commit(
|
||||
} else if (!tx_start_transfer_in_info.is_valid()) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("tx start transfer in info is unexpected", K(ret), K(tx_start_transfer_in_info));
|
||||
} else if (OB_FAIL(check_can_skip_replay_(scn, tx_start_transfer_in_info, skip_replay))) {
|
||||
LOG_WARN("failed to check can skip replay commit", K(ret), K(scn), K(tx_start_transfer_in_info));
|
||||
} else if (skip_replay) {
|
||||
b_ret = true;
|
||||
LOG_INFO("skip replay start transfer in commit", K(scn), K(tx_start_transfer_in_info));
|
||||
} else {
|
||||
if (OB_FAIL(check_can_skip_check_transfer_src_tablet_(scn, tx_start_transfer_in_info, can_skip_check_src))) {
|
||||
LOG_WARN("failed to check can skip check transfer src tablet", K(ret), K(tx_start_transfer_in_info));
|
||||
|
||||
Reference in New Issue
Block a user