Wait until old ddl session exit before deciding the need of retry

This commit is contained in:
Hongqin-Li
2023-03-03 08:13:40 +00:00
committed by ob-robot
parent c19b2b1dcd
commit 1b9aaaba25

View File

@ -1685,21 +1685,15 @@ int ObCheckTabletDataComplementOp::check_and_wait_old_complement_task(
{
int ret = OB_SUCCESS;
need_exec_new_inner_sql = true; // default need execute new inner sql
bool is_old_task_session_exist = false;
bool is_all_sstable_build_finished = false;
bool need_wait = false;
bool is_old_task_session_exist = true;
bool is_dst_checksums_all_report = false;
if (OB_UNLIKELY(OB_INVALID_ID == tenant_id || OB_INVALID_ID == table_id)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("fail to check and wait complement task", K(ret), K(tenant_id), K(table_id));
} else {
LOG_INFO("start to check and wait complement task", K(tenant_id), K(table_id), K(inner_sql_exec_addr), K(trace_id));
do {
if (OB_FAIL(check_all_tablet_sstable_status(tenant_id, table_id, scn, execution_id, ddl_task_id, is_all_sstable_build_finished))) {
LOG_WARN("fail to check task tablet sstable status", K(ret), K(tenant_id), K(table_id), K(scn), K(execution_id), K(ddl_task_id));
} else if (is_all_sstable_build_finished) {
LOG_INFO("all tablet sstable has build finished");
} else {
while (OB_SUCC(ret) && is_old_task_session_exist) {
if (OB_FAIL(check_task_inner_sql_session_status(inner_sql_exec_addr, trace_id, tenant_id, execution_id, scn, is_old_task_session_exist))) {
LOG_WARN("fail check task inner sql session status", K(ret), K(trace_id), K(inner_sql_exec_addr));
} else if (!is_old_task_session_exist) {
@ -1708,20 +1702,31 @@ int ObCheckTabletDataComplementOp::check_and_wait_old_complement_task(
usleep(10 * 1000); // sleep 10ms
}
}
if (OB_EAGAIN == ret) { // retry
ret = OB_SUCCESS;
}
need_wait = !is_all_sstable_build_finished && is_old_task_session_exist;
} while (OB_SUCC(ret) && need_wait); // TODO: time out
///// end
/* Only in table all sstables not finished case, we will do retry */
if (is_all_sstable_build_finished) {
// After old session exits, the rule of retry is specified as follows
//
// A. for dst table merge checksums of this execution,
// - if complete, goto B (need_exec_new_inner_sql = false)
// - else if all tablets has been merged, this means some checksum report failed, retry
// - else old session must fail/crash, retry
//
// B. do checksum validation against src table scan checksums of this execution,
// - if src checksums are complete, this is exactly a validation
// - else old session must fail/crash "unexpectedly" (because complete dst checksum in A
// guarantees at least one preivous execution has successfully finished table scan),
// the validation may returns error due to lack of src checksum records
ObArray<ObTabletID> dest_tablet_ids;
if (OB_FAIL(ret)) {
} else if (OB_FAIL(check_tablet_checksum_update_status(tenant_id, table_id, ddl_task_id, execution_id, dest_tablet_ids, is_dst_checksums_all_report))) {
LOG_WARN("fail to check tablet checksum update status.", K(ret), K(tenant_id), K(dest_tablet_ids), K(execution_id));
} else if (is_dst_checksums_all_report) {
need_exec_new_inner_sql = false;
LOG_INFO("no need to execute inner sql to do complement.", K(need_exec_new_inner_sql));
LOG_INFO("no need execute because all tablet sstable has build finished", K(need_exec_new_inner_sql));
}
}
LOG_INFO("end to check and wait complement task", K(ret),
K(table_id), K(is_old_task_session_exist), K(is_all_sstable_build_finished), K(need_exec_new_inner_sql));
K(table_id), K(is_old_task_session_exist), K(is_dst_checksums_all_report), K(need_exec_new_inner_sql));
return ret;
}