[bugfix](backup)(cooldown) cancel backup properly when be backup failed (#38724) (#38993)

Co-authored-by: zhangyuan <ayuanzhang@tencent.com>
This commit is contained in:
walter
2024-08-07 15:58:11 +08:00
committed by GitHub
parent 843afccdf9
commit 7e95d7cbec
5 changed files with 286 additions and 0 deletions

View File

@ -159,6 +159,61 @@ public class BackupJob extends AbstractJob {
return BackupContent.ALL;
}
private synchronized boolean tryNewTabletSnapshotTask(SnapshotTask task) {
Table table = env.getInternalCatalog().getTableByTableId(task.getTableId());
if (table == null) {
return false;
}
OlapTable tbl = (OlapTable) table;
tbl.readLock();
try {
if (tbl.getId() != task.getTableId()) {
return false;
}
Partition partition = tbl.getPartition(task.getPartitionId());
if (partition == null) {
return false;
}
MaterializedIndex index = partition.getIndex(task.getIndexId());
if (index == null) {
return false;
}
Tablet tablet = index.getTablet(task.getTabletId());
if (tablet == null) {
return false;
}
Replica replica = chooseReplica(tablet, task.getVersion());
if (replica == null) {
return false;
}
//clear old task
AgentTaskQueue.removeTaskOfType(TTaskType.MAKE_SNAPSHOT, task.getTabletId());
unfinishedTaskIds.remove(task.getTabletId());
taskProgress.remove(task.getTabletId());
taskErrMsg.remove(task.getTabletId());
SnapshotTask newTask = new SnapshotTask(null, replica.getBackendId(), task.getTabletId(),
task.getJobId(), task.getDbId(), tbl.getId(), task.getPartitionId(),
task.getIndexId(), task.getTabletId(),
task.getVersion(),
task.getSchemaHash(), timeoutMs, false /* not restore task */);
AgentBatchTask batchTask = new AgentBatchTask();
batchTask.addTask(newTask);
unfinishedTaskIds.put(tablet.getId(), replica.getBackendId());
//send task
AgentTaskQueue.addTask(newTask);
AgentTaskExecutor.submit(batchTask);
} finally {
tbl.readUnlock();
}
return true;
}
public synchronized boolean finishTabletSnapshotTask(SnapshotTask task, TFinishTaskRequest request) {
Preconditions.checkState(task.getJobId() == jobId);
@ -171,6 +226,20 @@ public class BackupJob extends AbstractJob {
"make snapshot failed, version already merged");
cancelInternal();
}
if (request.getTaskStatus().getStatusCode() == TStatusCode.TABLET_MISSING
&& !tryNewTabletSnapshotTask(task)) {
status = new Status(ErrCode.NOT_FOUND,
"make snapshot failed, failed to ge tablet, table will be droped or truncated");
cancelInternal();
}
if (request.getTaskStatus().getStatusCode() == TStatusCode.NOT_IMPLEMENTED_ERROR) {
status = new Status(ErrCode.COMMON_ERROR,
"make snapshot failed, currently not support backup tablet with cooldowned remote data");
cancelInternal();
}
return false;
}