From e1ff744a99647cb441fc7b9bd75ed7c50cc14581 Mon Sep 17 00:00:00 2001 From: WingC <1018957763@qq.com> Date: Wed, 18 Dec 2019 19:17:34 +0800 Subject: [PATCH] [Alter Job] Cancel the alter job after a task failed for 3 times (#2447) To avoid waiting timeout when it is a invalid alter job. --- fe/src/main/java/org/apache/doris/alter/RollupJobV2.java | 6 ++++++ .../java/org/apache/doris/alter/SchemaChangeJobV2.java | 6 ++++++ fe/src/main/java/org/apache/doris/master/MasterImpl.java | 3 +++ fe/src/main/java/org/apache/doris/task/AgentTask.java | 9 +++++++++ 4 files changed, 24 insertions(+) diff --git a/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java b/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java index 9b8753ea67..d43c177a48 100644 --- a/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java +++ b/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java @@ -373,6 +373,12 @@ public class RollupJobV2 extends AlterJobV2 { if (!rollupBatchTask.isFinished()) { LOG.info("rollup tasks not finished. job: {}", jobId); + List tasks = rollupBatchTask.getUnfinishedTasks(2000); + for (AgentTask task : tasks) { + if (task.getFailedTimes() >= 3) { + throw new AlterCancelException("rollup task failed after try three times: " + task.getErrorMsg()); + } + } return; } diff --git a/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java b/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java index 929b3245f0..29557caa98 100644 --- a/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java +++ b/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java @@ -406,6 +406,12 @@ public class SchemaChangeJobV2 extends AlterJobV2 { if (!schemaChangeBatchTask.isFinished()) { LOG.info("schema change tasks not finished. job: {}", jobId); + List tasks = schemaChangeBatchTask.getUnfinishedTasks(2000); + for (AgentTask task : tasks) { + if (task.getFailedTimes() >= 3) { + throw new AlterCancelException("schema change task failed after try three times: " + task.getErrorMsg()); + } + } return; } diff --git a/fe/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/src/main/java/org/apache/doris/master/MasterImpl.java index 9e0b271bec..6535b622db 100644 --- a/fe/src/main/java/org/apache/doris/master/MasterImpl.java +++ b/fe/src/main/java/org/apache/doris/master/MasterImpl.java @@ -130,6 +130,9 @@ public class MasterImpl { } else { if (taskStatus.getStatus_code() != TStatusCode.OK) { task.failed(); + String errMsg = "task type: " + taskType + ", status_code: " + taskStatus.getStatus_code().toString() + + ", backendId: " + backend + ", signature: " + signature; + task.setErrorMsg(errMsg); // We start to let FE perceive the task's error msg if (taskType != TTaskType.MAKE_SNAPSHOT && taskType != TTaskType.UPLOAD && taskType != TTaskType.DOWNLOAD && taskType != TTaskType.MOVE diff --git a/fe/src/main/java/org/apache/doris/task/AgentTask.java b/fe/src/main/java/org/apache/doris/task/AgentTask.java index 60486e7379..9ce7557afa 100644 --- a/fe/src/main/java/org/apache/doris/task/AgentTask.java +++ b/fe/src/main/java/org/apache/doris/task/AgentTask.java @@ -34,6 +34,7 @@ public abstract class AgentTask { protected TResourceInfo resourceInfo; protected int failedTimes; + protected String errorMsg; // some of process may use this member to check if the task is finished. // some of are not. // so whether the task is finished depends on caller's logic, not the value of this member. @@ -105,6 +106,14 @@ public abstract class AgentTask { return this.failedTimes; } + public void setErrorMsg(String errorMsg) { + this.errorMsg = errorMsg; + } + + public String getErrorMsg() { + return errorMsg; + } + public void setFinished(boolean isFinished) { this.isFinished = isFinished; }