[Bug][RoutineLoad] Fix bug that routine Load encounter "label already used" exception (#2959)

This CL modify 2 things:

1. When a routine load task submit failed, it will not be put back to the task queue.
2. The rpc timeout when executing a routine load task in BE is set to `query_timeout` of the task plan.

ISSUE: #2964
This commit is contained in:
Mingyu Chen
2020-02-22 22:01:14 +08:00
committed by GitHub
parent fc2d92d68a
commit 8eb413fa69
5 changed files with 33 additions and 19 deletions

View File

@ -79,7 +79,7 @@ Status NodeChannel::init(RuntimeState* state) {
_add_batch_request.set_index_id(_index_id);
_add_batch_request.set_sender_id(_parent->_sender_id);
_rpc_timeout_ms = config::tablet_writer_rpc_timeout_sec * 1000;
_rpc_timeout_ms = state->query_options().query_timeout * 1000;
return Status::OK();
}

View File

@ -83,6 +83,7 @@ public class ShowRoutineLoadStmt extends ShowStmt {
.add("Progress")
.add("ReasonOfStateChanged")
.add("ErrorLogUrls")
.add("OtherMsg")
.build();
private final LabelName labelName;

View File

@ -173,6 +173,8 @@ public abstract class RoutineLoadJob extends AbstractTxnStateChangeCallback impl
protected int currentTaskConcurrentNum;
protected RoutineLoadProgress progress;
// some other msg which need to show to user;
protected String otherMsg = "";
protected String pauseReason = "";
protected String cancelReason = "";
@ -316,6 +318,10 @@ public abstract class RoutineLoadJob extends AbstractTxnStateChangeCallback impl
return dbId;
}
public void setOtherMsg(String otherMsg) {
this.otherMsg = Strings.nullToEmpty(otherMsg);
}
public String getDbFullName() throws MetaNotFoundException {
Database database = Catalog.getCurrentCatalog().getDb(dbId);
if (database == null) {
@ -1095,6 +1101,7 @@ public abstract class RoutineLoadJob extends AbstractTxnStateChangeCallback impl
row.add("");
}
row.add(Joiner.on(", ").join(errorLogUrls));
row.add(otherMsg);
return row;
} finally {
readUnlock();

View File

@ -177,10 +177,20 @@ public class RoutineLoadTaskScheduler extends MasterDaemon {
throw e;
}
if (!submitTask(routineLoadTaskInfo.getBeId(), tRoutineLoadTask)) {
// submit failed. push it back to the queue to wait next scheduling
routineLoadTaskInfo.setBeId(-1);
needScheduleTasksQueue.put(routineLoadTaskInfo);
try {
submitTask(routineLoadTaskInfo.getBeId(), tRoutineLoadTask);
} catch (LoadException e) {
// submit task failed (such as TOO_MANY_TASKS error), but txn has already begun.
// Here we will still set the ExecuteStartTime of this task, which means
// we "assume" that this task has been successfully submitted.
// And this task will then be aborted because of a timeout.
// In this way, we can prevent the entire job from being paused due to submit errors,
// and we can also relieve the pressure on BE by waiting for the timeout period.
LOG.warn("failed to submit routine load task {} to BE: {}",
DebugUtil.printId(routineLoadTaskInfo.getId()),
routineLoadTaskInfo.getBeId());
routineLoadManager.getJob(routineLoadTaskInfo.getJobId()).setOtherMsg(e.getMessage());
// fall through to set ExecuteStartTime
}
// set the executeStartTimeMs of task
@ -208,32 +218,28 @@ public class RoutineLoadTaskScheduler extends MasterDaemon {
LOG.debug("total tasks num in routine load task queue: {}", needScheduleTasksQueue.size());
}
private boolean submitTask(long beId, TRoutineLoadTask tTask) {
private void submitTask(long beId, TRoutineLoadTask tTask) throws LoadException {
Backend backend = Catalog.getCurrentSystemInfo().getBackend(beId);
if (backend == null) {
LOG.warn("failed to send tasks to backend {} because not exist", beId);
return false;
throw new LoadException("failed to send tasks to backend " + beId + " because not exist");
}
TNetworkAddress address = new TNetworkAddress(backend.getHost(), backend.getBePort());
boolean ok = false;
BackendService.Client client = null;
try {
client = ClientPool.backendPool.borrowObject(address);
TStatus tStatus = client.submit_routine_load_task(Lists.newArrayList(tTask));
ok = true;
if (tStatus.getStatus_code() == TStatusCode.OK) {
LOG.debug("send routine load task {} to BE: {}", DebugUtil.printId(tTask.id), beId);
return true;
} else {
LOG.info("failed to submit task {}, BE: {}, error code: {}",
DebugUtil.printId(tTask.getId()), beId, tStatus.getStatus_code());
return false;
if (tStatus.getStatus_code() != TStatusCode.OK) {
throw new LoadException("failed to submit task. error code: " + tStatus.getStatus_code()
+ ", msg: " + (tStatus.getError_msgsSize() > 0 ? tStatus.getError_msgs().get(0) : "NaN"));
}
LOG.debug("send routine load task {} to BE: {}", DebugUtil.printId(tTask.id), beId);
} catch (Exception e) {
LOG.warn("task send error. backend[{}]", beId, e);
return false;
throw new LoadException("failed to send task: " + e.getMessage(), e);
} finally {
if (ok) {
ClientPool.backendPool.returnObject(address, client);
@ -241,7 +247,6 @@ public class RoutineLoadTaskScheduler extends MasterDaemon {
ClientPool.backendPool.invalidateObject(address, client);
}
}
}
// try to allocate a task to BE which has idle slot.

View File

@ -186,6 +186,7 @@ public class StreamLoadTask {
partitions = routineLoadJob.getPartitions() == null ? null : Joiner.on(",").join(routineLoadJob.getPartitions());
strictMode = routineLoadJob.isStrictMode();
timezone = routineLoadJob.getTimezone();
timeout = (int) routineLoadJob.getMaxBatchIntervalS() * 2;
}
// used for stream load