From d4a67d93f3e5be963238702dc98753cdf633a988 Mon Sep 17 00:00:00 2001 From: HHoflittlefish777 <77738092+HHoflittlefish777@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:53:51 +0800 Subject: [PATCH] [improve](routine-load) timely pause job if Kafka cluster exception when consume (#33372) --- .../doris/load/routineload/KafkaRoutineLoadJob.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java index bdcfb9e4a2..8540bb4396 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/KafkaRoutineLoadJob.java @@ -746,7 +746,13 @@ public class KafkaRoutineLoadJob extends RoutineLoadJob { cachedPartitionWithLatestOffsets.put(pair.first, pair.second); } } catch (Exception e) { - LOG.warn("failed to get latest partition offset. {}", e.getMessage(), e); + // It needs to pause job when can not get partition meta. + // To ensure the stability of the routine load, + // the scheduler will automatically pull up routine load job in this scenario, + // to avoid some network and Kafka exceptions causing the routine load job to stop + updateState(JobState.PAUSED, new ErrorReason(InternalErrorCode.PARTITIONS_ERR, + "failed to get latest partition offset. {}" + e.getMessage()), + false /* not replay */); return false; }