[CP] Optimize the DAS retry process by reducing the interval of sleep.
This commit is contained in:
@ -37,7 +37,7 @@ int ObDASCtx::init(const ObPhysicalPlan &plan, ObExecContext &ctx)
|
||||
const ObIArray<ObTableLocation> &normal_locations = plan.get_table_locations();
|
||||
const ObIArray<ObTableLocation> &das_locations = plan.get_das_table_locations();
|
||||
location_router_.set_last_errno(ctx.get_my_session()->get_retry_info().get_last_query_retry_err());
|
||||
location_router_.set_retry_cnt(ctx.get_my_session()->get_retry_info().get_retry_cnt());
|
||||
location_router_.set_total_retry_cnt(ctx.get_my_session()->get_retry_info().get_retry_cnt());
|
||||
for (int64_t i = 0; OB_SUCC(ret) && i < das_locations.count(); ++i) {
|
||||
const ObTableLocation &das_location = das_locations.at(i);
|
||||
ObDASTableLoc *table_loc = nullptr;
|
||||
|
@ -743,7 +743,8 @@ int ObDASTabletMapper::get_partition_id_map(ObObjectID partition_id,
|
||||
ObDASLocationRouter::ObDASLocationRouter(ObIAllocator &allocator)
|
||||
: last_errno_(OB_SUCCESS),
|
||||
cur_errno_(OB_SUCCESS),
|
||||
retry_cnt_(0),
|
||||
total_retry_cnt_(0),
|
||||
cur_retry_cnt_(0),
|
||||
all_tablet_list_(allocator),
|
||||
succ_tablet_list_(allocator),
|
||||
virtual_server_list_(allocator),
|
||||
@ -1237,7 +1238,7 @@ int ObDASLocationRouter::block_renew_tablet_location(const ObTabletID &tablet_id
|
||||
void ObDASLocationRouter::set_retry_info(const ObQueryRetryInfo* retry_info)
|
||||
{
|
||||
last_errno_ = retry_info->get_last_query_retry_err();
|
||||
retry_cnt_ = retry_info->get_retry_cnt();
|
||||
total_retry_cnt_ = retry_info->get_retry_cnt();
|
||||
}
|
||||
|
||||
int ObDASLocationRouter::get_external_table_ls_location(ObLSLocation &location)
|
||||
|
@ -310,10 +310,17 @@ public:
|
||||
int save_touched_tablet_id(const common::ObTabletID &tablet_id) { return all_tablet_list_.push_back(tablet_id); }
|
||||
void set_last_errno(int err_no) { last_errno_ = err_no; }
|
||||
int get_last_errno() const { return last_errno_; }
|
||||
void set_retry_cnt(int64_t retry_cnt) { retry_cnt_ = retry_cnt; }
|
||||
void inc_retry_cnt() { ++retry_cnt_; }
|
||||
void set_total_retry_cnt(int64_t total_retry_cnt) { total_retry_cnt_ = total_retry_cnt; }
|
||||
void accumulate_retry_count()
|
||||
{
|
||||
total_retry_cnt_ += cur_retry_cnt_;
|
||||
cur_retry_cnt_ = 0;
|
||||
}
|
||||
int64_t get_total_retry_cnt() const { return total_retry_cnt_; }
|
||||
int64_t get_cur_retry_cnt() const { return cur_retry_cnt_; }
|
||||
void reset_cur_retry_cnt() { cur_retry_cnt_ = 0; }
|
||||
void inc_cur_retry_cnt() { ++cur_retry_cnt_; }
|
||||
void set_retry_info(const ObQueryRetryInfo* retry_info);
|
||||
int64_t get_retry_cnt() const { return retry_cnt_; }
|
||||
int get_external_table_ls_location(share::ObLSLocation &location);
|
||||
void save_cur_exec_status(int err_no)
|
||||
{
|
||||
@ -338,7 +345,8 @@ private:
|
||||
private:
|
||||
int last_errno_;
|
||||
int cur_errno_;
|
||||
int64_t retry_cnt_;
|
||||
int64_t total_retry_cnt_;
|
||||
int64_t cur_retry_cnt_; // the counter of continuous retry
|
||||
// NOTE: Only all_tablet_list_ needs to be serialized and send to other server to perform das remote execution;
|
||||
// And other members will be collected by execution server self, No need to perform serialization;
|
||||
ObList<common::ObTabletID, common::ObIAllocator> all_tablet_list_;
|
||||
|
@ -41,7 +41,7 @@ void ObDASRetryCtrl::tablet_location_retry_proc(ObDASRef &das_ref,
|
||||
loc_router.force_refresh_location_cache(true, task_op.get_errcode());
|
||||
need_retry = true;
|
||||
const ObDASTableLocMeta *loc_meta = tablet_loc->loc_meta_;
|
||||
LOG_INFO("refresh tablet location cache and retry DAS task",
|
||||
LOG_INFO("[DAS RETRY] refresh tablet location cache and retry DAS task",
|
||||
"errcode", task_op.get_errcode(), KPC(loc_meta), KPC(tablet_loc));
|
||||
}
|
||||
}
|
||||
|
@ -395,7 +395,7 @@ int ObDASUtils::wait_das_retry(int64_t retry_cnt)
|
||||
? THIS_WORKER.get_timeout_remain()
|
||||
: 10000L * timeout_factor;
|
||||
if (sleep_us > 0) {
|
||||
LOG_INFO("will sleep", K(sleep_us), K(THIS_WORKER.get_timeout_remain()));
|
||||
LOG_INFO("[DAS RETRY] will sleep", K(sleep_us), K(THIS_WORKER.get_timeout_remain()));
|
||||
THIS_WORKER.sched_wait();
|
||||
ob_usleep(static_cast<uint32_t>(sleep_us));
|
||||
THIS_WORKER.sched_run();
|
||||
|
@ -219,7 +219,7 @@ int ObDataAccessService::refresh_task_location_info(ObDASRef &das_ref, ObIDASTas
|
||||
int ret = OB_SUCCESS;
|
||||
ObExecContext &exec_ctx = das_ref.get_exec_ctx();
|
||||
ObDASTabletLoc *tablet_loc = const_cast<ObDASTabletLoc*>(task_op.get_tablet_loc());
|
||||
int64_t retry_cnt = DAS_CTX(exec_ctx).get_location_router().get_retry_cnt();
|
||||
int64_t retry_cnt = DAS_CTX(exec_ctx).get_location_router().get_cur_retry_cnt();
|
||||
if (OB_FAIL(ObDASUtils::wait_das_retry(retry_cnt))) {
|
||||
LOG_WARN("wait das retry failed", K(ret));
|
||||
} else if (OB_FAIL(DAS_CTX(exec_ctx).get_location_router().get_tablet_loc(*tablet_loc->loc_meta_,
|
||||
@ -247,6 +247,7 @@ int ObDataAccessService::retry_das_task(ObDASRef &das_ref, ObIDASTaskOp &task_op
|
||||
ObDasAggregatedTasks das_task_wrapper(tmp_alloc);
|
||||
bool retry_continue = false;
|
||||
ObDASLocationRouter &location_router = DAS_CTX(das_ref.get_exec_ctx()).get_location_router();
|
||||
location_router.reset_cur_retry_cnt();
|
||||
do {
|
||||
ObDASRetryCtrl::retry_func retry_func = nullptr;
|
||||
|
||||
@ -256,14 +257,17 @@ int ObDataAccessService::retry_das_task(ObDASRef &das_ref, ObIDASTaskOp &task_op
|
||||
LOG_WARN("get das retry func failed", K(tmp_ret), K(task_op.errcode_));
|
||||
} else if (retry_func != nullptr) {
|
||||
bool need_retry = false;
|
||||
const ObDASTabletLoc *tablet_loc = task_op.get_tablet_loc();
|
||||
const ObDASTableLocMeta *loc_meta = tablet_loc != nullptr ? tablet_loc->loc_meta_ : nullptr;
|
||||
retry_func(das_ref, task_op, need_retry);
|
||||
LOG_INFO("[DAS RETRY] check if need tablet level retry",
|
||||
KR(task_op.errcode_), K(need_retry), K(task_op.task_flag_),
|
||||
"retry_cnt", location_router.get_retry_cnt(),
|
||||
KPC(task_op.get_tablet_loc()));
|
||||
"continuous_retry_cnt", location_router.get_cur_retry_cnt(),
|
||||
"total_retry_cnt", location_router.get_total_retry_cnt(),
|
||||
KPC(loc_meta), KPC(tablet_loc));
|
||||
if (need_retry &&
|
||||
task_op.get_inner_rescan() &&
|
||||
location_router.get_retry_cnt() > 100) { //hard code retry 100 times.
|
||||
location_router.get_total_retry_cnt() > 100) { //hard code retry 100 times.
|
||||
// disable das retry for rescan.
|
||||
need_retry = false;
|
||||
retry_continue = false;
|
||||
@ -272,7 +276,7 @@ int ObDataAccessService::retry_das_task(ObDASRef &das_ref, ObIDASTaskOp &task_op
|
||||
if (need_retry) {
|
||||
task_op.in_part_retry_ = true;
|
||||
location_router.set_last_errno(task_op.get_errcode());
|
||||
location_router.inc_retry_cnt();
|
||||
location_router.inc_cur_retry_cnt();
|
||||
oceanbase::lib::Thread::WaitGuard guard(oceanbase::lib::Thread::WAIT_FOR_LOCAL_RETRY);
|
||||
if (OB_TMP_FAIL(clear_task_exec_env(das_ref, task_op))) {
|
||||
LOG_WARN("clear task execution environment failed", K(tmp_ret));
|
||||
@ -290,10 +294,28 @@ int ObDataAccessService::retry_das_task(ObDASRef &das_ref, ObIDASTaskOp &task_op
|
||||
} else if (OB_FAIL(execute_dist_das_task(das_ref, das_task_wrapper, false))) {
|
||||
LOG_WARN("execute dist DAS task failed", K(ret));
|
||||
}
|
||||
LOG_INFO("[DAS RETRY] Retry completing the DAS Task", KPC(task_op.get_tablet_loc()), KR(ret));
|
||||
if (OB_SUCCESS == ret) {
|
||||
LOG_INFO("[DAS RETRY] DAS Task succeeds after multiple retries",
|
||||
"continuous_retry_cnt", location_router.get_cur_retry_cnt(),
|
||||
"total_retry_cnt", location_router.get_total_retry_cnt(),
|
||||
KPC(task_op.get_tablet_loc()));
|
||||
} else {
|
||||
int64_t cur_retry_cnt = location_router.get_cur_retry_cnt();
|
||||
int64_t total_retry_cnt = location_router.get_total_retry_cnt();
|
||||
if (cur_retry_cnt >= 100 && cur_retry_cnt % 50L == 0) {
|
||||
LOG_INFO("[DAS RETRY] The DAS task has been retried multiple times without success, "
|
||||
"and the execution may be blocked by a specific exception", KR(ret),
|
||||
"continuous_retry_cnt", cur_retry_cnt,
|
||||
"total_retry_cnt", location_router.get_total_retry_cnt(),
|
||||
KPC(task_op.get_tablet_loc()));
|
||||
}
|
||||
}
|
||||
}
|
||||
task_op.errcode_ = ret;
|
||||
retry_continue = (OB_SUCCESS != ret);
|
||||
if (!retry_continue) {
|
||||
location_router.accumulate_retry_count();
|
||||
}
|
||||
if (retry_continue && IS_INTERRUPTED()) {
|
||||
retry_continue = false;
|
||||
LOG_INFO("[DAS RETRY] Retry is interrupted by worker interrupt signal", KR(ret));
|
||||
|
Reference in New Issue
Block a user