From 27b208972fce2bc20035511342af15ff14b599af Mon Sep 17 00:00:00 2001 From: obdev Date: Fri, 21 Apr 2023 07:15:06 +0000 Subject: [PATCH] Add restore_to_end interface --- .../restoreservice/ob_log_restore_handler.cpp | 36 +++++++++++++++---- .../restoreservice/ob_log_restore_handler.h | 3 +- .../ob_remote_fetch_log_worker.cpp | 18 +++++----- .../ob_remote_fetch_log_worker.h | 2 +- 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/src/logservice/restoreservice/ob_log_restore_handler.cpp b/src/logservice/restoreservice/ob_log_restore_handler.cpp index 7f89463ac..2f9b583bc 100644 --- a/src/logservice/restoreservice/ob_log_restore_handler.cpp +++ b/src/logservice/restoreservice/ob_log_restore_handler.cpp @@ -178,7 +178,7 @@ int ObLogRestoreHandler::get_max_restore_scn(SCN &scn) const { RLockGuard guard(lock_); int ret = OB_SUCCESS; - if (OB_ISNULL(parent_) || ! parent_->to_end()) { + if (OB_ISNULL(parent_) || ! restore_to_end_unlock_()) { ret = OB_EAGAIN; } else { parent_->get_end_scn(scn); @@ -325,7 +325,7 @@ int ObLogRestoreHandler::raw_write(const int64_t proposal_id, CLOG_LOG(WARN, "invalid argument", K(ret), K(proposal_id), K(lsn), K(buf), K(buf_size)); } else if (proposal_id != proposal_id_) { CLOG_LOG(INFO, "stale task, just skip", K(proposal_id), K(proposal_id_), K(lsn), K(id_)); - } else if (NULL == parent_ || parent_->to_end()) { + } else if (NULL == parent_ || restore_to_end_unlock_()) { ret = OB_RESTORE_LOG_TO_END; CLOG_LOG(INFO, "submit log to end, just skip", K(ret), K(lsn), KPC(this)); } else { @@ -417,7 +417,7 @@ int ObLogRestoreHandler::try_retire_task(ObFetchLogTask &task, bool &done) CLOG_LOG(INFO, "restore max_lsn bigger than task end_lsn, just skip it", K(task), KPC(this)); done = true; context_.issue_task_num_--; - } else if (parent_->to_end()) { + } else if (restore_to_end_unlock_()) { // when restore is set to_end, issue_version_ is advanced, and all issued tasks before are stale tasks as stale issue_version_ CLOG_LOG(ERROR, "error unexpected, log restored to_end, just skip it", K(task), KPC(this), K(parent_)); done = true; @@ -440,7 +440,7 @@ int ObLogRestoreHandler::need_schedule(bool &need_schedule, } else if (OB_SUCCESS != context_.error_context_.ret_code_) { // error exist, no need schedule } else { - need_schedule = is_strong_leader(role_) && ! parent_->to_end(); + need_schedule = is_strong_leader(role_) && ! restore_to_end_unlock_(); proposal_id = proposal_id_; context = context_; } @@ -458,17 +458,23 @@ bool ObLogRestoreHandler::need_update_source() const return is_strong_leader(role_); } -void ObLogRestoreHandler::mark_error(share::ObTaskId &trace_id, const int ret_code) +void ObLogRestoreHandler::mark_error(share::ObTaskId &trace_id, const int ret_code, const palf::LSN &lsn) { int ret = OB_SUCCESS; + palf::LSN end_lsn; WLockGuard guard(lock_); if (IS_NOT_INIT) { ret = OB_NOT_INIT; } else if (! is_strong_leader(role_)) { CLOG_LOG(INFO, "not leader, no need record error", K(id_), K(ret_code)); + } else if (OB_FAIL(palf_handle_.get_end_lsn(end_lsn))) { + CLOG_LOG(WARN, "get end_lsn failed", K(id_)); + } else if (end_lsn < lsn) { + CLOG_LOG(WARN, "end_lsn smaller than error lsn, just skip", K(id_), K(end_lsn), K(lsn), KPC(parent_), KPC(this)); } else if (OB_SUCCESS == context_.error_context_.ret_code_) { context_.error_context_.ret_code_ = ret_code; context_.error_context_.trace_id_.set(trace_id); + CLOG_LOG(ERROR, "fatal error occur in restore", KPC(parent_), KPC(this)); } } @@ -674,7 +680,7 @@ int ObLogRestoreHandler::get_next_sorted_task(ObFetchLogTask *&task) ret = OB_NOT_INIT; } else if (! is_strong_leader(role_)) { ret = OB_NOT_MASTER; - } else if (NULL != parent_ && parent_->to_end()) { + } else if (NULL != parent_ && restore_to_end_unlock_()) { // if restore to end, free all cached tasks ret = context_.reset_sorted_tasks(); } else if (context_.submit_array_.empty()) { @@ -734,5 +740,23 @@ int ObLogRestoreHandler::diagnose(RestoreDiagnoseInfo &diagnose_info) return ret; } +bool ObLogRestoreHandler::restore_to_end_unlock_() const +{ + int ret = OB_SUCCESS; + bool bret = false; + share::SCN scn; + share::SCN recovery_end_scn; + if (NULL == parent_) { + bret = false; + } else if (parent_->to_end()) { + bret = true; + } else if (OB_FAIL(palf_handle_.get_end_scn(scn))) { + CLOG_LOG(WARN, "get end scn failed", K(id_)); + } else { + parent_->get_upper_limit_scn(recovery_end_scn); + bret = scn >= recovery_end_scn; + } + return bret; +} } // namespace logservice } // namespace oceanbase diff --git a/src/logservice/restoreservice/ob_log_restore_handler.h b/src/logservice/restoreservice/ob_log_restore_handler.h index 95fdfbb22..9374bc9aa 100644 --- a/src/logservice/restoreservice/ob_log_restore_handler.h +++ b/src/logservice/restoreservice/ob_log_restore_handler.h @@ -151,7 +151,7 @@ public: // return true only if in restore state and all replicas have restore and replay finish int check_restore_done(const share::SCN &recovery_end_scn, bool &done); // @brief set error if error occurs in log restore - void mark_error(share::ObTaskId &trace_id, const int ret_code); + void mark_error(share::ObTaskId &trace_id, const int ret_code, const palf::LSN &lsn); // @brief get restore error for report int get_restore_error(share::ObTaskId &trace_id, int &ret_code, bool &error_exist); // @brief Before the standby tenant switchover to primary, check if all primary logs are restored in the standby @@ -182,6 +182,7 @@ private: int check_replay_done_(const share::SCN &scn, bool &done); int check_replica_replay_done_(const share::SCN &scn, common::ObMemberList &member_list, bool &done); int check_member_list_change_(common::ObMemberList &member_list, bool &member_list_change); + bool restore_to_end_unlock_() const; private: ObRemoteLogParent *parent_; diff --git a/src/logservice/restoreservice/ob_remote_fetch_log_worker.cpp b/src/logservice/restoreservice/ob_remote_fetch_log_worker.cpp index ee5f427ef..de71ce716 100644 --- a/src/logservice/restoreservice/ob_remote_fetch_log_worker.cpp +++ b/src/logservice/restoreservice/ob_remote_fetch_log_worker.cpp @@ -268,6 +268,7 @@ int ObRemoteFetchWorker::handle_single_task_() } else { ObFetchLogTask *task = static_cast(data); ObLSID id = task->id_; + palf::LSN cur_lsn = task->cur_lsn_; // after task handle, DON'T print it any more if (OB_FAIL(handle_fetch_log_task_(task))) { LOG_WARN("handle fetch log task failed", K(ret), KP(task), K(id)); @@ -275,7 +276,7 @@ int ObRemoteFetchWorker::handle_single_task_() // only fatal error report fail, retry with others if (is_fatal_error_(ret)) { - report_error_(id, ret); + report_error_(id, ret, cur_lsn); } } return ret; @@ -300,14 +301,13 @@ int ObRemoteFetchWorker::handle_fetch_log_task_(ObFetchLogTask *task) LOG_WARN("push submit array failed", K(ret)); } - // if fatal error encounter, just free task. Restore can only be recovered with new log_restore_source - if (is_fatal_error_(ret)) { - LOG_ERROR("free fetch log task with restore fatal error", K(ret), KPC(task)); - inner_free_task_(*task); - } else if (OB_SUCC(ret) && ! empty) { + if (OB_SUCC(ret) && ! empty) { // pre_read succ and push submit array succ, do nothing, } else { - if (! empty && OB_FAIL(ret)) { + if (is_fatal_error_(ret)) { + // fatal error may be false positive, for example restore in parallel, the range in pre-read maybe surpass the current log archive round, which not needed. + LOG_WARN("fatal error occur", K(ret), KPC(task)); + } else if (! empty && OB_FAIL(ret)) { LOG_WARN("task data not empty and push submit array failed, try retire task", K(ret), KPC(task)); } else if (OB_SUCC(ret)) { // pre_read data is empty, do notning @@ -568,11 +568,11 @@ bool ObRemoteFetchWorker::is_fatal_error_(const int ret_code) const || OB_ARCHIVE_LOG_RECYCLED == ret_code; } -void ObRemoteFetchWorker::report_error_(const ObLSID &id, const int ret_code) +void ObRemoteFetchWorker::report_error_(const ObLSID &id, const int ret_code, const palf::LSN &lsn) { int ret = OB_SUCCESS; GET_RESTORE_HANDLER_CTX(id) { - restore_handler->mark_error(*ObCurTraceId::get_trace_id(), ret_code); + restore_handler->mark_error(*ObCurTraceId::get_trace_id(), ret_code, lsn); } } diff --git a/src/logservice/restoreservice/ob_remote_fetch_log_worker.h b/src/logservice/restoreservice/ob_remote_fetch_log_worker.h index c7f705f3e..2950b7518 100644 --- a/src/logservice/restoreservice/ob_remote_fetch_log_worker.h +++ b/src/logservice/restoreservice/ob_remote_fetch_log_worker.h @@ -94,7 +94,7 @@ private: bool is_retry_ret_code_(const int ret_code) const; bool is_fatal_error_(const int ret_code) const; - void report_error_(const ObLSID &id, const int ret_code); + void report_error_(const ObLSID &id, const int ret_code, const palf::LSN &lsn); private: bool inited_; uint64_t tenant_id_;