diff --git a/src/observer/ob_service.cpp b/src/observer/ob_service.cpp index 2925eb13ba..cbb7082031 100644 --- a/src/observer/ob_service.cpp +++ b/src/observer/ob_service.cpp @@ -797,35 +797,60 @@ int ObService::check_not_backup_tablet_create_scn(const obrpc::ObBackupCheckTabl } else if (OB_ISNULL(ls = ls_handle.get_ls())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("log stream should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls)); - } else if (OB_ISNULL(ls_tablet_svr = ls->get_tablet_svr())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("ls tablet service should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls)); } else { - const int64_t timeout_us = ObTabletCommon::DIRECT_GET_COMMITTED_TABLET_TIMEOUT_US; - ObTabletHandle tablet_handle; - for (int64_t i = 0; OB_SUCC(ret) && i < tablet_ids.count(); ++i) { - const ObTabletID &tablet_id = tablet_ids.at(i); - tablet_handle.reset(); - if (OB_FAIL(ls_tablet_svr->get_tablet(tablet_id, tablet_handle, timeout_us))) { - if (OB_TABLET_NOT_EXIST == ret) { - LOG_INFO("tablet has been deleted, no need to check", K(tablet_id)); - ret = OB_SUCCESS; - } else { - LOG_WARN("failed to get tablet", KR(ret), K(tablet_id), K(timeout_us)); - } - } else if (OB_UNLIKELY(!tablet_handle.is_valid())) { + const int64_t rebuild_seq = ls->get_rebuild_seq(); + ObMigrationStatus migration_status; + share::ObLSRestoreStatus restore_status; + if (OB_FAIL(ls->is_offline())) { + ret = OB_EAGAIN; + LOG_WARN("ls is offline, retry later", K(ret), KPC(ls)); + } else if (OB_FAIL(ls->get_migration_status(migration_status))) { + LOG_WARN("failed to get migration status", K(ret), KPC(ls)); + } else if (storage::ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status) { + ret = OB_EAGAIN; + LOG_WARN("ls is in migration, retry later", K(ret), KPC(ls)); + } else if (OB_FAIL(ls->get_restore_status(restore_status))) { + LOG_WARN("failed to get restore status", K(ret), KPC(ls)); + } else if (share::ObLSRestoreStatus::RESTORE_NONE != restore_status) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("restore ls is unexpected", K(ret), KPC(ls)); + } else { + if (OB_ISNULL(ls_tablet_svr = ls->get_tablet_svr())) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected error : tablet handle is invalid", KR(ret), K(tablet_handle)); + LOG_WARN("ls tablet service should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls)); } else { - const ObTabletMeta &tablet_meta = tablet_handle.get_obj()->get_tablet_meta(); - if (OB_UNLIKELY(tablet_meta.create_scn_ <= arg.backup_scn_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected error : tablet has't been backup", KR(ret), - K(arg.tenant_id_), K(arg.ls_id_), K(tablet_id), - K(tablet_meta), "backup_scn", arg.backup_scn_); + const int64_t timeout_us = ObTabletCommon::DIRECT_GET_COMMITTED_TABLET_TIMEOUT_US; + ObTabletHandle tablet_handle; + for (int64_t i = 0; OB_SUCC(ret) && i < tablet_ids.count(); ++i) { + const ObTabletID &tablet_id = tablet_ids.at(i); + tablet_handle.reset(); + if (OB_FAIL(ls_tablet_svr->get_tablet(tablet_id, tablet_handle, timeout_us))) { + if (OB_TABLET_NOT_EXIST == ret) { + LOG_INFO("tablet has been deleted, no need to check", K(tablet_id)); + ret = OB_SUCCESS; + } else { + LOG_WARN("failed to get tablet", KR(ret), K(tablet_id), K(timeout_us)); + } + } else if (OB_UNLIKELY(!tablet_handle.is_valid())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error : tablet handle is invalid", KR(ret), K(tablet_handle)); + } else { + const ObTabletMeta &tablet_meta = tablet_handle.get_obj()->get_tablet_meta(); + if (OB_UNLIKELY(tablet_meta.create_scn_ <= arg.backup_scn_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error : tablet has't been backup", KR(ret), + K(arg.tenant_id_), K(arg.ls_id_), K(tablet_id), + K(tablet_meta), "backup_scn", arg.backup_scn_); + } + } } } } + if (OB_FAIL(ret)) { + } else if (rebuild_seq != ls->get_rebuild_seq()) { + ret = OB_EAGAIN; + LOG_WARN("ls has rebuild, retry later", K(ret), KPC(ls)); + } } } } diff --git a/src/share/restore/ob_ls_restore_status.h b/src/share/restore/ob_ls_restore_status.h index a566c44b79..6db337f853 100644 --- a/src/share/restore/ob_ls_restore_status.h +++ b/src/share/restore/ob_ls_restore_status.h @@ -82,10 +82,10 @@ public: bool is_wait_status() const { return is_wait_restore_sys_tablets() || is_wait_restore_tablets_meta() || is_wait_quick_restore() || is_wait_restore_major_data(); } - // Don't load inner tablet while downtime and restart if restore status is in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED - bool is_need_load_inner_tablet() const + // enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED + bool is_enable_for_restore() const { - return !((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) || + return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) || status_ == Status::RESTORE_FAILED); } // if restore status is not in [RESTORE_START, WAIT_RESTORE_TABLETS_META], log_replay_service can replay log. diff --git a/src/storage/high_availability/ob_ls_migration.cpp b/src/storage/high_availability/ob_ls_migration.cpp index 4fca52f63d..d8fb332c29 100644 --- a/src/storage/high_availability/ob_ls_migration.cpp +++ b/src/storage/high_availability/ob_ls_migration.cpp @@ -1033,7 +1033,6 @@ int ObStartMigrationTask::deal_with_local_ls_() int64_t proposal_id = 0; ObLSMeta local_ls_meta; logservice::ObLogService *log_service = nullptr; - if (!is_inited_) { ret = OB_NOT_INIT; LOG_WARN("start migration task do not init", K(ret)); @@ -1393,20 +1392,14 @@ int ObStartMigrationTask::deal_local_restore_ls_(bool &need_generate_dag) } else if (ls_restore_status.is_restore_failed()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("ls restore status is not expected", K(ret), KPC(ctx_), KPC(ls), K(ls_restore_status)); - } else if (ls_restore_status.is_restore_start() || ls_restore_status.is_restore_sys_tablets()) { - if (OB_FAIL(ls->get_log_handler()->enable_sync())) { - LOG_WARN("failed to enable log sync", K(ret), KPC(ctx_), KPC(ls)); - } else if (OB_FAIL(ls->get_tablet_svr()->online())) { - LOG_WARN("failed to online tablet svr", K(ret), KPC(ctx_), KPC(ls)); - } else if (OB_FAIL(ls->get_tx_svr()->online())) { - LOG_WARN("failed to online tx svr", K(ret), KPC(ctx_), KPC(ls)); - } else if (OB_FAIL(ls->get_ddl_log_handler()->online())) { - LOG_WARN("failed to online ddl log handler", K(ret), KPC(ctx_), KPC(ls)); - } else if (OB_FAIL(ls->get_ls_wrs_handler()->online())) { - LOG_WARN("failed to online ls wrs handler", K(ret), KPC(ctx_), KPC(ls)); - } else if (OB_FALSE_IT(ls->get_checkpoint_executor()->online())) { + } else if (ls_restore_status.is_restore_start()) { + ret = OB_SRC_DO_NOT_ALLOWED_MIGRATE; + LOG_WARN("src ls is in restore start, wait later", K(ret), KPC(ls)); + } else if (ls_restore_status.is_restore_sys_tablets()) { + need_generate_dag = false; + if (OB_FAIL(ls->enable_for_restore())) { + LOG_WARN("failed to enable for restore", K(ret)); } else { - need_generate_dag = false; LOG_INFO("ls restore status is in restore start or in restore sys tablets, no need generate dag", K(ls_restore_status), "ls_id", ctx_->arg_.ls_id_); } diff --git a/src/storage/ls/ob_ls.cpp b/src/storage/ls/ob_ls.cpp index 95c5d0134b..da02850ee5 100644 --- a/src/storage/ls/ob_ls.cpp +++ b/src/storage/ls/ob_ls.cpp @@ -373,7 +373,7 @@ bool ObLS::is_need_gc() const return bool_ret; } -bool ObLS::is_need_load_inner_tablet() const +bool ObLS::is_enable_for_restore() const { int ret = OB_SUCCESS; bool bool_ret = false; @@ -381,7 +381,7 @@ bool ObLS::is_need_load_inner_tablet() const if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); } else { - bool_ret = restore_status.is_need_load_inner_tablet(); + bool_ret = restore_status.is_enable_for_restore(); } return bool_ret; } @@ -684,6 +684,8 @@ int ObLS::offline_() } else if (FALSE_IT(is_offlined_ = true)) { } else if (FALSE_IT(checkpoint_executor_.offline())) { LOG_WARN("checkpoint executor offline failed", K(ret), K(ls_meta_)); + } else if (OB_FAIL(ls_restore_handler_.offline())) { + LOG_WARN("failed to offline ls restore handler", K(ret)); } else if (OB_FAIL(offline_compaction_())) { LOG_WARN("compaction offline failed", K(ret), K(ls_meta_)); } else if (OB_FAIL(ls_wrs_handler_.offline())) { @@ -735,6 +737,30 @@ int ObLS::offline() return ret; } +int ObLS::offline_without_lock() +{ + int ret = OB_SUCCESS; + int64_t start_ts = ObTimeUtility::current_time(); + int64_t retry_times = 0; + + do { + retry_times++; + { + if (OB_FAIL(offline_())) { + LOG_WARN("ls offline failed", K(ret), K(ls_meta_)); + } + } + if (OB_EAGAIN == ret) { + ob_usleep(100 * 1000); // 100 ms + if (retry_times % 100 == 0) { // every 10 s + LOG_WARN("ls offline use too much time.", K(ls_meta_), K(start_ts)); + } + } + } while (OB_EAGAIN == ret); + FLOG_INFO("ls offline end", KR(ret), "ls_id", get_ls_id()); + return ret; +} + int ObLS::online_tx_() { int ret = OB_SUCCESS; @@ -777,6 +803,8 @@ int ObLS::online() LOG_WARN("weak read handler online failed", K(ret), K(ls_meta_)); } else if (OB_FAIL(online_compaction_())) { LOG_WARN("compaction online failed", K(ret), K(ls_meta_)); + } else if (OB_FAIL(ls_restore_handler_.online())) { + LOG_WARN("ls restore handler online failed", K(ret)); } else if (FALSE_IT(checkpoint_executor_.online())) { } else if (FALSE_IT(tablet_gc_handler_.online())) { } else { @@ -788,6 +816,23 @@ int ObLS::online() return ret; } +int ObLS::enable_for_restore() +{ + int ret = OB_SUCCESS; + int64_t read_lock = 0; + int64_t write_lock = LSLOCKALL; + ObLSLockGuard lock_myself(lock_, read_lock, write_lock); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("ls is not inited", K(ret)); + } else if (OB_FAIL(log_handler_.enable_sync())) { + LOG_WARN("failed to enable sync", K(ret)); + } else if (OB_FAIL(ls_restore_handler_.online())) { + LOG_WARN("failed to online restore", K(ret)); + } + return ret; +} + int ObLS::get_ls_meta_package(ObLSMetaPackage &meta_package) { int ret = OB_SUCCESS; @@ -1051,10 +1096,18 @@ int ObLS::finish_slog_replay() LOG_INFO("this ls should be gc later", KPC(this)); // ls will be gc later and tablets in the ls are not complete, // so skip the following steps, otherwise load_ls_inner_tablet maybe encounter error. - } else if (is_need_load_inner_tablet() && OB_FAIL(load_ls_inner_tablet())) { - LOG_WARN("ls load inner tablet failed", K(ret), KPC(this)); } else if (OB_FAIL(start())) { LOG_WARN("ls can not start to work", K(ret)); + } else if (is_enable_for_restore()) { + if (OB_FAIL(offline_())) { + LOG_WARN("failed to offline", K(ret), KPC(this)); + } else if (OB_FAIL(log_handler_.enable_sync())) { + LOG_WARN("failed to enable sync", K(ret), KPC(this)); + } else if (OB_FAIL(ls_restore_handler_.online())) { + LOG_WARN("failed to online ls restore handler", K(ret), KPC(this)); + } + } else if (OB_FAIL(load_ls_inner_tablet())) { + LOG_WARN("ls load inner tablet failed", K(ret), KPC(this)); } else { // do nothing } diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index 3fb1f424c5..be0b7ff4d5 100644 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -154,6 +154,8 @@ public: void destroy(); int offline(); int online(); + int offline_without_lock(); + int enable_for_restore(); bool is_offline() const { return is_offlined_; } // mock function, TODO(@yanyuan) ObLSTxService *get_tx_svr() { return &ls_tx_svr_; } @@ -200,7 +202,7 @@ public: void set_create_state(const ObInnerLSStatus new_status); ObInnerLSStatus get_create_state() const; bool is_need_gc() const; - bool is_need_load_inner_tablet() const; + bool is_enable_for_restore() const; // for rebuild // remove inner tablet, the memtable and minor sstable of data tablet, disable replay // int prepare_rebuild(); diff --git a/src/storage/restore/ob_ls_restore_handler.cpp b/src/storage/restore/ob_ls_restore_handler.cpp index b16ad2d3e8..23efa8f8e1 100644 --- a/src/storage/restore/ob_ls_restore_handler.cpp +++ b/src/storage/restore/ob_ls_restore_handler.cpp @@ -42,12 +42,13 @@ using namespace logservice; ObLSRestoreHandler::ObLSRestoreHandler() : is_inited_(false), is_stop_(false), + is_online_(true), + rebuild_seq_(0), result_mgr_(), ls_(nullptr), ls_restore_arg_(), state_handler_(nullptr), - allocator_(), - rebuild_seq_(0) + allocator_() { } @@ -88,6 +89,74 @@ void ObLSRestoreHandler::destroy() ls_ = nullptr; } +int ObLSRestoreHandler::offline() +{ + int ret = OB_SUCCESS; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("not init", K(ret)); + } else { + lib::ObMutexGuard guard(mtx_); + if (OB_FAIL(cancel_task_())) { + LOG_WARN("failed to cancel task", K(ret), KPC(ls_)); + } else { + is_online_ = false; + } + } + return ret; +} + +int ObLSRestoreHandler::online() +{ + int ret = OB_SUCCESS; + share::ObLSRestoreStatus new_status; + ObILSRestoreState *new_state_handler = nullptr; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("not init", K(ret)); + } else if (is_online_) { + // do nothing + LOG_INFO("ls is online", KPC(ls_)); + } else if (OB_FAIL(ls_->get_restore_status(new_status))) { + LOG_WARN("fail to get_restore_status", K(ret), KPC(ls_)); + } else if (new_status.is_restore_none()) { + is_online_ = true; + } else { + lib::ObMutexGuard guard(mtx_); + // online after rebuild or migrate. the restore status may changed. + // so, we refresh the restore state handler according to the new ls restore status. + if (OB_FAIL(fill_restore_arg_())) { + LOG_WARN("fail to fill restore arg", K(ret)); + } else if (OB_FAIL(get_restore_state_handler_(new_status, new_state_handler))) { + LOG_WARN("fail to get restore state handler", K(ret), K(new_status)); + } else { + if (nullptr != state_handler_) { + // when online, the old task should be cancel. + if (OB_FAIL(state_handler_->get_tablet_mgr().cancel_task())) { + LOG_WARN("failed to cancel task", K(ret)); + } else { + state_handler_->~ObILSRestoreState(); + allocator_.free(state_handler_); + state_handler_ = nullptr; + } + } + if (OB_SUCC(ret)) { + state_handler_ = new_state_handler; + is_online_ = true; + new_state_handler = nullptr; + } + } + + if (OB_FAIL(ret) && nullptr != new_state_handler) { + new_state_handler->~ObILSRestoreState(); + allocator_.free(new_state_handler); + new_state_handler = nullptr; + } + } + return ret; +} + int ObLSRestoreHandler::record_clog_failed_info( const share::ObTaskId &trace_id, const share::ObLSID &ls_id, const int &result) { @@ -128,7 +197,10 @@ int ObLSRestoreHandler::handle_execute_over( status = state_handler_->get_restore_status(); } } - if (status.is_restore_sys_tablets()) { + if (OB_CANCELED == result) { + //do nothing + LOG_WARN("task has been canceled", KPC(ls_), K(task_id)); + } else if (status.is_restore_sys_tablets()) { state_handler_->set_retry_flag(); result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); LOG_WARN("restore sys tablets dag failed, need retry", K(ret)); @@ -157,9 +229,11 @@ int ObLSRestoreHandler::handle_pull_tablet( if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); + } else if (is_stop_ || !is_online_) { + LOG_WARN("ls stopped or disabled", KPC(ls_)); } else if (OB_ISNULL(state_handler_)) { // server may downtime and restart, but it has't inited state handler, so state_handler_ may be null. - LOG_WARN("need restart, wait later"); + LOG_WARN("need restart, wait later", KPC(ls_)); } else if (OB_FAIL(state_handler_->handle_pull_tablet(tablet_ids, leader_restore_status))) { LOG_WARN("fail to handl pull tablet", K(ret), K(leader_restore_status)); } @@ -188,7 +262,10 @@ int ObLSRestoreHandler::process() // it tasks a period of time for the ls leader is ready after the shutdown and restart of observer usually, // and an ls leader not exist error will be returned before leader is ready. // so in order to improve availability, we need control the retry frequency and the default retry time interval is 10s. - if (OB_FAIL(state_handler_->do_restore())) { + lib::ObMutexGuard guard(mtx_); + if (is_stop_ || !is_online_) { + LOG_INFO("ls stopped or disabled", KPC(ls_)); + } else if (OB_FAIL(state_handler_->do_restore())) { ObTaskId trace_id(*ObCurTraceId::get_trace_id()); result_mgr_.set_result(ret, trace_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); LOG_WARN("fail to do restore", K(ret), KPC(state_handler_)); @@ -489,30 +566,36 @@ void ObLSRestoreHandler::wakeup() int ObLSRestoreHandler::safe_to_destroy(bool &is_safe) { int ret = OB_SUCCESS; - const int64_t start_ts = ObTimeUtil::current_time(); - const int64_t OB_WAIT_LS_RESTORE_STOP_MS = 200 * 1000; // 200ms is_safe = false; if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("ls restore handler do not init", K(ret)); } else { lib::ObMutexGuard guard(mtx_); - if (OB_ISNULL(state_handler_)) { - is_safe = true; + if (OB_FAIL(cancel_task_())) { + LOG_WARN("failed to cancel tasks", K(ret), KPC(ls_)); } else { - ObLSRestoreTaskMgr &restore_tablet_mgr = state_handler_->get_tablet_mgr(); - bool is_done = false; - if (OB_FAIL(restore_tablet_mgr.check_all_task_done(is_done))) { - LOG_WARN("fail to check all task done", K(ret)); - } else if (is_done) { - is_safe = true; - } + is_safe = true; + is_stop_ = true; } } LOG_INFO("wait ls restore stop", K(ret), K(is_safe), KPC(ls_)); return ret; } +int ObLSRestoreHandler::cancel_task_() +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(state_handler_)) { + } else { + ObLSRestoreTaskMgr &restore_tablet_mgr = state_handler_->get_tablet_mgr(); + if (OB_FAIL(restore_tablet_mgr.cancel_task())) { + LOG_WARN("fail to check all task done", K(ret)); + } + } + return ret; +} + int ObLSRestoreHandler::update_rebuild_seq() { int ret = OB_SUCCESS; @@ -520,16 +603,14 @@ int ObLSRestoreHandler::update_rebuild_seq() ret = OB_NOT_INIT; LOG_WARN("ls restore handler do not init", K(ret)); } else { - lib::ObMutexGuard guard(mtx_); - rebuild_seq_ = ls_->get_rebuild_seq(); + ATOMIC_STORE(&rebuild_seq_, ls_->get_rebuild_seq()); } return ret; } int64_t ObLSRestoreHandler::get_rebuild_seq() { - lib::ObMutexGuard guard(mtx_); - return rebuild_seq_; + return ATOMIC_LOAD(&rebuild_seq_); } //================================ObILSRestoreState======================================= @@ -1170,23 +1251,15 @@ int ObILSRestoreState::check_restore_concurrency_limit_() return ret; } -int ObILSRestoreState::enable_replay_() +int ObILSRestoreState::online_() { int ret = OB_SUCCESS; - if (OB_FAIL(ls_->enable_replay())) { - LOG_WARN("enable ls replay failed", K(ret), KPC(ls_)); + if (OB_FAIL(ls_->online())) { + LOG_WARN("online ls failed", K(ret), KPC(ls_)); } return ret; } -void ObILSRestoreState::disable_replay_() -{ - int tmp_ret = OB_SUCCESS; - if (OB_SUCCESS != (tmp_ret = ls_->disable_replay())) { - LOG_WARN("fail to disable replay", K(tmp_ret), KPC(ls_)); - } -} - int ObILSRestoreState::schedule_tablet_group_restore_( const ObTabletGroupRestoreArg &arg, const share::ObTaskId &task_id) @@ -1390,15 +1463,12 @@ int ObLSRestoreStartState::do_with_no_ls_meta_() // this ls doesn't have ls meta and tablet in backup, it only needs to replay clog. // so just advance to qucik restore and start replay clog. ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::QUICK_RESTORE); - if (OB_FAIL(enable_replay_())) { + if (OB_FAIL(online_())) { LOG_WARN("fail to enable log", K(ret)); } else if (OB_FAIL(advance_status_(*ls_, next_status))) { LOG_WARN("fail to advance status", K(ret), K(*ls_), K(next_status)); } - if (OB_FAIL(ret)) { - disable_replay_(); - } return ret; } @@ -1415,7 +1485,7 @@ int ObLSRestoreStartState::do_with_uncreated_ls_() LOG_WARN("fail to check ls created", K(ret), KPC(ls_)); } else if (is_created) { // creating ls finished after sys ls restored. cur ls need to do restore. - } else if (OB_FAIL(enable_replay_())) { + } else if (OB_FAIL(online_())) { LOG_WARN("fail to enable log", K(ret)); } else if (OB_FAIL(advance_status_(*ls_, next_status))) { LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); @@ -1424,9 +1494,6 @@ int ObLSRestoreStartState::do_with_uncreated_ls_() LOG_INFO("no need to restore when sys ls has been restored and the ls doesn't created.", KPC(ls_)); } - if (OB_FAIL(ret)) { - disable_replay_(); - } return ret; } @@ -1630,7 +1697,7 @@ int ObLSRestoreSysTabletState::leader_restore_sys_tablet_() } else if (!tablet_mgr_.is_restore_completed()) {// TODO: check restore finish, should read from extern. fix later } else if (is_need_retry_()) { // next term to retry - } else if (OB_FAIL(ls_->load_ls_inner_tablet())) { + } else if (OB_FAIL(online_())) { LOG_WARN("fail to load ls inner tablet", K(ret)); } else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) { LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_)); @@ -1639,6 +1706,7 @@ int ObLSRestoreSysTabletState::leader_restore_sys_tablet_() } else { LOG_INFO("leader succ to restore sys tablet", KPC(ls_)); } + return ret; } @@ -1661,7 +1729,7 @@ int ObLSRestoreSysTabletState::follower_restore_sys_tablet_() } else if (!tablet_mgr_.is_restore_completed()) { } else if (is_need_retry_()) { // next term to retry - } else if (OB_FAIL(ls_->load_ls_inner_tablet())) { + } else if (OB_FAIL(online_())) { LOG_WARN("fail to load ls inner tablet", K(ret)); } else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) { LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_)); @@ -1670,6 +1738,7 @@ int ObLSRestoreSysTabletState::follower_restore_sys_tablet_() } else { LOG_INFO("follower succ to restore sys tablet", KPC(ls_)); } + return ret; } @@ -1791,17 +1860,12 @@ int ObLSRestoreCreateUserTabletState::leader_create_user_tablet_() } else if (tablet_need_restore.empty()) { ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::WAIT_RESTORE_TABLETS_META); if (!tablet_mgr_.is_restore_completed()) { - } else if (OB_FAIL(enable_replay_())) { - LOG_WARN("fail to enable log", K(ret), KPC(ls_)); } else if (OB_FAIL(advance_status_(*ls_, next_status))) { LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); } else { LOG_INFO("success create leader user tablets", KPC(ls_)); tablet_mgr_.reuse_set(); } - if (OB_FAIL(ret)) { - disable_replay_(); - } } else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) { LOG_WARN("fail to do quick restore", K(ret), K(tablet_need_restore), KPC(ls_)); } @@ -1836,17 +1900,12 @@ int ObLSRestoreCreateUserTabletState::follower_create_user_tablet_() if (OB_FAIL(reload_miss_tablet_(all_finish))) { LOG_WARN("fail to check follower restore tablet all finish", K(ret), KPC(ls_)); } else if (all_finish) { - if (OB_FAIL(enable_replay_())) { - LOG_WARN("fail to enable log", K(ret), KPC(ls_)); - } else if (OB_FAIL(advance_status_(*ls_, next_status))) { + if (OB_FAIL(advance_status_(*ls_, next_status))) { LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); } else { LOG_INFO("success create follower user tablets", KPC(ls_)); tablet_mgr_.reuse_set(); } - if (OB_FAIL(ret)) { - disable_replay_(); - } } } } else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) { diff --git a/src/storage/restore/ob_ls_restore_handler.h b/src/storage/restore/ob_ls_restore_handler.h index 7c63f026c0..8389955c6f 100644 --- a/src/storage/restore/ob_ls_restore_handler.h +++ b/src/storage/restore/ob_ls_restore_handler.h @@ -87,10 +87,13 @@ public: void wakeup(); void stop() { ATOMIC_STORE(&is_stop_, true); } // when remove ls, set this int safe_to_destroy(bool &is_safe); + int offline(); + int online(); bool is_stop() { return is_stop_; } int update_rebuild_seq(); int64_t get_rebuild_seq(); private: + int cancel_task_(); int check_before_do_restore_(bool &can_do_restore); int update_state_handle_(); int check_meta_tenant_normal_(bool &is_normal); @@ -101,14 +104,15 @@ private: int fill_restore_arg_(); private: bool is_inited_; - bool is_stop_; + bool is_stop_; // used by ls destory + bool is_online_; // used by ls online/offline + int64_t rebuild_seq_; // update by rebuild lib::ObMutex mtx_; ObLSRestoreResultMgr result_mgr_; storage::ObLS *ls_; ObTenantRestoreCtx ls_restore_arg_; ObILSRestoreState *state_handler_; common::ObFIFOAllocator allocator_; - int64_t rebuild_seq_; DISALLOW_COPY_AND_ASSIGN(ObLSRestoreHandler); }; @@ -171,8 +175,8 @@ protected: int report_ls_restore_progress_(storage::ObLS &ls, const share::ObLSRestoreStatus &status, const share::ObTaskId &trace_id, const int result = OB_SUCCESS, const char *comment = ""); - int enable_replay_(); - void disable_replay_(); + int online_(); + void offline_(); int update_restore_status_( storage::ObLS &ls, const share::ObLSRestoreStatus &next_status); diff --git a/src/storage/restore/ob_ls_restore_task_mgr.cpp b/src/storage/restore/ob_ls_restore_task_mgr.cpp index ef5f3f5274..af5fbb1c52 100644 --- a/src/storage/restore/ob_ls_restore_task_mgr.cpp +++ b/src/storage/restore/ob_ls_restore_task_mgr.cpp @@ -209,10 +209,9 @@ int ObLSRestoreTaskMgr::schedule_tablet(const ObTaskId &task_id, const ObSArray< return ret; } -int ObLSRestoreTaskMgr::check_all_task_done(bool &is_all_done) +int ObLSRestoreTaskMgr::cancel_task() { int ret = OB_SUCCESS; - is_all_done = true; bool is_exist = false; if (IS_NOT_INIT) { ret = OB_NOT_INIT; @@ -221,12 +220,37 @@ int ObLSRestoreTaskMgr::check_all_task_done(bool &is_all_done) lib::ObMutexGuard guard(mtx_); TaskMap::iterator iter = tablet_map_.begin(); for (; OB_SUCC(ret) && iter != tablet_map_.end(); ++iter) { + is_exist = false; if (OB_FAIL(check_task_exist_(iter->first, is_exist))) { LOG_WARN("fail to check task exist", K(ret), "taks_id", iter->first); } else if (is_exist) { - is_all_done = false; + ObTenantDagScheduler *scheduler = nullptr; + if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret), KP(scheduler)); + } else if (OB_FAIL(scheduler->cancel_dag_net(iter->first))) { + LOG_WARN("failed to check dag net exist", K(ret), K(iter->first)); + } } } + + int64_t start_ts = ObTimeUtil::current_time(); + for (; OB_SUCC(ret) && iter != tablet_map_.end(); ++iter) { + is_exist = true; + do { + if (OB_FAIL(check_task_exist_(iter->first, is_exist))) { + LOG_WARN("fail to check task exist", K(ret), "taks_id", iter->first); + } else if (is_exist && REACH_TIME_INTERVAL(60 * 1000 * 1000)) { + LOG_WARN("cancel dag next task cost too much time", K(ret), "task_id", iter->first, + "cost_time", ObTimeUtil::current_time() - start_ts); + } + } while (is_exist && OB_SUCC(ret)); + } + + if (OB_SUCC(ret)) { + reuse_set(); + tablet_map_.reuse(); + } } return ret; } diff --git a/src/storage/restore/ob_ls_restore_task_mgr.h b/src/storage/restore/ob_ls_restore_task_mgr.h index 889ea880e1..0c4b8ccbba 100644 --- a/src/storage/restore/ob_ls_restore_task_mgr.h +++ b/src/storage/restore/ob_ls_restore_task_mgr.h @@ -41,7 +41,7 @@ public: int schedule_tablet(const share::ObTaskId &task_id, const ObSArray &tablet_need_restore, bool &reach_dag_limit); int pop_need_restore_tablets(ObIArray &need_restore_tablets); int pop_restored_tablets(storage::ObLS &ls, ObIArray &tablet_send_to_follower); - int check_all_task_done(bool &is_all_done); + int cancel_task(); void reuse_set() { schedule_tablet_set_.reuse(); wait_tablet_set_.reuse(); } void reuse_wait_set() { wait_tablet_set_.reuse(); } diff --git a/src/storage/tx_storage/ob_ls_service.cpp b/src/storage/tx_storage/ob_ls_service.cpp index f3d848b6a3..4a25664be6 100644 --- a/src/storage/tx_storage/ob_ls_service.cpp +++ b/src/storage/tx_storage/ob_ls_service.cpp @@ -460,14 +460,22 @@ int ObLSService::create_ls(const obrpc::ObCreateLSArg &arg) } else { state = ObLSCreateState::CREATE_STATE_FINISH; ls->finish_create(is_commit); - if (OB_SUCCESS != (tmp_ret = ls->start())) { - LOG_ERROR("ls start failed", K(tmp_ret), K(arg)); - } else { - FLOG_INFO("add ls to ls service succ", K(ls->get_ls_id()), K(arg)); - if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) { - LOG_WARN("fail to report ls", KR(tmp_ret), K(arg)); + if (OB_FAIL(ls->start())) { + LOG_ERROR("ls start failed", K(ret), K(arg)); + } else if (is_ls_to_restore_(arg)) { + if (OB_FAIL(ls->offline_without_lock())) { + LOG_WARN("failed to offline", K(ret), K(arg)); + } else if (OB_FAIL(ls->get_log_handler()->enable_sync())) { + LOG_WARN("failed to enable sync", K(ret), K(arg)); + } else if (OB_FAIL(ls->get_ls_restore_handler()->online())) { + LOG_WARN("failed to online restore handler", K(ret), K(arg)); } } + + FLOG_INFO("add ls to ls service succ", K(ls->get_ls_id()), K(arg)); + if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) { + LOG_WARN("fail to report ls", KR(tmp_ret), K(arg)); + } } if (OB_FAIL(ret)) { do {