check_not_backup_tablet_create_scn mutex with rebuild

This commit is contained in:
hamstersox
2022-12-01 12:05:41 +00:00
committed by ob-robot
parent 2754b76deb
commit dc1b53494f
10 changed files with 279 additions and 111 deletions

View File

@ -797,7 +797,25 @@ int ObService::check_not_backup_tablet_create_scn(const obrpc::ObBackupCheckTabl
} else if (OB_ISNULL(ls = ls_handle.get_ls())) { } else if (OB_ISNULL(ls = ls_handle.get_ls())) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_WARN("log stream should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls)); LOG_WARN("log stream should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls));
} else if (OB_ISNULL(ls_tablet_svr = ls->get_tablet_svr())) { } else {
const int64_t rebuild_seq = ls->get_rebuild_seq();
ObMigrationStatus migration_status;
share::ObLSRestoreStatus restore_status;
if (OB_FAIL(ls->is_offline())) {
ret = OB_EAGAIN;
LOG_WARN("ls is offline, retry later", K(ret), KPC(ls));
} else if (OB_FAIL(ls->get_migration_status(migration_status))) {
LOG_WARN("failed to get migration status", K(ret), KPC(ls));
} else if (storage::ObMigrationStatus::OB_MIGRATION_STATUS_NONE != migration_status) {
ret = OB_EAGAIN;
LOG_WARN("ls is in migration, retry later", K(ret), KPC(ls));
} else if (OB_FAIL(ls->get_restore_status(restore_status))) {
LOG_WARN("failed to get restore status", K(ret), KPC(ls));
} else if (share::ObLSRestoreStatus::RESTORE_NONE != restore_status) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("restore ls is unexpected", K(ret), KPC(ls));
} else {
if (OB_ISNULL(ls_tablet_svr = ls->get_tablet_svr())) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_WARN("ls tablet service should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls)); LOG_WARN("ls tablet service should not be NULL", KR(ret), K(arg.tenant_id_), K(arg.ls_id_), KPC(ls));
} else { } else {
@ -828,6 +846,13 @@ int ObService::check_not_backup_tablet_create_scn(const obrpc::ObBackupCheckTabl
} }
} }
} }
if (OB_FAIL(ret)) {
} else if (rebuild_seq != ls->get_rebuild_seq()) {
ret = OB_EAGAIN;
LOG_WARN("ls has rebuild, retry later", K(ret), KPC(ls));
}
}
}
} }
return ret; return ret;
} }

View File

@ -82,10 +82,10 @@ public:
bool is_wait_status() const { return is_wait_restore_sys_tablets() || is_wait_restore_tablets_meta() bool is_wait_status() const { return is_wait_restore_sys_tablets() || is_wait_restore_tablets_meta()
|| is_wait_quick_restore() || is_wait_restore_major_data(); } || is_wait_quick_restore() || is_wait_restore_major_data(); }
// Don't load inner tablet while downtime and restart if restore status is in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED // enable sync and online ls restore handler in [RESTORE_START, RESTORE_SYS_TABLETS] or RESTORE_FAILED
bool is_need_load_inner_tablet() const bool is_enable_for_restore() const
{ {
return !((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) || return ((status_ >= Status::RESTORE_START && status_ <= Status::RESTORE_SYS_TABLETS) ||
status_ == Status::RESTORE_FAILED); status_ == Status::RESTORE_FAILED);
} }
// if restore status is not in [RESTORE_START, WAIT_RESTORE_TABLETS_META], log_replay_service can replay log. // if restore status is not in [RESTORE_START, WAIT_RESTORE_TABLETS_META], log_replay_service can replay log.

View File

@ -1033,7 +1033,6 @@ int ObStartMigrationTask::deal_with_local_ls_()
int64_t proposal_id = 0; int64_t proposal_id = 0;
ObLSMeta local_ls_meta; ObLSMeta local_ls_meta;
logservice::ObLogService *log_service = nullptr; logservice::ObLogService *log_service = nullptr;
if (!is_inited_) { if (!is_inited_) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
LOG_WARN("start migration task do not init", K(ret)); LOG_WARN("start migration task do not init", K(ret));
@ -1393,20 +1392,14 @@ int ObStartMigrationTask::deal_local_restore_ls_(bool &need_generate_dag)
} else if (ls_restore_status.is_restore_failed()) { } else if (ls_restore_status.is_restore_failed()) {
ret = OB_ERR_UNEXPECTED; ret = OB_ERR_UNEXPECTED;
LOG_WARN("ls restore status is not expected", K(ret), KPC(ctx_), KPC(ls), K(ls_restore_status)); LOG_WARN("ls restore status is not expected", K(ret), KPC(ctx_), KPC(ls), K(ls_restore_status));
} else if (ls_restore_status.is_restore_start() || ls_restore_status.is_restore_sys_tablets()) { } else if (ls_restore_status.is_restore_start()) {
if (OB_FAIL(ls->get_log_handler()->enable_sync())) { ret = OB_SRC_DO_NOT_ALLOWED_MIGRATE;
LOG_WARN("failed to enable log sync", K(ret), KPC(ctx_), KPC(ls)); LOG_WARN("src ls is in restore start, wait later", K(ret), KPC(ls));
} else if (OB_FAIL(ls->get_tablet_svr()->online())) { } else if (ls_restore_status.is_restore_sys_tablets()) {
LOG_WARN("failed to online tablet svr", K(ret), KPC(ctx_), KPC(ls));
} else if (OB_FAIL(ls->get_tx_svr()->online())) {
LOG_WARN("failed to online tx svr", K(ret), KPC(ctx_), KPC(ls));
} else if (OB_FAIL(ls->get_ddl_log_handler()->online())) {
LOG_WARN("failed to online ddl log handler", K(ret), KPC(ctx_), KPC(ls));
} else if (OB_FAIL(ls->get_ls_wrs_handler()->online())) {
LOG_WARN("failed to online ls wrs handler", K(ret), KPC(ctx_), KPC(ls));
} else if (OB_FALSE_IT(ls->get_checkpoint_executor()->online())) {
} else {
need_generate_dag = false; need_generate_dag = false;
if (OB_FAIL(ls->enable_for_restore())) {
LOG_WARN("failed to enable for restore", K(ret));
} else {
LOG_INFO("ls restore status is in restore start or in restore sys tablets, no need generate dag", LOG_INFO("ls restore status is in restore start or in restore sys tablets, no need generate dag",
K(ls_restore_status), "ls_id", ctx_->arg_.ls_id_); K(ls_restore_status), "ls_id", ctx_->arg_.ls_id_);
} }

View File

@ -373,7 +373,7 @@ bool ObLS::is_need_gc() const
return bool_ret; return bool_ret;
} }
bool ObLS::is_need_load_inner_tablet() const bool ObLS::is_enable_for_restore() const
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
bool bool_ret = false; bool bool_ret = false;
@ -381,7 +381,7 @@ bool ObLS::is_need_load_inner_tablet() const
if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) { if (OB_FAIL(ls_meta_.get_restore_status(restore_status))) {
LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_)); LOG_WARN("fail to get restore status", K(ret), K(ls_meta_.ls_id_));
} else { } else {
bool_ret = restore_status.is_need_load_inner_tablet(); bool_ret = restore_status.is_enable_for_restore();
} }
return bool_ret; return bool_ret;
} }
@ -684,6 +684,8 @@ int ObLS::offline_()
} else if (FALSE_IT(is_offlined_ = true)) { } else if (FALSE_IT(is_offlined_ = true)) {
} else if (FALSE_IT(checkpoint_executor_.offline())) { } else if (FALSE_IT(checkpoint_executor_.offline())) {
LOG_WARN("checkpoint executor offline failed", K(ret), K(ls_meta_)); LOG_WARN("checkpoint executor offline failed", K(ret), K(ls_meta_));
} else if (OB_FAIL(ls_restore_handler_.offline())) {
LOG_WARN("failed to offline ls restore handler", K(ret));
} else if (OB_FAIL(offline_compaction_())) { } else if (OB_FAIL(offline_compaction_())) {
LOG_WARN("compaction offline failed", K(ret), K(ls_meta_)); LOG_WARN("compaction offline failed", K(ret), K(ls_meta_));
} else if (OB_FAIL(ls_wrs_handler_.offline())) { } else if (OB_FAIL(ls_wrs_handler_.offline())) {
@ -735,6 +737,30 @@ int ObLS::offline()
return ret; return ret;
} }
int ObLS::offline_without_lock()
{
int ret = OB_SUCCESS;
int64_t start_ts = ObTimeUtility::current_time();
int64_t retry_times = 0;
do {
retry_times++;
{
if (OB_FAIL(offline_())) {
LOG_WARN("ls offline failed", K(ret), K(ls_meta_));
}
}
if (OB_EAGAIN == ret) {
ob_usleep(100 * 1000); // 100 ms
if (retry_times % 100 == 0) { // every 10 s
LOG_WARN("ls offline use too much time.", K(ls_meta_), K(start_ts));
}
}
} while (OB_EAGAIN == ret);
FLOG_INFO("ls offline end", KR(ret), "ls_id", get_ls_id());
return ret;
}
int ObLS::online_tx_() int ObLS::online_tx_()
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
@ -777,6 +803,8 @@ int ObLS::online()
LOG_WARN("weak read handler online failed", K(ret), K(ls_meta_)); LOG_WARN("weak read handler online failed", K(ret), K(ls_meta_));
} else if (OB_FAIL(online_compaction_())) { } else if (OB_FAIL(online_compaction_())) {
LOG_WARN("compaction online failed", K(ret), K(ls_meta_)); LOG_WARN("compaction online failed", K(ret), K(ls_meta_));
} else if (OB_FAIL(ls_restore_handler_.online())) {
LOG_WARN("ls restore handler online failed", K(ret));
} else if (FALSE_IT(checkpoint_executor_.online())) { } else if (FALSE_IT(checkpoint_executor_.online())) {
} else if (FALSE_IT(tablet_gc_handler_.online())) { } else if (FALSE_IT(tablet_gc_handler_.online())) {
} else { } else {
@ -788,6 +816,23 @@ int ObLS::online()
return ret; return ret;
} }
int ObLS::enable_for_restore()
{
int ret = OB_SUCCESS;
int64_t read_lock = 0;
int64_t write_lock = LSLOCKALL;
ObLSLockGuard lock_myself(lock_, read_lock, write_lock);
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("ls is not inited", K(ret));
} else if (OB_FAIL(log_handler_.enable_sync())) {
LOG_WARN("failed to enable sync", K(ret));
} else if (OB_FAIL(ls_restore_handler_.online())) {
LOG_WARN("failed to online restore", K(ret));
}
return ret;
}
int ObLS::get_ls_meta_package(ObLSMetaPackage &meta_package) int ObLS::get_ls_meta_package(ObLSMetaPackage &meta_package)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
@ -1051,10 +1096,18 @@ int ObLS::finish_slog_replay()
LOG_INFO("this ls should be gc later", KPC(this)); LOG_INFO("this ls should be gc later", KPC(this));
// ls will be gc later and tablets in the ls are not complete, // ls will be gc later and tablets in the ls are not complete,
// so skip the following steps, otherwise load_ls_inner_tablet maybe encounter error. // so skip the following steps, otherwise load_ls_inner_tablet maybe encounter error.
} else if (is_need_load_inner_tablet() && OB_FAIL(load_ls_inner_tablet())) {
LOG_WARN("ls load inner tablet failed", K(ret), KPC(this));
} else if (OB_FAIL(start())) { } else if (OB_FAIL(start())) {
LOG_WARN("ls can not start to work", K(ret)); LOG_WARN("ls can not start to work", K(ret));
} else if (is_enable_for_restore()) {
if (OB_FAIL(offline_())) {
LOG_WARN("failed to offline", K(ret), KPC(this));
} else if (OB_FAIL(log_handler_.enable_sync())) {
LOG_WARN("failed to enable sync", K(ret), KPC(this));
} else if (OB_FAIL(ls_restore_handler_.online())) {
LOG_WARN("failed to online ls restore handler", K(ret), KPC(this));
}
} else if (OB_FAIL(load_ls_inner_tablet())) {
LOG_WARN("ls load inner tablet failed", K(ret), KPC(this));
} else { } else {
// do nothing // do nothing
} }

View File

@ -154,6 +154,8 @@ public:
void destroy(); void destroy();
int offline(); int offline();
int online(); int online();
int offline_without_lock();
int enable_for_restore();
bool is_offline() const { return is_offlined_; } // mock function, TODO(@yanyuan) bool is_offline() const { return is_offlined_; } // mock function, TODO(@yanyuan)
ObLSTxService *get_tx_svr() { return &ls_tx_svr_; } ObLSTxService *get_tx_svr() { return &ls_tx_svr_; }
@ -200,7 +202,7 @@ public:
void set_create_state(const ObInnerLSStatus new_status); void set_create_state(const ObInnerLSStatus new_status);
ObInnerLSStatus get_create_state() const; ObInnerLSStatus get_create_state() const;
bool is_need_gc() const; bool is_need_gc() const;
bool is_need_load_inner_tablet() const; bool is_enable_for_restore() const;
// for rebuild // for rebuild
// remove inner tablet, the memtable and minor sstable of data tablet, disable replay // remove inner tablet, the memtable and minor sstable of data tablet, disable replay
// int prepare_rebuild(); // int prepare_rebuild();

View File

@ -42,12 +42,13 @@ using namespace logservice;
ObLSRestoreHandler::ObLSRestoreHandler() ObLSRestoreHandler::ObLSRestoreHandler()
: is_inited_(false), : is_inited_(false),
is_stop_(false), is_stop_(false),
is_online_(true),
rebuild_seq_(0),
result_mgr_(), result_mgr_(),
ls_(nullptr), ls_(nullptr),
ls_restore_arg_(), ls_restore_arg_(),
state_handler_(nullptr), state_handler_(nullptr),
allocator_(), allocator_()
rebuild_seq_(0)
{ {
} }
@ -88,6 +89,74 @@ void ObLSRestoreHandler::destroy()
ls_ = nullptr; ls_ = nullptr;
} }
int ObLSRestoreHandler::offline()
{
int ret = OB_SUCCESS;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("not init", K(ret));
} else {
lib::ObMutexGuard guard(mtx_);
if (OB_FAIL(cancel_task_())) {
LOG_WARN("failed to cancel task", K(ret), KPC(ls_));
} else {
is_online_ = false;
}
}
return ret;
}
int ObLSRestoreHandler::online()
{
int ret = OB_SUCCESS;
share::ObLSRestoreStatus new_status;
ObILSRestoreState *new_state_handler = nullptr;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("not init", K(ret));
} else if (is_online_) {
// do nothing
LOG_INFO("ls is online", KPC(ls_));
} else if (OB_FAIL(ls_->get_restore_status(new_status))) {
LOG_WARN("fail to get_restore_status", K(ret), KPC(ls_));
} else if (new_status.is_restore_none()) {
is_online_ = true;
} else {
lib::ObMutexGuard guard(mtx_);
// online after rebuild or migrate. the restore status may changed.
// so, we refresh the restore state handler according to the new ls restore status.
if (OB_FAIL(fill_restore_arg_())) {
LOG_WARN("fail to fill restore arg", K(ret));
} else if (OB_FAIL(get_restore_state_handler_(new_status, new_state_handler))) {
LOG_WARN("fail to get restore state handler", K(ret), K(new_status));
} else {
if (nullptr != state_handler_) {
// when online, the old task should be cancel.
if (OB_FAIL(state_handler_->get_tablet_mgr().cancel_task())) {
LOG_WARN("failed to cancel task", K(ret));
} else {
state_handler_->~ObILSRestoreState();
allocator_.free(state_handler_);
state_handler_ = nullptr;
}
}
if (OB_SUCC(ret)) {
state_handler_ = new_state_handler;
is_online_ = true;
new_state_handler = nullptr;
}
}
if (OB_FAIL(ret) && nullptr != new_state_handler) {
new_state_handler->~ObILSRestoreState();
allocator_.free(new_state_handler);
new_state_handler = nullptr;
}
}
return ret;
}
int ObLSRestoreHandler::record_clog_failed_info( int ObLSRestoreHandler::record_clog_failed_info(
const share::ObTaskId &trace_id, const share::ObLSID &ls_id, const int &result) const share::ObTaskId &trace_id, const share::ObLSID &ls_id, const int &result)
{ {
@ -128,7 +197,10 @@ int ObLSRestoreHandler::handle_execute_over(
status = state_handler_->get_restore_status(); status = state_handler_->get_restore_status();
} }
} }
if (status.is_restore_sys_tablets()) { if (OB_CANCELED == result) {
//do nothing
LOG_WARN("task has been canceled", KPC(ls_), K(task_id));
} else if (status.is_restore_sys_tablets()) {
state_handler_->set_retry_flag(); state_handler_->set_retry_flag();
result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); result_mgr_.set_result(result, task_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("restore sys tablets dag failed, need retry", K(ret)); LOG_WARN("restore sys tablets dag failed, need retry", K(ret));
@ -157,9 +229,11 @@ int ObLSRestoreHandler::handle_pull_tablet(
if (IS_NOT_INIT) { if (IS_NOT_INIT) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
LOG_WARN("not init", K(ret)); LOG_WARN("not init", K(ret));
} else if (is_stop_ || !is_online_) {
LOG_WARN("ls stopped or disabled", KPC(ls_));
} else if (OB_ISNULL(state_handler_)) { } else if (OB_ISNULL(state_handler_)) {
// server may downtime and restart, but it has't inited state handler, so state_handler_ may be null. // server may downtime and restart, but it has't inited state handler, so state_handler_ may be null.
LOG_WARN("need restart, wait later"); LOG_WARN("need restart, wait later", KPC(ls_));
} else if (OB_FAIL(state_handler_->handle_pull_tablet(tablet_ids, leader_restore_status))) { } else if (OB_FAIL(state_handler_->handle_pull_tablet(tablet_ids, leader_restore_status))) {
LOG_WARN("fail to handl pull tablet", K(ret), K(leader_restore_status)); LOG_WARN("fail to handl pull tablet", K(ret), K(leader_restore_status));
} }
@ -188,7 +262,10 @@ int ObLSRestoreHandler::process()
// it tasks a period of time for the ls leader is ready after the shutdown and restart of observer usually, // it tasks a period of time for the ls leader is ready after the shutdown and restart of observer usually,
// and an ls leader not exist error will be returned before leader is ready. // and an ls leader not exist error will be returned before leader is ready.
// so in order to improve availability, we need control the retry frequency and the default retry time interval is 10s. // so in order to improve availability, we need control the retry frequency and the default retry time interval is 10s.
if (OB_FAIL(state_handler_->do_restore())) { lib::ObMutexGuard guard(mtx_);
if (is_stop_ || !is_online_) {
LOG_INFO("ls stopped or disabled", KPC(ls_));
} else if (OB_FAIL(state_handler_->do_restore())) {
ObTaskId trace_id(*ObCurTraceId::get_trace_id()); ObTaskId trace_id(*ObCurTraceId::get_trace_id());
result_mgr_.set_result(ret, trace_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE); result_mgr_.set_result(ret, trace_id, ObLSRestoreResultMgr::RestoreFailedType::DATA_RESTORE_FAILED_TYPE);
LOG_WARN("fail to do restore", K(ret), KPC(state_handler_)); LOG_WARN("fail to do restore", K(ret), KPC(state_handler_));
@ -489,30 +566,36 @@ void ObLSRestoreHandler::wakeup()
int ObLSRestoreHandler::safe_to_destroy(bool &is_safe) int ObLSRestoreHandler::safe_to_destroy(bool &is_safe)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
const int64_t start_ts = ObTimeUtil::current_time();
const int64_t OB_WAIT_LS_RESTORE_STOP_MS = 200 * 1000; // 200ms
is_safe = false; is_safe = false;
if (IS_NOT_INIT) { if (IS_NOT_INIT) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
LOG_WARN("ls restore handler do not init", K(ret)); LOG_WARN("ls restore handler do not init", K(ret));
} else { } else {
lib::ObMutexGuard guard(mtx_); lib::ObMutexGuard guard(mtx_);
if (OB_ISNULL(state_handler_)) { if (OB_FAIL(cancel_task_())) {
is_safe = true; LOG_WARN("failed to cancel tasks", K(ret), KPC(ls_));
} else { } else {
ObLSRestoreTaskMgr &restore_tablet_mgr = state_handler_->get_tablet_mgr();
bool is_done = false;
if (OB_FAIL(restore_tablet_mgr.check_all_task_done(is_done))) {
LOG_WARN("fail to check all task done", K(ret));
} else if (is_done) {
is_safe = true; is_safe = true;
} is_stop_ = true;
} }
} }
LOG_INFO("wait ls restore stop", K(ret), K(is_safe), KPC(ls_)); LOG_INFO("wait ls restore stop", K(ret), K(is_safe), KPC(ls_));
return ret; return ret;
} }
int ObLSRestoreHandler::cancel_task_()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(state_handler_)) {
} else {
ObLSRestoreTaskMgr &restore_tablet_mgr = state_handler_->get_tablet_mgr();
if (OB_FAIL(restore_tablet_mgr.cancel_task())) {
LOG_WARN("fail to check all task done", K(ret));
}
}
return ret;
}
int ObLSRestoreHandler::update_rebuild_seq() int ObLSRestoreHandler::update_rebuild_seq()
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
@ -520,16 +603,14 @@ int ObLSRestoreHandler::update_rebuild_seq()
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
LOG_WARN("ls restore handler do not init", K(ret)); LOG_WARN("ls restore handler do not init", K(ret));
} else { } else {
lib::ObMutexGuard guard(mtx_); ATOMIC_STORE(&rebuild_seq_, ls_->get_rebuild_seq());
rebuild_seq_ = ls_->get_rebuild_seq();
} }
return ret; return ret;
} }
int64_t ObLSRestoreHandler::get_rebuild_seq() int64_t ObLSRestoreHandler::get_rebuild_seq()
{ {
lib::ObMutexGuard guard(mtx_); return ATOMIC_LOAD(&rebuild_seq_);
return rebuild_seq_;
} }
//================================ObILSRestoreState======================================= //================================ObILSRestoreState=======================================
@ -1170,23 +1251,15 @@ int ObILSRestoreState::check_restore_concurrency_limit_()
return ret; return ret;
} }
int ObILSRestoreState::enable_replay_() int ObILSRestoreState::online_()
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
if (OB_FAIL(ls_->enable_replay())) { if (OB_FAIL(ls_->online())) {
LOG_WARN("enable ls replay failed", K(ret), KPC(ls_)); LOG_WARN("online ls failed", K(ret), KPC(ls_));
} }
return ret; return ret;
} }
void ObILSRestoreState::disable_replay_()
{
int tmp_ret = OB_SUCCESS;
if (OB_SUCCESS != (tmp_ret = ls_->disable_replay())) {
LOG_WARN("fail to disable replay", K(tmp_ret), KPC(ls_));
}
}
int ObILSRestoreState::schedule_tablet_group_restore_( int ObILSRestoreState::schedule_tablet_group_restore_(
const ObTabletGroupRestoreArg &arg, const ObTabletGroupRestoreArg &arg,
const share::ObTaskId &task_id) const share::ObTaskId &task_id)
@ -1390,15 +1463,12 @@ int ObLSRestoreStartState::do_with_no_ls_meta_()
// this ls doesn't have ls meta and tablet in backup, it only needs to replay clog. // this ls doesn't have ls meta and tablet in backup, it only needs to replay clog.
// so just advance to qucik restore and start replay clog. // so just advance to qucik restore and start replay clog.
ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::QUICK_RESTORE); ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::QUICK_RESTORE);
if (OB_FAIL(enable_replay_())) { if (OB_FAIL(online_())) {
LOG_WARN("fail to enable log", K(ret)); LOG_WARN("fail to enable log", K(ret));
} else if (OB_FAIL(advance_status_(*ls_, next_status))) { } else if (OB_FAIL(advance_status_(*ls_, next_status))) {
LOG_WARN("fail to advance status", K(ret), K(*ls_), K(next_status)); LOG_WARN("fail to advance status", K(ret), K(*ls_), K(next_status));
} }
if (OB_FAIL(ret)) {
disable_replay_();
}
return ret; return ret;
} }
@ -1415,7 +1485,7 @@ int ObLSRestoreStartState::do_with_uncreated_ls_()
LOG_WARN("fail to check ls created", K(ret), KPC(ls_)); LOG_WARN("fail to check ls created", K(ret), KPC(ls_));
} else if (is_created) { } else if (is_created) {
// creating ls finished after sys ls restored. cur ls need to do restore. // creating ls finished after sys ls restored. cur ls need to do restore.
} else if (OB_FAIL(enable_replay_())) { } else if (OB_FAIL(online_())) {
LOG_WARN("fail to enable log", K(ret)); LOG_WARN("fail to enable log", K(ret));
} else if (OB_FAIL(advance_status_(*ls_, next_status))) { } else if (OB_FAIL(advance_status_(*ls_, next_status))) {
LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status));
@ -1424,9 +1494,6 @@ int ObLSRestoreStartState::do_with_uncreated_ls_()
LOG_INFO("no need to restore when sys ls has been restored and the ls doesn't created.", KPC(ls_)); LOG_INFO("no need to restore when sys ls has been restored and the ls doesn't created.", KPC(ls_));
} }
if (OB_FAIL(ret)) {
disable_replay_();
}
return ret; return ret;
} }
@ -1630,7 +1697,7 @@ int ObLSRestoreSysTabletState::leader_restore_sys_tablet_()
} else if (!tablet_mgr_.is_restore_completed()) {// TODO: check restore finish, should read from extern. fix later } else if (!tablet_mgr_.is_restore_completed()) {// TODO: check restore finish, should read from extern. fix later
} else if (is_need_retry_()) { } else if (is_need_retry_()) {
// next term to retry // next term to retry
} else if (OB_FAIL(ls_->load_ls_inner_tablet())) { } else if (OB_FAIL(online_())) {
LOG_WARN("fail to load ls inner tablet", K(ret)); LOG_WARN("fail to load ls inner tablet", K(ret));
} else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) { } else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) {
LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_)); LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_));
@ -1639,6 +1706,7 @@ int ObLSRestoreSysTabletState::leader_restore_sys_tablet_()
} else { } else {
LOG_INFO("leader succ to restore sys tablet", KPC(ls_)); LOG_INFO("leader succ to restore sys tablet", KPC(ls_));
} }
return ret; return ret;
} }
@ -1661,7 +1729,7 @@ int ObLSRestoreSysTabletState::follower_restore_sys_tablet_()
} else if (!tablet_mgr_.is_restore_completed()) { } else if (!tablet_mgr_.is_restore_completed()) {
} else if (is_need_retry_()) { } else if (is_need_retry_()) {
// next term to retry // next term to retry
} else if (OB_FAIL(ls_->load_ls_inner_tablet())) { } else if (OB_FAIL(online_())) {
LOG_WARN("fail to load ls inner tablet", K(ret)); LOG_WARN("fail to load ls inner tablet", K(ret));
} else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) { } else if (OB_FAIL(ls_->get_ls_restore_handler()->update_rebuild_seq())) {
LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_)); LOG_WARN("failed to update rebuild seq", K(ret), KPC(ls_));
@ -1670,6 +1738,7 @@ int ObLSRestoreSysTabletState::follower_restore_sys_tablet_()
} else { } else {
LOG_INFO("follower succ to restore sys tablet", KPC(ls_)); LOG_INFO("follower succ to restore sys tablet", KPC(ls_));
} }
return ret; return ret;
} }
@ -1791,17 +1860,12 @@ int ObLSRestoreCreateUserTabletState::leader_create_user_tablet_()
} else if (tablet_need_restore.empty()) { } else if (tablet_need_restore.empty()) {
ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::WAIT_RESTORE_TABLETS_META); ObLSRestoreStatus next_status(ObLSRestoreStatus::Status::WAIT_RESTORE_TABLETS_META);
if (!tablet_mgr_.is_restore_completed()) { if (!tablet_mgr_.is_restore_completed()) {
} else if (OB_FAIL(enable_replay_())) {
LOG_WARN("fail to enable log", K(ret), KPC(ls_));
} else if (OB_FAIL(advance_status_(*ls_, next_status))) { } else if (OB_FAIL(advance_status_(*ls_, next_status))) {
LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status));
} else { } else {
LOG_INFO("success create leader user tablets", KPC(ls_)); LOG_INFO("success create leader user tablets", KPC(ls_));
tablet_mgr_.reuse_set(); tablet_mgr_.reuse_set();
} }
if (OB_FAIL(ret)) {
disable_replay_();
}
} else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) { } else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) {
LOG_WARN("fail to do quick restore", K(ret), K(tablet_need_restore), KPC(ls_)); LOG_WARN("fail to do quick restore", K(ret), K(tablet_need_restore), KPC(ls_));
} }
@ -1836,17 +1900,12 @@ int ObLSRestoreCreateUserTabletState::follower_create_user_tablet_()
if (OB_FAIL(reload_miss_tablet_(all_finish))) { if (OB_FAIL(reload_miss_tablet_(all_finish))) {
LOG_WARN("fail to check follower restore tablet all finish", K(ret), KPC(ls_)); LOG_WARN("fail to check follower restore tablet all finish", K(ret), KPC(ls_));
} else if (all_finish) { } else if (all_finish) {
if (OB_FAIL(enable_replay_())) { if (OB_FAIL(advance_status_(*ls_, next_status))) {
LOG_WARN("fail to enable log", K(ret), KPC(ls_));
} else if (OB_FAIL(advance_status_(*ls_, next_status))) {
LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status)); LOG_WARN("fail to advance status", K(ret), KPC(ls_), K(next_status));
} else { } else {
LOG_INFO("success create follower user tablets", KPC(ls_)); LOG_INFO("success create follower user tablets", KPC(ls_));
tablet_mgr_.reuse_set(); tablet_mgr_.reuse_set();
} }
if (OB_FAIL(ret)) {
disable_replay_();
}
} }
} }
} else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) { } else if (OB_FAIL(do_create_user_tablet_(tablet_need_restore))) {

View File

@ -87,10 +87,13 @@ public:
void wakeup(); void wakeup();
void stop() { ATOMIC_STORE(&is_stop_, true); } // when remove ls, set this void stop() { ATOMIC_STORE(&is_stop_, true); } // when remove ls, set this
int safe_to_destroy(bool &is_safe); int safe_to_destroy(bool &is_safe);
int offline();
int online();
bool is_stop() { return is_stop_; } bool is_stop() { return is_stop_; }
int update_rebuild_seq(); int update_rebuild_seq();
int64_t get_rebuild_seq(); int64_t get_rebuild_seq();
private: private:
int cancel_task_();
int check_before_do_restore_(bool &can_do_restore); int check_before_do_restore_(bool &can_do_restore);
int update_state_handle_(); int update_state_handle_();
int check_meta_tenant_normal_(bool &is_normal); int check_meta_tenant_normal_(bool &is_normal);
@ -101,14 +104,15 @@ private:
int fill_restore_arg_(); int fill_restore_arg_();
private: private:
bool is_inited_; bool is_inited_;
bool is_stop_; bool is_stop_; // used by ls destory
bool is_online_; // used by ls online/offline
int64_t rebuild_seq_; // update by rebuild
lib::ObMutex mtx_; lib::ObMutex mtx_;
ObLSRestoreResultMgr result_mgr_; ObLSRestoreResultMgr result_mgr_;
storage::ObLS *ls_; storage::ObLS *ls_;
ObTenantRestoreCtx ls_restore_arg_; ObTenantRestoreCtx ls_restore_arg_;
ObILSRestoreState *state_handler_; ObILSRestoreState *state_handler_;
common::ObFIFOAllocator allocator_; common::ObFIFOAllocator allocator_;
int64_t rebuild_seq_;
DISALLOW_COPY_AND_ASSIGN(ObLSRestoreHandler); DISALLOW_COPY_AND_ASSIGN(ObLSRestoreHandler);
}; };
@ -171,8 +175,8 @@ protected:
int report_ls_restore_progress_(storage::ObLS &ls, const share::ObLSRestoreStatus &status, int report_ls_restore_progress_(storage::ObLS &ls, const share::ObLSRestoreStatus &status,
const share::ObTaskId &trace_id, const int result = OB_SUCCESS, const char *comment = ""); const share::ObTaskId &trace_id, const int result = OB_SUCCESS, const char *comment = "");
int enable_replay_(); int online_();
void disable_replay_(); void offline_();
int update_restore_status_( int update_restore_status_(
storage::ObLS &ls, storage::ObLS &ls,
const share::ObLSRestoreStatus &next_status); const share::ObLSRestoreStatus &next_status);

View File

@ -209,10 +209,9 @@ int ObLSRestoreTaskMgr::schedule_tablet(const ObTaskId &task_id, const ObSArray<
return ret; return ret;
} }
int ObLSRestoreTaskMgr::check_all_task_done(bool &is_all_done) int ObLSRestoreTaskMgr::cancel_task()
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
is_all_done = true;
bool is_exist = false; bool is_exist = false;
if (IS_NOT_INIT) { if (IS_NOT_INIT) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
@ -221,13 +220,38 @@ int ObLSRestoreTaskMgr::check_all_task_done(bool &is_all_done)
lib::ObMutexGuard guard(mtx_); lib::ObMutexGuard guard(mtx_);
TaskMap::iterator iter = tablet_map_.begin(); TaskMap::iterator iter = tablet_map_.begin();
for (; OB_SUCC(ret) && iter != tablet_map_.end(); ++iter) { for (; OB_SUCC(ret) && iter != tablet_map_.end(); ++iter) {
is_exist = false;
if (OB_FAIL(check_task_exist_(iter->first, is_exist))) { if (OB_FAIL(check_task_exist_(iter->first, is_exist))) {
LOG_WARN("fail to check task exist", K(ret), "taks_id", iter->first); LOG_WARN("fail to check task exist", K(ret), "taks_id", iter->first);
} else if (is_exist) { } else if (is_exist) {
is_all_done = false; ObTenantDagScheduler *scheduler = nullptr;
if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret), KP(scheduler));
} else if (OB_FAIL(scheduler->cancel_dag_net(iter->first))) {
LOG_WARN("failed to check dag net exist", K(ret), K(iter->first));
} }
} }
} }
int64_t start_ts = ObTimeUtil::current_time();
for (; OB_SUCC(ret) && iter != tablet_map_.end(); ++iter) {
is_exist = true;
do {
if (OB_FAIL(check_task_exist_(iter->first, is_exist))) {
LOG_WARN("fail to check task exist", K(ret), "taks_id", iter->first);
} else if (is_exist && REACH_TIME_INTERVAL(60 * 1000 * 1000)) {
LOG_WARN("cancel dag next task cost too much time", K(ret), "task_id", iter->first,
"cost_time", ObTimeUtil::current_time() - start_ts);
}
} while (is_exist && OB_SUCC(ret));
}
if (OB_SUCC(ret)) {
reuse_set();
tablet_map_.reuse();
}
}
return ret; return ret;
} }

View File

@ -41,7 +41,7 @@ public:
int schedule_tablet(const share::ObTaskId &task_id, const ObSArray<common::ObTabletID> &tablet_need_restore, bool &reach_dag_limit); int schedule_tablet(const share::ObTaskId &task_id, const ObSArray<common::ObTabletID> &tablet_need_restore, bool &reach_dag_limit);
int pop_need_restore_tablets(ObIArray<common::ObTabletID> &need_restore_tablets); int pop_need_restore_tablets(ObIArray<common::ObTabletID> &need_restore_tablets);
int pop_restored_tablets(storage::ObLS &ls, ObIArray<common::ObTabletID> &tablet_send_to_follower); int pop_restored_tablets(storage::ObLS &ls, ObIArray<common::ObTabletID> &tablet_send_to_follower);
int check_all_task_done(bool &is_all_done); int cancel_task();
void reuse_set() { schedule_tablet_set_.reuse(); wait_tablet_set_.reuse(); } void reuse_set() { schedule_tablet_set_.reuse(); wait_tablet_set_.reuse(); }
void reuse_wait_set() { wait_tablet_set_.reuse(); } void reuse_wait_set() { wait_tablet_set_.reuse(); }

View File

@ -460,15 +460,23 @@ int ObLSService::create_ls(const obrpc::ObCreateLSArg &arg)
} else { } else {
state = ObLSCreateState::CREATE_STATE_FINISH; state = ObLSCreateState::CREATE_STATE_FINISH;
ls->finish_create(is_commit); ls->finish_create(is_commit);
if (OB_SUCCESS != (tmp_ret = ls->start())) { if (OB_FAIL(ls->start())) {
LOG_ERROR("ls start failed", K(tmp_ret), K(arg)); LOG_ERROR("ls start failed", K(ret), K(arg));
} else { } else if (is_ls_to_restore_(arg)) {
if (OB_FAIL(ls->offline_without_lock())) {
LOG_WARN("failed to offline", K(ret), K(arg));
} else if (OB_FAIL(ls->get_log_handler()->enable_sync())) {
LOG_WARN("failed to enable sync", K(ret), K(arg));
} else if (OB_FAIL(ls->get_ls_restore_handler()->online())) {
LOG_WARN("failed to online restore handler", K(ret), K(arg));
}
}
FLOG_INFO("add ls to ls service succ", K(ls->get_ls_id()), K(arg)); FLOG_INFO("add ls to ls service succ", K(ls->get_ls_id()), K(arg));
if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) { if (OB_SUCCESS != (tmp_ret = ls->report_replica_info())) {
LOG_WARN("fail to report ls", KR(tmp_ret), K(arg)); LOG_WARN("fail to report ls", KR(tmp_ret), K(arg));
} }
} }
}
if (OB_FAIL(ret)) { if (OB_FAIL(ret)) {
do { do {
need_retry = false; need_retry = false;