From af5122597772bd409d1e86d57f4d157268be6318 Mon Sep 17 00:00:00 2001 From: ZenoWang Date: Fri, 26 Apr 2024 12:20:43 +0000 Subject: [PATCH] [FIX] create memtable for dml may dead loop --- src/storage/ddl/ob_ddl_replay_executor.cpp | 12 ++++----- src/storage/ls/ob_ls.cpp | 29 ++++++++++++++-------- src/storage/ls/ob_ls.h | 2 +- src/storage/ob_storage_table_guard.cpp | 12 +++++++-- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/storage/ddl/ob_ddl_replay_executor.cpp b/src/storage/ddl/ob_ddl_replay_executor.cpp index 53c219454..193c7bc2e 100644 --- a/src/storage/ddl/ob_ddl_replay_executor.cpp +++ b/src/storage/ddl/ob_ddl_replay_executor.cpp @@ -659,9 +659,9 @@ int ObDDLIncStartReplayExecutor::do_replay_(ObTabletHandle &tablet_handle) if (lob_meta_tablet_id.is_valid()) { tmp_ret_lob = ls_->sync_tablet_freeze_for_direct_load(lob_meta_tablet_id); } - ret = tmp_ret | tmp_ret_lob; - if (OB_FAIL(ret)) { - LOG_ERROR("sync freeze failed", KR(ret), KR(tmp_ret), KR(tmp_ret_lob), K(tablet_id), K(lob_meta_tablet_id)); + if (OB_SUCCESS != (tmp_ret | tmp_ret_lob)) { + ret = OB_EAGAIN; + LOG_WARN("sync freeze failed", KR(ret), KR(tmp_ret), KR(tmp_ret_lob), K(tablet_id), K(lob_meta_tablet_id)); } } } @@ -727,9 +727,9 @@ int ObDDLIncCommitReplayExecutor::do_replay_(ObTabletHandle &tablet_handle) if (lob_meta_tablet_id.is_valid()) { tmp_ret_lob = ls_->sync_tablet_freeze_for_direct_load(lob_meta_tablet_id); } - ret = tmp_ret | tmp_ret_lob; - if (OB_FAIL(ret)) { - LOG_ERROR("sync freeze failed", KR(ret), KR(tmp_ret), KR(tmp_ret_lob), K(tablet_id), K(lob_meta_tablet_id)); + if (OB_SUCCESS != (tmp_ret | tmp_ret_lob)) { + ret = OB_EAGAIN; + LOG_WARN("sync freeze failed", KR(ret), KR(tmp_ret), KR(tmp_ret_lob), K(tablet_id), K(lob_meta_tablet_id)); } } } diff --git a/src/storage/ls/ob_ls.cpp b/src/storage/ls/ob_ls.cpp index ced319e4f..e260ecb80 100644 --- a/src/storage/ls/ob_ls.cpp +++ b/src/storage/ls/ob_ls.cpp @@ -1913,7 +1913,7 @@ int ObLS::tablet_freeze_with_rewrite_meta(const ObTabletID &tablet_id, } /** - * @brief Used for async freeze task + * @brief Used for both async and sync freeze * * @param tablet_id tablet to be freezed * @param epoch to check if logstream has offlined @@ -1930,9 +1930,13 @@ int ObLS::tablet_freeze_task_for_direct_load(const ObTabletID &tablet_id, const } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; STORAGE_LOG(WARN, "ls is not inited", K(ret)); - } else if (OB_UNLIKELY(is_offline()) || ATOMIC_LOAD(&switch_epoch_) != epoch) { + } else if (OB_UNLIKELY(is_offline())) { ret = OB_LS_OFFLINE; LOG_WARN("ls has offlined", K(ret), K_(ls_meta)); + } else if (ATOMIC_LOAD(&switch_epoch_) != epoch) { + // happened in async freeze situation. This ls has offlined and onlined again + ret = OB_SUCCESS; + FLOG_INFO("quit freeze because logstream epoch has changed", K(ret), K(tablet_id), K(epoch), K(ls_meta_)); } else if (OB_FAIL(ls_freezer_.tablet_freeze_task_for_direct_load(tablet_id, result))) { LOG_WARN("tablet force freeze failed", K(ret), K(tablet_id)); } else { @@ -1940,17 +1944,23 @@ int ObLS::tablet_freeze_task_for_direct_load(const ObTabletID &tablet_id, const } if (OB_FAIL(ret)) { - if (OB_NOT_INIT == ret || OB_NOT_RUNNING == ret || OB_LS_OFFLINE == ret) { - STORAGE_LOG(INFO, "reset ret code to stop retry", KR(ret)); - ret = OB_SUCCESS; - } else { - // reset ret to EAGAIN to retry freeze - ret = OB_EAGAIN; + int origin_ret = ret; + // reset ret to EAGAIN to retry freeze + ret = OB_EAGAIN; + if (REACH_TIME_INTERVAL(1LL * 1000LL * 1000LL /* 1 second */)) { + STORAGE_LOG(INFO, "reset ret code to stop retry", KR(ret), KR(origin_ret)); } } return ret; } +/** + * @brief sync freeze only retry for a while. + * + * @param tablet_id + * @param max_retry_time + * @return int + */ int ObLS::sync_tablet_freeze_for_direct_load(const ObTabletID &tablet_id, const int64_t max_retry_time) { int ret = OB_SUCCESS; @@ -1962,9 +1972,6 @@ int ObLS::sync_tablet_freeze_for_direct_load(const ObTabletID &tablet_id, const ret = OB_SUCCESS; ObFuture result; if (OB_FAIL(tablet_freeze_task_for_direct_load(tablet_id, epoch, &result))) { - if (REACH_TIME_INTERVAL(1LL * 1000LL * 1000LL /* 1 second */)) { - LOG_INFO("fail to start tablet freeze. need retry", K(ret), K(tablet_id)); - } usleep(RETRY_INTERVAL); } else if (OB_FAIL(ls_freezer_.wait_freeze_finished(result))) { STORAGE_LOG(WARN, "freeze task failed", KR(ret)); diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index c41b16d45..6d3b96838 100644 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -874,7 +874,7 @@ public: const uint64_t epoch, ObFuture *result = nullptr); int sync_tablet_freeze_for_direct_load(const ObTabletID &tablet_id, - const int64_t max_retry_time = 3600LL * 1000LL * 1000LL /*1 hour*/); + const int64_t max_retry_time = 5LL * 1000LL * 1000LL /*5 seconds*/); void async_tablet_freeze_for_direct_load(const ObTabletID &tablet_id); DELEGATE_WITH_RET(ls_freezer_, wait_freeze_finished, int); diff --git a/src/storage/ob_storage_table_guard.cpp b/src/storage/ob_storage_table_guard.cpp index 32e94daee..af6eacf12 100644 --- a/src/storage/ob_storage_table_guard.cpp +++ b/src/storage/ob_storage_table_guard.cpp @@ -180,6 +180,10 @@ int ObStorageTableGuard::refresh_and_protect_memtable() } } while ((OB_SUCC(ret) || OB_ENTRY_NOT_EXIST == ret || OB_EAGAIN == ret) && need_retry); + if (OB_LS_OFFLINE == ret) { + ret = OB_EAGAIN; + STORAGE_LOG(INFO, "reset ret code to OB_EAGAIN to avoid error log", KR(ret), K(ls_id), K(tablet_id)); + } return ret; } @@ -192,13 +196,17 @@ int ObStorageTableGuard::create_data_memtable_(const share::ObLSID &ls_id, ObLSHandle ls_handle; ObTabletHandle tmp_handle; SCN clog_checkpoint_scn; + ObLS *ls = nullptr; if (OB_FAIL(MTL(ObLSService *)->get_ls(ls_id, ls_handle, ObLSGetMod::STORAGE_MOD))) { LOG_WARN("failed to get log stream", K(ret), K(ls_id), K(tablet_id)); } else if (OB_UNLIKELY(!ls_handle.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected error, invalid ls handle", K(ret), K(ls_handle), K(ls_id), K(tablet_id)); - } else if (OB_FAIL(ls_handle.get_ls()->get_tablet_svr()->get_tablet( - tablet_id, tmp_handle, 0, ObMDSGetTabletMode::READ_WITHOUT_CHECK))) { + } else if (FALSE_IT(ls = ls_handle.get_ls())) { + } else if (ls->is_offline()) { + ret = OB_LS_OFFLINE; + FLOG_INFO("create data memtable failed because of ls offline", KR(ret), K(ls_id), K(tablet_id)); + } else if (OB_FAIL(ls->get_tablet_svr()->get_tablet(tablet_id, tmp_handle, 0, ObMDSGetTabletMode::READ_WITHOUT_CHECK))) { LOG_WARN("fail to get tablet", K(ret), K(ls_id), K(tablet_id)); } else if (FALSE_IT(clog_checkpoint_scn = tmp_handle.get_obj()->get_tablet_meta().clog_checkpoint_scn_)) { } else if (replay_scn_ > clog_checkpoint_scn) {