From 412211b212bb3a3929854e02ea5eca483f6d5758 Mon Sep 17 00:00:00 2001 From: simonjoylet Date: Sat, 10 Feb 2024 10:15:32 +0000 Subject: [PATCH] [CP] retry when ddl clog write failed --- src/storage/ddl/ob_ddl_redo_log_writer.cpp | 89 +++++++++++++++++++ src/storage/ddl/ob_ddl_redo_log_writer.h | 12 +++ .../ddl/ob_direct_insert_sstable_ctx_new.cpp | 2 +- 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/src/storage/ddl/ob_ddl_redo_log_writer.cpp b/src/storage/ddl/ob_ddl_redo_log_writer.cpp index 27ff654d3..63fc54380 100644 --- a/src/storage/ddl/ob_ddl_redo_log_writer.cpp +++ b/src/storage/ddl/ob_ddl_redo_log_writer.cpp @@ -944,6 +944,11 @@ if (OB_ISNULL(buffer = static_cast(ob_malloc(buffer_size, ObMemAttr(MTL_ return ret; } +bool ObDDLRedoLogWriter::need_retry(int ret_code) +{ + return OB_NOT_MASTER == ret_code; +} + ObDDLRedoLogHandle::ObDDLRedoLogHandle() : cb_(nullptr), scn_(SCN::min_scn()) { @@ -1223,6 +1228,37 @@ int ObDDLRedoLogWriter::wait_macro_block_log_finish( return ret; } +int ObDDLRedoLogWriter::write_commit_log_with_retry( + const bool allow_remote_write, + const ObITable::TableKey &table_key, + const share::SCN &start_scn, + ObTabletDirectLoadMgrHandle &direct_load_mgr_handle, + ObTabletHandle &tablet_handle, + SCN &commit_scn, + bool &is_remote_write, + uint32_t &lock_tid) +{ + int ret = OB_SUCCESS; + int64_t start_ts = ObTimeUtility::fast_current_time(); + const int64_t timeout_us = ObDDLRedoLogWriter::DEFAULT_RETRY_TIMEOUT_US; + int64_t retry_count = 0; + do { + if (OB_FAIL(THIS_WORKER.check_status())) { + LOG_WARN("check status failed", K(ret)); + } else if (OB_FAIL(write_commit_log(allow_remote_write, table_key, start_scn, direct_load_mgr_handle, tablet_handle, commit_scn, is_remote_write, lock_tid))) { + LOG_WARN("write ddl commit log failed", K(ret)); + } + if (ObDDLRedoLogWriter::need_retry(ret)) { + usleep(1000L * 1000L); // 1s + ++retry_count; + LOG_INFO("retry write ddl commit log", K(ret), K(table_key), K(retry_count)); + } else { + break; + } + } while (ObTimeUtility::fast_current_time() - start_ts < timeout_us); + return ret; +} + int ObDDLRedoLogWriter::write_commit_log( const bool allow_remote_write, const ObITable::TableKey &table_key, @@ -1514,6 +1550,14 @@ int ObDDLRedoLogWriterCallback::write(const ObMacroBlockHandle ¯o_handle, } if (OB_FAIL(ddl_writer_->write_macro_block_log(redo_info_, macro_block_id_, true/*allow remote write*/, task_id_))) { LOG_WARN("fail to write ddl redo log", K(ret)); + if (ObDDLRedoLogWriter::need_retry(ret)) { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(retry(ObDDLRedoLogWriter::DEFAULT_RETRY_TIMEOUT_US))) { + LOG_WARN("retry wirte ddl macro redo log failed", K(ret), K(tmp_ret), K(task_id_), K(table_key_)); + } else { + ret = OB_SUCCESS; // overwrite the return code + } + } } } return ret; @@ -1527,6 +1571,51 @@ int ObDDLRedoLogWriterCallback::wait() LOG_WARN("ObDDLRedoLogWriterCallback is not inited", K(ret)); } else if (OB_FAIL(ddl_writer_->wait_macro_block_log_finish(redo_info_, macro_block_id_))) { LOG_WARN("fail to wait redo log finish", K(ret)); + if (ObDDLRedoLogWriter::need_retry(ret)) { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(retry(ObDDLRedoLogWriter::DEFAULT_RETRY_TIMEOUT_US))) { + LOG_WARN("retry wirte ddl macro redo log failed", K(ret), K(tmp_ret), K(task_id_), K(table_key_)); + } else { + ret = OB_SUCCESS; // overwrite the return code + } + } + } + return ret; +} + +int ObDDLRedoLogWriterCallback::retry(const int64_t timeout_us) +{ + int ret = OB_SUCCESS; + int64_t retry_count = 0; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("ObDDLRedoLogWriterCallback is not inited", K(ret)); + } else if (timeout_us <= 0) { + ret = OB_TIMEOUT; + LOG_WARN("timeout less than 0", K(ret), K(timeout_us)); + } else if (OB_UNLIKELY(!macro_block_id_.is_valid() || !redo_info_.is_valid())) { + ret = OB_ERR_SYS; + LOG_WARN("macro block id or redo info not valid", K(ret), K(macro_block_id_), K(redo_info_)); + } else { + int64_t start_ts = ObTimeUtility::fast_current_time(); + while (ObTimeUtility::fast_current_time() - start_ts < timeout_us) { // ignore ret + if (OB_FAIL(THIS_WORKER.check_status())) { + LOG_WARN("check status failed", K(ret)); + } else if (OB_FAIL(ddl_writer_->write_macro_block_log(redo_info_, macro_block_id_, true/*allow remote write*/, task_id_))) { + LOG_WARN("fail to write ddl redo log", K(ret)); + } else if (OB_FAIL(ddl_writer_->wait_macro_block_log_finish(redo_info_, macro_block_id_))) { + LOG_WARN("wait ddl redo log finish failed", K(ret)); + } else { + FLOG_INFO("retry write ddl macro redo success", K(ret), K(table_key_), K(macro_block_id_)); + } + if (ObDDLRedoLogWriter::need_retry(ret)) { + usleep(1000L * 1000L); // 1s + ++retry_count; + LOG_INFO("retry write ddl macro redo log", K(ret), K(table_key_), K(retry_count)); + } else { + break; + } + } } return ret; } diff --git a/src/storage/ddl/ob_ddl_redo_log_writer.h b/src/storage/ddl/ob_ddl_redo_log_writer.h index 815b92b99..8864bd597 100644 --- a/src/storage/ddl/ob_ddl_redo_log_writer.h +++ b/src/storage/ddl/ob_ddl_redo_log_writer.h @@ -274,6 +274,17 @@ public: share::SCN &commit_scn, bool &is_remote_write, uint32_t &lock_tid); + int write_commit_log_with_retry( + const bool allow_remote_write, + const ObITable::TableKey &table_key, + const share::SCN &start_scn, + ObTabletDirectLoadMgrHandle &direct_load_mgr_handle, + ObTabletHandle &tablet_handle, + share::SCN &commit_scn, + bool &is_remote_write, + uint32_t &lock_tid); + static const int64_t DEFAULT_RETRY_TIMEOUT_US = 60L * 1000L * 1000L; // 1min + static bool need_retry(int ret_code); private: int switch_to_remote_write(); int local_write_ddl_start_log( @@ -348,6 +359,7 @@ public: int wait(); private: bool is_column_group_info_valid() const; + int retry(const int64_t timeout_us); private: bool is_inited_; blocksstable::ObDDLMacroBlockRedoInfo redo_info_; diff --git a/src/storage/ddl/ob_direct_insert_sstable_ctx_new.cpp b/src/storage/ddl/ob_direct_insert_sstable_ctx_new.cpp index 940482310..94d57621b 100644 --- a/src/storage/ddl/ob_direct_insert_sstable_ctx_new.cpp +++ b/src/storage/ddl/ob_direct_insert_sstable_ctx_new.cpp @@ -2198,7 +2198,7 @@ int ObTabletFullDirectLoadMgr::close(const int64_t execution_id, const SCN &star ObTabletDirectLoadMgrHandle direct_load_mgr_handle; if (OB_FAIL(direct_load_mgr_handle.set_obj(this))) { LOG_WARN("set direct load mgr handle failed", K(ret)); - } else if (OB_FAIL(redo_writer.write_commit_log(true, table_key_, + } else if (OB_FAIL(redo_writer.write_commit_log_with_retry(true, table_key_, start_scn, direct_load_mgr_handle, tablet_handle, commit_scn, is_remote_write, lock_tid))) { LOG_WARN("fail write ddl commit log", K(ret), K(table_key_), K(sqc_build_ctx_)); }