/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #include "ob_clog_writer.h" #include "storage/ob_partition_service.h" #include "lib/thread/ob_thread_name.h" #include "lib/atomic/ob_atomic.h" #include "lib/allocator/ob_malloc.h" #include "lib/stat/ob_diagnose_info.h" #include "lib/statistic_event/ob_stat_event.h" #include "lib/file/file_directory_utils.h" #include "share/config/ob_server_config.h" #include "share/ob_bg_thread_monitor.h" #include "clog/ob_log_common.h" #include "clog/ob_log_entry.h" #include "clog/ob_log_define.h" #include "clog/ob_log_file_trailer.h" #include "clog/ob_clog_file_writer.h" #include "observer/ob_server_struct.h" #include "election/ob_election.h" using namespace oceanbase::common; using namespace oceanbase::share; using namespace oceanbase::election; namespace oceanbase { namespace clog { ObCLogWriter::ObCLogWriter() : is_started_(false), is_disk_error_(false), file_mutex_(), file_writer_(NULL), type_(INVALID_WRITE_POOL), log_cache_(0), info_getter_(NULL), tail_(NULL) {} ObCLogWriter::~ObCLogWriter() { destroy(); } int ObCLogWriter::init(ObCLogWriterCfg& clog_cfg) { int ret = OB_SUCCESS; if (OB_UNLIKELY(is_inited())) { ret = OB_INIT_TWICE; CLOG_LOG(WARN, "The ObCLogWriter has been inited, ", K(ret)); } else if (OB_UNLIKELY(!clog_cfg.is_valid()) || OB_UNLIKELY(1 != clog_cfg.base_cfg_.group_commit_min_item_cnt_) || OB_UNLIKELY(1 != clog_cfg.base_cfg_.group_commit_max_item_cnt_)) { ret = OB_INVALID_ARGUMENT; CLOG_LOG(WARN, "Invalid argument, ", K(clog_cfg), K(ret)); } else if (OB_FAIL(ObBaseLogWriter::init(clog_cfg.base_cfg_))) { CLOG_LOG(WARN, "Fail to init base log writer, ", K(ret)); } else { file_writer_ = clog_cfg.log_file_writer_; log_cache_ = clog_cfg.log_cache_; info_getter_ = clog_cfg.info_getter_; tail_ = clog_cfg.tail_ptr_; type_ = clog_cfg.type_; } if (!is_inited()) { destroy(); } return ret; } int ObCLogWriter::start(file_id_t& file_id, offset_t& offset) { int ret = OB_SUCCESS; uint32_t load_file_id = file_id; uint32_t load_offset = offset; ObAtomicFilePos file_pos; if (OB_UNLIKELY(!is_inited())) { ret = OB_NOT_INIT; CLOG_LOG(ERROR, "The ObCLogWriter has not been inited, ", K(ret)); } else if (OB_UNLIKELY(is_started_)) { ret = OB_INIT_TWICE; CLOG_LOG(WARN, "The ObCLogWriter has been started, ", K(ret)); } else if (!is_valid_file_id(file_id)) { ret = OB_INVALID_ARGUMENT; CLOG_LOG(WARN, "invalid argument", K(file_id), K(offset)); } else if (OB_FAIL(file_writer_->load_file(load_file_id, load_offset, false))) { CLOG_LOG(WARN, "sync shared memory fail", K(ret), K(file_id), K(offset)); } else { is_started_ = true; file_id = load_file_id; offset = load_offset; CLOG_LOG(INFO, "log writer start success", K(file_id), K(offset)); } return ret; } void ObCLogWriter::destroy() { ObBaseLogWriter::destroy(); type_ = INVALID_WRITE_POOL; file_writer_ = NULL; log_cache_ = NULL; info_getter_ = NULL; tail_ = NULL; is_disk_error_ = false; is_started_ = false; } int ObCLogWriter::switch_file() { int ret = OB_SUCCESS; if (OB_UNLIKELY(!is_started_)) { ret = OB_NOT_INIT; CLOG_LOG(WARN, "The ObCLogWriter has not been inited, ", K(ret)); } else { lib::ObMutexGuard guard(file_mutex_); if (OB_FAIL(inner_switch_file())) { CLOG_LOG(WARN, "Fail to switch file, ", K(ret)); } } return ret; } file_id_t ObCLogWriter::get_file_id() const { return (NULL == file_writer_) ? 0 : file_writer_->get_cur_file_id(); } bool ObCLogWriter::is_disk_error() const { bool b_ret = ATOMIC_LOAD(&is_disk_error_); if (!b_ret && nullptr != file_writer_) { b_ret = file_writer_->is_write_hang(); } return b_ret; } int ObCLogWriter::set_is_disk_error() { int ret = OB_SUCCESS; if (OB_UNLIKELY(!is_inited())) { ret = OB_NOT_INIT; } else { ATOMIC_STORE(&is_disk_error_, true); CLOG_LOG(WARN, "clog disk may be hang or something error has happen!"); } return ret; } int ObCLogWriter::reset_is_disk_error() { int ret = OB_SUCCESS; if (OB_UNLIKELY(!is_inited())) { ret = OB_NOT_INIT; } else { ATOMIC_STORE(&is_disk_error_, false); CLOG_LOG(TRACE, "reset clog disk status to normal"); } return ret; } void ObCLogWriter::set_clog_writer_thread_name() { static __thread bool has_set_thread_name = false; if (!has_set_thread_name) { lib::set_thread_name("CLGWR"); has_set_thread_name = true; } } void ObCLogWriter::process_log_items(common::ObIBaseLogItem** items, const int64_t item_cnt, int64_t& finish_cnt) { int ret = OB_SUCCESS; ObICLogItem* item = NULL; int64_t cur_time = 0; int64_t io_time = 0; int64_t flush_time = 0; finish_cnt = 0; ObLogBlockMetaV2 block_meta; const int64_t block_meta_len = block_meta.get_serialize_size(); int64_t meta_pos = 0; const bool is_disk_error = ATOMIC_LOAD(&is_disk_error_); set_clog_writer_thread_name(); if (OB_UNLIKELY(!is_started_)) { ret = OB_NOT_INIT; CLOG_LOG(WARN, "The ObCLogWriter has not been started, ", K(ret)); } else if (OB_UNLIKELY(NULL == items) || OB_UNLIKELY(item_cnt <= 0) || OB_UNLIKELY(item_cnt > 1) || NULL == (item = reinterpret_cast(items[0])) || OB_UNLIKELY(!item->is_valid()) || OB_UNLIKELY(item->get_data_len() > OB_MAX_LOG_BUFFER_SIZE)) { ret = OB_INVALID_ARGUMENT; CLOG_LOG(WARN, "Invalid argument, ", K(ret), KP(items), K(item_cnt)); } else if (OB_UNLIKELY(is_disk_error)) { ret = OB_STATE_NOT_MATCH; CLOG_LOG(ERROR, "The ObCLogWriter met disk error and been frozen, ", K(ret), K(is_disk_error)); after_flush(item, block_meta_len, ret, file_writer_->get_cur_file_len(), finish_cnt); } else { const bool is_idempotent = false; const uint64_t write_len = block_meta_len + item->get_data_len(); const int64_t warning_value = GCONF.data_storage_warning_tolerance_time; ObCLogDiskErrorCB* cb = NULL; lib::ObMutexGuard guard(file_mutex_); BG_NEW_CALLBACK(cb, ObCLogDiskErrorCB, this); BG_MONITOR_GUARD(warning_value, is_idempotent, cb); // The timestamp value in block header must be generated by the time order, so // call inner_switch_file first here. if (need_switch_file(write_len) && OB_FAIL(inner_switch_file())) { CLOG_LOG(ERROR, "Fail to switch file, ", K(ret), K(file_writer_->get_cur_file_id())); } else if (OB_FAIL(block_meta.build_serialized_block(item->get_buf() - block_meta_len, block_meta_len, item->get_buf(), item->get_data_len(), OB_DATA_BLOCK, meta_pos))) { CLOG_LOG(ERROR, "build serialized block meta fail", K(ret)); } // invoke callback when fail if (OB_FAIL(ret)) { after_flush(item, block_meta_len, ret, file_writer_->get_cur_file_len(), finish_cnt); } if (OB_SUCC(ret)) { if (OB_FAIL(file_writer_->append_log_entry(item->get_buf() - block_meta_len, (uint32_t)write_len))) { CLOG_LOG(ERROR, "fail to add log item to buf, ", K(ret)); } } int64_t flush_start_offset = -1; if (OB_SUCC(ret)) { cur_time = ObTimeUtility::current_time(); do { // TODO: flush log will not return OB_TIMEOUT, other IO error will be treated as bug if (OB_FAIL(file_writer_->flush(info_getter_, log_cache_, tail_, flush_start_offset))) { set_is_disk_error(); // flush log to disk until die when IO hang, other IO error will be treated as bug if (OB_TIMEOUT == ret && REACH_TIME_INTERVAL(60 * 1000 * 1000)) { ret = OB_IO_ERROR; CLOG_LOG(ERROR, "flush IO hang, retrying", K(ret)); } else { CLOG_LOG(ERROR, "Fail to flush clog to disk, ", K(ret)); } } else { reset_is_disk_error(); } } while (!has_stoped() && OB_TIMEOUT == ret); } if (OB_SUCC(ret)) { io_time = ObTimeUtility::current_time() - cur_time; // log flush succeed, invoke callback when disk sync after_flush(item, block_meta_len, ret, flush_start_offset, finish_cnt); flush_time = ObTimeUtility::current_time() - cur_time - io_time; if (flush_time + io_time > 100 * 1000) { CLOG_LOG(WARN, "slow flush", K(flush_time), K(io_time), "file_id", file_writer_->get_cur_file_id(), K(flush_start_offset), "file_offset", file_writer_->get_cur_file_len()); } if (CLOG_WRITE_POOL == type_) { EVENT_INC(CLOG_WRITE_COUNT); EVENT_ADD(CLOG_WRITE_TIME, io_time); } else if (ILOG_WRITE_POOL == type_) { EVENT_INC(ILOG_WRITE_COUNT); EVENT_ADD(ILOG_WRITE_TIME, io_time); } else { CLOG_LOG(ERROR, "unknown write pool type", K(type_)); } } } if (OB_FAIL(ret)) { CLOG_LOG(ERROR, "log writer write data error", K(ret)); if (!has_stoped()) { on_fatal_error(ret); } } if (REACH_TIME_INTERVAL(1000 * 1000)) { CLOG_LOG(INFO, "ObCLogWriter get_queued_item_cnt", "queued_cnt", get_queued_item_cnt()); } } bool ObCLogWriter::need_switch_file(const uint64_t write_len) const { // Left space is not enough for data or info block uint64_t max_switch_file_limit = 0; if (CLOG_WRITE_POOL == type_) { max_switch_file_limit = CLOG_MAX_SWITCH_FILE_LIMIT; } else { max_switch_file_limit = ILOG_MAX_SWITCH_FILE_LIMIT; } return !file_writer_->enough_file_space(write_len) || (info_getter_->get_entry_cnt() > max_switch_file_limit); } void ObCLogWriter::after_flush(ObICLogItem* item, const int64_t block_meta_len, const int err_code, const uint32_t file_offset, int64_t& finish_cnt) { int ret = OB_SUCCESS; uint32_t file_id = file_writer_->get_cur_file_id(); if (OB_SUCCESS == err_code) { const common::ObAddr addr = GCTX.self_addr_; if (OB_FAIL(log_cache_->append_data(addr, (char*)(item->get_buf() - block_meta_len), file_id, file_offset, item->get_data_len() + block_meta_len))) { CLOG_LOG(ERROR, "Fail to append data to cache, ", K(ret)); } tail_->advance(file_id, file_offset + (uint32_t)item->get_data_len() + (uint32_t)block_meta_len); } if (OB_FAIL(item->after_flushed(file_id, file_offset + (uint32_t)block_meta_len, err_code, type_))) { CLOG_LOG(ERROR, "Fail to process item after flush, ", K(ret)); } ++finish_cnt; } int ObCLogWriter::inner_switch_file() { int ret = OB_SUCCESS; const int64_t start = ObTimeUtility::current_time(); if (OB_FAIL(file_writer_->end_current_file(info_getter_, log_cache_, tail_))) { CLOG_LOG(ERROR, "switch file failed", K(ret)); } // keep try open new file when too many open files, other IO error will be treated as bug if (OB_SUCC(ret)) { do { if (OB_FAIL(file_writer_->create_next_file())) { is_disk_error_ = true; if (OB_FILE_NOT_OPENED == ret) { if (REACH_TIME_INTERVAL(60 * 1000 * 1000)) { CLOG_LOG(ERROR, "Too many open files, retrying", K(ret)); } sleep(30); } else { CLOG_LOG(ERROR, "Fail to switch file, ", K(ret), K(file_writer_->get_cur_file_id())); } } else { is_disk_error_ = false; } } while (!has_stoped() && OB_FILE_NOT_OPENED == ret); } if (OB_SUCC(ret)) { if (CLOG_WRITE_POOL == type_) { EVENT_ADD(CLOG_FILE_TOTAL_SIZE, CLOG_FILE_SIZE); } else if (ILOG_WRITE_POOL == type_) { EVENT_ADD(ILOG_FILE_TOTAL_SIZE, CLOG_FILE_SIZE); } const int64_t end = ObTimeUtility::current_time(); CLOG_LOG(INFO, "switch file success", K(*tail_), "new_file", file_writer_->get_cur_file_id(), "used", end - start); } else { CLOG_LOG(ERROR, "switch file error", K(ret), "new_file", file_writer_->get_cur_file_id()); } return ret; } ObCLogDiskErrorCB::ObCLogDiskErrorCB(ObCLogWriter* host) : host_(host) {} ObCLogDiskErrorCB::~ObCLogDiskErrorCB() {} int ObCLogDiskErrorCB::callback() { int ret = OB_SUCCESS; storage::ObPartitionService& partition_service = storage::ObPartitionService::get_instance(); if (OB_ISNULL(host_)) { ret = OB_ERR_UNEXPECTED; } else if (OB_FAIL(host_->set_is_disk_error())) { CLOG_LOG(ERROR, "ObCLogDiskErrorCB set_is_disk_error failed", K(ret)); } else if (OB_FAIL(partition_service.try_revoke_all_leader(ObElection::RevokeType::CLOG_DISK_HANG))) { CLOG_LOG(ERROR, "ObCLogDiskErrorCB try_revoke_all_leader failed", K(ret)); } CLOG_LOG(INFO, "call leader revoke because of clog disk hang", K(ret)); return ret; } void ObCLogDiskErrorCB::destroy() { // If disk has real error, may cause is_disk_error be false, // however, observer will be killed. if (OB_NOT_NULL(host_)) { (void)host_->reset_is_disk_error(); } } int locate_clog_tail( const int64_t timeout, ObILogFileStore* file_store, ObLogDirectReader* reader, file_id_t& file_id, offset_t& offset) { ObLogFileTailLocatorImpl impl; return impl.locate_tail(timeout, file_store, reader, file_id, offset); } int locate_ilog_tail( const int64_t timeout, ObILogFileStore* file_store, ObLogDirectReader* reader, file_id_t& file_id, offset_t& offset) { ObLogFileTailLocatorImpl impl; return impl.locate_tail(timeout, file_store, reader, file_id, offset); } } // namespace clog } // namespace oceanbase