Files
oceanbase/src/clog/ob_clog_writer.cpp
2021-07-15 10:51:14 +08:00

423 lines
14 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#include "ob_clog_writer.h"
#include "storage/ob_partition_service.h"
#include "lib/thread/ob_thread_name.h"
#include "lib/atomic/ob_atomic.h"
#include "lib/allocator/ob_malloc.h"
#include "lib/stat/ob_diagnose_info.h"
#include "lib/statistic_event/ob_stat_event.h"
#include "lib/file/file_directory_utils.h"
#include "share/config/ob_server_config.h"
#include "share/ob_bg_thread_monitor.h"
#include "clog/ob_log_common.h"
#include "clog/ob_log_entry.h"
#include "clog/ob_log_define.h"
#include "clog/ob_log_file_trailer.h"
#include "clog/ob_clog_file_writer.h"
#include "observer/ob_server_struct.h"
#include "election/ob_election.h"
using namespace oceanbase::common;
using namespace oceanbase::share;
using namespace oceanbase::election;
namespace oceanbase {
namespace clog {
ObCLogWriter::ObCLogWriter()
: is_started_(false),
is_disk_error_(false),
file_mutex_(),
file_writer_(NULL),
type_(INVALID_WRITE_POOL),
log_cache_(0),
info_getter_(NULL),
tail_(NULL)
{}
ObCLogWriter::~ObCLogWriter()
{
destroy();
}
int ObCLogWriter::init(ObCLogWriterCfg& clog_cfg)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(is_inited())) {
ret = OB_INIT_TWICE;
CLOG_LOG(WARN, "The ObCLogWriter has been inited, ", K(ret));
} else if (OB_UNLIKELY(!clog_cfg.is_valid()) || OB_UNLIKELY(1 != clog_cfg.base_cfg_.group_commit_min_item_cnt_) ||
OB_UNLIKELY(1 != clog_cfg.base_cfg_.group_commit_max_item_cnt_)) {
ret = OB_INVALID_ARGUMENT;
CLOG_LOG(WARN, "Invalid argument, ", K(clog_cfg), K(ret));
} else if (OB_FAIL(ObBaseLogWriter::init(clog_cfg.base_cfg_))) {
CLOG_LOG(WARN, "Fail to init base log writer, ", K(ret));
} else {
file_writer_ = clog_cfg.log_file_writer_;
log_cache_ = clog_cfg.log_cache_;
info_getter_ = clog_cfg.info_getter_;
tail_ = clog_cfg.tail_ptr_;
type_ = clog_cfg.type_;
}
if (!is_inited()) {
destroy();
}
return ret;
}
int ObCLogWriter::start(file_id_t& file_id, offset_t& offset)
{
int ret = OB_SUCCESS;
uint32_t load_file_id = file_id;
uint32_t load_offset = offset;
ObAtomicFilePos file_pos;
if (OB_UNLIKELY(!is_inited())) {
ret = OB_NOT_INIT;
CLOG_LOG(ERROR, "The ObCLogWriter has not been inited, ", K(ret));
} else if (OB_UNLIKELY(is_started_)) {
ret = OB_INIT_TWICE;
CLOG_LOG(WARN, "The ObCLogWriter has been started, ", K(ret));
} else if (!is_valid_file_id(file_id)) {
ret = OB_INVALID_ARGUMENT;
CLOG_LOG(WARN, "invalid argument", K(file_id), K(offset));
} else if (OB_FAIL(file_writer_->load_file(load_file_id, load_offset, false))) {
CLOG_LOG(WARN, "sync shared memory fail", K(ret), K(file_id), K(offset));
} else {
is_started_ = true;
file_id = load_file_id;
offset = load_offset;
CLOG_LOG(INFO, "log writer start success", K(file_id), K(offset));
}
return ret;
}
void ObCLogWriter::destroy()
{
ObBaseLogWriter::destroy();
type_ = INVALID_WRITE_POOL;
file_writer_ = NULL;
log_cache_ = NULL;
info_getter_ = NULL;
tail_ = NULL;
is_disk_error_ = false;
is_started_ = false;
}
int ObCLogWriter::switch_file()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_started_)) {
ret = OB_NOT_INIT;
CLOG_LOG(WARN, "The ObCLogWriter has not been inited, ", K(ret));
} else {
lib::ObMutexGuard guard(file_mutex_);
if (OB_FAIL(inner_switch_file())) {
CLOG_LOG(WARN, "Fail to switch file, ", K(ret));
}
}
return ret;
}
file_id_t ObCLogWriter::get_file_id() const
{
return (NULL == file_writer_) ? 0 : file_writer_->get_cur_file_id();
}
bool ObCLogWriter::is_disk_error() const
{
bool b_ret = ATOMIC_LOAD(&is_disk_error_);
if (!b_ret && nullptr != file_writer_) {
b_ret = file_writer_->is_write_hang();
}
return b_ret;
}
int ObCLogWriter::set_is_disk_error()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited())) {
ret = OB_NOT_INIT;
} else {
ATOMIC_STORE(&is_disk_error_, true);
CLOG_LOG(ERROR, "clog disk is error!!!");
}
return ret;
}
int ObCLogWriter::reset_is_disk_error()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited())) {
ret = OB_NOT_INIT;
} else {
ATOMIC_STORE(&is_disk_error_, false);
CLOG_LOG(INFO, "reset clog disk status to normal");
}
return ret;
}
void ObCLogWriter::set_clog_writer_thread_name()
{
static __thread bool has_set_thread_name = false;
if (!has_set_thread_name) {
lib::set_thread_name("CLGWR");
has_set_thread_name = true;
}
}
void ObCLogWriter::process_log_items(common::ObIBaseLogItem** items, const int64_t item_cnt, int64_t& finish_cnt)
{
int ret = OB_SUCCESS;
ObICLogItem *item = NULL;
int64_t cur_time = 0;
int64_t io_time = 0;
int64_t flush_time = 0;
finish_cnt = 0;
ObLogBlockMetaV2 block_meta;
const int64_t block_meta_len = block_meta.get_serialize_size();
int64_t meta_pos = 0;
const bool is_disk_error = ATOMIC_LOAD(&is_disk_error_);
set_clog_writer_thread_name();
if (OB_UNLIKELY(!is_started_)) {
ret = OB_NOT_INIT;
CLOG_LOG(WARN, "The ObCLogWriter has not been started, ", K(ret));
} else if (OB_UNLIKELY(NULL == items) || OB_UNLIKELY(item_cnt <= 0) || OB_UNLIKELY(item_cnt > 1) ||
NULL == (item = reinterpret_cast<ObICLogItem*>(items[0])) || OB_UNLIKELY(!item->is_valid()) ||
OB_UNLIKELY(item->get_data_len() > OB_MAX_LOG_BUFFER_SIZE)) {
ret = OB_INVALID_ARGUMENT;
CLOG_LOG(WARN, "Invalid argument, ", K(ret), KP(items), K(item_cnt));
} else if (OB_UNLIKELY(is_disk_error)) {
ret = OB_STATE_NOT_MATCH;
CLOG_LOG(ERROR, "The ObCLogWriter met disk error and been frozen, ", K(ret), K(is_disk_error));
after_flush(item, block_meta_len, ret, file_writer_->get_cur_file_len(), finish_cnt);
} else {
const bool is_idempotent = false;
const uint64_t write_len = block_meta_len + item->get_data_len();
ObCLogDiskErrorCB *cb = NULL;
lib::ObMutexGuard guard(file_mutex_);
BG_NEW_CALLBACK(cb, ObCLogDiskErrorCB, this);
BG_MONITOR_GUARD(DEFAULT_CLOG_FLUSH_TIMEOUT_US, is_idempotent, cb);
// The timestamp value in block header must be generated by the time order, so
// call inner_switch_file first here.
if (need_switch_file(write_len) && OB_FAIL(inner_switch_file())) {
CLOG_LOG(ERROR, "Fail to switch file, ", K(ret), K(file_writer_->get_cur_file_id()));
} else if (OB_FAIL(block_meta.build_serialized_block(item->get_buf() - block_meta_len,
block_meta_len,
item->get_buf(),
item->get_data_len(),
OB_DATA_BLOCK,
meta_pos))) {
CLOG_LOG(ERROR, "build serialized block meta fail", K(ret));
}
// invoke callback when fail
if (OB_FAIL(ret)) {
after_flush(item, block_meta_len, ret, file_writer_->get_cur_file_len(), finish_cnt);
}
if (OB_SUCC(ret)) {
if (OB_FAIL(file_writer_->append_log_entry(item->get_buf() - block_meta_len, (uint32_t)write_len))) {
CLOG_LOG(ERROR, "fail to add log item to buf, ", K(ret));
}
}
int64_t flush_start_offset = -1;
if (OB_SUCC(ret)) {
cur_time = ObTimeUtility::current_time();
do {
// TODO: flush log will not return OB_TIMEOUT, other IO error will be treated as bug
if (OB_FAIL(file_writer_->flush(info_getter_, log_cache_, tail_, flush_start_offset))) {
set_is_disk_error();
// flush log to disk until die when IO hang, other IO error will be treated as bug
if (OB_TIMEOUT == ret && REACH_TIME_INTERVAL(60 * 1000 * 1000)) {
ret = OB_IO_ERROR;
CLOG_LOG(ERROR, "flush IO hang, retrying", K(ret));
} else {
CLOG_LOG(ERROR, "Fail to flush clog to disk, ", K(ret));
}
} else {
reset_is_disk_error();
}
} while (!has_stoped() && OB_TIMEOUT == ret);
}
if (OB_SUCC(ret)) {
io_time = ObTimeUtility::current_time() - cur_time;
//log flush succeed, invoke callback when disk sync
after_flush(item, block_meta_len, ret, flush_start_offset, finish_cnt);
flush_time = ObTimeUtility::current_time() - cur_time - io_time;
if (flush_time + io_time > 100 * 1000) {
CLOG_LOG(WARN,
"slow flush",
K(flush_time),
K(io_time),
"file_id",
file_writer_->get_cur_file_id(),
K(flush_start_offset),
"file_offset",
file_writer_->get_cur_file_len());
}
if (CLOG_WRITE_POOL == type_) {
EVENT_INC(CLOG_WRITE_COUNT);
EVENT_ADD(CLOG_WRITE_TIME, io_time);
} else if (ILOG_WRITE_POOL == type_) {
EVENT_INC(ILOG_WRITE_COUNT);
EVENT_ADD(ILOG_WRITE_TIME, io_time);
} else {
CLOG_LOG(ERROR, "unknown write pool type", K(type_));
}
}
}
if (OB_FAIL(ret)) {
CLOG_LOG(ERROR, "log writer write data error", K(ret));
if (!has_stoped()) {
on_fatal_error(ret);
}
}
if (REACH_TIME_INTERVAL(1000 * 1000)) {
CLOG_LOG(INFO, "ObCLogWriter get_queued_item_cnt", "queued_cnt", get_queued_item_cnt());
}
}
bool ObCLogWriter::need_switch_file(const uint64_t write_len) const
{
// Left space is not enough for data or info block
uint64_t max_switch_file_limit = 0;
if (CLOG_WRITE_POOL == type_) {
max_switch_file_limit = CLOG_MAX_SWITCH_FILE_LIMIT;
} else {
max_switch_file_limit = ILOG_MAX_SWITCH_FILE_LIMIT;
}
return !file_writer_->enough_file_space(write_len) || (info_getter_->get_entry_cnt() > max_switch_file_limit);
}
void ObCLogWriter::after_flush(ObICLogItem* item, const int64_t block_meta_len, const int err_code,
const uint32_t file_offset, int64_t& finish_cnt)
{
int ret = OB_SUCCESS;
uint32_t file_id = file_writer_->get_cur_file_id();
if (OB_SUCCESS == err_code) {
const common::ObAddr addr = GCTX.self_addr_;
if (OB_FAIL(log_cache_->append_data(addr,
(char*)(item->get_buf() - block_meta_len),
file_id,
file_offset,
item->get_data_len() + block_meta_len))) {
CLOG_LOG(ERROR, "Fail to append data to cache, ", K(ret));
}
tail_->advance(file_id, file_offset + (uint32_t)item->get_data_len() + (uint32_t)block_meta_len);
}
if (OB_FAIL(item->after_flushed(file_id, file_offset + (uint32_t)block_meta_len, err_code, type_))) {
CLOG_LOG(ERROR, "Fail to process item after flush, ", K(ret));
}
++finish_cnt;
}
int ObCLogWriter::inner_switch_file()
{
int ret = OB_SUCCESS;
const int64_t start = ObTimeUtility::current_time();
if (OB_FAIL(file_writer_->end_current_file(info_getter_, log_cache_, tail_))) {
CLOG_LOG(ERROR, "switch file failed", K(ret));
}
// keep try open new file when too many open files, other IO error will be treated as bug
if (OB_SUCC(ret)) {
do {
if (OB_FAIL(file_writer_->create_next_file())) {
is_disk_error_ = true;
if (OB_FILE_NOT_OPENED == ret) {
if (REACH_TIME_INTERVAL(60 * 1000 * 1000)) {
CLOG_LOG(ERROR, "Too many open files, retrying", K(ret));
}
sleep(30);
} else {
CLOG_LOG(ERROR, "Fail to switch file, ", K(ret), K(file_writer_->get_cur_file_id()));
}
} else {
is_disk_error_ = false;
}
} while (!has_stoped() && OB_FILE_NOT_OPENED == ret);
}
if (OB_SUCC(ret)) {
if (CLOG_WRITE_POOL == type_) {
EVENT_ADD(CLOG_FILE_TOTAL_SIZE, CLOG_FILE_SIZE);
} else if (ILOG_WRITE_POOL == type_) {
EVENT_ADD(ILOG_FILE_TOTAL_SIZE, CLOG_FILE_SIZE);
}
const int64_t end = ObTimeUtility::current_time();
CLOG_LOG(INFO, "switch file success", K(*tail_), "new_file", file_writer_->get_cur_file_id(), "used", end - start);
} else {
CLOG_LOG(ERROR, "switch file error", K(ret), "new_file", file_writer_->get_cur_file_id());
}
return ret;
}
ObCLogDiskErrorCB::ObCLogDiskErrorCB(ObCLogWriter* host) : host_(host)
{}
ObCLogDiskErrorCB::~ObCLogDiskErrorCB()
{}
int ObCLogDiskErrorCB::callback()
{
int ret = OB_SUCCESS;
storage::ObPartitionService& partition_service = storage::ObPartitionService::get_instance();
if (OB_ISNULL(host_)) {
ret = OB_ERR_UNEXPECTED;
} else if (OB_FAIL(host_->set_is_disk_error())) {
CLOG_LOG(ERROR, "ObCLogDiskErrorCB set_is_disk_error failed", K(ret));
} else if (OB_FAIL(partition_service.try_revoke_all_leader(ObElection::RevokeType::CLOG_DISK_HANG))) {
CLOG_LOG(ERROR, "ObCLogDiskErrorCB try_revoke_all_leader failed", K(ret));
}
CLOG_LOG(INFO, "call leader revoke because of clog disk hang", K(ret));
return ret;
}
void ObCLogDiskErrorCB::destroy()
{
// If disk has real error, may cause is_disk_error be false,
// however, observer will be killed.
if (OB_NOT_NULL(host_)) {
(void)host_->reset_is_disk_error();
}
}
int locate_clog_tail(
const int64_t timeout, ObILogFileStore* file_store, ObLogDirectReader* reader, file_id_t& file_id, offset_t& offset)
{
ObLogFileTailLocatorImpl<ObLogEntry, ObIRawLogIterator> impl;
return impl.locate_tail(timeout, file_store, reader, file_id, offset);
}
int locate_ilog_tail(
const int64_t timeout, ObILogFileStore* file_store, ObLogDirectReader* reader, file_id_t& file_id, offset_t& offset)
{
ObLogFileTailLocatorImpl<ObIndexEntry, ObIRawIndexIterator> impl;
return impl.locate_tail(timeout, file_store, reader, file_id, offset);
}
} // namespace clog
} // namespace oceanbase