fixed log disk usage execcded 100% because of recycleing blocks hang.

This commit is contained in:
HaHaJeff
2023-12-15 04:43:16 +00:00
committed by ant-ob-hengtang
parent a17991664f
commit 7ed5eaeb8f
4 changed files with 76 additions and 27 deletions

View File

@ -77,8 +77,8 @@ TEST_F(TestObSimpleLogDiskMgr, out_of_disk_space)
share::SCN create_scn = share::SCN::base_scn();
EXPECT_EQ(OB_SUCCESS, get_palf_env(server_idx, palf_env));
EXPECT_EQ(OB_SUCCESS, create_paxos_group(id, create_scn, leader_idx, leader));
update_disk_options(leader_idx, MIN_DISK_SIZE_PER_PALF_INSTANCE/PALF_PHY_BLOCK_SIZE);
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 6*31+1, id, MAX_LOG_BODY_SIZE));
update_disk_options(leader_idx, MIN_DISK_SIZE_PER_PALF_INSTANCE/PALF_PHY_BLOCK_SIZE + 2);
EXPECT_EQ(OB_SUCCESS, submit_log(leader, 8*31+1, id, MAX_LOG_BODY_SIZE));
LogStorage *log_storage = &leader.palf_handle_impl_->log_engine_.log_storage_;
while (LSN(6*PALF_BLOCK_SIZE) > log_storage->log_tail_) {
usleep(500);
@ -92,6 +92,9 @@ TEST_F(TestObSimpleLogDiskMgr, out_of_disk_space)
PALF_LOG(INFO, "out of disk max_lsn", K(max_lsn));
usleep(palf::BlockGCTimerTask::BLOCK_GC_TIMER_INTERVAL_MS + 5*10000);
EXPECT_EQ(OB_LOG_OUTOF_DISK_SPACE, submit_log(leader, 1, id, MAX_LOG_BODY_SIZE));
// shrinking 后继续停写
update_disk_options(leader_idx, MIN_DISK_SIZE_PER_PALF_INSTANCE/PALF_PHY_BLOCK_SIZE);
EXPECT_EQ(OB_LOG_OUTOF_DISK_SPACE, submit_log(leader, 1, id, MAX_LOG_BODY_SIZE));
usleep(ObLooper::INTERVAL_US*2);
}

View File

@ -142,6 +142,8 @@ void LogLoopThread::log_loop_()
PALF_LOG_RET(WARN, tmp_ret, "for_each try_freeze_log_func failed", K(tmp_ret));
}
palf_env_impl_->period_calc_disk_usage();
const int64_t round_cost_time = ObTimeUtility::current_time() - start_ts;
int32_t sleep_ts = run_interval_ - static_cast<const int32_t>(round_cost_time);
if (sleep_ts < 0) {

View File

@ -186,7 +186,8 @@ PalfEnvImpl::PalfEnvImpl() : palf_meta_lock_(common::ObLatchIds::PALF_ENV_LOCK),
log_updater_(),
monitor_(NULL),
disk_options_wrapper_(),
disk_not_enough_print_interval_(OB_INVALID_TIMESTAMP),
disk_not_enough_print_interval_in_gc_thread_(OB_INVALID_TIMESTAMP),
disk_not_enough_print_interval_in_loop_thread_(OB_INVALID_TIMESTAMP),
self_(),
palf_handle_impl_map_(64), // 指定min_size=64
last_palf_epoch_(0),
@ -350,7 +351,8 @@ void PalfEnvImpl::destroy()
election_timer_.destroy();
log_alloc_mgr_ = NULL;
monitor_ = NULL;
disk_not_enough_print_interval_ = OB_INVALID_TIMESTAMP;
disk_not_enough_print_interval_in_gc_thread_ = OB_INVALID_TIMESTAMP;
disk_not_enough_print_interval_in_loop_thread_ = OB_INVALID_TIMESTAMP;
self_.reset();
log_dir_[0] = '\0';
tmp_log_dir_[0] = '\0';
@ -715,11 +717,6 @@ int PalfEnvImpl::try_recycle_blocks()
const bool need_recycle =
usable_disk_size_to_recycle_blocks >= total_used_size_byte ? false : true;
const bool is_shrinking = disk_options_wrapper_.is_shrinking();
// Assume that, recycle speed is higher than write speed, therefor, the abnormal case
// is that, after each 'recycle_blocks_', the 'total_used_size_byte' is one PALF_BLOCK_SIZE
// more than 'usable_disk_size'.
const bool curr_diskspace_enough =
usable_disk_limit_size_to_stop_writing >= total_used_size_byte ? true : false;
constexpr int64_t MB = 1024 * 1024LL;
const int64_t print_error_log_disk_size =
disk_opts_for_stopping_writing.log_disk_usage_limit_size_
@ -732,7 +729,8 @@ int PalfEnvImpl::try_recycle_blocks()
// 2. the snapshot of status is SHRINKING_STATUS.
bool has_recycled = false;
int64_t oldest_palf_id = INVALID_PALF_ID;
if (OB_SUCC(ret) && PalfDiskOptionsWrapper::Status::SHRINKING_STATUS == status) {
const bool in_shrinking = (PalfDiskOptionsWrapper::Status::SHRINKING_STATUS == status);
if (OB_SUCC(ret) && in_shrinking) {
if (total_used_size_byte <= usable_disk_size_to_recycle_blocks) {
disk_options_wrapper_.change_to_normal(sequence);
PALF_LOG(INFO, "change_to_normal success", K(disk_options_wrapper_),
@ -749,31 +747,32 @@ int PalfEnvImpl::try_recycle_blocks()
}
}
// step3. reset diskspace_enough_.
if (diskspace_enough_ != curr_diskspace_enough) {
ATOMIC_STORE(&diskspace_enough_, curr_diskspace_enough);
}
// step3. try print error log
// NB: print error log when:
// 1. write-stop.
// 2. the used log disk space exceeded the log disk recycle threshold(stop-write PalfDiskOptions) and there is no recycable block.
if ((false == diskspace_enough_) || (true == need_print_error_log && false == has_recycled)) {
// 1. write-stop.(i.e. set 'diskspace_enough_' to true when the disk usage execeed than the 'log_disk_throttling_percentage_' in disk_opts_for_stopping_writing);
// 2. the used log disk space exceeded the log disk recycle threshold and there is no recycable block(in shrinking log disk status, disk_opts_for_stopping_writing is not
// same with disk_opts_for_recycling_blocks).
if (!check_disk_space_enough() || (true == need_print_error_log && false == has_recycled)) {
constexpr int64_t INTERVAL = 1*1000*1000;
if (palf_reach_time_interval(INTERVAL, disk_not_enough_print_interval_)) {
if (palf_reach_time_interval(INTERVAL, disk_not_enough_print_interval_in_gc_thread_)) {
int tmp_ret = OB_LOG_OUTOF_DISK_SPACE;
const int64_t log_disk_usage_limit_size = disk_opts_for_stopping_writing.log_disk_usage_limit_size_;
const int64_t log_disk_warn_percent = disk_opts_for_stopping_writing.log_disk_utilization_threshold_;
const int64_t log_disk_limit_percent = disk_opts_for_stopping_writing.log_disk_utilization_limit_threshold_;
LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "log disk space is almost full", "ret", tmp_ret,
"total_size(MB)", disk_opts_for_recycling_blocks.log_disk_usage_limit_size_/MB,
"total_size(MB)", log_disk_usage_limit_size/MB,
"used_size(MB)", total_used_size_byte/MB,
"used_percent(%)", (total_used_size_byte* 100) / (disk_opts_for_stopping_writing.log_disk_usage_limit_size_ + 1),
"warn_size(MB)", (total_size_to_recycle_blocks*disk_opts_for_recycling_blocks.log_disk_utilization_threshold_)/100/MB,
"warn_percent(%)", disk_opts_for_recycling_blocks.log_disk_utilization_threshold_,
"limit_size(MB)", (total_size_to_recycle_blocks*disk_opts_for_recycling_blocks.log_disk_utilization_limit_threshold_)/100/MB,
"limit_percent(%)", disk_opts_for_recycling_blocks.log_disk_utilization_limit_threshold_,
"used_percent(%)", (total_used_size_byte*100) / (log_disk_usage_limit_size+1),
"warn_size(MB)", (log_disk_usage_limit_size*log_disk_warn_percent)/100/MB,
"warn_percent(%)", log_disk_warn_percent,
"limit_size(MB)", (log_disk_usage_limit_size*log_disk_limit_percent)/100/MB,
"limit_percent(%)", log_disk_limit_percent,
"total_unrecyclable_size_byte(MB)", total_unrecyclable_size_byte/MB,
"maximum_used_size(MB)", maximum_used_size/MB,
"maximum_log_stream", palf_id,
"oldest_log_stream", oldest_palf_id,
"oldest_scn", oldest_scn);
"oldest_scn", oldest_scn,
"in_shrinking", in_shrinking);
}
} else {
if (REACH_TIME_INTERVAL(2 * 1000 * 1000L)) {
@ -1302,6 +1301,48 @@ int PalfEnvImpl::get_throttling_options(PalfThrottleOptions &options)
return ret;
}
void PalfEnvImpl::period_calc_disk_usage()
{
int ret = OB_SUCCESS;
constexpr int64_t MB = 1024 * 1024;
PalfDiskOptions disk_options = disk_options_wrapper_.get_disk_opts_for_stopping_writing();
int64_t used_size_byte = 0;
int64_t total_usable_size_byte = 0;
if (OB_FAIL(get_disk_usage_(used_size_byte))) {
PALF_LOG(WARN, "get_disk_usage_ failed", K(ret));
} else {
const int64_t log_disk_usage_limit_size = disk_options.log_disk_usage_limit_size_;
const int64_t log_disk_limit_percent = disk_options.log_disk_utilization_limit_threshold_;
const int64_t log_disk_warn_percent = disk_options.log_disk_utilization_threshold_;
const int64_t usable_disk_limit_size_to_stop_writing =
log_disk_usage_limit_size * log_disk_limit_percent / 100LL;
const bool curr_diskspace_enough =
usable_disk_limit_size_to_stop_writing >= used_size_byte ? true : false;
const int64_t warn_siz =
log_disk_usage_limit_size * log_disk_warn_percent / 100LL;
if (diskspace_enough_ != curr_diskspace_enough) {
ATOMIC_STORE(&diskspace_enough_, curr_diskspace_enough);
}
// NB: print error log when:
// 1. write-stop.
if (!curr_diskspace_enough) {
constexpr int64_t INTERVAL = 1*1000*1000;
if (palf_reach_time_interval(INTERVAL, disk_not_enough_print_interval_in_loop_thread_)) {
int tmp_ret = OB_LOG_OUTOF_DISK_SPACE;
LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "log disk space is almost full", "ret", tmp_ret,
"total_size(MB)", log_disk_usage_limit_size/MB,
"used_size(MB)", used_size_byte/MB,
"used_percent(%)", (used_size_byte*100) / (log_disk_usage_limit_size + 1),
"warn_size(MB)", warn_siz/MB,
"warn_percent(%)", log_disk_warn_percent,
"limit_size(MB)", usable_disk_limit_size_to_stop_writing/MB,
"limit_percent(%)", log_disk_limit_percent);
}
}
}
}
int PalfEnvImpl::init_log_io_worker_config_(const int log_writer_parallelism,
const int64_t tenant_id,
LogIOWorkerConfig &config)

View File

@ -202,6 +202,7 @@ public:
// should be removed in version 4.2.0.0
virtual int update_replayable_point(const SCN &replayable_scn) = 0;
virtual int get_throttling_options(PalfThrottleOptions &option) = 0;
virtual void period_calc_disk_usage() = 0;
VIRTUAL_TO_STRING_KV("IPalfEnvImpl", "Dummy");
};
@ -272,6 +273,7 @@ public:
int64_t get_tenant_id() override final;
int update_replayable_point(const SCN &replayable_scn) override final;
int get_throttling_options(PalfThrottleOptions &option);
void period_calc_disk_usage() override final;
INHERIT_TO_STRING_KV("IPalfEnvImpl", IPalfEnvImpl, K_(self), K_(log_dir), K_(disk_options_wrapper),
KPC(log_alloc_mgr_));
// =================== disk space management ==================
@ -371,7 +373,8 @@ private:
PalfMonitorCb *monitor_;
PalfDiskOptionsWrapper disk_options_wrapper_;
int64_t disk_not_enough_print_interval_;
int64_t disk_not_enough_print_interval_in_gc_thread_;
int64_t disk_not_enough_print_interval_in_loop_thread_;
char log_dir_[common::MAX_PATH_SIZE];
char tmp_log_dir_[common::MAX_PATH_SIZE];