add parameter log_storage_warning_tolerance_time

This commit is contained in:
obdev
2023-02-09 14:21:07 +00:00
committed by ob-robot
parent 1b735c5f41
commit 3f8bc6dd73
4 changed files with 37 additions and 33 deletions

View File

@ -159,8 +159,8 @@ void ObFailureDetector::detect_failure()
LC_TIME_GUARD(1_s);
// clog disk hang check
detect_palf_hang_failure_();
// slog writter hang check
detect_slog_writter_hang_failure_();
// slog writer hang check
detect_slog_writer_hang_failure_();
// sstable hang check
detect_sstable_io_failure_();
// clog disk full check
@ -332,7 +332,7 @@ void ObFailureDetector::detect_palf_hang_failure_()
} else if (OB_FAIL(log_service->get_io_start_time(clog_disk_last_working_time))) {
COORDINATOR_LOG(WARN, "get_io_start_time failed", K(ret));
} else if (FALSE_IT(is_clog_disk_hang = (OB_INVALID_TIMESTAMP != clog_disk_last_working_time
&& now - clog_disk_last_working_time > IO_HANG_TIME_THRESHOLD_US))) {
&& now - clog_disk_last_working_time > GCONF.log_storage_warning_tolerance_time))) {
} else if (false == ATOMIC_LOAD(&has_add_clog_hang_event_)) {
if (!is_clog_disk_hang) {
// log disk does not hang, skip.
@ -340,8 +340,8 @@ void ObFailureDetector::detect_palf_hang_failure_()
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_hang_event));
} else {
ATOMIC_SET(&has_add_clog_hang_event_, true);
COORDINATOR_LOG(WARN, "clog disk may be hang, add failure event", K(ret), K(clog_disk_hang_event),
K(clog_disk_last_working_time), "hang time", now - clog_disk_last_working_time);
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "clog disk may be hung, add failure event", K(clog_disk_hang_event),
K(clog_disk_last_working_time), "hung time", now - clog_disk_last_working_time);
}
} else {
if (is_clog_disk_hang) {
@ -355,38 +355,38 @@ void ObFailureDetector::detect_palf_hang_failure_()
}
}
void ObFailureDetector::detect_slog_writter_hang_failure_()
void ObFailureDetector::detect_slog_writer_hang_failure_()
{
LC_TIME_GUARD(1_s);
int ret = OB_SUCCESS;
bool is_slog_writter_hang = false;
int64_t slog_writter_last_working_time = OB_INVALID_TIMESTAMP;
bool is_slog_writer_hang = false;
int64_t slog_writer_last_working_time = OB_INVALID_TIMESTAMP;
const int64_t now = ObTimeUtility::current_time();
ObStorageLogger *storage_logger = MTL(ObStorageLogger*);
FailureEvent slog_writter_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL);
if (OB_FAIL(slog_writter_hang_event.set_info("slog writter hang event"))) {
COORDINATOR_LOG(ERROR, "slog_writter_hang_event set_info failed", K(ret));
} else if (FALSE_IT(slog_writter_last_working_time = storage_logger->get_pwrite_ts())) {
} else if (FALSE_IT(is_slog_writter_hang = (0 != slog_writter_last_working_time
&& now - slog_writter_last_working_time > GCONF.data_storage_warning_tolerance_time))) {
FailureEvent slog_writer_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL);
if (OB_FAIL(slog_writer_hang_event.set_info("slog writer hang event"))) {
COORDINATOR_LOG(ERROR, "slog_writer_hang_event set_info failed", K(ret));
} else if (FALSE_IT(slog_writer_last_working_time = storage_logger->get_pwrite_ts())) {
} else if (FALSE_IT(is_slog_writer_hang = (0 != slog_writer_last_working_time
&& now - slog_writer_last_working_time > GCONF.data_storage_warning_tolerance_time))) {
} else if (false == ATOMIC_LOAD(&has_add_slog_hang_event_)) {
if (!is_slog_writter_hang) {
// slog writter is normal, skip.
} else if (OB_FAIL(add_failure_event(slog_writter_hang_event))) {
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writter_hang_event));
if (!is_slog_writer_hang) {
// slog writer is normal, skip.
} else if (OB_FAIL(add_failure_event(slog_writer_hang_event))) {
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writer_hang_event));
} else {
ATOMIC_SET(&has_add_slog_hang_event_, true);
COORDINATOR_LOG(WARN, "slog writter may be hang, add failure event", K(ret), K(slog_writter_hang_event),
K(slog_writter_last_working_time), "hang time", now - slog_writter_last_working_time);
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "slog writer may be hung, add failure event", K(slog_writer_hang_event),
K(slog_writer_last_working_time), "hung time", now - slog_writer_last_working_time);
}
} else {
if (is_slog_writter_hang) {
// slog writter still hangs, cannot remove failure_event.
} else if (OB_FAIL(remove_failure_event(slog_writter_hang_event))) {
COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writter_hang_event));
if (is_slog_writer_hang) {
// slog writer still hangs, cannot remove failure_event.
} else if (OB_FAIL(remove_failure_event(slog_writer_hang_event))) {
COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writer_hang_event));
} else {
ATOMIC_SET(&has_add_slog_hang_event_, false);
COORDINATOR_LOG(INFO, "slog writter has recoverd, remove failure event", K(ret), K(slog_writter_hang_event));
COORDINATOR_LOG(INFO, "slog writer has recoverd, remove failure event", K(ret), K(slog_writer_hang_event));
}
}
}
@ -412,8 +412,8 @@ void ObFailureDetector::detect_sstable_io_failure_()
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(sstable_io_hang_event));
} else {
ATOMIC_SET(&has_add_sstable_hang_event_, true);
COORDINATOR_LOG(WARN, "data disk may be hang, add failure event", K(ret), K(sstable_io_hang_event),
K(data_disk_error_start_ts));
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "data disk may be hung, add failure event", K(sstable_io_hang_event),
K(data_disk_error_start_ts));
}
} else {
if (ObDeviceHealthStatus::DEVICE_HEALTH_NORMAL != data_disk_status) {
@ -446,8 +446,8 @@ void ObFailureDetector::detect_palf_disk_full_()
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_full_event));
} else {
ATOMIC_SET(&has_add_clog_full_event_, true);
COORDINATOR_LOG(WARN, "clog disk is full, add failure event", K(ret), K(clog_disk_full_event),
K(now));
LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "clog disk is almost full, add failure event",
K(clog_disk_full_event), K(now));
}
} else {
if (!is_disk_enough) {

View File

@ -97,7 +97,7 @@ private:
bool check_is_running_() const { return is_running_; }
int insert_event_to_table_(const FailureEvent &event, const ObFunction<bool()> &recover_operation, ObString info);
void detect_palf_hang_failure_();
void detect_slog_writter_hang_failure_();
void detect_slog_writer_hang_failure_();
void detect_sstable_io_failure_();
void detect_palf_disk_full_();
private:
@ -108,9 +108,6 @@ private:
ObFunction<bool()> recover_detect_operation_;
TO_STRING_KV(K_(event));
};
// default time threshold for clog disk io hang detection is 5s
// TODO: this value should be a configuration
static const int64_t IO_HANG_TIME_THRESHOLD_US = 5 * 1000 * 1000;
bool is_running_;
common::ObArray<FailureEventWithRecoverOp> events_with_ops_;
common::ObArray<common::ObAddr> tenant_server_list_;

View File

@ -552,6 +552,12 @@ DEF_INT(log_disk_utilization_threshold, OB_TENANT_PARAMETER,"80",
"Range: [10, 100)",
ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_TIME(log_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "5s",
"[1s,300s]",
"time to tolerate log disk io delay, after that, the disk status will be set warning. "
"Range: [1s,300s]",
ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
// ========================= LogService Config End =====================
DEF_INT(resource_hard_limit, OB_CLUSTER_PARAMETER, "100", "[100, 10000]",
"system utilization should not be large than resource_hard_limit",

View File

@ -119,6 +119,7 @@ log_disk_size
log_disk_utilization_limit_threshold
log_disk_utilization_threshold
log_restore_concurrency
log_storage_warning_tolerance_time
log_transport_compress_all
log_transport_compress_func
ls_meta_table_check_interval