add parameter log_storage_warning_tolerance_time
This commit is contained in:
@ -159,8 +159,8 @@ void ObFailureDetector::detect_failure()
|
||||
LC_TIME_GUARD(1_s);
|
||||
// clog disk hang check
|
||||
detect_palf_hang_failure_();
|
||||
// slog writter hang check
|
||||
detect_slog_writter_hang_failure_();
|
||||
// slog writer hang check
|
||||
detect_slog_writer_hang_failure_();
|
||||
// sstable hang check
|
||||
detect_sstable_io_failure_();
|
||||
// clog disk full check
|
||||
@ -332,7 +332,7 @@ void ObFailureDetector::detect_palf_hang_failure_()
|
||||
} else if (OB_FAIL(log_service->get_io_start_time(clog_disk_last_working_time))) {
|
||||
COORDINATOR_LOG(WARN, "get_io_start_time failed", K(ret));
|
||||
} else if (FALSE_IT(is_clog_disk_hang = (OB_INVALID_TIMESTAMP != clog_disk_last_working_time
|
||||
&& now - clog_disk_last_working_time > IO_HANG_TIME_THRESHOLD_US))) {
|
||||
&& now - clog_disk_last_working_time > GCONF.log_storage_warning_tolerance_time))) {
|
||||
} else if (false == ATOMIC_LOAD(&has_add_clog_hang_event_)) {
|
||||
if (!is_clog_disk_hang) {
|
||||
// log disk does not hang, skip.
|
||||
@ -340,8 +340,8 @@ void ObFailureDetector::detect_palf_hang_failure_()
|
||||
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_hang_event));
|
||||
} else {
|
||||
ATOMIC_SET(&has_add_clog_hang_event_, true);
|
||||
COORDINATOR_LOG(WARN, "clog disk may be hang, add failure event", K(ret), K(clog_disk_hang_event),
|
||||
K(clog_disk_last_working_time), "hang time", now - clog_disk_last_working_time);
|
||||
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "clog disk may be hung, add failure event", K(clog_disk_hang_event),
|
||||
K(clog_disk_last_working_time), "hung time", now - clog_disk_last_working_time);
|
||||
}
|
||||
} else {
|
||||
if (is_clog_disk_hang) {
|
||||
@ -355,38 +355,38 @@ void ObFailureDetector::detect_palf_hang_failure_()
|
||||
}
|
||||
}
|
||||
|
||||
void ObFailureDetector::detect_slog_writter_hang_failure_()
|
||||
void ObFailureDetector::detect_slog_writer_hang_failure_()
|
||||
{
|
||||
LC_TIME_GUARD(1_s);
|
||||
int ret = OB_SUCCESS;
|
||||
bool is_slog_writter_hang = false;
|
||||
int64_t slog_writter_last_working_time = OB_INVALID_TIMESTAMP;
|
||||
bool is_slog_writer_hang = false;
|
||||
int64_t slog_writer_last_working_time = OB_INVALID_TIMESTAMP;
|
||||
const int64_t now = ObTimeUtility::current_time();
|
||||
ObStorageLogger *storage_logger = MTL(ObStorageLogger*);
|
||||
FailureEvent slog_writter_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL);
|
||||
if (OB_FAIL(slog_writter_hang_event.set_info("slog writter hang event"))) {
|
||||
COORDINATOR_LOG(ERROR, "slog_writter_hang_event set_info failed", K(ret));
|
||||
} else if (FALSE_IT(slog_writter_last_working_time = storage_logger->get_pwrite_ts())) {
|
||||
} else if (FALSE_IT(is_slog_writter_hang = (0 != slog_writter_last_working_time
|
||||
&& now - slog_writter_last_working_time > GCONF.data_storage_warning_tolerance_time))) {
|
||||
FailureEvent slog_writer_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL);
|
||||
if (OB_FAIL(slog_writer_hang_event.set_info("slog writer hang event"))) {
|
||||
COORDINATOR_LOG(ERROR, "slog_writer_hang_event set_info failed", K(ret));
|
||||
} else if (FALSE_IT(slog_writer_last_working_time = storage_logger->get_pwrite_ts())) {
|
||||
} else if (FALSE_IT(is_slog_writer_hang = (0 != slog_writer_last_working_time
|
||||
&& now - slog_writer_last_working_time > GCONF.data_storage_warning_tolerance_time))) {
|
||||
} else if (false == ATOMIC_LOAD(&has_add_slog_hang_event_)) {
|
||||
if (!is_slog_writter_hang) {
|
||||
// slog writter is normal, skip.
|
||||
} else if (OB_FAIL(add_failure_event(slog_writter_hang_event))) {
|
||||
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writter_hang_event));
|
||||
if (!is_slog_writer_hang) {
|
||||
// slog writer is normal, skip.
|
||||
} else if (OB_FAIL(add_failure_event(slog_writer_hang_event))) {
|
||||
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writer_hang_event));
|
||||
} else {
|
||||
ATOMIC_SET(&has_add_slog_hang_event_, true);
|
||||
COORDINATOR_LOG(WARN, "slog writter may be hang, add failure event", K(ret), K(slog_writter_hang_event),
|
||||
K(slog_writter_last_working_time), "hang time", now - slog_writter_last_working_time);
|
||||
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "slog writer may be hung, add failure event", K(slog_writer_hang_event),
|
||||
K(slog_writer_last_working_time), "hung time", now - slog_writer_last_working_time);
|
||||
}
|
||||
} else {
|
||||
if (is_slog_writter_hang) {
|
||||
// slog writter still hangs, cannot remove failure_event.
|
||||
} else if (OB_FAIL(remove_failure_event(slog_writter_hang_event))) {
|
||||
COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writter_hang_event));
|
||||
if (is_slog_writer_hang) {
|
||||
// slog writer still hangs, cannot remove failure_event.
|
||||
} else if (OB_FAIL(remove_failure_event(slog_writer_hang_event))) {
|
||||
COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writer_hang_event));
|
||||
} else {
|
||||
ATOMIC_SET(&has_add_slog_hang_event_, false);
|
||||
COORDINATOR_LOG(INFO, "slog writter has recoverd, remove failure event", K(ret), K(slog_writter_hang_event));
|
||||
COORDINATOR_LOG(INFO, "slog writer has recoverd, remove failure event", K(ret), K(slog_writer_hang_event));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -412,8 +412,8 @@ void ObFailureDetector::detect_sstable_io_failure_()
|
||||
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(sstable_io_hang_event));
|
||||
} else {
|
||||
ATOMIC_SET(&has_add_sstable_hang_event_, true);
|
||||
COORDINATOR_LOG(WARN, "data disk may be hang, add failure event", K(ret), K(sstable_io_hang_event),
|
||||
K(data_disk_error_start_ts));
|
||||
LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "data disk may be hung, add failure event", K(sstable_io_hang_event),
|
||||
K(data_disk_error_start_ts));
|
||||
}
|
||||
} else {
|
||||
if (ObDeviceHealthStatus::DEVICE_HEALTH_NORMAL != data_disk_status) {
|
||||
@ -446,8 +446,8 @@ void ObFailureDetector::detect_palf_disk_full_()
|
||||
COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_full_event));
|
||||
} else {
|
||||
ATOMIC_SET(&has_add_clog_full_event_, true);
|
||||
COORDINATOR_LOG(WARN, "clog disk is full, add failure event", K(ret), K(clog_disk_full_event),
|
||||
K(now));
|
||||
LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "clog disk is almost full, add failure event",
|
||||
K(clog_disk_full_event), K(now));
|
||||
}
|
||||
} else {
|
||||
if (!is_disk_enough) {
|
||||
|
||||
@ -97,7 +97,7 @@ private:
|
||||
bool check_is_running_() const { return is_running_; }
|
||||
int insert_event_to_table_(const FailureEvent &event, const ObFunction<bool()> &recover_operation, ObString info);
|
||||
void detect_palf_hang_failure_();
|
||||
void detect_slog_writter_hang_failure_();
|
||||
void detect_slog_writer_hang_failure_();
|
||||
void detect_sstable_io_failure_();
|
||||
void detect_palf_disk_full_();
|
||||
private:
|
||||
@ -108,9 +108,6 @@ private:
|
||||
ObFunction<bool()> recover_detect_operation_;
|
||||
TO_STRING_KV(K_(event));
|
||||
};
|
||||
// default time threshold for clog disk io hang detection is 5s
|
||||
// TODO: this value should be a configuration
|
||||
static const int64_t IO_HANG_TIME_THRESHOLD_US = 5 * 1000 * 1000;
|
||||
bool is_running_;
|
||||
common::ObArray<FailureEventWithRecoverOp> events_with_ops_;
|
||||
common::ObArray<common::ObAddr> tenant_server_list_;
|
||||
|
||||
@ -552,6 +552,12 @@ DEF_INT(log_disk_utilization_threshold, OB_TENANT_PARAMETER,"80",
|
||||
"Range: [10, 100)",
|
||||
ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
|
||||
|
||||
DEF_TIME(log_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "5s",
|
||||
"[1s,300s]",
|
||||
"time to tolerate log disk io delay, after that, the disk status will be set warning. "
|
||||
"Range: [1s,300s]",
|
||||
ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
|
||||
|
||||
// ========================= LogService Config End =====================
|
||||
DEF_INT(resource_hard_limit, OB_CLUSTER_PARAMETER, "100", "[100, 10000]",
|
||||
"system utilization should not be large than resource_hard_limit",
|
||||
|
||||
Reference in New Issue
Block a user