From 3f8bc6dd73c3910f14ebd256e22f0a95dc9e55bf Mon Sep 17 00:00:00 2001 From: obdev Date: Thu, 9 Feb 2023 14:21:07 +0000 Subject: [PATCH] add parameter log_storage_warning_tolerance_time --- .../ob_failure_detector.cpp | 58 +++++++++---------- .../leader_coordinator/ob_failure_detector.h | 5 +- src/share/parameter/ob_parameter_seed.ipp | 6 ++ .../all_virtual_sys_parameter_stat.result | 1 + 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/logservice/leader_coordinator/ob_failure_detector.cpp b/src/logservice/leader_coordinator/ob_failure_detector.cpp index 39c336f778..75dd489ae5 100644 --- a/src/logservice/leader_coordinator/ob_failure_detector.cpp +++ b/src/logservice/leader_coordinator/ob_failure_detector.cpp @@ -159,8 +159,8 @@ void ObFailureDetector::detect_failure() LC_TIME_GUARD(1_s); // clog disk hang check detect_palf_hang_failure_(); - // slog writter hang check - detect_slog_writter_hang_failure_(); + // slog writer hang check + detect_slog_writer_hang_failure_(); // sstable hang check detect_sstable_io_failure_(); // clog disk full check @@ -332,7 +332,7 @@ void ObFailureDetector::detect_palf_hang_failure_() } else if (OB_FAIL(log_service->get_io_start_time(clog_disk_last_working_time))) { COORDINATOR_LOG(WARN, "get_io_start_time failed", K(ret)); } else if (FALSE_IT(is_clog_disk_hang = (OB_INVALID_TIMESTAMP != clog_disk_last_working_time - && now - clog_disk_last_working_time > IO_HANG_TIME_THRESHOLD_US))) { + && now - clog_disk_last_working_time > GCONF.log_storage_warning_tolerance_time))) { } else if (false == ATOMIC_LOAD(&has_add_clog_hang_event_)) { if (!is_clog_disk_hang) { // log disk does not hang, skip. @@ -340,8 +340,8 @@ void ObFailureDetector::detect_palf_hang_failure_() COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_hang_event)); } else { ATOMIC_SET(&has_add_clog_hang_event_, true); - COORDINATOR_LOG(WARN, "clog disk may be hang, add failure event", K(ret), K(clog_disk_hang_event), - K(clog_disk_last_working_time), "hang time", now - clog_disk_last_working_time); + LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "clog disk may be hung, add failure event", K(clog_disk_hang_event), + K(clog_disk_last_working_time), "hung time", now - clog_disk_last_working_time); } } else { if (is_clog_disk_hang) { @@ -355,38 +355,38 @@ void ObFailureDetector::detect_palf_hang_failure_() } } -void ObFailureDetector::detect_slog_writter_hang_failure_() +void ObFailureDetector::detect_slog_writer_hang_failure_() { LC_TIME_GUARD(1_s); int ret = OB_SUCCESS; - bool is_slog_writter_hang = false; - int64_t slog_writter_last_working_time = OB_INVALID_TIMESTAMP; + bool is_slog_writer_hang = false; + int64_t slog_writer_last_working_time = OB_INVALID_TIMESTAMP; const int64_t now = ObTimeUtility::current_time(); ObStorageLogger *storage_logger = MTL(ObStorageLogger*); - FailureEvent slog_writter_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL); - if (OB_FAIL(slog_writter_hang_event.set_info("slog writter hang event"))) { - COORDINATOR_LOG(ERROR, "slog_writter_hang_event set_info failed", K(ret)); - } else if (FALSE_IT(slog_writter_last_working_time = storage_logger->get_pwrite_ts())) { - } else if (FALSE_IT(is_slog_writter_hang = (0 != slog_writter_last_working_time - && now - slog_writter_last_working_time > GCONF.data_storage_warning_tolerance_time))) { + FailureEvent slog_writer_hang_event(FailureType::PROCESS_HANG, FailureModule::STORAGE, FailureLevel::FATAL); + if (OB_FAIL(slog_writer_hang_event.set_info("slog writer hang event"))) { + COORDINATOR_LOG(ERROR, "slog_writer_hang_event set_info failed", K(ret)); + } else if (FALSE_IT(slog_writer_last_working_time = storage_logger->get_pwrite_ts())) { + } else if (FALSE_IT(is_slog_writer_hang = (0 != slog_writer_last_working_time + && now - slog_writer_last_working_time > GCONF.data_storage_warning_tolerance_time))) { } else if (false == ATOMIC_LOAD(&has_add_slog_hang_event_)) { - if (!is_slog_writter_hang) { - // slog writter is normal, skip. - } else if (OB_FAIL(add_failure_event(slog_writter_hang_event))) { - COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writter_hang_event)); + if (!is_slog_writer_hang) { + // slog writer is normal, skip. + } else if (OB_FAIL(add_failure_event(slog_writer_hang_event))) { + COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(slog_writer_hang_event)); } else { ATOMIC_SET(&has_add_slog_hang_event_, true); - COORDINATOR_LOG(WARN, "slog writter may be hang, add failure event", K(ret), K(slog_writter_hang_event), - K(slog_writter_last_working_time), "hang time", now - slog_writter_last_working_time); + LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "slog writer may be hung, add failure event", K(slog_writer_hang_event), + K(slog_writer_last_working_time), "hung time", now - slog_writer_last_working_time); } } else { - if (is_slog_writter_hang) { - // slog writter still hangs, cannot remove failure_event. - } else if (OB_FAIL(remove_failure_event(slog_writter_hang_event))) { - COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writter_hang_event)); + if (is_slog_writer_hang) { + // slog writer still hangs, cannot remove failure_event. + } else if (OB_FAIL(remove_failure_event(slog_writer_hang_event))) { + COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(slog_writer_hang_event)); } else { ATOMIC_SET(&has_add_slog_hang_event_, false); - COORDINATOR_LOG(INFO, "slog writter has recoverd, remove failure event", K(ret), K(slog_writter_hang_event)); + COORDINATOR_LOG(INFO, "slog writer has recoverd, remove failure event", K(ret), K(slog_writer_hang_event)); } } } @@ -412,8 +412,8 @@ void ObFailureDetector::detect_sstable_io_failure_() COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(sstable_io_hang_event)); } else { ATOMIC_SET(&has_add_sstable_hang_event_, true); - COORDINATOR_LOG(WARN, "data disk may be hang, add failure event", K(ret), K(sstable_io_hang_event), - K(data_disk_error_start_ts)); + LOG_DBA_ERROR(OB_DISK_CORRUPTED, "msg", "data disk may be hung, add failure event", K(sstable_io_hang_event), + K(data_disk_error_start_ts)); } } else { if (ObDeviceHealthStatus::DEVICE_HEALTH_NORMAL != data_disk_status) { @@ -446,8 +446,8 @@ void ObFailureDetector::detect_palf_disk_full_() COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(clog_disk_full_event)); } else { ATOMIC_SET(&has_add_clog_full_event_, true); - COORDINATOR_LOG(WARN, "clog disk is full, add failure event", K(ret), K(clog_disk_full_event), - K(now)); + LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "clog disk is almost full, add failure event", + K(clog_disk_full_event), K(now)); } } else { if (!is_disk_enough) { diff --git a/src/logservice/leader_coordinator/ob_failure_detector.h b/src/logservice/leader_coordinator/ob_failure_detector.h index ce21b80f69..d04795e517 100644 --- a/src/logservice/leader_coordinator/ob_failure_detector.h +++ b/src/logservice/leader_coordinator/ob_failure_detector.h @@ -97,7 +97,7 @@ private: bool check_is_running_() const { return is_running_; } int insert_event_to_table_(const FailureEvent &event, const ObFunction &recover_operation, ObString info); void detect_palf_hang_failure_(); - void detect_slog_writter_hang_failure_(); + void detect_slog_writer_hang_failure_(); void detect_sstable_io_failure_(); void detect_palf_disk_full_(); private: @@ -108,9 +108,6 @@ private: ObFunction recover_detect_operation_; TO_STRING_KV(K_(event)); }; - // default time threshold for clog disk io hang detection is 5s - // TODO: this value should be a configuration - static const int64_t IO_HANG_TIME_THRESHOLD_US = 5 * 1000 * 1000; bool is_running_; common::ObArray events_with_ops_; common::ObArray tenant_server_list_; diff --git a/src/share/parameter/ob_parameter_seed.ipp b/src/share/parameter/ob_parameter_seed.ipp index acd2430408..26e6a4f01d 100644 --- a/src/share/parameter/ob_parameter_seed.ipp +++ b/src/share/parameter/ob_parameter_seed.ipp @@ -552,6 +552,12 @@ DEF_INT(log_disk_utilization_threshold, OB_TENANT_PARAMETER,"80", "Range: [10, 100)", ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); +DEF_TIME(log_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "5s", + "[1s,300s]", + "time to tolerate log disk io delay, after that, the disk status will be set warning. " + "Range: [1s,300s]", + ObParameterAttr(Section::LOGSERVICE, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); + // ========================= LogService Config End ===================== DEF_INT(resource_hard_limit, OB_CLUSTER_PARAMETER, "100", "[100, 10000]", "system utilization should not be large than resource_hard_limit", diff --git a/tools/deploy/mysql_test/test_suite/inner_table/r/mysql/all_virtual_sys_parameter_stat.result b/tools/deploy/mysql_test/test_suite/inner_table/r/mysql/all_virtual_sys_parameter_stat.result index 5c56150b19..7bd72c3911 100644 --- a/tools/deploy/mysql_test/test_suite/inner_table/r/mysql/all_virtual_sys_parameter_stat.result +++ b/tools/deploy/mysql_test/test_suite/inner_table/r/mysql/all_virtual_sys_parameter_stat.result @@ -119,6 +119,7 @@ log_disk_size log_disk_utilization_limit_threshold log_disk_utilization_threshold log_restore_concurrency +log_storage_warning_tolerance_time log_transport_compress_all log_transport_compress_func ls_meta_table_check_interval