diff --git a/src/share/io/ob_io_define.cpp b/src/share/io/ob_io_define.cpp index 314373b9ba..3b07985ba6 100644 --- a/src/share/io/ob_io_define.cpp +++ b/src/share/io/ob_io_define.cpp @@ -875,7 +875,7 @@ int ObIOHandle::wait(const int64_t timeout_ms) LOG_WARN("IO error, ", K(ret), K(*req_)); } } else if (OB_TIMEOUT == ret) { - LOG_WARN("IO wait timeout, ", K(timeout_ms), K(ret), K(*req_)); + LOG_WARN("IO wait timeout", K(timeout_ms), K(ret), K(*req_)); } estimate(); diff --git a/src/share/io/ob_io_define.h b/src/share/io/ob_io_define.h index fb341b5a58..8b88e1141d 100644 --- a/src/share/io/ob_io_define.h +++ b/src/share/io/ob_io_define.h @@ -32,6 +32,7 @@ namespace common static constexpr int64_t DEFAULT_IO_WAIT_TIME_MS = 5000L; // 5s static constexpr int64_t MAX_IO_WAIT_TIME_MS = 300L * 1000L; // 5min static constexpr int64_t GROUP_START_NUM = 8L; +static constexpr int64_t MAX_DETECT_READ_TIMES = 10L; enum class ObIOMode : uint8_t { READ = 0, diff --git a/src/share/io/ob_io_manager.cpp b/src/share/io/ob_io_manager.cpp index 962d824f01..ef1b684102 100644 --- a/src/share/io/ob_io_manager.cpp +++ b/src/share/io/ob_io_manager.cpp @@ -287,7 +287,7 @@ int ObIOManager::pwrite(ObIOInfo &info, int64_t &write_size) return ret; } -int ObIOManager::detect_read(const ObIOInfo &info, ObIOHandle &handle, const uint64_t timeout_ms) +int ObIOManager::detect_read(const ObIOInfo &info, ObIOHandle &handle, const uint64_t timeout_ms, int &sys_io_errno) { int ret = OB_SUCCESS; ObRefHolder tenant_holder; @@ -306,6 +306,10 @@ int ObIOManager::detect_read(const ObIOInfo &info, ObIOHandle &handle, const uin LOG_WARN("tenant io manager do aio failed", K(ret), K(info), KPC(tenant_holder.get_ptr())); } else if (OB_FAIL(handle.wait(timeout_ms))) { LOG_WARN("io handle wait failed", K(ret), K(info), K(timeout_ms)); + int tmp_ret = OB_SUCCESS; + if (OB_SUCCESS != (tmp_ret = handle.get_fs_errno(sys_io_errno))) { + LOG_WARN("fail to get io errno, ", K(sys_io_errno), K(tmp_ret)); + } } return ret; } diff --git a/src/share/io/ob_io_manager.h b/src/share/io/ob_io_manager.h index 29c453bb28..0e11bd1b50 100644 --- a/src/share/io/ob_io_manager.h +++ b/src/share/io/ob_io_manager.h @@ -48,7 +48,7 @@ public: int pwrite(ObIOInfo &info, int64_t &write_size); - int detect_read(const ObIOInfo &info, ObIOHandle &handle, const uint64_t timeout_ms); + int detect_read(const ObIOInfo &info, ObIOHandle &handle, const uint64_t timeout_ms, int &sys_io_errno); // config related, thread safe int set_io_config(const ObIOConfig &conf); diff --git a/src/share/io/ob_io_struct.cpp b/src/share/io/ob_io_struct.cpp index 999b87ff93..e0c142031b 100644 --- a/src/share/io/ob_io_struct.cpp +++ b/src/share/io/ob_io_struct.cpp @@ -2818,7 +2818,6 @@ void ObIOFaultDetector::handle(void *task) const int64_t LONG_AIO_TIMEOUT_MS = 30000; // 30s RetryTask *retry_task = reinterpret_cast(task); retry_task->io_info_.flag_.set_unlimited(); - retry_task->io_info_.flag_.set_detect(); int64_t timeout_ms = retry_task->timeout_ms_; // remain 1s to avoid race condition for retry_black_list_interval const int64_t retry_black_list_interval_ms = io_config_.read_failure_black_list_interval_ / 1000L - 1000L; @@ -2827,6 +2826,7 @@ void ObIOFaultDetector::handle(void *task) const int64_t MAX_IO_RETRY_TIMEOUT_MS = min(180L * 1000L/* 180s*/, retry_black_list_interval_ms); const int64_t diagnose_begin_ts = ObTimeUtility::fast_current_time(); bool is_retry_succ = false; + int64_t retry_times = 0; while (OB_SUCC(ret) && !OB_IO_MANAGER.is_stopped() && !is_retry_succ && !is_device_error_) { ObIOHandle handle; const int64_t current_retry_ts = ObTimeUtility::fast_current_time(); @@ -2836,15 +2836,19 @@ void ObIOFaultDetector::handle(void *task) (warn_ts - current_retry_ts) / 1000 : (error_ts - current_retry_ts) / 1000; // timeout of retry io increase exponentially timeout_ms = min(left_timeout_ms, min(MAX_IO_RETRY_TIMEOUT_MS, max(timeout_ms * 2, MIN_IO_RETRY_TIMEOUT_MS))); + int sys_io_errno = 0; if (timeout_ms > 0) { // do retry io - if (OB_FAIL(OB_IO_MANAGER.detect_read(retry_task->io_info_, handle, timeout_ms))) { + if (OB_FAIL(OB_IO_MANAGER.detect_read(retry_task->io_info_, handle, timeout_ms, sys_io_errno))) { if (OB_TIMEOUT == ret) { LOG_WARN("ObIOManager::read failed", K(ret), K(retry_task->io_info_), K(timeout_ms)); ret = OB_SUCCESS; } else if (OB_EAGAIN == ret) { //maybe channel is busy, wait and retry ob_usleep(100 * 1000); // 100ms ret = OB_SUCCESS; + } else if (sys_io_errno != 0) { + ++ retry_times; + ret = OB_SUCCESS; } else { LOG_WARN("ObIOManager::retry read request failed", K(ret), K(retry_task->io_info_)); } @@ -2853,11 +2857,19 @@ void ObIOFaultDetector::handle(void *task) } } if (OB_SUCC(ret) && !is_retry_succ) { - const int64_t current_ts = ObTimeUtility::fast_current_time(); - if (current_ts >= error_ts) { - set_device_error(); - } else if (current_ts >= warn_ts) { + if (sys_io_errno != 0 && retry_times >= MAX_DETECT_READ_TIMES) { + retry_task->io_info_.flag_.set_detect(); set_device_warning(); + LOG_WARN("ObIOManager::detect IO retry count reach limit, device warning", K(ret), K(sys_io_errno)); + } else { + const int64_t current_ts = ObTimeUtility::fast_current_time(); + if (current_ts >= error_ts) { + set_device_error(); + LOG_WARN("ObIOManager::detect IO retry timeout, device error", K(ret), K(current_ts), K(error_ts)); + } else if (current_ts >= warn_ts) { + set_device_warning(); + LOG_WARN("ObIOManager::detect IO retry timeout, device warning", K(ret), K(sys_io_errno)); + } } } } @@ -2911,7 +2923,7 @@ void ObIOFaultDetector::record_failure(const ObIORequest &req) ret = OB_NOT_INIT; LOG_WARN("io fault detector not init", K(ret), KP(is_inited_)); } else if (req.get_flag().is_detect()) { - //ignore, do not retry + //reach max retry time, ignore } else if (req.is_finished_ && OB_IO_ERROR != req.ret_code_.io_ret_) { // ignore, do nothing here } else if (req.get_flag().is_read()) { @@ -2920,7 +2932,7 @@ void ObIOFaultDetector::record_failure(const ObIORequest &req) } } else if (req.get_flag().is_write()) { ret = OB_NOT_SUPPORTED; - LOG_WARN("write IORequest failed", K(ret), K(req)); + LOG_WARN("not supported io write detect", K(ret), K(req)); } else { ret = OB_NOT_SUPPORTED; LOG_WARN("not supported io mode", K(ret), K(req)); @@ -2956,7 +2968,7 @@ void ObIOFaultDetector::set_device_warning() { last_device_warning_ts_ = ObTimeUtility::fast_current_time(); is_device_warning_ = true; - LOG_WARN_RET(OB_IO_ERROR, "disk maybe too slow"); + LOG_WARN_RET(OB_IO_ERROR, "disk maybe corrupted"); } // set disk error and record error_ts diff --git a/src/share/io/ob_io_struct.h b/src/share/io/ob_io_struct.h index 5bd2ae625c..f8a8678452 100644 --- a/src/share/io/ob_io_struct.h +++ b/src/share/io/ob_io_struct.h @@ -497,9 +497,6 @@ private: bool is_device_error_; int64_t begin_device_error_ts_; int64_t last_device_error_ts_; - // write/append failure detect - int64_t write_failure_count_; - int64_t write_failure_ts_[WRITE_FAILURE_DETECT_EVENT_COUNT]; }; class ObIOTracer final