add parameters for detecting disk warning and error

2021-07-15 17:57:14 +08:00
parent 20c0cae3dd
commit 7eb645fba3
8 changed files with 95 additions and 68 deletions
--- a/deps/oblib/src/lib/io/ob_io_common.cpp
+++ b/deps/oblib/src/lib/io/ob_io_common.cpp
@ -150,8 +150,8 @@ void ObIOConfig::set_default_value()
  cpu_high_water_level_ = DEFAULT_CPU_HIGH_WATER_LEVEL;
  write_failure_detect_interval_ = DEFAULT_WRITE_FAILURE_DETECT_INTERVAL;
  read_failure_black_list_interval_ = DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL;
-  retry_warn_limit_ = DEFAULT_RETRY_WARN_LIMIT;
-  retry_error_limit_ = DEFAULT_RETRY_ERROR_LIMIT;
+  data_storage_warning_tolerance_time_ = DEFAULT_WARNING_TOLERANCE_TIME;
+  data_storage_error_tolerance_time_ = DEFAULT_ERROR_TOLERANCE_TIME;
  disk_io_thread_count_ = DEFAULT_DISK_IO_THREAD_COUNT;
  callback_thread_count_ = DEFAULT_IO_CALLBACK_THREAD_COUNT;
  large_query_io_percent_ = DEFAULT_LARGE_QUERY_IO_PERCENT;
@ -163,7 +163,8 @@ bool ObIOConfig::is_valid() const
  return sys_io_low_percent_ >= 0 && sys_io_low_percent_ <= 100 && sys_io_high_percent_ > 0 &&
         sys_io_high_percent_ <= 100 && sys_io_low_percent_ <= sys_io_high_percent_ && user_iort_up_percent_ >= 0 &&
         cpu_high_water_level_ > 0 && write_failure_detect_interval_ > 0 && read_failure_black_list_interval_ > 0 &&
-         retry_warn_limit_ > 0 && retry_error_limit_ > retry_warn_limit_ && disk_io_thread_count_ > 0 &&
+         data_storage_warning_tolerance_time_ > 0 &&
+         data_storage_error_tolerance_time_ >= data_storage_warning_tolerance_time_ && disk_io_thread_count_ > 0 &&
         disk_io_thread_count_ <= ObDisk::MAX_DISK_CHANNEL_CNT * 2 && disk_io_thread_count_ % 2 == 0 &&
         callback_thread_count_ > 0 && large_query_io_percent_ >= 0 && large_query_io_percent_ <= 100 &&
         data_storage_io_timeout_ms_ > 0;
@ -177,8 +178,8 @@ void ObIOConfig::reset()
  cpu_high_water_level_ = 0;
  write_failure_detect_interval_ = 0;
  read_failure_black_list_interval_ = 0;
-  retry_warn_limit_ = 0;
-  retry_error_limit_ = 0;
+  data_storage_warning_tolerance_time_ = 0;
+  data_storage_error_tolerance_time_ = 0;
  disk_io_thread_count_ = 0;
  callback_thread_count_ = 0;
  large_query_io_percent_ = 0;
--- a/deps/oblib/src/lib/io/ob_io_common.h
+++ b/deps/oblib/src/lib/io/ob_io_common.h
@ -98,8 +98,8 @@ public:
  static const int64_t DEFAULT_CPU_HIGH_WATER_LEVEL = 4800;
  static const int64_t DEFAULT_WRITE_FAILURE_DETECT_INTERVAL = 60 * 1000 * 1000;         // 1 min
  static const int64_t DEFAULT_READ_FAILURE_IN_BLACK_LIST_INTERVAL = 300 * 1000 * 1000;  // 5 min
-  static const int32_t DEFAULT_RETRY_WARN_LIMIT = 2;
-  static const int32_t DEFAULT_RETRY_ERROR_LIMIT = 5;
+  static const int32_t DEFAULT_WARNING_TOLERANCE_TIME = 30L * 1000L * 1000L;             // 30s
+  static const int32_t DEFAULT_ERROR_TOLERANCE_TIME = 300L * 1000L * 1000L;              // 300s
  static const int64_t DEFAULT_DISK_IO_THREAD_COUNT = 8;
  static const int64_t DEFAULT_IO_CALLBACK_THREAD_COUNT = 8;
  static const int64_t DEFAULT_LARGE_QUERY_IO_PERCENT = 0;                 // 0 means unlimited
@ -113,19 +113,22 @@ public:
  bool is_valid() const;
  void reset();
  TO_STRING_KV(K_(sys_io_low_percent), K_(sys_io_high_percent), K_(user_iort_up_percent), K_(cpu_high_water_level),
-      K_(write_failure_detect_interval), K_(read_failure_black_list_interval), K_(retry_warn_limit),
-      K_(retry_error_limit), K_(disk_io_thread_count), K_(callback_thread_count), K_(large_query_io_percent),
-      K_(data_storage_io_timeout_ms));
+      K_(write_failure_detect_interval), K_(read_failure_black_list_interval), K_(data_storage_warning_tolerance_time),
+      K_(data_storage_error_tolerance_time), K_(disk_io_thread_count), K_(callback_thread_count),
+      K_(large_query_io_percent), K_(data_storage_io_timeout_ms));

 public:
+  // schedule related
  int64_t sys_io_low_percent_;
  int64_t sys_io_high_percent_;
  int64_t user_iort_up_percent_;
  int64_t cpu_high_water_level_;
+  // diagnose related
  int64_t write_failure_detect_interval_;
  int64_t read_failure_black_list_interval_;
-  int64_t retry_warn_limit_;
-  int64_t retry_error_limit_;
+  int64_t data_storage_warning_tolerance_time_;
+  int64_t data_storage_error_tolerance_time_;
+  // resource related
  int64_t disk_io_thread_count_;
  int64_t callback_thread_count_;
  int64_t large_query_io_percent_;
--- a/deps/oblib/src/lib/io/ob_io_disk.cpp
+++ b/deps/oblib/src/lib/io/ob_io_disk.cpp
@ -41,28 +41,23 @@ void ObDiskDiagnose::reset()
  MEMSET(write_failure_event_ts_, 0, sizeof(write_failure_event_ts_));
 }

-void ObDiskDiagnose::record_read_fail(const int64_t retry_cnt)
+void ObDiskDiagnose::record_read_fail(const int64_t diagnose_begin_ts)
 {
  const ObIOConfig io_config = OB_IO_MANAGER.get_io_config();
-  // in oder to reduce the misjudgement, here is the rules:
-  // watch the continuous read timeout with the exponential growth of timeout
-  // 1. for more than 3 times, record as dick warning,
-  //    after that, this server is not allowed to be the paxos leader for a period,
-  //    which is indicated by READ_FAILURE_IN_BLACK_LIST_INTERVAL, usually 300s.
-  //
-  // 2. for more than 6 times, record as disk error
-  //    if the disk is confirmed normal, the administrator can reset the disk error by
-  //    alter system set disk valid server [=] 'ip:port'
-  //
-  if (retry_cnt < io_config.retry_warn_limit_) {
-    // do nothing
-  } else if (retry_cnt < io_config.retry_error_limit_) {
-    last_read_failure_warn_ts_ = ObTimeUtility::current_time();
-  } else {
-    if (!is_disk_error_) {
-      disk_error_begin_ts_ = ObTimeUtility::current_time();
+  const int64_t current_ts = ObTimeUtility::current_time();
+  if (current_ts >= diagnose_begin_ts + io_config.data_storage_warning_tolerance_time_) {
+    // set disk warning and record warn_ts
+    // until warn_ts + READ_FAILURE_IN_BLACK_LIST_INTERVAL, this server is not allowed to be partition leader
+    last_read_failure_warn_ts_ = current_ts;
  }
-    disk_error_last_ts_ = ObTimeUtility::current_time();
+  if (current_ts >= diagnose_begin_ts + io_config.data_storage_error_tolerance_time_) {
+    // set disk error and record error_ts
+    // if the disk is confirmed normal, the administrator can reset disk status by:
+    // alter system set disk valid server [=] 'ip:port'
+    if (!is_disk_error_) {
+      disk_error_begin_ts_ = current_ts;
+    }
+    disk_error_last_ts_ = current_ts;
    is_disk_error_ = true;
    COMMON_LOG(ERROR, "set_disk_error: attention!!!");
  }
@ -119,18 +114,6 @@ int64_t ObDiskDiagnose::get_last_io_failure_ts() const
  return MAX(disk_error_last_ts_, last_read_failure_warn_ts_);
 }

-int64_t ObDiskDiagnose::get_max_retry_cnt() const
-{
-  const ObIOConfig io_config = OB_IO_MANAGER.get_io_config();
-  return io_config.retry_error_limit_;
-}
-
-int64_t ObDiskDiagnose::get_warn_retry_cnt() const
-{
-  const ObIOConfig io_config = OB_IO_MANAGER.get_io_config();
-  return io_config.retry_warn_limit_;
-}
-
 /**
 * ---------------------------------------------- ObDisk ---------------------------------------------
 */
@ -648,33 +631,41 @@ void ObIOFaultDetector::handle(void* t)
    const ObIOInfo& info = task->info_;
    ObIOHandle handle;
    uint64_t timeout_ms = task->timeout_ms_;
-    int64_t retry_cnt = 0;
-    const int64_t MIN_IO_WAIT_TIME_MS = 30000;  // 30s
-
-    for (retry_cnt = 0; retry_cnt < disk_diagnose.get_max_retry_cnt(); ++retry_cnt) {
+    // remain 1s to avoid race condition for retry_black_list_interval
+    const int64_t retry_black_list_interval_ms =
+        OB_IO_MANAGER.get_io_config().read_failure_black_list_interval_ / 1000L - 1000L;
+    // rety_io_timeout must less than black_list_interval
+    const int64_t MIN_IO_RETRY_TIMEOUT_MS = min(10L * 1000L /* 10s */, retry_black_list_interval_ms);
+    const int64_t MAX_IO_RETRY_TIMEOUT_MS = min(180L * 1000L /* 180s*/, retry_black_list_interval_ms);
+    const int64_t diagnose_begin_ts = ObTimeUtility::current_time();
+    bool is_retry_succ = false;
+    while (OB_SUCC(ret) && !is_retry_succ && !disk_diagnose.is_disk_error()) {
      handle.reset();
-      // timeout grows exponentially
-      if (retry_cnt >= disk_diagnose.get_warn_retry_cnt() - 1) {
-        timeout_ms = max(timeout_ms * 2, MIN_IO_WAIT_TIME_MS);
-      } else {
-        timeout_ms = timeout_ms * 2;
-      }
-
-      if (retry_cnt == disk_diagnose.get_warn_retry_cnt()) {
-        disk_diagnose.record_read_fail(retry_cnt);
-      }
-
+      const ObIOConfig io_conf = OB_IO_MANAGER.get_io_config();
+      const int64_t current_retry_ts = ObTimeUtility::current_time();
+      const int64_t warn_ts = diagnose_begin_ts + io_conf.data_storage_warning_tolerance_time_;
+      const int64_t error_ts = diagnose_begin_ts + io_conf.data_storage_error_tolerance_time_;
+      const int64_t left_timeout_ms =
+          !disk_diagnose.is_disk_warning() ? (warn_ts - current_retry_ts) / 1000 : (error_ts - current_retry_ts) / 1000;
+      // timeout of retry io increase exponentially
+      timeout_ms = min(left_timeout_ms, min(MAX_IO_RETRY_TIMEOUT_MS, max(timeout_ms * 2, MIN_IO_RETRY_TIMEOUT_MS)));
+      if (timeout_ms > 0) {
+        // do retry io
        if (disk->get_admin_status() != DISK_USING) {
          ret = OB_STATE_NOT_MATCH;
          COMMON_LOG(WARN, "check_admin_status failed, disk is deleting", K(ret), "status", disk->get_admin_status());
          break;
        } else if (OB_FAIL(OB_IO_MANAGER.read(info, handle, timeout_ms))) {
          COMMON_LOG(WARN, "ObIOManager::read failed", K(ret), K(info), K(timeout_ms));
+          ret = OB_SUCCESS;
        } else {
-        break;  // stop retry if success
+          is_retry_succ = true;
+        }
+      }
+      if (OB_SUCC(ret) && !is_retry_succ) {
+        disk_diagnose.record_read_fail(diagnose_begin_ts);
      }
    }
-    disk_diagnose.record_read_fail(retry_cnt);

    op_free(task);
    task = NULL;
--- a/deps/oblib/src/lib/io/ob_io_disk.h
+++ b/deps/oblib/src/lib/io/ob_io_disk.h
@ -56,13 +56,11 @@ class ObDiskDiagnose {
 public:
  ObDiskDiagnose();
  virtual ~ObDiskDiagnose();
-  void record_read_fail(const int64_t retry_cnt);
+  void record_read_fail(const int64_t diagnose_begin_ts);
  void record_write_fail();
  bool is_disk_warning() const;
  bool is_disk_error() const;
  void reset_disk_health();
-  int64_t get_max_retry_cnt() const;
-  int64_t get_warn_retry_cnt() const;
  int64_t get_disk_error_begin_ts() const
  {
    return disk_error_begin_ts_;
--- a/src/observer/ob_server_reload_config.cpp
+++ b/src/observer/ob_server_reload_config.cpp
@ -111,6 +111,8 @@ int ObServerReloadConfig::operator()()
    // In the 2.x version, reuse the sys_bkgd_io_timeout configuration item to indicate the data disk io timeout time
    // After version 3.1, use the data_storage_io_timeout configuration item.
    io_config.data_storage_io_timeout_ms_ = GCONF._data_storage_io_timeout / 1000L;
+    io_config.data_storage_warning_tolerance_time_ = GCONF.data_storage_warning_tolerance_time;
+    io_config.data_storage_error_tolerance_time_ = GCONF.data_storage_error_tolerance_time;
    if (OB_FAIL(ObIOManager::get_instance().set_io_config(io_config))) {
      real_ret = ret;
      LOG_WARN("reload io manager config fail, ", K(ret));
--- a/src/share/config/ob_config_helper.cpp
+++ b/src/share/config/ob_config_helper.cpp
@ -302,6 +302,17 @@ bool ObConfigPartitionBalanceStrategyFuncChecker::check(const ObConfigItem& t) c
  return is_valid;
 }

+bool ObDataStorageErrorToleranceTimeChecker::check(const ObConfigItem& t) const
+{
+  bool is_valid = false;
+  int64_t value = ObConfigTimeParser::get(t.str(), is_valid);
+  if (is_valid) {
+    const int64_t warning_value = GCONF.data_storage_warning_tolerance_time;
+    is_valid = value >= warning_value;
+  }
+  return is_valid;
+}
+
 int64_t ObConfigIntParser::get(const char* str, bool& valid)
 {
  char* p_end = NULL;
--- a/src/share/config/ob_config_helper.h
+++ b/src/share/config/ob_config_helper.h
@ -394,6 +394,18 @@ private:
  DISALLOW_COPY_AND_ASSIGN(ObConfigPartitionBalanceStrategyFuncChecker);
 };

+class ObDataStorageErrorToleranceTimeChecker : public ObConfigChecker {
+public:
+  ObDataStorageErrorToleranceTimeChecker()
+  {}
+  virtual ~ObDataStorageErrorToleranceTimeChecker()
+  {}
+  bool check(const ObConfigItem& t) const;
+
+private:
+  DISABLE_COPY_ASSIGN(ObDataStorageErrorToleranceTimeChecker);
+};
+
 // config item container
 class ObConfigStringKey {
 public:
--- a/src/share/parameter/ob_parameter_seed.ipp
+++ b/src/share/parameter/ob_parameter_seed.ipp
@ -881,6 +881,15 @@ DEF_TIME(_data_storage_io_timeout, OB_CLUSTER_PARAMETER, "120s", "[5s,600s]",
    "io timeout for data storage, Range [5s,600s]. "
    "The default value is 120s",
    ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
+DEF_TIME(data_storage_warning_tolerance_time, OB_CLUSTER_PARAMETER, "30s", "[10s,300s]",
+    "time to tolerate disk read failure, after that, the disk status will be set warning. Range [10s,300s]. The "
+    "default value is 30s",
+    ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
+DEF_TIME_WITH_CHECKER(data_storage_error_tolerance_time, OB_CLUSTER_PARAMETER, "300s",
+    common::ObDataStorageErrorToleranceTimeChecker, "[10s,7200s]",
+    "time to tolerate disk read failure, after that, the disk status will be set error. Range [10s,7200s]. The default "
+    "value is 300s",
+    ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
 DEF_INT(data_disk_usage_limit_percentage, OB_CLUSTER_PARAMETER, "90", "[50,100]",
    "the safe use percentage of data disk"
    "Range: [50,100] in integer",