From 439931433241139804fe4b4b263e6124635147c1 Mon Sep 17 00:00:00 2001 From: obdev Date: Mon, 23 Oct 2023 08:10:05 +0000 Subject: [PATCH] BUGFIX: check data disk at dml interface --- .../ob_failure_detector.cpp | 52 ++++++++++++++++++ .../leader_coordinator/ob_failure_detector.h | 6 +++ src/storage/tx_storage/ob_access_service.cpp | 21 ++++++++ src/storage/tx_storage/ob_access_service.h | 1 + src/storage/tx_storage/ob_tenant_freezer.cpp | 53 ++++++++++++++++--- src/storage/tx_storage/ob_tenant_freezer.h | 9 ++++ 6 files changed, 136 insertions(+), 6 deletions(-) diff --git a/src/logservice/leader_coordinator/ob_failure_detector.cpp b/src/logservice/leader_coordinator/ob_failure_detector.cpp index 3f1090233..7466f8514 100644 --- a/src/logservice/leader_coordinator/ob_failure_detector.cpp +++ b/src/logservice/leader_coordinator/ob_failure_detector.cpp @@ -30,6 +30,7 @@ #include "logservice/ob_log_service.h" #include "observer/ob_server_event_history_table_operator.h" #include "storage/slog/ob_storage_logger.h" +#include "storage/tx_storage/ob_tenant_freezer.h" #include "share/schema/ob_multi_version_schema_service.h" namespace oceanbase @@ -48,6 +49,7 @@ ObFailureDetector::ObFailureDetector() has_add_data_disk_hang_event_(false), has_add_clog_full_event_(false), has_schema_error_(false), + has_add_disk_full_event_(false), lock_(common::ObLatchIds::ELECTION_LOCK) { COORDINATOR_LOG(INFO, "ObFailureDetector constructed"); @@ -126,6 +128,7 @@ void ObFailureDetector::destroy() has_add_data_disk_hang_event_ = false; has_add_clog_full_event_ = false; has_schema_error_ = false; + has_add_disk_full_event_ = false; COORDINATOR_LOG(INFO, "ObFailureDetector mtl destroy"); } @@ -166,6 +169,8 @@ void ObFailureDetector::detect_failure() detect_palf_disk_full_(); // schema refreshed check detect_schema_not_refreshed_(); + // data disk full check + detect_data_disk_full_(); } int ObFailureDetector::add_failure_event(const FailureEvent &event) @@ -461,6 +466,53 @@ void ObFailureDetector::detect_schema_not_refreshed_() } } +void ObFailureDetector::detect_data_disk_full_() +{ + LC_TIME_GUARD(1_s); + int ret = OB_SUCCESS; + const int64_t now = ObTimeUtility::current_time(); + ObTenantFreezer *freezer = MTL(ObTenantFreezer*); + int64_t memstore_used = 0; + const bool force_refresh = true; + bool is_disk_enough = true; + FailureEvent data_disk_full_event(FailureType::RESOURCE_NOT_ENOUGH, FailureModule::STORAGE, FailureLevel::NOTICE); + if (OB_FAIL(data_disk_full_event.set_info("data disk full event"))) { + COORDINATOR_LOG(ERROR, "data_disk_full_event set_info failed", K(ret)); + } else if (OB_FAIL(freezer->get_tenant_memstore_used(memstore_used, force_refresh))) { + COORDINATOR_LOG(WARN, "get tenant memstore used failed", K(ret)); + } else if (OB_FAIL(THE_IO_DEVICE->check_space_full(memstore_used)) && + OB_SERVER_OUTOF_DISK_SPACE != ret) { + COORDINATOR_LOG(WARN, "check space full failed", K(ret)); + } else if (OB_SERVER_OUTOF_DISK_SPACE == ret) { + is_disk_enough = false; + ret = OB_SUCCESS; + } else { + // do nothing + } + + if (OB_FAIL(ret)) { + } else if (false == ATOMIC_LOAD(&has_add_disk_full_event_)) { + if (is_disk_enough) { + // data disk is not full, skip. + } else if (OB_FAIL(add_failure_event(data_disk_full_event))) { + COORDINATOR_LOG(ERROR, "add_failure_event failed", K(ret), K(data_disk_full_event)); + } else { + ATOMIC_SET(&has_add_disk_full_event_, true); + LOG_DBA_ERROR(OB_LOG_OUTOF_DISK_SPACE, "msg", "data disk is full, add failure event", + K(data_disk_full_event), K(now)); + } + } else { + if (!is_disk_enough) { + // data disk is still full, cannot remove failure_event. + } else if (OB_FAIL(remove_failure_event(data_disk_full_event))) { + COORDINATOR_LOG(ERROR, "remove_failure_event failed", K(ret), K(data_disk_full_event)); + } else { + ATOMIC_SET(&has_add_disk_full_event_, false); + COORDINATOR_LOG(INFO, "data disk has left space, remove failure event", K(ret), K(data_disk_full_event)); + } + } +} + int ObFailureDetector::FailureEventWithRecoverOp::init(const FailureEvent &event, const ObFunction &recover_detect_operation) { diff --git a/src/logservice/leader_coordinator/ob_failure_detector.h b/src/logservice/leader_coordinator/ob_failure_detector.h index 434cff861..238b5fe7a 100644 --- a/src/logservice/leader_coordinator/ob_failure_detector.h +++ b/src/logservice/leader_coordinator/ob_failure_detector.h @@ -94,6 +94,10 @@ public: bool is_clog_disk_has_fatal_error(); bool is_data_disk_has_fatal_error(); bool is_schema_not_refreshed(); + bool is_data_disk_full() const + { + return has_add_disk_full_event_; + } private: bool check_is_running_() const { return is_running_; } int insert_event_to_table_(const FailureEvent &event, const ObFunction &recover_operation, ObString info); @@ -101,6 +105,7 @@ private: void detect_data_disk_io_failure_(); void detect_palf_disk_full_(); void detect_schema_not_refreshed_(); + void detect_data_disk_full_(); private: struct FailureEventWithRecoverOp { int init(const FailureEvent &event, const ObFunction &recover_detect_operation); @@ -119,6 +124,7 @@ private: bool has_add_data_disk_hang_event_; bool has_add_clog_full_event_; bool has_schema_error_; + bool has_add_disk_full_event_; ObSpinLock lock_; }; diff --git a/src/storage/tx_storage/ob_access_service.cpp b/src/storage/tx_storage/ob_access_service.cpp index 3c11edcd0..dd1726ece 100755 --- a/src/storage/tx_storage/ob_access_service.cpp +++ b/src/storage/tx_storage/ob_access_service.cpp @@ -14,6 +14,7 @@ #include "lib/ob_errno.h" #include "lib/objectpool/ob_server_object_pool.h" +#include "logservice/leader_coordinator/ob_failure_detector.h" #include "share/ob_ls_id.h" #include "storage/ob_query_iterator_factory.h" #include "storage/access/ob_table_scan_iterator.h" @@ -29,6 +30,7 @@ namespace oceanbase { using namespace common; using namespace share; +using namespace logservice::coordinator; namespace storage { @@ -99,6 +101,19 @@ int ObAccessService::check_tenant_out_of_memstore_limit_(bool &is_out_of_mem) return ret; } +int ObAccessService::check_data_disk_full_(bool &is_full) +{ + int ret = OB_SUCCESS; + ObFailureDetector* detector = MTL(ObFailureDetector*); + if (OB_ISNULL(detector)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("mtl module detector is null", K(ret), KP(detector)); + } else { + is_full = detector->is_data_disk_full(); + } + return ret; +} + int ObAccessService::pre_check_lock( const share::ObLSID &ls_id, transaction::ObTxDesc &tx_desc, @@ -546,6 +561,7 @@ int ObAccessService::check_write_allowed_( { int ret = OB_SUCCESS; bool is_out_of_mem = false; + bool is_disk_full = false; ObLS *ls = nullptr; ObLockID lock_id; ObLockParam lock_param; @@ -559,6 +575,11 @@ int ObAccessService::check_write_allowed_( } else if (is_out_of_mem && !tablet_id.is_inner_tablet()) { ret = OB_TENANT_OUT_OF_MEM; LOG_WARN("this tenant is already out of memstore limit", K(ret), K_(tenant_id)); + } else if (OB_FAIL(check_data_disk_full_(is_disk_full))) { + LOG_WARN("fail to check data disk full", K(ret)); + } else if (is_disk_full) { + ret = OB_SERVER_OUTOF_DISK_SPACE; + LOG_WARN("data disk full, you should not do io now", K(ret)); } else if (OB_FAIL(get_write_store_ctx_guard_(ls_id, dml_param.timeout_, tx_desc, diff --git a/src/storage/tx_storage/ob_access_service.h b/src/storage/tx_storage/ob_access_service.h index 8635dc798..6ede47978 100644 --- a/src/storage/tx_storage/ob_access_service.h +++ b/src/storage/tx_storage/ob_access_service.h @@ -198,6 +198,7 @@ public: int64_t &memtable_row_count) const; protected: int check_tenant_out_of_memstore_limit_(bool &is_out_of_mem); + int check_data_disk_full_(bool &is_full); int get_write_store_ctx_guard_( const share::ObLSID &ls_id, diff --git a/src/storage/tx_storage/ob_tenant_freezer.cpp b/src/storage/tx_storage/ob_tenant_freezer.cpp index d2583414c..9b48bf54e 100755 --- a/src/storage/tx_storage/ob_tenant_freezer.cpp +++ b/src/storage/tx_storage/ob_tenant_freezer.cpp @@ -917,7 +917,51 @@ bool ObTenantFreezer::is_replay_pending_log_too_large(const int64_t pending_size return bool_ret; } -int ObTenantFreezer::get_tenant_memstore_cond( +int ObTenantFreezer::get_tenant_memstore_used(int64_t &total_memstore_used, + const bool force_refresh) +{ + int ret = OB_SUCCESS; + int64_t unused_active_memstore_used = 0; + int64_t unused_memstore_freeze_trigger = 0; + int64_t unused_memstore_limit = 0; + int64_t unused_freeze_cnt = 0; + if (!is_inited_) { + ret = OB_NOT_INIT; + LOG_WARN("[TenantFreezer] tenant manager not init", KR(ret)); + } else if (OB_FAIL(get_tenant_memstore_cond_(unused_active_memstore_used, + total_memstore_used, + unused_memstore_freeze_trigger, + unused_memstore_limit, + unused_freeze_cnt, + force_refresh))) { + LOG_WARN("get tenant memstore used failed", K(ret)); + } + return ret; +} + +int ObTenantFreezer::get_tenant_memstore_cond(int64_t &active_memstore_used, + int64_t &total_memstore_used, + int64_t &memstore_freeze_trigger, + int64_t &memstore_limit, + int64_t &freeze_cnt, + const bool force_refresh) +{ + int ret = OB_SUCCESS; + if (!is_inited_) { + ret = OB_NOT_INIT; + LOG_WARN("[TenantFreezer] tenant manager not init", KR(ret)); + } else if (OB_FAIL(get_tenant_memstore_cond_(active_memstore_used, + total_memstore_used, + memstore_freeze_trigger, + memstore_limit, + freeze_cnt, + force_refresh))) { + LOG_WARN("get tenant memstore used failed", K(ret)); + } + return ret; +} + +int ObTenantFreezer::get_tenant_memstore_cond_( int64_t &active_memstore_used, int64_t &total_memstore_used, int64_t &memstore_freeze_trigger, @@ -941,11 +985,8 @@ int ObTenantFreezer::get_tenant_memstore_cond( memstore_freeze_trigger = 0; memstore_limit = 0; - if (!is_inited_) { - ret = OB_NOT_INIT; - LOG_WARN("[TenantFreezer] tenant manager not init", KR(ret)); - } else if (!force_refresh && - current_time - last_refresh_timestamp < MEMSTORE_USED_CACHE_REFRESH_INTERVAL) { + if (!force_refresh && + current_time - last_refresh_timestamp < MEMSTORE_USED_CACHE_REFRESH_INTERVAL) { active_memstore_used = last_active_memstore_used; total_memstore_used = last_total_memstore_used; memstore_freeze_trigger = last_memstore_freeze_trigger; diff --git a/src/storage/tx_storage/ob_tenant_freezer.h b/src/storage/tx_storage/ob_tenant_freezer.h index 9af7b8f24..5f3ada1d6 100755 --- a/src/storage/tx_storage/ob_tenant_freezer.h +++ b/src/storage/tx_storage/ob_tenant_freezer.h @@ -121,6 +121,9 @@ public: int64_t &memstore_limit, int64_t &freeze_cnt, const bool force_refresh = true); + // get the tenant memstore used + int get_tenant_memstore_used(int64_t &total_memstore_used, + const bool force_refresh = true); // get the tenant memstore limit. int get_tenant_memstore_limit(int64_t &mem_limit); // this is used to check if the tenant's memstore is out at user side. @@ -149,6 +152,12 @@ public: static int64_t get_freeze_trigger_interval() { return FREEZE_TRIGGER_INTERVAL; } bool exist_ls_freezing(); private: + int get_tenant_memstore_cond_(int64_t &active_memstore_used, + int64_t &total_memstore_used, + int64_t &memstore_freeze_trigger, + int64_t &memstore_limit, + int64_t &freeze_cnt, + const bool force_refresh = true); int check_memstore_full_(bool &last_result, int64_t &last_check_timestamp, bool &is_out_of_mem,