diff --git a/src/storage/compaction/ob_compaction_diagnose.cpp b/src/storage/compaction/ob_compaction_diagnose.cpp index d69edcb825..163108724a 100644 --- a/src/storage/compaction/ob_compaction_diagnose.cpp +++ b/src/storage/compaction/ob_compaction_diagnose.cpp @@ -379,6 +379,30 @@ int ObCompactionDiagnoseMgr::get_suspect_info( return ret; } +int ObCompactionDiagnoseMgr::diagnose_ls_merge( + const ObMergeType merge_type, + const ObLSID &ls_id) +{ + int ret = OB_SUCCESS; + ObScheduleSuspectInfo ret_info; + if (OB_FAIL(get_suspect_info(merge_type, ls_id, ObTabletID(INT64_MAX), ret_info))) { + if (OB_HASH_NOT_EXIST != ret) { + LOG_WARN("failed get ls merge suspect info", K(ret), K(ls_id)); + } + } else if (can_add_diagnose_info()) { + SET_DIAGNOSE_INFO( + info_array_[idx_++], + merge_type, + ret_info.tenant_id_, + ls_id, + ObTabletID(INT64_MAX), + ObCompactionDiagnoseInfo::DIA_STATUS_FAILED, + ret_info.add_time_, + "schedule_suspect_info", ret_info.suspect_info_); + } + return ret; +} + int ObCompactionDiagnoseMgr::diagnose_tenant_tablet() { int ret = OB_SUCCESS; @@ -476,21 +500,13 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet() compaction_scn); } // check ls suspect info for memtable freezing - ObScheduleSuspectInfo ret_info; - if (OB_TMP_FAIL(get_suspect_info(MINI_MERGE, ls_id, ObTabletID(INT64_MAX), ret_info))) { - if (OB_HASH_NOT_EXIST != tmp_ret) { - LOG_WARN("failed get ls merge suspect info", K(tmp_ret), K(ls_id)); - } - } else if (can_add_diagnose_info()) { - SET_DIAGNOSE_INFO( - info_array_[idx_++], - MINI_MERGE, - ret_info.tenant_id_, - ls_id, - ObTabletID(INT64_MAX), - ObCompactionDiagnoseInfo::DIA_STATUS_FAILED, - ret_info.add_time_, - "schedule_suspect_info", ret_info.suspect_info_); + if (OB_TMP_FAIL(diagnose_ls_merge(MINI_MERGE, ls_id))) { + LOG_WARN("failed to diagnose about memtable freezing", K(tmp_ret)); + } + + // check ls locality change and leader change + if (OB_TMP_FAIL(diagnose_ls_merge(MEDIUM_MERGE, ls_id))) { + LOG_WARN("failed to diagnose about ls locality change", K(tmp_ret)); } ObLSTabletIterator tablet_iter(ObTabletCommon::NO_CHECK_GET_TABLET_TIMEOUT_US); ObLSVTInfo ls_info; @@ -553,7 +569,7 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet() ObTabletID(INT64_MAX), ObCompactionDiagnoseInfo::DIA_STATUS_FINISH, ObTimeUtility::fast_current_time(), - "test: compaction has finished in storage, please check RS. compaction_scn", compaction_scn); + "compaction has finished in storage, please check RS. compaction_scn", compaction_scn); if (!abnormal_ls_id.empty()) { char * buf = info.diagnose_info_; const int64_t buf_len = common::OB_DIAGNOSE_INFO_LENGTH; diff --git a/src/storage/compaction/ob_compaction_diagnose.h b/src/storage/compaction/ob_compaction_diagnose.h index 61099b245d..58b0f5b796 100644 --- a/src/storage/compaction/ob_compaction_diagnose.h +++ b/src/storage/compaction/ob_compaction_diagnose.h @@ -130,6 +130,9 @@ public: ObDiagnoseTabletCompProgress &input_progress); static int check_system_compaction_config(char *tmp_str, const int64_t buf_len); private: + int diagnose_ls_merge( + const ObMergeType merge_type, + const ObLSID &ls_id); int diagnose_tablet_mini_merge(const ObLSID &ls_id, ObTablet &tablet); int diagnose_tablet_minor_merge(const ObLSID &ls_id, ObTablet &tablet); int diagnose_tablet_medium_merge( @@ -209,20 +212,21 @@ private: #define DEL_SUSPECT_INFO(type, ls_id, tablet_id) \ { \ + int tmp_ret = OB_SUCCESS; \ compaction::ObMergeDagHash dag_hash; \ dag_hash.merge_type_ = type; \ dag_hash.ls_id_ = ls_id; \ dag_hash.tablet_id_ = tablet_id; \ int64_t tenant_id = MTL_ID(); \ int64_t hash_value = ObScheduleSuspectInfo::gen_hash(tenant_id, dag_hash.inner_hash()); \ - if (OB_FAIL(ObScheduleSuspectInfoMgr::get_instance().del_suspect_info(hash_value))) { \ - if (OB_HASH_NOT_EXIST != ret) { \ - STORAGE_LOG(WARN, "failed to add suspect info", K(ret), K(dag_hash), K(tenant_id)); \ + if (OB_TMP_FAIL(ObScheduleSuspectInfoMgr::get_instance().del_suspect_info(hash_value))) { \ + if (OB_HASH_NOT_EXIST != tmp_ret) { \ + STORAGE_LOG(WARN, "failed to add suspect info", K(tmp_ret), K(dag_hash), K(tenant_id)); \ } else { \ - ret = OB_SUCCESS; \ + tmp_ret = OB_SUCCESS; \ } \ } else { \ - STORAGE_LOG(DEBUG, "success to add suspect info", K(ret), K(dag_hash), K(tenant_id)); \ + STORAGE_LOG(DEBUG, "success to add suspect info", K(tmp_ret), K(dag_hash), K(tenant_id)); \ } \ } diff --git a/src/storage/compaction/ob_tenant_tablet_scheduler.cpp b/src/storage/compaction/ob_tenant_tablet_scheduler.cpp index 8f4230a034..3338bb97f2 100755 --- a/src/storage/compaction/ob_tenant_tablet_scheduler.cpp +++ b/src/storage/compaction/ob_tenant_tablet_scheduler.cpp @@ -921,7 +921,6 @@ int ObTenantTabletScheduler::schedule_ls_minor_merge( int ObTenantTabletScheduler::schedule_ls_medium_merge( int64_t &merge_version, ObLSHandle &ls_handle, - bool &ls_merge_finish, bool &all_ls_weak_read_ts_ready, int64_t &schedule_tablet_cnt) { @@ -968,6 +967,11 @@ int ObTenantTabletScheduler::schedule_ls_medium_merge( is_leader = true; if (OB_FAIL(ls_locality_cache_.get_ls_locality(ls_id, ls_locality))) { LOG_WARN("failed to get ls locality", K(ret), K(ls_id)); + } else if (0 == ls_locality.svr_addr_list_.count()) { + ADD_SUSPECT_INFO(MEDIUM_MERGE, ls_id, ObTabletID(INT64_MAX), + "maybe bad case: locality change and leader change", K(ls_locality)); + } else { + DEL_SUSPECT_INFO(MEDIUM_MERGE, ls_id, ObTabletID(INT64_MAX)); } } } else { @@ -1048,7 +1052,7 @@ int ObTenantTabletScheduler::schedule_ls_medium_merge( LOG_WARN("failed to schedule medium", K(tmp_ret), K(ls_id), K(tablet_id)); } } - ls_merge_finish &= tablet_merge_finish; + medium_ls_tablet_iter_.update_merge_finish(tablet_merge_finish); } } // end of while } // else @@ -1076,7 +1080,6 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() } else if (!medium_ls_tablet_iter_.is_valid() && OB_FAIL(medium_ls_tablet_iter_.build_iter())) { LOG_WARN("failed to init iterator", K(ret)); } else { - bool tenant_merge_finish = true; bool all_ls_weak_read_ts_ready = true; bool check_report_scn_flag = false; int64_t merge_version = get_frozen_version(); @@ -1109,7 +1112,6 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() #endif while (OB_SUCC(ret) && schedule_tablet_cnt < SCHEDULE_TABLET_BATCH_CNT) { - bool ls_merge_finish = true; if (OB_FAIL(medium_ls_tablet_iter_.get_next_ls(ls_handle))) { if (OB_ITER_END == ret) { ret = OB_SUCCESS; @@ -1121,18 +1123,16 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() ret = OB_ERR_UNEXPECTED; LOG_WARN("ls is null", K(ret), K(ls)); } else if (OB_TMP_FAIL(schedule_ls_medium_merge( - merge_version, ls_handle, ls_merge_finish, + merge_version, ls_handle, all_ls_weak_read_ts_ready, schedule_tablet_cnt))) { medium_ls_tablet_iter_.skip_cur_ls(); // for any errno, skip cur ls - tenant_merge_finish = false; + medium_ls_tablet_iter_.update_merge_finish(false); if (OB_SIZE_OVERFLOW == tmp_ret) { break; } else if (!schedule_ignore_error(tmp_ret)) { LOG_WARN("failed to schedule ls merge", K(tmp_ret), KPC(ls)); } } else { - tenant_merge_finish &= ls_merge_finish; - // loop tablet_meta table to update smaller report_scn because of migration if (check_report_scn_flag) { (void) update_report_scn_as_ls_leader(*ls); @@ -1140,9 +1140,9 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() } } // end while - if (!tenant_merge_finish) { // wait major compaction + if (!medium_ls_tablet_iter_.tenant_merge_finish()) { // wait major compaction if (all_ls_weak_read_ts_ready) { // check schedule Timer Task - if (schedule_stats_.add_weak_read_ts_event_flag_) { + if (schedule_stats_.add_weak_read_ts_event_flag_ && medium_ls_tablet_iter_.is_scan_finish()) { // all ls scan finish schedule_stats_.add_weak_read_ts_event_flag_ = false; ADD_COMPACTION_EVENT( MTL_ID(), @@ -1159,7 +1159,7 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() } } - if (REACH_TENANT_TIME_INTERVAL(ADD_LOOP_EVENT_INTERVAL)) { + if (medium_ls_tablet_iter_.is_scan_finish() && REACH_TENANT_TIME_INTERVAL(ADD_LOOP_EVENT_INTERVAL)) { ADD_COMPACTION_EVENT( MTL_ID(), MAJOR_MERGE, @@ -1171,7 +1171,7 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() } } - if (OB_SUCC(ret) && tenant_merge_finish && merge_version > merged_version_) { + if (OB_SUCC(ret) && medium_ls_tablet_iter_.tenant_merge_finish() && merge_version > merged_version_) { merged_version_ = merge_version; LOG_INFO("all tablet major merge finish", K(merged_version_), K(merge_version)); DEL_SUSPECT_INFO(MEDIUM_MERGE, share::ObLSID(INT64_MAX), ObTabletID(INT64_MAX)); @@ -1194,9 +1194,12 @@ int ObTenantTabletScheduler::schedule_all_tablets_medium() reload_tenant_config(); // tenant merge finish, use tenant default config to loop } - LOG_INFO("finish schedule all tablet merge", K(merge_version), K(schedule_stats_), K(tenant_merge_finish), + LOG_INFO("finish schedule all tablet merge", K(merge_version), K(schedule_stats_), + "tenant_merge_finish", medium_ls_tablet_iter_.tenant_merge_finish(), K(merged_version_), K(schedule_tablet_cnt)); - schedule_stats_.clear_tablet_cnt(); + if (medium_ls_tablet_iter_.is_scan_finish()) { + schedule_stats_.clear_tablet_cnt(); + } } return ret; } @@ -1275,6 +1278,8 @@ int ObCompactionScheduleIterator::build_iter() ls_idx_ = -1; tablet_idx_ = 0; tablet_ids_.reuse(); + scan_finish_ = false; + merge_finish_ = true; LOG_TRACE("build iter", K(ret), K(ls_ids_)); } return ret; @@ -1290,6 +1295,7 @@ int ObCompactionScheduleIterator::get_next_ls(ObLSHandle &ls_handle) } do { if (ls_idx_ >= ls_ids_.count()) { + scan_finish_ = true; ret = OB_ITER_END; } else if (OB_FAIL((MTL(storage::ObLSService *)->get_ls(ls_ids_[ls_idx_], ls_handle, mod_)))) { if (OB_LS_NOT_EXIST == ret) { @@ -1311,6 +1317,8 @@ void ObCompactionScheduleIterator::reset() tablet_idx_ = 0; ls_ids_.reuse(); tablet_ids_.reuse(); + scan_finish_ = false; + merge_finish_ = false; } bool ObCompactionScheduleIterator::is_valid() const diff --git a/src/storage/compaction/ob_tenant_tablet_scheduler.h b/src/storage/compaction/ob_tenant_tablet_scheduler.h index c4c7328170..e3c56f265e 100644 --- a/src/storage/compaction/ob_tenant_tablet_scheduler.h +++ b/src/storage/compaction/ob_tenant_tablet_scheduler.h @@ -66,6 +66,8 @@ public: const int64_t timeout_us = ObTabletCommon::DIRECT_GET_COMMITTED_TABLET_TIMEOUT_US) : mod_(mod), is_major_(is_major), + scan_finish_(false), + merge_finish_(false), timeout_us_(timeout_us), ls_idx_(0), tablet_idx_(0), @@ -76,6 +78,11 @@ public: int build_iter(); int get_next_ls(ObLSHandle &ls_handle); int get_next_tablet(ObLSHandle &ls_handle, ObTabletHandle &tablet_handle); + bool is_scan_finish() const { return scan_finish_; } + bool tenant_merge_finish() const { return merge_finish_ & scan_finish_; } + void update_merge_finish(bool merge_finish) { + merge_finish_ &= merge_finish; + } void reset(); bool is_valid() const; void skip_cur_ls() @@ -89,6 +96,8 @@ private: static const int64_t TABLET_ID_ARRAY_CNT = 2000; ObLSGetMod mod_; bool is_major_; + bool scan_finish_; + bool merge_finish_; int64_t timeout_us_; int64_t ls_idx_; uint64_t tablet_idx_; @@ -219,7 +228,6 @@ private: int schedule_ls_medium_merge( int64_t &merge_version, ObLSHandle &ls_handle, - bool &ls_merge_finish, bool &all_ls_weak_read_ts_ready, int64_t &schedule_tablet_cnt); int schedule_ls_minor_merge(