enhance compaction diagnose

This commit is contained in:
a1iive
2023-04-03 19:45:01 +00:00
committed by ob-robot
parent 03809198e1
commit 33923859fc
8 changed files with 59 additions and 58 deletions

View File

@ -218,8 +218,7 @@ const char *ObCompactionDiagnoseInfo::ObDiagnoseStatusStr[DIA_STATUS_MAX] = {
"RUNNING", "RUNNING",
"FAILED", "FAILED",
"FINISH", "FINISH",
"RS_UNCOMPACTED", "RS_UNCOMPACTED"
"DIA_FAILED"
}; };
const char * ObCompactionDiagnoseInfo::get_diagnose_status_str(ObDiagnoseStatus status) const char * ObCompactionDiagnoseInfo::get_diagnose_status_str(ObDiagnoseStatus status)
@ -430,6 +429,7 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
bool tenant_major_finish = true; bool tenant_major_finish = true;
bool ls_major_finish = true; bool ls_major_finish = true;
bool tablet_major_finish = true; bool tablet_major_finish = true;
ObSEArray<ObLSID, 64> abnormal_ls_id;
while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all log_stream while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all log_stream
bool need_merge = false; bool need_merge = false;
if (OB_FAIL(ls_iter_guard.get_ptr()->get_next(ls))) { if (OB_FAIL(ls_iter_guard.get_ptr()->get_next(ls))) {
@ -481,17 +481,7 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
if (OB_FAIL(ls->get_ls_info(ls_info))) { if (OB_FAIL(ls->get_ls_info(ls_info))) {
LOG_WARN("failed to get ls info", K(ret), K(ls)); LOG_WARN("failed to get ls info", K(ret), K(ls));
} else if (MAX_LS_TABLET_CNT < ls_info.tablet_count_) { } else if (MAX_LS_TABLET_CNT < ls_info.tablet_count_) {
if (can_add_diagnose_info()) { LOG_INFO("there is too many tablets, skip diagnose in this ls", K(ls_id), "tablet_count", ls_info.tablet_count_);
SET_DIAGNOSE_INFO(
info_array_[idx_++],
MERGE_TYPE_MAX,
MTL_ID(),
ls_id,
ObTabletID(INT64_MAX),
ObCompactionDiagnoseInfo::DIA_STATUS_DIA_FAILED,
ObTimeUtility::fast_current_time(),
"there is too many tablets. tablet count", ls_info.tablet_count_);
}
} else if (OB_FAIL(ls->build_tablet_iter(tablet_iter))) { } else if (OB_FAIL(ls->build_tablet_iter(tablet_iter))) {
LOG_WARN("failed to build ls tablet iter", K(ret), K(ls)); LOG_WARN("failed to build ls tablet iter", K(ret), K(ls));
} else { } else {
@ -525,7 +515,7 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
if (OB_TMP_FAIL(diagnose_tablet_minor_merge(ls_id, *tablet_handle.get_obj()))) { if (OB_TMP_FAIL(diagnose_tablet_minor_merge(ls_id, *tablet_handle.get_obj()))) {
LOG_WARN("failed to get diagnose minor merge", K(tmp_ret)); LOG_WARN("failed to get diagnose minor merge", K(tmp_ret));
} }
if (OB_TMP_FAIL(diagnose_tablet_medium_merge(ls_id, *tablet_handle.get_obj()))) { if (OB_TMP_FAIL(diagnose_tablet_medium_merge(compaction_scn, ls_id, *tablet_handle.get_obj()))) {
LOG_WARN("failed to get diagnose medium merge", K(tmp_ret)); LOG_WARN("failed to get diagnose medium merge", K(tmp_ret));
} }
} }
@ -533,18 +523,27 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
tenant_major_finish &= ls_major_finish; tenant_major_finish &= ls_major_finish;
LOG_INFO("finish ls merge diagnose", K(ret), K(ls_id)); LOG_INFO("finish ls merge diagnose", K(ret), K(ls_id));
} }
} else {
(void)abnormal_ls_id.push_back(ls->get_ls_id());
} }
} // end of while } // end of while
if (diagnose_major_flag && tenant_major_finish && can_add_diagnose_info()) { if (diagnose_major_flag && tenant_major_finish && can_add_diagnose_info()) {
ObCompactionDiagnoseInfo &info = info_array_[idx_++];
SET_DIAGNOSE_INFO( SET_DIAGNOSE_INFO(
info_array_[idx_++], info,
MEDIUM_MERGE, MEDIUM_MERGE,
MTL_ID(), MTL_ID(),
share::ObLSID(INT64_MAX), share::ObLSID(INT64_MAX),
ObTabletID(INT64_MAX), ObTabletID(INT64_MAX),
ObCompactionDiagnoseInfo::DIA_STATUS_FINISH, ObCompactionDiagnoseInfo::DIA_STATUS_FINISH,
ObTimeUtility::fast_current_time(), ObTimeUtility::fast_current_time(),
"compaction has finished in storage. compaction_scn", compaction_scn); "test: compaction has finished in storage, please check RS. compaction_scn", compaction_scn);
if (!abnormal_ls_id.empty()) {
char * buf = info.diagnose_info_;
const int64_t buf_len = common::OB_DIAGNOSE_INFO_LENGTH;
ADD_COMPACTION_INFO_PARAM(buf, buf_len,
"some ls may offline", abnormal_ls_id);
}
} }
} }
} }
@ -733,6 +732,7 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_minor_merge(const ObLSID &ls_id, Ob
} }
int ObCompactionDiagnoseMgr::diagnose_tablet_medium_merge( int ObCompactionDiagnoseMgr::diagnose_tablet_medium_merge(
const int64_t compaction_scn,
const ObLSID &ls_id, const ObLSID &ls_id,
ObTablet &tablet) ObTablet &tablet)
{ {
@ -763,7 +763,7 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_medium_merge(
"tablet_snapshot", tablet.get_snapshot_version()))) { "tablet_snapshot", tablet.get_snapshot_version()))) {
LOG_WARN("failed to add diagnose info", K(ret), K(ls_id), K(tablet_id)); LOG_WARN("failed to add diagnose info", K(ret), K(ls_id), K(tablet_id));
} }
} else { } else if (max_sync_medium_scn != compaction_scn) {
ObTabletMajorMergeDag dag; ObTabletMajorMergeDag dag;
if (OB_FAIL(diagnose_tablet_merge( if (OB_FAIL(diagnose_tablet_merge(
dag, dag,
@ -851,6 +851,7 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_merge(
} else if (progress.is_valid()) { // dag exist, means compaction is running } else if (progress.is_valid()) { // dag exist, means compaction is running
// check progress is normal // check progress is normal
if (progress.is_suspect_abormal_) { // progress is abnomal if (progress.is_suspect_abormal_) { // progress is abnomal
const char* current_status = progress.is_waiting_schedule_ ? "dag may be waiting for schedule" : "dag may hang";
if (can_add_diagnose_info() if (can_add_diagnose_info()
&& OB_FAIL(SET_DIAGNOSE_INFO( && OB_FAIL(SET_DIAGNOSE_INFO(
info_array_[idx_++], info_array_[idx_++],
@ -860,7 +861,7 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_merge(
tablet_id, tablet_id,
ObCompactionDiagnoseInfo::DIA_STATUS_RUNNING, ObCompactionDiagnoseInfo::DIA_STATUS_RUNNING,
ObTimeUtility::fast_current_time(), ObTimeUtility::fast_current_time(),
"current_status", "dag may hang", K(current_status),
"merge_progress", progress))) { "merge_progress", progress))) {
LOG_WARN("failed to add diagnose info", K(ret), K(ls_id), K(tablet_id), K(progress)); LOG_WARN("failed to add diagnose info", K(ret), K(ls_id), K(tablet_id), K(progress));
} }

View File

@ -94,7 +94,6 @@ struct ObCompactionDiagnoseInfo
DIA_STATUS_FAILED = 2, DIA_STATUS_FAILED = 2,
DIA_STATUS_FINISH = 3, DIA_STATUS_FINISH = 3,
DIA_STATUS_RS_UNCOMPACTED = 4, // RS diagnose DIA_STATUS_RS_UNCOMPACTED = 4, // RS diagnose
DIA_STATUS_DIA_FAILED = 5,
DIA_STATUS_MAX DIA_STATUS_MAX
}; };
const static char *ObDiagnoseStatusStr[DIA_STATUS_MAX]; const static char *ObDiagnoseStatusStr[DIA_STATUS_MAX];
@ -134,6 +133,7 @@ private:
int diagnose_tablet_mini_merge(const ObLSID &ls_id, ObTablet &tablet); int diagnose_tablet_mini_merge(const ObLSID &ls_id, ObTablet &tablet);
int diagnose_tablet_minor_merge(const ObLSID &ls_id, ObTablet &tablet); int diagnose_tablet_minor_merge(const ObLSID &ls_id, ObTablet &tablet);
int diagnose_tablet_medium_merge( int diagnose_tablet_medium_merge(
const int64_t compaction_scn,
const ObLSID &ls_id, const ObLSID &ls_id,
ObTablet &tablet); ObTablet &tablet);
int diagnose_tablet_major_merge( int diagnose_tablet_major_merge(

View File

@ -45,6 +45,7 @@ ObPartitionMergeProgress::ObPartitionMergeProgress(common::ObIAllocator &allocat
pre_scanned_row_cnt_(0), pre_scanned_row_cnt_(0),
pre_output_block_cnt_(0), pre_output_block_cnt_(0),
is_updating_(false), is_updating_(false),
is_waiting_schedule_(true),
is_inited_(false) is_inited_(false)
{ {
} }
@ -72,6 +73,7 @@ void ObPartitionMergeProgress::reset()
pre_output_block_cnt_ = 0; pre_output_block_cnt_ = 0;
concurrent_cnt_ = 0; concurrent_cnt_ = 0;
is_updating_ = false; is_updating_ = false;
is_waiting_schedule_ = true;
} }
@ -112,6 +114,7 @@ int ObPartitionMergeProgress::init(ObTabletMergeCtx *ctx, const ObTableReadInfo
MEMSET(buf, 0, sizeof(int64_t) * concurrent_cnt * 2); MEMSET(buf, 0, sizeof(int64_t) * concurrent_cnt * 2);
scanned_row_cnt_arr_ = buf; scanned_row_cnt_arr_ = buf;
output_block_cnt_arr_ = buf + concurrent_cnt; output_block_cnt_arr_ = buf + concurrent_cnt;
is_waiting_schedule_ = false;
concurrent_cnt_ = concurrent_cnt; concurrent_cnt_ = concurrent_cnt;
merge_dag_ = merge_dag; merge_dag_ = merge_dag;
@ -224,21 +227,6 @@ int ObPartitionMergeProgress::estimate(ObTabletMergeCtx *ctx)
return ret; return ret;
} }
int ObPartitionMergeProgress::update_row_count(const int64_t idx, const int64_t incre_row_cnt)
{
int ret = OB_SUCCESS;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionMergeProgress not inited", K(ret));
} else if (incre_row_cnt > 0) {
scanned_row_cnt_arr_[idx] += incre_row_cnt;
if (REACH_TENANT_TIME_INTERVAL(UPDATE_INTERVAL)) {
latest_update_ts_ = ObTimeUtility::fast_current_time();
}
}
return ret;
}
int ObPartitionMergeProgress::update_merge_progress( int ObPartitionMergeProgress::update_merge_progress(
const int64_t idx, const int64_t idx,
const int64_t scanned_row_cnt, const int64_t scanned_row_cnt,
@ -254,7 +242,6 @@ int ObPartitionMergeProgress::update_merge_progress(
} else if (scanned_row_cnt > scanned_row_cnt_arr_[idx] || output_block_cnt > output_block_cnt_arr_[idx]) { } else if (scanned_row_cnt > scanned_row_cnt_arr_[idx] || output_block_cnt > output_block_cnt_arr_[idx]) {
scanned_row_cnt_arr_[idx] = MAX(scanned_row_cnt_arr_[idx], scanned_row_cnt); scanned_row_cnt_arr_[idx] = MAX(scanned_row_cnt_arr_[idx], scanned_row_cnt);
output_block_cnt_arr_[idx] = MAX(output_block_cnt_arr_[idx], output_block_cnt); output_block_cnt_arr_[idx] = MAX(output_block_cnt_arr_[idx], output_block_cnt);
if (REACH_TENANT_TIME_INTERVAL(UPDATE_INTERVAL)) { if (REACH_TENANT_TIME_INTERVAL(UPDATE_INTERVAL)) {
if (!ATOMIC_CAS(&is_updating_, false, true)) { if (!ATOMIC_CAS(&is_updating_, false, true)) {
latest_update_ts_ = ObTimeUtility::fast_current_time(); latest_update_ts_ = ObTimeUtility::fast_current_time();
@ -351,7 +338,9 @@ int ObPartitionMergeProgress::diagnose_progress(ObDiagnoseTabletCompProgress &in
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
if (ObTimeUtility::fast_current_time() - latest_update_ts_ > UPDATE_INTERVAL * NORMAL_UPDATE_PARAM) { if (ObTimeUtility::fast_current_time() - latest_update_ts_ > UPDATE_INTERVAL * NORMAL_UPDATE_PARAM) {
input_progress.is_suspect_abormal_ = true; input_progress.is_suspect_abormal_ = true;
input_progress.is_waiting_schedule_ = is_waiting_schedule_;
} }
input_progress.latest_update_ts_ = latest_update_ts_;
return ret; return ret;
} }

View File

@ -37,7 +37,6 @@ public:
void reset(); void reset();
OB_INLINE bool is_inited() const { return is_inited_; } OB_INLINE bool is_inited() const { return is_inited_; }
int init(ObTabletMergeCtx *ctx, const storage::ObTableReadInfo &read_info); int init(ObTabletMergeCtx *ctx, const storage::ObTableReadInfo &read_info);
int update_row_count(const int64_t idx, const int64_t incre_row_cnt);
virtual int update_merge_progress(const int64_t idx, const int64_t scanned_row_count, const int64_t output_block_cnt); virtual int update_merge_progress(const int64_t idx, const int64_t scanned_row_count, const int64_t output_block_cnt);
virtual int finish_merge_progress(const int64_t output_cnt); virtual int finish_merge_progress(const int64_t output_cnt);
int update_merge_info(storage::ObSSTableMergeInfo &merge_info); int update_merge_info(storage::ObSSTableMergeInfo &merge_info);
@ -73,6 +72,7 @@ protected:
int64_t pre_scanned_row_cnt_; // for smooth the progress curve int64_t pre_scanned_row_cnt_; // for smooth the progress curve
int64_t pre_output_block_cnt_; int64_t pre_output_block_cnt_;
bool is_updating_; // atomic lock bool is_updating_; // atomic lock
bool is_waiting_schedule_;
bool is_inited_; bool is_inited_;
}; };

View File

@ -251,7 +251,7 @@ int ObBasicTabletMergeDag::get_tablet_and_compat_mode()
++inc_sstable_cnt; ++inc_sstable_cnt;
} }
if (OB_SUCC(ret) && inc_sstable_cnt >= MAX_SSTABLE_CNT_IN_STORAGE) { if (OB_SUCC(ret) && inc_sstable_cnt >= MAX_SSTABLE_CNT_IN_STORAGE) {
ret = OB_EAGAIN; ret = OB_TOO_MANY_SSTABLE;
LOG_WARN("Too many sstables in tablet, cannot schdule mini compaction, retry later", LOG_WARN("Too many sstables in tablet, cannot schdule mini compaction, retry later",
K(ret), K_(ls_id), K_(tablet_id), K(inc_sstable_cnt), K(tmp_tablet_handle.get_obj())); K(ret), K_(ls_id), K_(tablet_id), K(inc_sstable_cnt), K(tmp_tablet_handle.get_obj()));
} }

View File

@ -120,8 +120,10 @@ void ObDiagnoseTabletCompProgress::reset()
{ {
ObCompactionProgress::reset(); ObCompactionProgress::reset();
is_suspect_abormal_ = false; is_suspect_abormal_ = false;
is_waiting_schedule_ = false;
dag_id_.reset(); dag_id_.reset();
create_time_ = 0; create_time_ = 0;
latest_update_ts_ = 0;
base_version_ = 0; base_version_ = 0;
snapshot_version_ = 0; snapshot_version_ = 0;
} }
@ -398,6 +400,9 @@ int ObTenantCompactionProgressMgr::update_progress(
} else if (array_[pos].estimated_finish_time_ < estimate_finish_time) { } else if (array_[pos].estimated_finish_time_ < estimate_finish_time) {
array_[pos].estimated_finish_time_ = estimate_finish_time; array_[pos].estimated_finish_time_ = estimate_finish_time;
} }
if (ObPartitionMergeProgress::MAX_ESTIMATE_SPEND_TIME < array_[pos].estimated_finish_time_ - array_[pos].start_time_) {
array_[pos].estimated_finish_time_ = array_[pos].start_time_ + ObPartitionMergeProgress::MAX_ESTIMATE_SPEND_TIME;
}
LOG_DEBUG("success to update progress", K(ret), K(total_data_size_delta), K(output_block_cnt_delta), LOG_DEBUG("success to update progress", K(ret), K(total_data_size_delta), K(output_block_cnt_delta),
K(scanned_data_size_delta), K(array_[pos])); K(scanned_data_size_delta), K(array_[pos]));
} }

View File

@ -179,6 +179,7 @@ struct ObDiagnoseTabletCompProgress : public ObCompactionProgress
ObDiagnoseTabletCompProgress() ObDiagnoseTabletCompProgress()
: ObCompactionProgress(), : ObCompactionProgress(),
is_suspect_abormal_(false), is_suspect_abormal_(false),
is_waiting_schedule_(false),
dag_id_(), dag_id_(),
create_time_(0), create_time_(0),
latest_update_ts_(0), latest_update_ts_(0),
@ -188,10 +189,11 @@ struct ObDiagnoseTabletCompProgress : public ObCompactionProgress
} }
bool is_valid() const; bool is_valid() const;
void reset(); void reset();
INHERIT_TO_STRING_KV("ObCompactionProgress", ObCompactionProgress, K_(is_suspect_abormal), K_(create_time), INHERIT_TO_STRING_KV("ObCompactionProgress", ObCompactionProgress, K_(is_suspect_abormal), K_(is_waiting_schedule),
K_(dag_id), K_(base_version), K_(snapshot_version), K_(status)); K_(create_time), K_(latest_update_ts), K_(dag_id), K_(base_version), K_(snapshot_version), K_(status));
bool is_suspect_abormal_; bool is_suspect_abormal_;
bool is_waiting_schedule_;
share::ObDagId dag_id_; share::ObDagId dag_id_;
int64_t create_time_; int64_t create_time_;
int64_t latest_update_ts_; int64_t latest_update_ts_;

View File

@ -2089,19 +2089,6 @@ int ObMemtable::flush(share::ObLSID ls_id)
if (is_flushed_) { if (is_flushed_) {
ret = OB_NO_NEED_UPDATE; ret = OB_NO_NEED_UPDATE;
} else { } else {
if (mt_stat_.create_flush_dag_time_ == 0 &&
mt_stat_.ready_for_flush_time_ != 0 &&
cur_time - mt_stat_.ready_for_flush_time_ > 30 * 1000 * 1000) {
STORAGE_LOG(WARN, "memtable can not create dag successfully for long time",
K(ls_id), K(*this), K(mt_stat_.ready_for_flush_time_));
ADD_SUSPECT_INFO(MINI_MERGE,
ls_id, get_tablet_id(),
"memtable can not create dag successfully",
"has been ready for flush time:",
cur_time - mt_stat_.ready_for_flush_time_,
"ready for flush time:",
mt_stat_.ready_for_flush_time_);
}
ObTabletMergeDagParam param; ObTabletMergeDagParam param;
param.ls_id_ = ls_id; param.ls_id_ = ls_id;
param.tablet_id_ = key_.tablet_id_; param.tablet_id_ = key_.tablet_id_;
@ -2116,6 +2103,23 @@ int ObMemtable::flush(share::ObLSID ls_id)
mt_stat_.create_flush_dag_time_ = cur_time; mt_stat_.create_flush_dag_time_ = cur_time;
TRANS_LOG(INFO, "schedule tablet merge dag successfully", K(ret), K(param), KPC(this)); TRANS_LOG(INFO, "schedule tablet merge dag successfully", K(ret), K(param), KPC(this));
} }
if (OB_FAIL(ret) && mt_stat_.create_flush_dag_time_ == 0 &&
mt_stat_.ready_for_flush_time_ != 0 &&
cur_time - mt_stat_.ready_for_flush_time_ > 30 * 1000 * 1000) {
STORAGE_LOG(WARN, "memtable can not create dag successfully for long time",
K(ls_id), K(*this), K(mt_stat_.ready_for_flush_time_));
const char *str_user_error = ob_errpkt_str_user_error(ret, false);
ADD_SUSPECT_INFO(MINI_MERGE,
ls_id, get_tablet_id(),
"memtable can not create dag successfully",
"extra info",
str_user_error,
"has been ready for flush time:",
cur_time - mt_stat_.ready_for_flush_time_,
"ready for flush time:",
mt_stat_.ready_for_flush_time_);
}
} }
return ret; return ret;