enhance compaction diagnose

This commit is contained in:
a1iive
2023-03-14 14:11:06 +00:00
committed by ob-robot
parent f08a873f6a
commit 8794de5125
7 changed files with 76 additions and 55 deletions

View File

@ -30,13 +30,14 @@ using namespace oceanbase::common::sqlclient;
// update status of all rows // update status of all rows
int ObTabletMetaTableCompactionOperator::set_info_status( int ObTabletMetaTableCompactionOperator::set_info_status(
const ObTabletCompactionScnInfo &input_info, const ObTabletCompactionScnInfo &input_info,
ObTabletCompactionScnInfo &ret_info) ObTabletCompactionScnInfo &ret_info,
int64_t &affected_rows)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
ObMySQLTransaction trans; ObMySQLTransaction trans;
ObSqlString sql; ObSqlString sql;
ObDMLSqlSplicer dml; ObDMLSqlSplicer dml;
int64_t affected_rows = 0; affected_rows = 0;
if (OB_UNLIKELY(!input_info.is_valid())) { if (OB_UNLIKELY(!input_info.is_valid())) {
ret = OB_INVALID_ARGUMENT; ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), K(input_info)); LOG_WARN("invalid argument", K(ret), K(input_info));
@ -46,6 +47,8 @@ int ObTabletMetaTableCompactionOperator::set_info_status(
LOG_WARN("fail to start transaction", KR(ret), K(input_info), K(meta_tenant_id)); LOG_WARN("fail to start transaction", KR(ret), K(input_info), K(meta_tenant_id));
} else if (OB_FAIL(do_select(trans, true/*select_with_update*/, input_info, ret_info))) { } else if (OB_FAIL(do_select(trans, true/*select_with_update*/, input_info, ret_info))) {
LOG_WARN("failed to do select", K(ret), K(input_info)); LOG_WARN("failed to do select", K(ret), K(input_info));
} else if (ObTabletReplica::ScnStatus::SCN_STATUS_ERROR == ret_info.status_) {
// do nothing
} else if (OB_FAIL(dml.add_pk_column("tenant_id", input_info.tenant_id_)) } else if (OB_FAIL(dml.add_pk_column("tenant_id", input_info.tenant_id_))
|| OB_FAIL(dml.add_pk_column("ls_id", input_info.ls_id_)) || OB_FAIL(dml.add_pk_column("ls_id", input_info.ls_id_))
|| OB_FAIL(dml.add_pk_column("tablet_id", input_info.tablet_id_)) || OB_FAIL(dml.add_pk_column("tablet_id", input_info.tablet_id_))
@ -87,38 +90,6 @@ int ObTabletMetaTableCompactionOperator::get_status(
return ret; return ret;
} }
int ObTabletMetaTableCompactionOperator::diagnose_compaction_scn(
const int64_t tenant_id,
int64_t &error_tablet_cnt)
{
int ret = OB_SUCCESS;
if (OB_ISNULL(GCTX.sql_proxy_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("sql proxy is unexpected null", K(ret));
}
SMART_VAR(ObMySQLProxy::MySQLResult, res) {
ObZone zone;
ObMySQLResult *result = nullptr;
ObSqlString sql;
const uint64_t meta_tenant_id = gen_meta_tenant_id(tenant_id);
if (OB_FAIL(sql.append_fmt(
"SELECT count(1) as c FROM %s WHERE tenant_id = '%ld' AND status = '%ld'",
OB_ALL_TABLET_META_TABLE_TNAME,
tenant_id,
(int64_t )ObTabletReplica::SCN_STATUS_ERROR))) {
LOG_WARN("failed to append fmt", K(ret), K(tenant_id));
} else if (OB_FAIL(GCTX.sql_proxy_->read(res, meta_tenant_id, sql.ptr()))) {
LOG_WARN("fail to do read", KR(ret), K(meta_tenant_id), K(sql.ptr()));
} else if (OB_ISNULL(result = res.get_result())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("fail to get result", KR(ret), K(meta_tenant_id), K(sql.ptr()));
} else if (OB_FAIL(result->get_int("c", error_tablet_cnt))) {
LOG_WARN("failed to get int", KR(ret));
}
}
return ret;
}
void ObTabletMetaTableCompactionOperator::handle_trans_stat( void ObTabletMetaTableCompactionOperator::handle_trans_stat(
ObMySQLTransaction &trans, ObMySQLTransaction &trans,
int &ret) int &ret)

View File

@ -88,13 +88,11 @@ class ObTabletMetaTableCompactionOperator
public: public:
static int set_info_status( static int set_info_status(
const ObTabletCompactionScnInfo &input_info, const ObTabletCompactionScnInfo &input_info,
ObTabletCompactionScnInfo &ret_info); ObTabletCompactionScnInfo &ret_info,
int64_t &affected_rows);
static int get_status( static int get_status(
const ObTabletCompactionScnInfo &input_info, const ObTabletCompactionScnInfo &input_info,
ObTabletCompactionScnInfo &ret_info); ObTabletCompactionScnInfo &ret_info);
static int diagnose_compaction_scn(
const int64_t tenant_id,
int64_t &error_tablet_cnt);
// update report_scn of all tablets which belong to @tablet_pairs // update report_scn of all tablets which belong to @tablet_pairs
static int batch_update_report_scn( static int batch_update_report_scn(
const uint64_t tenant_id, const uint64_t tenant_id,

View File

@ -217,7 +217,9 @@ const char *ObCompactionDiagnoseInfo::ObDiagnoseStatusStr[DIA_STATUS_MAX] = {
"NOT_SCHEDULE", "NOT_SCHEDULE",
"RUNNING", "RUNNING",
"FAILED", "FAILED",
"UNCOMPACTED", "FINISH",
"RS_UNCOMPACTED",
"DIA_FAILED"
}; };
const char * ObCompactionDiagnoseInfo::get_diagnose_status_str(ObDiagnoseStatus status) const char * ObCompactionDiagnoseInfo::get_diagnose_status_str(ObDiagnoseStatus status)
@ -381,9 +383,9 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
int64_t compaction_scn = MAX(scheduler->get_frozen_version(), MTL(ObTenantFreezeInfoMgr*)->get_latest_frozen_version()); int64_t compaction_scn = MAX(scheduler->get_frozen_version(), MTL(ObTenantFreezeInfoMgr*)->get_latest_frozen_version());
ObTenantFreezeInfoMgr::FreezeInfo freeze_info; ObTenantFreezeInfoMgr::FreezeInfo freeze_info;
if (compaction_scn > scheduler->get_merged_version()) { // check major merge if (compaction_scn > scheduler->get_inner_table_merged_scn()) { // check major merge
diagnose_major_flag = true; diagnose_major_flag = true;
const int64_t merged_version = scheduler->get_merged_version(); const int64_t merged_version = scheduler->get_inner_table_merged_scn();
if (merged_version == ObTenantTabletScheduler::INIT_COMPACTION_SCN) { if (merged_version == ObTenantTabletScheduler::INIT_COMPACTION_SCN) {
// do nothing // do nothing
} else if (OB_TMP_FAIL(MTL(ObTenantFreezeInfoMgr *)->get_freeze_info_behind_snapshot_version(merged_version, freeze_info))) { } else if (OB_TMP_FAIL(MTL(ObTenantFreezeInfoMgr *)->get_freeze_info_behind_snapshot_version(merged_version, freeze_info))) {
@ -422,6 +424,9 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
} }
} }
bool tenant_major_finish = true;
bool ls_major_finish = true;
bool tablet_major_finish = true;
while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all log_stream while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all log_stream
bool need_merge = false; bool need_merge = false;
if (OB_FAIL(ls_iter_guard.get_ptr()->get_next(ls))) { if (OB_FAIL(ls_iter_guard.get_ptr()->get_next(ls))) {
@ -466,9 +471,25 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
"schedule_suspect_info", ret_info.suspect_info_); "schedule_suspect_info", ret_info.suspect_info_);
} }
ObLSTabletIterator tablet_iter(ObTabletCommon::NO_CHECK_GET_TABLET_TIMEOUT_US); ObLSTabletIterator tablet_iter(ObTabletCommon::NO_CHECK_GET_TABLET_TIMEOUT_US);
if (OB_FAIL(ls->build_tablet_iter(tablet_iter))) { ObLSVTInfo ls_info;
if (OB_FAIL(ls->get_ls_info(ls_info))) {
LOG_WARN("failed to get ls info", K(ret), K(ls));
} else if (MAX_LS_TABLET_CNT < ls_info.tablet_count_) {
if (can_add_diagnose_info()) {
SET_DIAGNOSE_INFO(
info_array_[idx_++],
MERGE_TYPE_MAX,
MTL_ID(),
ls_id,
ObTabletID(INT64_MAX),
ObCompactionDiagnoseInfo::DIA_STATUS_DIA_FAILED,
ObTimeUtility::fast_current_time(),
"there is too many tablets. tablet count", ls_info.tablet_count_);
}
} else if (OB_FAIL(ls->build_tablet_iter(tablet_iter))) {
LOG_WARN("failed to build ls tablet iter", K(ret), K(ls)); LOG_WARN("failed to build ls tablet iter", K(ret), K(ls));
} else { } else {
ls_major_finish = true;
ObTabletHandle tablet_handle; ObTabletHandle tablet_handle;
while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all tablets in ls while (OB_SUCC(ret) && can_add_diagnose_info()) { // loop all tablets in ls
if (OB_FAIL(tablet_iter.get_next_tablet(tablet_handle))) { if (OB_FAIL(tablet_iter.get_next_tablet(tablet_handle))) {
@ -487,9 +508,11 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
&& OB_TMP_FAIL(diagnose_tablet_major_merge( && OB_TMP_FAIL(diagnose_tablet_major_merge(
compaction_scn, compaction_scn,
ls_id, ls_id,
*tablet_handle.get_obj()))) { *tablet_handle.get_obj(),
tablet_major_finish))) {
LOG_WARN("failed to get diagnose major merge", K(tmp_ret)); LOG_WARN("failed to get diagnose major merge", K(tmp_ret));
} }
ls_major_finish &= tablet_major_finish;
if (OB_TMP_FAIL(diagnose_tablet_mini_merge(ls_id, *tablet_handle.get_obj()))) { if (OB_TMP_FAIL(diagnose_tablet_mini_merge(ls_id, *tablet_handle.get_obj()))) {
LOG_WARN("failed to get diagnose mini merge", K(tmp_ret)); LOG_WARN("failed to get diagnose mini merge", K(tmp_ret));
} }
@ -501,10 +524,22 @@ int ObCompactionDiagnoseMgr::diagnose_tenant_tablet()
} }
} }
} // end of while } // end of while
tenant_major_finish &= ls_major_finish;
LOG_INFO("finish ls merge diagnose", K(ret), K(ls_id)); LOG_INFO("finish ls merge diagnose", K(ret), K(ls_id));
} }
} }
} // end of while } // end of while
if (diagnose_major_flag && tenant_major_finish && can_add_diagnose_info()) {
SET_DIAGNOSE_INFO(
info_array_[idx_++],
MEDIUM_MERGE,
MTL_ID(),
share::ObLSID(INT64_MAX),
ObTabletID(INT64_MAX),
ObCompactionDiagnoseInfo::DIA_STATUS_FINISH,
ObTimeUtility::fast_current_time(),
"compaction has finished in storage. compaction_scn", compaction_scn);
}
} }
} }
return ret; return ret;
@ -593,7 +628,7 @@ int ObCompactionDiagnoseMgr::do_tenant_major_merge_diagnose(
info_array_[idx_++], MAJOR_MERGE, MTL_ID(), info_array_[idx_++], MAJOR_MERGE, MTL_ID(),
uncompacted_tablets.at(i).get_ls_id(), uncompacted_tablets.at(i).get_ls_id(),
uncompacted_tablets.at(i).get_tablet_id(), uncompacted_tablets.at(i).get_tablet_id(),
ObCompactionDiagnoseInfo::DIA_STATUS_UNCOMPACTED, ObCompactionDiagnoseInfo::DIA_STATUS_RS_UNCOMPACTED,
ObTimeUtility::fast_current_time(), "server", ObTimeUtility::fast_current_time(), "server",
uncompacted_tablets.at(i).get_server(), "status", status, uncompacted_tablets.at(i).get_server(), "status", status,
"frozen_scn", frozen_scn, "frozen_scn", frozen_scn,
@ -738,9 +773,11 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_medium_merge(
int ObCompactionDiagnoseMgr::diagnose_tablet_major_merge( int ObCompactionDiagnoseMgr::diagnose_tablet_major_merge(
const int64_t compaction_scn, const int64_t compaction_scn,
const ObLSID &ls_id, const ObLSID &ls_id,
ObTablet &tablet) ObTablet &tablet,
bool &tablet_major_finish)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
tablet_major_finish = true;
const ObTabletTableStore &table_store = tablet.get_table_store(); const ObTabletTableStore &table_store = tablet.get_table_store();
const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_; const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_;
const ObMergeType merge_type = MEDIUM_MERGE; const ObMergeType merge_type = MEDIUM_MERGE;
@ -756,6 +793,7 @@ int ObCompactionDiagnoseMgr::diagnose_tablet_major_merge(
int tmp_ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS;
if (nullptr == latest_major_sstable if (nullptr == latest_major_sstable
|| latest_major_sstable->get_snapshot_version() < compaction_scn) { || latest_major_sstable->get_snapshot_version() < compaction_scn) {
tablet_major_finish = false;
if (max_sync_medium_scn < compaction_scn) { if (max_sync_medium_scn < compaction_scn) {
if (can_add_diagnose_info() if (can_add_diagnose_info()
&& ObTimeUtility::fast_current_time() > compaction_scn + WAIT_MEDIUM_SCHEDULE_INTERVAL && ObTimeUtility::fast_current_time() > compaction_scn + WAIT_MEDIUM_SCHEDULE_INTERVAL
@ -940,10 +978,8 @@ int ObCompactionDiagnoseMgr::diagnose_no_dag(
int ObCompactionDiagnoseMgr::diagnose_medium_scn_table(const int64_t compaction_scn) int ObCompactionDiagnoseMgr::diagnose_medium_scn_table(const int64_t compaction_scn)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
int64_t error_tablet_cnt = 0; int64_t error_tablet_cnt = MTL(ObTenantTabletScheduler*)->get_error_tablet_cnt();
if (OB_FAIL(ObTabletMetaTableCompactionOperator::diagnose_compaction_scn(MTL_ID(), error_tablet_cnt))) { if (0 != error_tablet_cnt
LOG_WARN("failed to diagnose compaction scn", K(ret));
} else if (0 != error_tablet_cnt
&& can_add_diagnose_info() && can_add_diagnose_info()
&& OB_FAIL(SET_DIAGNOSE_INFO( && OB_FAIL(SET_DIAGNOSE_INFO(
info_array_[idx_++], info_array_[idx_++],
@ -953,7 +989,7 @@ int ObCompactionDiagnoseMgr::diagnose_medium_scn_table(const int64_t compaction_
ObTabletID(INT64_MAX), ObTabletID(INT64_MAX),
ObCompactionDiagnoseInfo::DIA_STATUS_FAILED, ObCompactionDiagnoseInfo::DIA_STATUS_FAILED,
ObTimeUtility::fast_current_time(), ObTimeUtility::fast_current_time(),
"error_tablet_cnt", error_tablet_cnt))) { "checksum may error. error_tablet_cnt", error_tablet_cnt))) {
LOG_WARN("failed to add diagnose info", K(ret)); LOG_WARN("failed to add diagnose info", K(ret));
} }
return ret; return ret;

View File

@ -92,7 +92,9 @@ struct ObCompactionDiagnoseInfo
DIA_STATUS_NOT_SCHEDULE = 0, DIA_STATUS_NOT_SCHEDULE = 0,
DIA_STATUS_RUNNING = 1, DIA_STATUS_RUNNING = 1,
DIA_STATUS_FAILED = 2, DIA_STATUS_FAILED = 2,
DIA_STATUS_UNCOMPACTED = 3, DIA_STATUS_FINISH = 3,
DIA_STATUS_RS_UNCOMPACTED = 4, // RS diagnose
DIA_STATUS_DIA_FAILED = 5,
DIA_STATUS_MAX DIA_STATUS_MAX
}; };
const static char *ObDiagnoseStatusStr[DIA_STATUS_MAX]; const static char *ObDiagnoseStatusStr[DIA_STATUS_MAX];
@ -137,7 +139,8 @@ private:
int diagnose_tablet_major_merge( int diagnose_tablet_major_merge(
const int64_t compaction_scn, const int64_t compaction_scn,
const ObLSID &ls_id, const ObLSID &ls_id,
ObTablet &tablet); ObTablet &tablet,
bool &tablet_major_finish);
int diagnose_tablet_merge( int diagnose_tablet_merge(
ObTabletMergeDag &dag, ObTabletMergeDag &dag,
const ObMergeType type, const ObMergeType type,
@ -170,6 +173,7 @@ private:
private: private:
static const int64_t WAIT_MEDIUM_SCHEDULE_INTERVAL = 1000L * 1000L * 120L; // 120 seconds static const int64_t WAIT_MEDIUM_SCHEDULE_INTERVAL = 1000L * 1000L * 120L; // 120 seconds
static const int64_t SUSPECT_INFO_WARNING_THRESHOLD = 1000L * 1000L * 60L * 5; // 5 mins static const int64_t SUSPECT_INFO_WARNING_THRESHOLD = 1000L * 1000L * 60L * 5; // 5 mins
static const int64_t MAX_LS_TABLET_CNT = 10 * 10000; // TODO(@jingshui): tmp solution
bool is_inited_; bool is_inited_;
ObCompactionDiagnoseInfo *info_array_; ObCompactionDiagnoseInfo *info_array_;
int64_t max_cnt_; int64_t max_cnt_;

View File

@ -775,10 +775,13 @@ int ObMediumCompactionScheduleFunc::check_medium_checksum_table(
tablet_id, tablet_id,
ObTabletReplica::SCN_STATUS_ERROR); ObTabletReplica::SCN_STATUS_ERROR);
ObTabletCompactionScnInfo unused_ret_info; ObTabletCompactionScnInfo unused_ret_info;
int64_t affected_rows = 0;
// TODO(@lixia.yq) delete status when data_checksum_error is a inner_table // TODO(@lixia.yq) delete status when data_checksum_error is a inner_table
if (OB_TMP_FAIL(ObTabletMetaTableCompactionOperator::set_info_status( if (OB_TMP_FAIL(ObTabletMetaTableCompactionOperator::set_info_status(
medium_snapshot_info, unused_ret_info))) { medium_snapshot_info, unused_ret_info, affected_rows))) {
LOG_WARN("failed to set info status", K(tmp_ret), K(medium_snapshot_info)); LOG_WARN("failed to set info status", K(tmp_ret), K(medium_snapshot_info));
} else {
MTL(ObTenantTabletScheduler*)->update_error_tablet_cnt(affected_rows);
} }
} }
} }

View File

@ -167,7 +167,8 @@ ObTenantTabletScheduler::ObTenantTabletScheduler()
medium_loop_task_(), medium_loop_task_(),
sstable_gc_task_(), sstable_gc_task_(),
fast_freeze_checker_(), fast_freeze_checker_(),
enable_adaptive_compaction_(false) enable_adaptive_compaction_(false),
error_tablet_cnt_(0)
{ {
STATIC_ASSERT(static_cast<int64_t>(NO_MAJOR_MERGE_TYPE_CNT) == ARRAYSIZEOF(MERGE_TYPES), "merge type array len is mismatch"); STATIC_ASSERT(static_cast<int64_t>(NO_MAJOR_MERGE_TYPE_CNT) == ARRAYSIZEOF(MERGE_TYPES), "merge type array len is mismatch");
} }
@ -520,6 +521,7 @@ int ObTenantTabletScheduler::schedule_merge(const int64_t broadcast_version)
if (OB_TMP_FAIL(MTL(ObTenantCompactionProgressMgr *)->add_progress(broadcast_version))) { if (OB_TMP_FAIL(MTL(ObTenantCompactionProgressMgr *)->add_progress(broadcast_version))) {
LOG_WARN("failed to add progress", K(tmp_ret), K(broadcast_version)); LOG_WARN("failed to add progress", K(tmp_ret), K(broadcast_version));
} }
clear_error_tablet_cnt();
schedule_stats_.start_merge(); // set all statistics schedule_stats_.start_merge(); // set all statistics
ADD_COMPACTION_EVENT( ADD_COMPACTION_EVENT(

View File

@ -104,6 +104,12 @@ public:
bool is_stop() const { return is_stop_; } bool is_stop() const { return is_stop_; }
int reload_tenant_config(); int reload_tenant_config();
bool enable_adaptive_compaction() const { return enable_adaptive_compaction_; } bool enable_adaptive_compaction() const { return enable_adaptive_compaction_; }
int64_t get_error_tablet_cnt() { return ATOMIC_LOAD(&error_tablet_cnt_); }
void clear_error_tablet_cnt() { ATOMIC_STORE(&error_tablet_cnt_, 0); }
void update_error_tablet_cnt(const int64_t delta_cnt)
{
(void)ATOMIC_AAF(&error_tablet_cnt_, delta_cnt);
}
// major merge status control // major merge status control
void stop_major_merge(); void stop_major_merge();
@ -243,6 +249,7 @@ private:
SSTableGCTask sstable_gc_task_; SSTableGCTask sstable_gc_task_;
ObFastFreezeChecker fast_freeze_checker_; ObFastFreezeChecker fast_freeze_checker_;
bool enable_adaptive_compaction_; bool enable_adaptive_compaction_;
int64_t error_tablet_cnt_; // for diagnose
}; };
} // namespace storage } // namespace storage