From 4e2bd1d9a52e5f6b4bb88a8f08dcae608986a8f4 Mon Sep 17 00:00:00 2001 From: obdev Date: Tue, 8 Nov 2022 03:40:13 +0000 Subject: [PATCH] fix major_freeze bug: improve last_merged_scn & all_merged_scn logic --- .../freeze/ob_major_merge_scheduler.cpp | 62 +++++++++---------- .../freeze/ob_zone_merge_manager.cpp | 41 ++++++------ src/rootserver/freeze/ob_zone_merge_manager.h | 8 +-- 3 files changed, 54 insertions(+), 57 deletions(-) diff --git a/src/rootserver/freeze/ob_major_merge_scheduler.cpp b/src/rootserver/freeze/ob_major_merge_scheduler.cpp index 6090cf6ce2..ff7b5891bb 100644 --- a/src/rootserver/freeze/ob_major_merge_scheduler.cpp +++ b/src/rootserver/freeze/ob_major_merge_scheduler.cpp @@ -572,50 +572,48 @@ int ObMajorMergeScheduler::update_merge_status(const int64_t expected_epoch) int64_t cur_all_merged_scn = 0; const uint64_t ori_all_merged_scn = info.all_merged_scn_.value_; - int64_t last_merged_scn = (merged ? info.broadcast_scn_ : info.last_merged_scn_); + int64_t cur_merged_scn = (merged ? info.broadcast_scn_ : info.last_merged_scn_); if (progress->smallest_snapshot_version_ <= 0) { cur_all_merged_scn = info.broadcast_scn_; } else { cur_all_merged_scn = progress->smallest_snapshot_version_; } + LOG_INFO("check updating merge status", KR(ret), K_(tenant_id), K(zone), K(merged), K(cur_all_merged_scn), + K(cur_merged_scn), "smallest_snapshot_version", progress->smallest_snapshot_version_, K(info)); - // cur_all_merged_scn >= last_merged_scn - // 1. Equal: snapshot_version of all tablets change to frozen_scn after major compaction - // 2. Greater: In backup-restore situation, tablets may have higher snapshot_version, which - // is larger than current frozen_scn. https://work.aone.alibaba-inc.com/issue/45933591 - if ((cur_all_merged_scn < last_merged_scn) || (ori_all_merged_scn > cur_all_merged_scn)) { - ret = OB_ERR_UNEXPECTED; - LOG_ERROR("unexpect merge scn", KR(ret), K(last_merged_scn), K(cur_all_merged_scn), - K(ori_all_merged_scn)); - } else { - // TODO 'zone merge finish' & 'zone all tablet merge finish' should be handled in same procedure. - if (info.last_merged_scn_ != last_merged_scn) { - LOG_INFO("this zone merge finished", K(zone), K_(tenant_id), K(cur_all_merged_scn), - K(ori_all_merged_scn), K(last_merged_scn)); - // update last_merged_scn in all_merge_info table - if (OB_FAIL(zone_merge_mgr_->finish_zone_merge(zone, expected_epoch, last_merged_scn, - ori_all_merged_scn))) { - LOG_WARN("fail to finish zone merge", KR(ret), K(zone), K(expected_epoch)); + if (OB_SUCC(ret) && merged) { + // cur_all_merged_scn >= cur_merged_scn + // 1. Equal: snapshot_version of all tablets change to frozen_scn after major compaction + // 2. Greater: In backup-restore situation, tablets may have higher snapshot_version, which + // is larger than current frozen_scn. https://work.aone.alibaba-inc.com/issue/45933591 + // + // cur_all_merged_scn >= ori_all_merged_scn + // 1. Greater: all_merged_scn will increase like last_merged_scn after major compaction + // 2. Equal: In backup-restore situation, tablets may have higher snapshot_version(eg. version=10). + // If major_freeze with version=4, all_merged_scn will be updated to 10; if major_freeze with version=5, + // all_merged_scn will still be 10. + if ((cur_all_merged_scn < cur_merged_scn) || (cur_all_merged_scn < ori_all_merged_scn) + || (cur_merged_scn < info.last_merged_scn_)) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("unexpected merged scn", KR(ret), K(merged), K(cur_merged_scn), K(cur_all_merged_scn), + K(ori_all_merged_scn), K(info)); + } else if (cur_merged_scn > info.last_merged_scn_) { + LOG_INFO("this zone merge finished", K(zone), K_(tenant_id), K(cur_all_merged_scn), K(ori_all_merged_scn), + K(cur_merged_scn), "last_merged_scn", info.last_merged_scn_); + // update last_merged_scn & all_merged_scn in all_zone_merge_info + if (OB_FAIL(zone_merge_mgr_->finish_zone_merge(zone, expected_epoch, cur_merged_scn, cur_all_merged_scn))) { + LOG_WARN("fail to finish zone merge", KR(ret), K(zone), K(expected_epoch), K(cur_merged_scn), + K(cur_all_merged_scn), K(info)); } else { ROOTSERVICE_EVENT_ADD("daily_merge", "zone_merge_finish", K_(tenant_id), - "last_merged_scn", info.last_merged_scn_, K(zone)); - } - } - - if (OB_SUCC(ret) && (ori_all_merged_scn != cur_all_merged_scn)) { - LOG_INFO("this zone all tablet merge finished", K(zone), K_(tenant_id), K(cur_all_merged_scn), - K(ori_all_merged_scn), K(last_merged_scn)); - // update all_merged_scn in all_merge_info table - if (OB_FAIL(zone_merge_mgr_->finish_zone_merge(zone, expected_epoch, last_merged_scn, - cur_all_merged_scn))) { - LOG_WARN("fail to finish zone merge", KR(ret), K(zone), K(expected_epoch)); - } else { - ROOTSERVICE_EVENT_ADD("daily_merge", "zone_all_tablet_merged", K_(tenant_id), - "all_merged_scn", cur_all_merged_scn, K(zone)); + "last_merged_scn", cur_merged_scn, + "all_merged_scn", cur_all_merged_scn, + K(zone)); } } } + } } } diff --git a/src/rootserver/freeze/ob_zone_merge_manager.cpp b/src/rootserver/freeze/ob_zone_merge_manager.cpp index b8145d8d68..87b6e6a240 100644 --- a/src/rootserver/freeze/ob_zone_merge_manager.cpp +++ b/src/rootserver/freeze/ob_zone_merge_manager.cpp @@ -361,8 +361,8 @@ int ObZoneMergeManagerBase::start_zone_merge( int ObZoneMergeManagerBase::finish_zone_merge( const ObZone &zone, const int64_t expected_epoch, - const int64_t last_merged_scn, - const int64_t all_merged_scn) + const int64_t new_last_merged_scn, + const int64_t new_all_merged_scn) { int ret = OB_SUCCESS; int64_t idx = OB_INVALID_INDEX; @@ -372,15 +372,16 @@ int ObZoneMergeManagerBase::finish_zone_merge( if (OB_FAIL(check_valid(zone, idx))) { LOG_WARN("fail to check valid", KR(ret), K(zone), K_(tenant_id)); - } else if ((last_merged_scn <= 0) || (all_merged_scn <= 0)) { + } else if ((new_last_merged_scn <= 0) || (new_all_merged_scn <= 0)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(zone), K_(tenant_id), - K(last_merged_scn), K(all_merged_scn)); - } else if ((last_merged_scn != zone_merge_infos_[idx].broadcast_scn_)) { + K(new_last_merged_scn), K(new_all_merged_scn)); + } else if ((new_last_merged_scn != zone_merge_infos_[idx].broadcast_scn_) + || (new_last_merged_scn <= zone_merge_infos_[idx].last_merged_scn_)) { ret = OB_INVALID_ARGUMENT; LOG_ERROR("invalid merged_scn", KR(ret), K(zone), K_(tenant_id), - K(last_merged_scn), K(all_merged_scn), "zone broadcast_scn", - zone_merge_infos_[idx].broadcast_scn_); + K(new_last_merged_scn), K(new_all_merged_scn), + "zone_merge_info", zone_merge_infos_[idx]); } else if (OB_FAIL(trans.start(proxy_, meta_tenant_id))) { LOG_WARN("fail to start transaction", KR(ret), K_(tenant_id), K(meta_tenant_id)); } else if (OB_FAIL(check_freeze_service_epoch(trans, expected_epoch))) { @@ -392,17 +393,15 @@ int ObZoneMergeManagerBase::finish_zone_merge( } else { ObZoneMergeInfo::MergeStatus status = static_cast( zone_merge_infos_[idx].merge_status_.value_); - if (last_merged_scn > zone_merge_infos_[idx].last_merged_scn_) { - const int64_t is_merging = 0; - tmp_info.is_merging_.set_val(is_merging, true); - tmp_info.last_merged_scn_.set_val(last_merged_scn, true); - tmp_info.last_merged_time_.set_val(cur_time, true); - status = ObZoneMergeInfo::MERGE_STATUS_IDLE; - tmp_info.merge_status_.set_val(status, true); - } + const int64_t is_merging = 0; + tmp_info.is_merging_.set_val(is_merging, true); + tmp_info.last_merged_scn_.set_val(new_last_merged_scn, true); + tmp_info.last_merged_time_.set_val(cur_time, true); + status = ObZoneMergeInfo::MERGE_STATUS_IDLE; + tmp_info.merge_status_.set_val(status, true); - if (all_merged_scn > zone_merge_infos_[idx].all_merged_scn_) { - tmp_info.all_merged_scn_.set_val(all_merged_scn, true); + if (new_all_merged_scn > zone_merge_infos_[idx].all_merged_scn_) { + tmp_info.all_merged_scn_.set_val(new_all_merged_scn, true); } if (OB_FAIL(ObZoneMergeTableOperator::update_partial_zone_merge_info(trans, tenant_id_, tmp_info))) { @@ -419,7 +418,7 @@ int ObZoneMergeManagerBase::finish_zone_merge( } } - LOG_INFO("finish zone merge", KR(ret), K_(tenant_id), K(zone), K(last_merged_scn), K(all_merged_scn)); + LOG_INFO("finish zone merge", KR(ret), K_(tenant_id), K(zone), K(new_last_merged_scn), K(new_all_merged_scn)); return ret; } @@ -1217,8 +1216,8 @@ int ObZoneMergeManager::start_zone_merge(const ObZone &zone, const int64_t expec int ObZoneMergeManager::finish_zone_merge( const ObZone &zone, const int64_t expected_epoch, - const int64_t last_merged_scn, - const int64_t all_merged_scn) + const int64_t new_last_merged_scn, + const int64_t new_all_merged_scn) { int ret = OB_SUCCESS; SpinWLockGuard guard(write_lock_); @@ -1228,7 +1227,7 @@ int ObZoneMergeManager::finish_zone_merge( ObZoneMergeMgrGuard shadow_guard(lock_, *(static_cast (this)), shadow_, ret); if (OB_SUCC(ret)) { - ret = shadow_.finish_zone_merge(zone, expected_epoch, last_merged_scn, all_merged_scn); + ret = shadow_.finish_zone_merge(zone, expected_epoch, new_last_merged_scn, new_all_merged_scn); } } return ret; diff --git a/src/rootserver/freeze/ob_zone_merge_manager.h b/src/rootserver/freeze/ob_zone_merge_manager.h index 58c3d8efac..ad937f64c0 100644 --- a/src/rootserver/freeze/ob_zone_merge_manager.h +++ b/src/rootserver/freeze/ob_zone_merge_manager.h @@ -48,8 +48,8 @@ public: virtual int start_zone_merge(const common::ObZone &zone, const int64_t expected_epoch); virtual int finish_zone_merge(const common::ObZone &zone, const int64_t expected_epoch, - const int64_t last_merged_scn, - const int64_t all_merged_scn); + const int64_t new_last_merged_scn, + const int64_t new_all_merged_scn); int suspend_merge(const int64_t expected_epoch); int resume_merge(const int64_t expected_epoch); int set_merge_error(const int64_t merge_error, const int64_t expected_epoch); @@ -119,8 +119,8 @@ public: virtual int start_zone_merge(const common::ObZone &zone, const int64_t expected_epoch); virtual int finish_zone_merge(const common::ObZone &zone, const int64_t expected_epoch, - const int64_t last_merged_scn, - const int64_t all_merged_scn); + const int64_t new_last_merged_scn, + const int64_t new_all_merged_scn); virtual int suspend_merge(const int64_t expected_epoch); virtual int resume_merge(const int64_t expected_epoch); virtual int set_merge_error(const int64_t merge_error, const int64_t expected_epoch);