//Copyright (c) 2021 OceanBase // OceanBase is licensed under Mulan PubL v2. // You can use this software according to the terms and conditions of the Mulan PubL v2. // You may obtain a copy of Mulan PubL v2 at: // http://license.coscl.org.cn/MulanPubL-2.0 // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, // EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, // MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. // See the Mulan PubL v2 for more details. #define USING_LOG_PREFIX STORAGE_COMPACTION #include "storage/compaction/ob_medium_compaction_func.h" #include "storage/compaction/ob_medium_compaction_mgr.h" #include "storage/compaction/ob_tablet_merge_ctx.h" #include "storage/compaction/ob_partition_merge_policy.h" #include "share/tablet/ob_tablet_info.h" #include "share/tablet/ob_tablet_table_operator.h" #include "share/ob_tablet_replica_checksum_operator.h" #include "share/ob_ls_id.h" #include "share/schema/ob_tenant_schema_service.h" #include "logservice/ob_log_service.h" #include "storage/meta_mem/ob_tenant_meta_mem_mgr.h" #include "storage/tx_storage/ob_tenant_freezer.h" #include "storage/compaction/ob_tenant_tablet_scheduler.h" #include "lib/oblog/ob_log_module.h" #include "lib/utility/ob_tracepoint.h" #include "storage/ob_partition_range_spliter.h" #include "storage/compaction/ob_compaction_diagnose.h" #include "src/storage/column_store/ob_column_oriented_sstable.h" #include "storage/tablet/ob_tablet_medium_info_reader.h" namespace oceanbase { using namespace storage; using namespace share; using namespace common; namespace compaction { int64_t ObMediumCompactionScheduleFunc::to_string(char *buf, const int64_t buf_len) const { int64_t pos = 0; if (OB_ISNULL(buf) || buf_len <= 0) { } else { J_OBJ_START(); J_KV("ls_id", ls_.get_ls_id()); J_COMMA(); if (tablet_handle_.is_valid()) { J_KV("tablet_id", tablet_handle_.get_obj()->get_tablet_meta().tablet_id_); } else { J_KV("tablet_handle", tablet_handle_); } J_OBJ_END(); } return pos; } int ObMediumCompactionScheduleFunc::choose_medium_snapshot( const int64_t max_sync_medium_scn, ObMediumCompactionInfo &medium_info, ObGetMergeTablesResult &result, int64_t &schema_version) { int ret = OB_SUCCESS; ObGetMergeTablesParam param; param.merge_type_ = MEDIUM_MERGE; int64_t max_reserved_snapshot = 0; ObTablet &tablet = *tablet_handle_.get_obj(); if (OB_FAIL(ObAdaptiveMergePolicy::get_meta_merge_tables(param, ls_, tablet, result))) { if (OB_NO_NEED_MERGE != ret) { LOG_WARN("failed to get meta merge tables", K(ret), K(param)); } } else if (FALSE_IT(medium_info.medium_snapshot_ = result.version_range_.snapshot_version_)) { } else if (OB_FAIL(get_max_reserved_snapshot(max_reserved_snapshot))) { LOG_WARN("failed to get reserved snapshot", K(ret), KPC(this)); } else if (medium_info.medium_snapshot_ < max_reserved_snapshot || medium_info.medium_snapshot_ > tablet.get_snapshot_version()) { // chosen medium snapshot is far too old if (OB_FAIL(choose_new_medium_snapshot(max_reserved_snapshot, medium_info, result, schema_version))) { LOG_WARN("failed to choose new medium snapshot", KR(ret), K(medium_info), K(max_reserved_snapshot)); } } else if (OB_FAIL(tablet.get_schema_version_from_storage_schema(schema_version))) { LOG_WARN("failed to get schema version from tablet", KR(ret), K(tablet)); } if (OB_FAIL(ret)) { } else if (medium_info.medium_snapshot_ <= max_sync_medium_scn) { ret = OB_NO_NEED_MERGE; } else if (OB_FAIL(check_frequency(max_reserved_snapshot, medium_info.medium_snapshot_))) { // check schedule interval if (OB_NO_NEED_MERGE != ret) { LOG_WARN("failed to check medium scn valid", K(ret), KPC(this)); } } else { medium_info.set_basic_info( ObMediumCompactionInfo::MEDIUM_COMPACTION, merge_reason_, medium_info.medium_snapshot_); LOG_TRACE("choose_medium_snapshot", K(ret), KPC(this), K(result), K(medium_info)); } return ret; } int ObMediumCompactionScheduleFunc::find_valid_freeze_info( ObMediumCompactionInfo &medium_info, share::ObFreezeInfo &freeze_info, bool &force_schedule_medium_merge) { int ret = OB_SUCCESS; force_schedule_medium_merge = false; ObTablet &tablet = *tablet_handle_.get_obj(); const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_; int64_t schedule_snapshot = 0; bool schedule_with_newer_info = false; const int64_t scheduler_frozen_version = MTL(ObTenantTabletScheduler*)->get_frozen_version(); ObTabletMemberWrapper table_store_wrapper; ObSSTable *last_major = nullptr; int64_t last_sstable_schema_version = 0; ObMultiVersionSchemaService *schema_service = nullptr; if (OB_ISNULL(schema_service = MTL(ObTenantSchemaService *)->get_schema_service())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("failed to get schema service from MTL", K(ret)); } else if (OB_FAIL(tablet.fetch_table_store(table_store_wrapper))) { LOG_WARN("failed to fetch table store", K(ret), K(tablet)); } else { last_major = static_cast(table_store_wrapper.get_member()->get_major_sstables().get_boundary_table(true/*last*/)); if (OB_ISNULL(last_major)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("major sstable is unexpected null", K(ret), K(tablet_id), KPC(last_major)); } else if (OB_FAIL(last_major->get_frozen_schema_version(last_sstable_schema_version))) { LOG_WARN("failed to get frozen schema version", KR(ret), KPC(last_major)); } else { schedule_snapshot = last_major->get_snapshot_version(); } } while (OB_SUCC(ret)) { if (OB_FAIL(MTL_CALL_FREEZE_INFO_MGR(get_freeze_info_behind_snapshot_version, schedule_snapshot, freeze_info))) { if (OB_ENTRY_NOT_EXIST != ret) { LOG_WARN("failed to get freeze info", K(ret), K(tablet_id), K(schedule_snapshot)); } else { ret = OB_NO_NEED_MERGE; } } else if (OB_UNLIKELY(freeze_info.schema_version_ <= 0)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("schema version is invalid", K(ret), K(freeze_info)); } else if (OB_UNLIKELY(freeze_info.schema_version_ < last_sstable_schema_version)) { force_schedule_medium_merge = true; FLOG_INFO("schema version in freeze info is too small, try to schedule medium compaction instead", K(ret), K(tablet_id), K(last_sstable_schema_version), K(freeze_info)); break; } else if (OB_FAIL(get_table_schema_to_merge( *schema_service, tablet, freeze_info.schema_version_, medium_info.medium_compat_version_, allocator_, medium_info.storage_schema_))) { if (OB_TABLE_IS_DELETED == ret) { // do nothing, end loop } else if (OB_ERR_SCHEMA_HISTORY_EMPTY == ret) { if (freeze_info.frozen_scn_.get_val_for_tx() <= scheduler_frozen_version) { FLOG_INFO("table schema may recycled, use newer freeze info instead", K(ret), KPC(last_major), K(freeze_info), K(scheduler_frozen_version)); schedule_snapshot = freeze_info.frozen_scn_.get_val_for_tx(); schedule_with_newer_info = true; ret = OB_SUCCESS; } } else { LOG_WARN("failed to get table schema", K(ret), K(medium_info)); } } if (OB_SUCC(ret)) { // success to get table schema if (schedule_with_newer_info) { FLOG_INFO("schedule with newer freeze info", K(ret), K(freeze_info)); } break; } } // end of while return ret; } int ObMediumCompactionScheduleFunc::choose_major_snapshot( const int64_t max_sync_medium_scn, ObMediumCompactionInfo &medium_info, ObGetMergeTablesResult &result, int64_t &schema_version) { int ret = OB_SUCCESS; ObTablet &tablet = *tablet_handle_.get_obj(); share::ObFreezeInfo freeze_info; bool force_schedule_medium_merge = false; if (OB_FAIL(find_valid_freeze_info(medium_info, freeze_info, force_schedule_medium_merge))) { LOG_WARN("failed to find valid freeze info", KR(ret)); } else if (force_schedule_medium_merge) { if (OB_FAIL(switch_to_choose_medium_snapshot(freeze_info.frozen_scn_.get_val_for_tx(), medium_info, schema_version))) { if (OB_EAGAIN != ret) { LOG_WARN("failed to switch to choose medium snapshot", K(ret), KPC(this)); } } } else { medium_info.set_basic_info( ObMediumCompactionInfo::MAJOR_COMPACTION, ObAdaptiveMergePolicy::AdaptiveMergeReason::TENANT_MAJOR, freeze_info.frozen_scn_.get_val_for_tx()); schema_version = freeze_info.schema_version_; } if (FAILEDx(ObPartitionMergePolicy::get_result_by_snapshot(tablet, medium_info.medium_snapshot_, result))) { LOG_WARN("failed get result for major", K(ret), K(medium_info)); } else { LOG_TRACE("choose_major_snapshot", K(ret), KPC(this), K(medium_info), K(freeze_info), K(result), K(medium_info), K(schema_version)); #ifdef ERRSIM if (tablet.get_tablet_meta().tablet_id_.id() == 1) { ret = OB_E(EventTable::EN_SPECIAL_TABLE_HAVE_LARGER_SCN) ret; if (OB_FAIL(ret)) { ret = OB_SUCCESS; medium_info.compaction_type_ = ObMediumCompactionInfo::MEDIUM_COMPACTION; medium_info.medium_snapshot_ += 100; FLOG_INFO("ERRSIM EN_SPECIAL_TABLE_HAVE_LARGER_SCN", KPC(this),K(medium_info)); } } #endif } return ret; } int ObMediumCompactionScheduleFunc::switch_to_choose_medium_snapshot( const int64_t freeze_version, ObMediumCompactionInfo &medium_info, int64_t &schema_version) { int ret = OB_SUCCESS; int64_t medium_snapshot = 0; if (weak_read_ts_ < freeze_version + 1) { ret = OB_EAGAIN; LOG_WARN("weak read ts is smaller than new medium snapshot, try later", K(ret), KPC(this), K(freeze_version)); } else if (FALSE_IT(medium_snapshot = MAX(weak_read_ts_, freeze_version + 1))) { } else if (OB_FAIL(tablet_handle_.get_obj()->get_newest_schema_version(schema_version))) { LOG_WARN("fail to choose medium schema version", K(ret), KPC(this)); } else { medium_info.set_basic_info( ObMediumCompactionInfo::MEDIUM_COMPACTION, ObAdaptiveMergePolicy::AdaptiveMergeReason::NONE, medium_snapshot); } return ret; } int ObMediumCompactionScheduleFunc::get_status_from_inner_table( const ObLSID &ls_id, const ObTabletID &tablet_id, ObTabletCompactionScnInfo &ret_info) { int ret = OB_SUCCESS; ret_info.reset(); ObTabletCompactionScnInfo snapshot_info( MTL_ID(), ls_id, tablet_id, ObTabletReplica::SCN_STATUS_IDLE); if (OB_FAIL(ObTabletMetaTableCompactionOperator::get_status(snapshot_info, ret_info))) { if (OB_ENTRY_NOT_EXIST == ret) { ret = OB_SUCCESS; // first schedule medium snapshot ret_info.status_ = ObTabletReplica::SCN_STATUS_IDLE; } else { LOG_WARN("failed to get cur medium snapshot", K(ret), K(ret_info)); } } return ret; } // cal this func with PLAF LEADER ROLE && last_medium_scn_ = 0 int ObMediumCompactionScheduleFunc::schedule_next_medium_for_leader( const int64_t major_snapshot) { int ret = OB_SUCCESS; ObRole role = INVALID_ROLE; if (OB_FAIL(ls_.get_ls_role(role))) { LOG_WARN("failed to get ls role", K(ret), KPC(this)); } else if (LEADER == role) { // only log_handler_leader can schedule #ifdef ERRSIM ret = OB_E(EventTable::EN_SKIP_INDEX_MAJOR) ret; // skip schedule major for user index table ObTablet *tablet = nullptr; if (OB_FAIL(ret) && tablet_handle_.is_valid() && OB_NOT_NULL(tablet = tablet_handle_.get_obj())) { if (tablet->get_tablet_meta().tablet_id_.id() > ObTabletID::MIN_USER_TABLET_ID && tablet->get_tablet_meta().tablet_id_ != tablet->get_tablet_meta().data_tablet_id_) { LOG_INFO("ERRSIM EN_SKIP_INDEX_MAJOR", K(ret), KPC(tablet)); return ret; } else { ret = OB_SUCCESS; } } #endif ret = schedule_next_medium_primary_cluster(major_snapshot); } else { LOG_TRACE("not leader", K(ret), K(role), K(ls_.get_ls_id())); } return ret; } int ObMediumCompactionScheduleFunc::get_adaptive_reason( const int64_t schedule_major_snapshot) { int ret = OB_SUCCESS; int64_t max_sync_medium_scn = 0; ObTablet *tablet = tablet_handle_.get_obj(); if (OB_FAIL(ObMediumCompactionScheduleFunc::get_max_sync_medium_scn( *tablet, *medium_info_list_, max_sync_medium_scn))) { LOG_WARN("failed to get max received medium scn", KR(ret), KPC(this)); } else if (schedule_major_snapshot > max_sync_medium_scn) { merge_reason_ = ObAdaptiveMergePolicy::AdaptiveMergeReason::TENANT_MAJOR; } else if (OB_FAIL(ObAdaptiveMergePolicy::get_adaptive_merge_reason(*tablet, merge_reason_))) { if (OB_HASH_NOT_EXIST != ret) { LOG_WARN("failed to get meta merge priority", K(ret), KPC(this)); } else { ret = OB_SUCCESS; } } #ifdef ERRSIM if (OB_SUCC(ret)) { if (tablet->get_tablet_meta().tablet_id_.id() > ObTabletID::MIN_USER_TABLET_ID) { ret = OB_E(EventTable::EN_SCHEDULE_MEDIUM_COMPACTION) ret; LOG_INFO("errsim", K(ret), KPC(this)); if (OB_FAIL(ret)) { FLOG_INFO("errsim EN_SCHEDULE_MEDIUM_COMPACTION", KPC(this)); ret = OB_SUCCESS; merge_reason_ = ObAdaptiveMergePolicy::AdaptiveMergeReason::LOAD_DATA_SCENE; } } } #endif return ret; } int ObMediumCompactionScheduleFunc::schedule_next_medium_primary_cluster( const int64_t schedule_major_snapshot) { int ret = OB_SUCCESS; ObTabletCompactionScnInfo ret_info; // check last medium type, select inner table for last major bool schedule_medium_flag = false; int64_t max_sync_medium_scn = 0; int64_t last_major_snapshot_version = 0; ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { } else if (FALSE_IT(last_major_snapshot_version = tablet->get_last_major_snapshot_version())) { } else if (0 >= last_major_snapshot_version) { // no major, do nothing } else if (OB_ISNULL(medium_info_list_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("medium info list is unexpected null", K(ret), KPC(this), KPC_(medium_info_list)); } else if (!medium_info_list_->could_schedule_next_round(last_major_snapshot_version)) { // check serialized list // do nothing } else if (!ObAdaptiveMergePolicy::is_valid_merge_reason(merge_reason_) && OB_FAIL(get_adaptive_reason(schedule_major_snapshot))) { LOG_WARN("failed to get adaptive reason", KR(ret), K(schedule_major_snapshot)); } else if (ObAdaptiveMergePolicy::is_valid_merge_reason(merge_reason_)) { schedule_medium_flag = true; } LOG_TRACE("schedule next medium in primary cluster", K(ret), KPC(this), K(schedule_medium_flag), K(schedule_major_snapshot), K(merge_reason_), K(last_major_snapshot_version), KPC_(medium_info_list), K(max_sync_medium_scn)); #ifdef ERRSIM if (OB_SUCC(ret)) { if (tablet->get_tablet_meta().tablet_id_.id() > ObTabletID::MIN_USER_TABLET_ID) { ret = OB_E(EventTable::EN_SCHEDULE_MEDIUM_COMPACTION) ret; if (OB_FAIL(ret)) { FLOG_INFO("ERRSIM EN_SCHEDULE_MEDIUM_COMPACTION", KPC(this)); ret = OB_SUCCESS; schedule_medium_flag = true; } } } #endif bool schedule_flag = false; if (OB_FAIL(ret) || !schedule_medium_flag) { } else if (MTL(ObTenantTabletScheduler*)->get_inner_table_merged_scn() >= last_major_snapshot_version) { schedule_flag = true; } else if (ObMediumCompactionInfo::MAJOR_COMPACTION == medium_info_list_->get_last_compaction_type()) { // for normal medium, checksum error happened, wait_check_medium_scn_ will never = 0 // for major, need select inner_table to check RS status if (OB_FAIL(get_status_from_inner_table(ls_.get_ls_id(), tablet->get_tablet_meta().tablet_id_, ret_info))) { LOG_WARN("failed to get status from inner tablet", K(ret), KPC(this)); } else if (ret_info.could_schedule_next_round(medium_info_list_->get_last_compaction_scn())) { LOG_INFO("success to check RS major checksum validation finished", K(ret), KPC(this), K(ret_info)); schedule_flag = true; } else if (OB_NOT_NULL(schedule_stat_)) { ++schedule_stat_->wait_rs_validate_cnt_; LOG_TRACE("cannot schedule next round merge now", K(ret), K(ret_info)); } } else { schedule_flag = true; } if (OB_SUCC(ret) && schedule_flag) { ret = decide_medium_snapshot(); } return ret; } int ObMediumCompactionScheduleFunc::choose_scn_for_user_request( const int64_t max_sync_medium_scn, ObMediumCompactionInfo &medium_info, ObGetMergeTablesResult &result, int64_t &schema_version) { int ret = OB_SUCCESS; // check exist not finish freeze info schema_version = 0; int64_t max_reserved_snapshot = 0; const int64_t latest_frozen_version = MTL(ObTenantFreezeInfoMgr*)->get_latest_frozen_version(); const int64_t last_major_snapshot_version = tablet_handle_.get_obj()->get_last_major_snapshot_version(); ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { LOG_WARN("major sstable should not be empty", K(ret), KPC(this)); } else if (latest_frozen_version > last_major_snapshot_version) { ret = OB_NO_NEED_MERGE; LOG_WARN("unfinished freeze info exist, can't schedule another medium", K(ret)); } else if (OB_FAIL(get_max_reserved_snapshot(max_reserved_snapshot))) { LOG_WARN("failed to get reserved snapshot", K(ret), KPC(this)); } else if (FALSE_IT(medium_info.medium_snapshot_ = MAX(max_reserved_snapshot, weak_read_ts_))) { } else if (medium_info.medium_snapshot_ < max_sync_medium_scn) { ret = OB_NO_NEED_MERGE; LOG_WARN("chosen medium snapshot is synced before", K(ret), K(medium_info), K(max_sync_medium_scn)); } else { medium_info.compaction_type_ = ObMediumCompactionInfo::MEDIUM_COMPACTION; medium_info.medium_merge_reason_ = merge_reason_; if (OB_FAIL(ObPartitionMergePolicy::get_result_by_snapshot(*tablet, medium_info.medium_snapshot_, result))) { LOG_WARN("failed to get result for major", K(ret), K(last_major_snapshot_version), K(medium_info)); } else if (OB_FAIL(tablet->get_newest_schema_version(schema_version))) { LOG_WARN("failed to get schema version from tablet", K(ret), KPC(tablet)); } else { LOG_INFO("choose medium_scn for user request", K(ret), K(result), K(schema_version), K(medium_info), K(max_sync_medium_scn), K(max_reserved_snapshot)); } } return ret; } int ObMediumCompactionScheduleFunc::check_frequency( const int64_t max_reserved_snapshot, const int64_t medium_snapshot) { int ret = OB_SUCCESS; ObTablet *tablet = tablet_handle_.get_obj(); const int64_t current_time = ObTimeUtility::current_time_ns(); if (max_reserved_snapshot < current_time) { const int64_t time_interval = (current_time - max_reserved_snapshot) / 2; const int64_t last_major_snapshot_version = tablet->get_last_major_snapshot_version(); if (0 >= last_major_snapshot_version) { ret = OB_ERR_UNEXPECTED; LOG_WARN("major sstable should not be empty", K(ret), K(last_major_snapshot_version)); } else if (last_major_snapshot_version + time_interval > medium_snapshot) { ret = OB_NO_NEED_MERGE; LOG_TRACE("schedule medium frequently", K(ret), K(last_major_snapshot_version), K(medium_snapshot), K(time_interval)); } } return ret; } int ObMediumCompactionScheduleFunc::get_max_reserved_snapshot(int64_t &max_reserved_snapshot) { int ret = OB_SUCCESS; max_reserved_snapshot = INT64_MAX; ObStorageSnapshotInfo snapshot_info; int64_t last_major_snapshot_version = 0; ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { } else if (FALSE_IT(last_major_snapshot_version = tablet->get_last_major_snapshot_version())) { } else if (0 >= last_major_snapshot_version) { ret = OB_ERR_UNEXPECTED; LOG_WARN("major sstable should not be empty", K(ret), K(last_major_snapshot_version)); } else if (0 == ls_.get_min_reserved_snapshot()) { ret = OB_NO_NEED_MERGE; // not sync reserved snapshot yet, should not schedule now } else if (OB_FAIL(MTL(ObTenantFreezeInfoMgr*)->get_min_reserved_snapshot( tablet->get_tablet_meta().tablet_id_, last_major_snapshot_version, snapshot_info))) { LOG_WARN("failed to get reserved snapshot from freeze info mgr", K(ret), "tablet_id", tablet->get_tablet_meta().tablet_id_); } else { max_reserved_snapshot = MAX(ls_.get_min_reserved_snapshot(), snapshot_info.snapshot_); LOG_TRACE("get max reserved snapshot", KR(ret), K(max_reserved_snapshot), K(snapshot_info)); } return ret; } int ObMediumCompactionScheduleFunc::choose_new_medium_snapshot( const int64_t max_reserved_snapshot, ObMediumCompactionInfo &medium_info, ObGetMergeTablesResult &result, int64_t &schema_version) { int ret = OB_SUCCESS; ObTablet *tablet = tablet_handle_.get_obj(); int64_t snapshot_gc_ts = 0; if (medium_info.medium_snapshot_ == tablet->get_snapshot_version() // no uncommitted sstable && weak_read_ts_ + DEFAULT_SCHEDULE_MEDIUM_INTERVAL < ObTimeUtility::current_time_ns()) { snapshot_gc_ts = MTL(ObTenantFreezeInfoMgr *)->get_snapshot_gc_ts(); // data before weak_read_ts & latest storage schema on memtable is match for schedule medium medium_info.medium_snapshot_ = MAX(max_reserved_snapshot, MIN(weak_read_ts_, snapshot_gc_ts)); } if (medium_info.medium_snapshot_ < max_reserved_snapshot) { // may not rewrite medium_snapshot above ret = OB_NO_NEED_MERGE; } else { LOG_INFO("use weak_read_ts to schedule medium", K(ret), KPC(this), K(medium_info), K(max_reserved_snapshot), K_(weak_read_ts), K(snapshot_gc_ts)); } // update schema version for cur medium scn if (FAILEDx(tablet->get_newest_schema_version(schema_version))) { LOG_WARN("failed to get schema version from tablet", K(ret), KPC(tablet)); } else { LOG_INFO("chosen new medium snapshot", K(ret), KPC(this), K(medium_info), K(max_reserved_snapshot), K(result), K(schema_version), K(medium_info), K(max_reserved_snapshot), K_(weak_read_ts), K(snapshot_gc_ts)); } return ret; } int ObMediumCompactionScheduleFunc::decide_medium_snapshot() { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; int64_t max_sync_medium_scn = 0; uint64_t compat_version = 0; ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (OB_FAIL(MTL(ObTenantTabletScheduler*)->get_min_data_version(compat_version))) { LOG_WARN("failed to get min data version", KR(ret)); } else if (OB_UNLIKELY(compat_version < DATA_VERSION_4_1_0_0)) { } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { } else if (OB_FAIL(ObMediumCompactionScheduleFunc::get_max_sync_medium_scn( *tablet, *medium_info_list_, max_sync_medium_scn))) { LOG_WARN("failed to get max sync medium scn", KR(ret), KPC(this)); } else if (OB_FAIL(ls_.add_dependent_medium_tablet(tablet->get_tablet_meta().tablet_id_))) { // add dependent_id in ObLSReservedSnapshotMgr LOG_WARN("failed to add dependent tablet", K(ret), KPC(this)); } else { const ObTabletID &tablet_id = tablet->get_tablet_meta().tablet_id_; LOG_TRACE("decide_medium_snapshot", K(ret), KPC(this), K(compat_version), K(tablet_id), K(max_sync_medium_scn), K_(merge_reason)); int64_t schema_version = 0; ObGetMergeTablesResult result; ObMediumCompactionInfo medium_info; if (OB_FAIL(medium_info.init_data_version(compat_version))) { LOG_WARN("fail to set data version", K(ret), K(tablet_id), K(compat_version)); } else if (is_user_request(merge_reason_)) { if (OB_FAIL(choose_scn_for_user_request(max_sync_medium_scn, medium_info, result, schema_version))) { LOG_WARN("failed to choose medium scn for user request", K(ret), KPC(this)); } } else if (ObAdaptiveMergePolicy::TENANT_MAJOR == merge_reason_) { if (OB_FAIL(choose_major_snapshot(max_sync_medium_scn, medium_info, result, schema_version))) { LOG_WARN("failed to choose medium scn for major", K(ret), KPC(this)); } } else if (OB_FAIL(choose_medium_snapshot(max_sync_medium_scn, medium_info, result, schema_version))) { LOG_WARN("failed to choose medium scn for medium", K(ret), KPC(this)); } if (OB_FAIL(ret)) { } else if (medium_info.medium_snapshot_ <= max_sync_medium_scn) { ret = OB_NO_NEED_MERGE; } #ifdef ERRSIM if (OB_SUCC(ret) || OB_NO_NEED_MERGE == ret) { ret = errsim_choose_medium_snapshot(max_sync_medium_scn, medium_info); } if (OB_SUCC(ret)) { ret = OB_E(EventTable::EN_SCHEDULE_MEDIUM_FAILED) ret; if (OB_FAIL(ret)) { LOG_INFO("ERRSIM EN_SCHEDULE_MEDIUM_FAILED", K(ret)); } } #endif if (FAILEDx(prepare_medium_info(result, schema_version, medium_info))) { if (OB_TABLE_IS_DELETED == ret || OB_NO_NEED_MERGE == ret) { ret = OB_SUCCESS; } else { LOG_WARN("failed to prepare medium info", K(ret), K(result)); } } else if (OB_FAIL(submit_medium_clog(medium_info))) { LOG_WARN("failed to submit medium clog and update inner table", K(ret), KPC(this)); } else if (OB_NOT_NULL(schedule_stat_)) { ++schedule_stat_->submit_clog_cnt_; } // delete tablet_id in ObLSReservedSnapshotMgr even if submit clog or update inner table failed if (OB_TMP_FAIL(ls_.del_dependent_medium_tablet(tablet_id))) { LOG_ERROR("failed to delete dependent medium tablet", K(tmp_ret), KPC(this)); ob_abort(); } ret = OB_NO_NEED_MERGE == ret ? OB_SUCCESS : ret; if (OB_FAIL(ret)) { // add schedule suspect info if (OB_TMP_FAIL(ADD_SUSPECT_INFO(MEDIUM_MERGE, ObDiagnoseTabletType::TYPE_MEDIUM_MERGE, ls_.get_ls_id(), tablet_id, ObSuspectInfoType::SUSPECT_SCHEDULE_MEDIUM_FAILED, medium_info.medium_snapshot_, medium_info.storage_schema_.store_column_cnt_, static_cast(ret)))) { LOG_WARN("failed to add suspect info", K(tmp_ret)); } } } return ret; } int ObMediumCompactionScheduleFunc::errsim_choose_medium_snapshot( const int64_t max_sync_medium_scn, ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; if (tablet_handle_.get_obj()->get_tablet_meta().tablet_id_.id() > ObTabletID::MIN_USER_TABLET_ID) { ret = OB_E(EventTable::EN_SCHEDULE_MEDIUM_COMPACTION) ret; } if (OB_FAIL(ret)) { LOG_INFO("ERRSIM EN_SCHEDULE_MEDIUM_COMPACTION", K(ret), KPC(this)); const int64_t snapshot_gc_ts = MTL(ObTenantFreezeInfoMgr *)->get_snapshot_gc_ts(); medium_info.medium_snapshot_ = MIN(weak_read_ts_, snapshot_gc_ts); medium_info.compaction_type_ = ObMediumCompactionInfo::MEDIUM_COMPACTION; int64_t max_reserved_snapshot = 0; if (OB_FAIL(get_max_reserved_snapshot(max_reserved_snapshot))) { LOG_WARN("failed to get reserved snapshot", K(ret), KPC(this)); } else if (medium_info.medium_snapshot_ > max_sync_medium_scn && medium_info.medium_snapshot_ >= max_reserved_snapshot) { FLOG_INFO("ERRSIM EN_SCHEDULE_MEDIUM_COMPACTION", KPC(this)); ret = OB_SUCCESS; } else { ret = OB_NO_NEED_MERGE; } } return ret; } int ObMediumCompactionScheduleFunc::init_schema_changed( ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; int64_t full_stored_col_cnt = 0; const ObStorageSchema &schema = medium_info.storage_schema_; if (OB_UNLIKELY(!schema.is_inited())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("schema is not inited", KR(ret), K(schema)); } else if (OB_FAIL(schema.get_stored_column_count_in_sstable(full_stored_col_cnt))) { LOG_WARN("failed to get stored column count in sstable", K(ret), K(schema)); } else if (OB_UNLIKELY(tablet_handle_.get_obj()->get_last_major_column_count() > full_stored_col_cnt)) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("stored col cnt in curr schema is less than old major sstable", K(ret), "col_cnt_in_sstable", tablet_handle_.get_obj()->get_last_major_column_count(), "col_cnt_in_schema", full_stored_col_cnt, KPC(this)); } else if (tablet_handle_.get_obj()->get_last_major_column_count() != full_stored_col_cnt || tablet_handle_.get_obj()->get_last_major_compressor_type() != schema.get_compressor_type() || (ObRowStoreType::DUMMY_ROW_STORE != tablet_handle_.get_obj()->get_last_major_latest_row_store_type() && tablet_handle_.get_obj()->get_last_major_latest_row_store_type() != schema.row_store_type_)) { medium_info.is_schema_changed_ = true; LOG_INFO("schema changed", K(schema), "col_cnt_in_sstable", tablet_handle_.get_obj()->get_last_major_column_count(), "compressor_type_in_sstable", tablet_handle_.get_obj()->get_last_major_compressor_type(), "latest_row_store_type_in_sstable", tablet_handle_.get_obj()->get_last_major_latest_row_store_type()); } else { medium_info.is_schema_changed_ = false; } return ret; } int ObMediumCompactionScheduleFunc::init_parallel_range_and_schema_changed_and_co_merge_type( const ObGetMergeTablesResult &result, ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; int64_t expected_task_count = 0; const int64_t tablet_size = medium_info.storage_schema_.get_tablet_size(); const ObSSTable *first_sstable = static_cast(result.handle_.get_table(0)); ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { } else if (OB_ISNULL(first_sstable)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("sstable is unexpected null", K(ret), K(result)); } else { const int64_t macro_block_cnt = first_sstable->get_data_macro_block_count(); int64_t inc_row_cnt = 0; int64_t inc_macro_cnt = 0; for (int64_t i = 0; OB_SUCC(ret) && i < result.handle_.get_count(); ++i) { inc_row_cnt += static_cast(result.handle_.get_table(i))->get_row_count(); inc_macro_cnt += static_cast(result.handle_.get_table(i))->get_data_macro_block_count(); } if (OB_FAIL(ret)) { } else if ((0 == macro_block_cnt && inc_row_cnt > SCHEDULE_RANGE_ROW_COUNT_THRESHOLD) || (first_sstable->get_row_count() >= SCHEDULE_RANGE_ROW_COUNT_THRESHOLD && inc_row_cnt >= first_sstable->get_row_count() * SCHEDULE_RANGE_INC_ROW_COUNT_PERCENRAGE_THRESHOLD)) { const int64_t estimate_macro_cnt = macro_block_cnt + inc_macro_cnt / 5; if (OB_FAIL(ObParallelMergeCtx::get_concurrent_cnt(tablet_size, estimate_macro_cnt, expected_task_count))) { STORAGE_LOG(WARN, "failed to get concurrent cnt", K(ret), K(tablet_size), K(expected_task_count), KPC(first_sstable)); } } #ifdef ERRSIM if (OB_SUCC(ret)) { ret = OB_E(EventTable::EN_COMPACTION_MEDIUM_INIT_PARALLEL_RANGE) ret; if (OB_FAIL(ret)) { expected_task_count = 2; LOG_INFO("ERRSIM EN_COMPACTION_MEDIUM_INIT_PARALLEL_RANGE", KPC(this), K(expected_task_count)); ret = OB_SUCCESS; } else { ret = OB_E(EventTable::EN_COMPACTION_MEDIUM_INIT_LARGE_PARALLEL_RANGE) ret; if (OB_FAIL(ret)) { expected_task_count = 64; LOG_INFO("ERRSIM EN_COMPACTION_MEDIUM_INIT_LARGE_PARALLEL_RANGE", KPC(this), K(expected_task_count)); ret = OB_SUCCESS; } } } #endif // init is_schema_changed if (FAILEDx(init_schema_changed(medium_info))) { STORAGE_LOG(WARN, "failed to init schema changed", KR(ret), K(first_sstable)); } else if (!tablet->is_row_store() && OB_FAIL(init_co_major_merge_type(result, medium_info))) { STORAGE_LOG(WARN, "failed to init co major merge type"); } if (OB_FAIL(ret)) { } else if (expected_task_count <= 1) { medium_info.clear_parallel_range(); } else { ObTableStoreIterator table_iter; ObArrayArray range_array; ObPartitionMultiRangeSpliter range_spliter; ObSEArray input_range_array; ObStoreRange range; range.set_start_key(ObStoreRowkey::MIN_STORE_ROWKEY); range.set_end_key(ObStoreRowkey::MAX_STORE_ROWKEY); const bool is_major = medium_info.is_major_compaction(); lib::CompatModeGuard g(tablet->get_tablet_meta().compat_mode_); if (OB_FAIL(prepare_iter(result, table_iter))) { LOG_WARN("failed to get table iter", K(ret), K(range_array)); } else if (OB_FAIL(input_range_array.push_back(range))) { LOG_WARN("failed to push back range", K(ret), K(range)); } else { bool recalc_count_flag = false; do { if (OB_FAIL(range_spliter.get_split_multi_ranges( input_range_array, expected_task_count, tablet->get_rowkey_read_info(), table_iter, allocator_, range_array))) { LOG_WARN("failed to get split multi range", K(ret), K(range_array)); } else if (OB_FAIL(medium_info.gene_parallel_info(allocator_, range_array))) { LOG_WARN("failed to get parallel ranges", K(ret), K(range_array)); } else { int64_t buf_len = ObTabletMediumCompactionInfoRecorder::cal_buf_len(tablet->get_tablet_meta().tablet_id_, medium_info, nullptr/*log_header*/); #ifdef ERRSIM ret = OB_E(EventTable::EN_COMPACTION_MEDIUM_INIT_LARGE_PARALLEL_RANGE) ret; if (OB_FAIL(ret)) { ret = OB_SUCCESS; if (!recalc_count_flag) { buf_len = common::OB_MAX_LOG_ALLOWED_SIZE; } } #endif if (buf_len < common::OB_MAX_LOG_ALLOWED_SIZE) { LOG_TRACE("success to split ranges", KR(ret), K(buf_len), K(medium_info.parallel_merge_info_), K(range_array), K(medium_info.parallel_merge_info_.get_serialize_size())); break; } else if (recalc_count_flag) { expected_task_count -= MAX(1, expected_task_count / 5); } else { recalc_count_flag = true; // get parallel info serialize size const int64_t parallel_size = medium_info.parallel_merge_info_.get_serialize_size(); const double avg_range_size = (parallel_size + 0.0) / range_array.count(); const int64_t rest_info_size = buf_len - parallel_size; expected_task_count = MAX(1, (common::OB_MAX_LOG_ALLOWED_SIZE - 1 - rest_info_size) / avg_range_size); expected_task_count = MIN(expected_task_count, MAX_MERGE_THREAD); LOG_INFO("success to recalc ranges", KR(ret), K(buf_len), K(expected_task_count), K(avg_range_size), K(rest_info_size)); } medium_info.clear_parallel_range(); table_iter.resume(); range_array.reuse(); } } while (OB_SUCC(ret) && !medium_info.contain_parallel_range_ && expected_task_count > 1); } } } return ret; } int ObMediumCompactionScheduleFunc::init_co_major_merge_type( const ObGetMergeTablesResult &result, ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; ObCOMajorMergePolicy::ObCOMajorMergeType major_merge_type = ObCOMajorMergePolicy::INVALID_CO_MAJOR_MERGE_TYPE; if (OB_FAIL(ObCOMajorMergePolicy::decide_co_major_merge_type( result, medium_info.storage_schema_, tablet_handle_, major_merge_type))) { LOG_WARN("failed to decide co major merge type", K(ret)); } else { medium_info.co_major_merge_type_ = major_merge_type; LOG_DEBUG("chengkong debug: success to get ", "major_merge_type", ObCOMajorMergePolicy::co_major_merge_type_to_str(major_merge_type)); } return ret; } int ObMediumCompactionScheduleFunc::get_result_for_major( ObTablet &tablet, const ObMediumCompactionInfo &medium_info, ObGetMergeTablesResult &result) { int ret = OB_SUCCESS; ObSSTable *base_table = nullptr; ObTabletMemberWrapper table_store_wrapper; if (OB_FAIL(tablet.fetch_table_store(table_store_wrapper))) { LOG_WARN("fail to fetch table store", K(ret)); } else if (OB_UNLIKELY(!table_store_wrapper.get_member()->is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("get invalid argument", K(ret), K(medium_info), KPC(table_store_wrapper.get_member())); } else if (OB_ISNULL(base_table = static_cast( table_store_wrapper.get_member()->get_major_sstables().get_boundary_table(true/*last*/)))) { ret = OB_ENTRY_NOT_EXIST; LOG_WARN("major sstable not exist", K(ret), KPC(table_store_wrapper.get_member())); } else if (base_table->get_snapshot_version() >= medium_info.medium_snapshot_) { ret = OB_NO_NEED_MERGE; } else if (OB_FAIL(result.handle_.add_sstable(base_table, table_store_wrapper.get_meta_handle()))) { LOG_WARN("failed to add table into iterator", K(ret), KP(base_table)); } else { const ObSSTableArray &minor_tables = table_store_wrapper.get_member()->get_minor_sstables(); bool start_add_table_flag = false; for (int64_t i = 0; OB_SUCC(ret) && i < minor_tables.count(); ++i) { if (OB_ISNULL(minor_tables[i])) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("table must not null", K(ret), K(i), K(minor_tables)); } else if (!start_add_table_flag && minor_tables[i]->get_upper_trans_version() >= base_table->get_snapshot_version()) { start_add_table_flag = true; } if (OB_SUCC(ret) && start_add_table_flag) { if (OB_FAIL(result.handle_.add_sstable(minor_tables[i], table_store_wrapper.get_meta_handle()))) { LOG_WARN("failed to add table", K(ret)); } } } } return ret; } int ObMediumCompactionScheduleFunc::prepare_iter( const ObGetMergeTablesResult &result, ObTableStoreIterator &table_iter) { int ret = OB_SUCCESS; if (OB_UNLIKELY(result.handle_.empty())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("handle is invalid", K(ret), K(result)); } for (int i = 0; OB_SUCC(ret) && i < result.handle_.get_count(); ++i) { if (OB_FAIL(table_iter.add_table(result.handle_.get_table(i)))) { LOG_WARN("failed to add table into table_iter", K(ret), K(i)); } } return ret; } int ObMediumCompactionScheduleFunc::prepare_medium_info( const ObGetMergeTablesResult &result, const int64_t schema_version, ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; medium_info.cluster_id_ = GCONF.cluster_id; medium_info.tenant_id_ = MTL_ID(); if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (OB_UNLIKELY(result.handle_.empty())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("table handle in result is empty", KR(ret), K(result)); } else if (0 == schema_version) { // not formal schema version ret = OB_NO_NEED_MERGE; LOG_TRACE("not formal schema version", KR(ret), KPC(this), K(schema_version)); } else if (medium_info.is_medium_compaction()) { ObMultiVersionSchemaService *schema_service = nullptr; ObTablet *tablet = tablet_handle_.get_obj(); if (OB_ISNULL(schema_service = MTL(ObTenantSchemaService *)->get_schema_service())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("failed to get schema service from MTL", K(ret)); } else if (FALSE_IT(medium_info.storage_schema_.reset())) { } else if (OB_FAIL(get_table_schema_to_merge(*schema_service, *tablet, schema_version, medium_info.medium_compat_version_, allocator_, medium_info.storage_schema_))) { // for major compaction, storage schema is inited in choose_major_snapshot if (OB_TABLE_IS_DELETED != ret) { LOG_WARN("failed to get table schema", KR(ret), KPC(this), K(medium_info)); } } } if (FAILEDx(init_parallel_range_and_schema_changed_and_co_merge_type(result, medium_info))) { LOG_WARN("failed to init parallel range", K(ret), K(medium_info)); } else { medium_info.last_medium_snapshot_ = result.handle_.get_table(0)->get_snapshot_version(); LOG_TRACE("success to prepare medium info", K(ret), K(medium_info)); } return ret; } int ObMediumCompactionScheduleFunc::get_table_id( ObMultiVersionSchemaService &schema_service, const ObTabletID &tablet_id, const int64_t schema_version, uint64_t &table_id) { int ret = OB_SUCCESS; table_id = OB_INVALID_ID; ObSEArray tablet_ids; ObSEArray table_ids; if (OB_FAIL(tablet_ids.push_back(tablet_id))) { LOG_WARN("failed to add tablet id", K(ret)); } else if (OB_FAIL(schema_service.get_tablet_to_table_history(MTL_ID(), tablet_ids, schema_version, table_ids))) { LOG_WARN("failed to get table id according to tablet id", K(ret), K(schema_version)); } else if (OB_UNLIKELY(table_ids.empty())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("get unexpected empty table id", K(ret), K(table_ids)); } else if (table_ids.at(0) == OB_INVALID_ID){ ret = OB_TABLE_IS_DELETED; LOG_WARN("table is deleted", K(ret), K(tablet_id), K(schema_version)); } else { table_id = table_ids.at(0); } return ret; } int ObMediumCompactionScheduleFunc::get_table_schema_to_merge( ObMultiVersionSchemaService &schema_service, const ObTablet &tablet, const int64_t schema_version, const int64_t medium_compat_version, ObIAllocator &allocator, ObStorageSchema &storage_schema) { int ret = OB_SUCCESS; const uint64_t tenant_id = MTL_ID(); const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_; uint64_t table_id = OB_INVALID_ID; schema::ObSchemaGetterGuard schema_guard; const ObTableSchema *table_schema = nullptr; int64_t save_schema_version = schema_version; if (OB_FAIL(get_table_id(schema_service, tablet_id, schema_version, table_id))) { if (OB_TABLE_IS_DELETED != ret) { LOG_WARN("failed to get table id", K(ret), K(tablet_id)); } } else if (OB_FAIL(schema_service.retry_get_schema_guard(tenant_id, schema_version, table_id, schema_guard, save_schema_version))) { if (OB_TABLE_IS_DELETED == ret) { LOG_WARN("table is deleted", K(ret), K(table_id)); } else if (OB_ERR_SCHEMA_HISTORY_EMPTY == ret) { LOG_WARN("schema history may recycle", K(ret)); } else { LOG_WARN("Fail to get schema", K(ret), K(tenant_id), K(schema_version), K(table_id)); } } else if (OB_UNLIKELY(save_schema_version < schema_version)) { ret = OB_SCHEMA_ERROR; LOG_WARN("can not use older schema version", K(ret), K(schema_version), K(save_schema_version), K(table_id)); } else if (OB_FAIL(schema_guard.get_table_schema(tenant_id, table_id, table_schema))) { LOG_WARN("Fail to get table schema", K(ret), K(table_id)); } else if (NULL == table_schema) { ret = OB_TABLE_IS_DELETED; LOG_WARN("table is deleted", K(ret), K(table_id)); } #ifdef ERRSIM if (OB_SUCC(ret)) { static bool have_set_errno = false; static ObTabletID errno_tablet_id; ret = OB_E(EventTable::EN_SCHEDULE_MAJOR_GET_TABLE_SCHEMA) ret; if (OB_FAIL(ret)) { if (tablet_id.id() > ObTabletID::MIN_USER_TABLET_ID && tablet_id != tablet.get_tablet_meta().data_tablet_id_ && ATOMIC_BCAS(&have_set_errno, false, true)) { LOG_INFO("ERRSIM EN_SCHEDULE_MAJOR_GET_TABLE_SCHEMA", K(ret), K(table_id), K(tablet_id)); errno_tablet_id = tablet_id; return ret; } else { ret = OB_SUCCESS; } } } #endif int64_t storage_schema_version = ObStorageSchema::STORAGE_SCHEMA_VERSION_V3; if (medium_compat_version < ObMediumCompactionInfo::MEDIUM_COMPAT_VERSION_V2) { storage_schema_version = ObStorageSchema::STORAGE_SCHEMA_VERSION; } else if (medium_compat_version < ObMediumCompactionInfo::MEDIUM_COMPAT_VERSION_V3) { storage_schema_version = ObStorageSchema::STORAGE_SCHEMA_VERSION_V2; } // for old version medium info, need generate old version schema if (FAILEDx(storage_schema.init( allocator, *table_schema, tablet.get_tablet_meta().compat_mode_, false/*skip_column_info*/, storage_schema_version))) { LOG_WARN("failed to init storage schema", K(ret), K(schema_version)); } else { LOG_INFO("get schema to merge", K(tablet_id), K(table_id), K(schema_version), K(save_schema_version), K(storage_schema), K(*reinterpret_cast(table_schema))); } return ret; } int ObMediumCompactionScheduleFunc::submit_medium_clog( ObMediumCompactionInfo &medium_info) { int ret = OB_SUCCESS; #ifdef ERRSIM ret = OB_E(EventTable::EN_MEDIUM_COMPACTION_SUBMIT_CLOG_FAILED) ret; if (OB_FAIL(ret)) { LOG_INFO("ERRSIM EN_MEDIUM_COMPACTION_SUBMIT_CLOG_FAILED", KPC(this)); return ret; } #endif ObTablet *tablet = nullptr; if (OB_UNLIKELY(!tablet_handle_.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid tablet_handle", K(ret), K(tablet_handle_)); } else if (FALSE_IT(tablet = tablet_handle_.get_obj())) { } else if (OB_FAIL(tablet->submit_medium_compaction_clog(medium_info, allocator_))) { LOG_WARN("failed to submit medium compaction clog", K(ret), K(medium_info)); } else { LOG_INFO("success to submit medium compaction clog", K(ret), KPC(this), K(medium_info)); } return ret; } int ObMediumCompactionScheduleFunc::batch_check_medium_meta_table( const ObIArray &tablet_ls_infos, const hash::ObHashMap &ls_info_map, ObIArray &finish_tablet_ls, ObCompactionTimeGuard &time_guard) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; if (tablet_ls_infos.empty()) { } else { share::ObTabletReplicaFilterHolder filters; ObArray tablet_infos; tablet_infos.set_attr(ObMemAttr(MTL_ID(), "TabletInfos")); if (OB_FAIL(tablet_infos.reserve(tablet_ls_infos.count()))) { LOG_WARN("failed to reserve array", KR(ret), "array_cnt", tablet_ls_infos.count()); } else if (OB_FAIL(init_tablet_filters(filters))) { LOG_WARN("failed to init tablet filters", K(ret)); } else if (OB_FAIL(ObTabletTableOperator::batch_get_tablet_info(GCTX.sql_proxy_, MTL_ID(), tablet_ls_infos, share::OBCG_STORAGE /*group_list*/, tablet_infos))) { LOG_WARN("failed to get tablet info", K(ret), K(tablet_ls_infos)); } else { time_guard.click(ObCompactionScheduleTimeGuard::SEARCH_META_TABLE); for (int64_t i = 0, idx = 0; OB_SUCC(ret) && i < tablet_infos.count() && idx < tablet_ls_infos.count(); ++idx) { bool merge_finish = false; const ObTabletCheckInfo &tablet_ls_info = tablet_ls_infos.at(idx); const ObTabletInfo &tablet_info = tablet_infos.at(i); const ObLSID &ls_id = tablet_info.get_ls_id(); const ObTabletID &tablet_id = tablet_info.get_tablet_id(); const int64_t check_medium_scn = tablet_ls_info.get_medium_scn(); if (tablet_ls_info.get_ls_id() != ls_id || tablet_ls_info.get_tablet_id() != tablet_id) { LOG_INFO("tablet_ls_info has been deleted", K(tablet_ls_info), K(tablet_info)); } else { if (OB_TMP_FAIL(check_medium_meta_table(check_medium_scn, tablet_info, filters, ls_info_map, merge_finish))) { LOG_WARN("failed to check medium meta table", K(tmp_ret), K(check_medium_scn), K(tablet_info)); } else if (merge_finish && OB_TMP_FAIL(finish_tablet_ls.push_back(ObTabletCheckInfo(tablet_id, ls_id, check_medium_scn)))) { LOG_WARN("fail to push back tablet_ls_infos", K(tmp_ret), K(tablet_info)); } ++i; } } time_guard.click(ObCompactionScheduleTimeGuard::CHECK_META_TABLE); } } return ret; } int ObMediumCompactionScheduleFunc::check_medium_meta_table( const int64_t check_medium_snapshot, const ObTabletInfo &tablet_info, const share::ObTabletReplicaFilterHolder &filters, const hash::ObHashMap &ls_info_map, bool &merge_finish) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; merge_finish = false; const ObLSID &ls_id = tablet_info.get_ls_id(); if (OB_UNLIKELY(check_medium_snapshot <= 0)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(check_medium_snapshot), K(tablet_info)); } else if (OB_UNLIKELY(!tablet_info.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("tabled_id is invalid", K(ret), K(tablet_info)); } else { const ObArray &replica_array = tablet_info.get_replicas(); int64_t unfinish_cnt = 0; int64_t filter_cnt = 0; bool pass = true; const ObLSReplica *ls_replica = nullptr; for (int i = 0; OB_SUCC(ret) && i < replica_array.count(); ++i) { const ObTabletReplica &replica = replica_array.at(i); if (OB_UNLIKELY(!replica.is_valid())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("replica info is invalid", K(ret), K(tablet_info), K(replica)); } else if (OB_FAIL(filters.check(replica, pass))) { LOG_WARN("filter replica failed", K(ret), K(replica), K(filters)); } else if (!pass) { // do nothing filter_cnt++; } else if (replica.get_snapshot_version() >= check_medium_snapshot) { // replica may have check_medium_snapshot = 2, but have received medium info of 3, // when this replica is elected as leader, this will happened } else { share::ObLSInfo ls_info; const ObLSReplica *ls_replica = nullptr; if (OB_TMP_FAIL(ls_info_map.get_refactored(ls_id, ls_info))) { LOG_WARN("failed to get map", K(tmp_ret), K(ls_id)); unfinish_cnt++; } else if (OB_ENTRY_NOT_EXIST == (tmp_ret = ls_info.find(replica.get_server(), ls_replica))) { filter_cnt++; LOG_TRACE("filter by ls locality", K(tmp_ret), K(replica), K(ls_info)); } else { LOG_TRACE("tablet unfinish", K(tmp_ret), K(ls_info), K(replica)); unfinish_cnt++; } } } // end of for LOG_INFO("check_medium_compaction_finish", K(ret), K(tablet_info), K(check_medium_snapshot), K(unfinish_cnt), K(filter_cnt), "total_cnt", replica_array.count()); if (0 == unfinish_cnt) { // merge finish merge_finish = true; } } return ret; } int ObMediumCompactionScheduleFunc::init_tablet_filters(share::ObTabletReplicaFilterHolder &filters) { int ret = OB_SUCCESS; if (OB_FAIL(filters.set_filter_not_exist_server(ObAllServerTracer::get_instance()))) { LOG_WARN("fail to set not exist server filter", KR(ret)); } else if (OB_FAIL(filters.set_filter_permanent_offline(ObAllServerTracer::get_instance()))) { LOG_WARN("fail to set filter", KR(ret)); } return ret; } int ObMediumCompactionScheduleFunc::check_medium_checksum( const ObIArray &checksum_items, ObIArray &error_pairs, int64_t &item_idx, int &check_ret) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; check_ret = OB_SUCCESS; int64_t items_cnt = checksum_items.count(); ObTabletReplicaChecksumItem prev_item; ObTabletReplicaChecksumItem curr_item; while (OB_SUCC(ret) && item_idx < items_cnt) { curr_item.reset(); if (OB_FAIL(curr_item.assign(checksum_items.at(item_idx)))) { LOG_WARN("fail to assign tablet replica checksum item", KR(ret), K(item_idx), "item", checksum_items.at(item_idx)); } else { if (prev_item.is_key_valid()) { if (curr_item.is_same_tablet(prev_item)) { // same tablet if (OB_TMP_FAIL(curr_item.verify_checksum(prev_item))) { LOG_DBA_ERROR(OB_CHECKSUM_ERROR, "msg", "checksum error in tablet replica checksum", KR(tmp_ret), K(curr_item), K(prev_item)); if (OB_SUCCESS == check_ret) { if (OB_TMP_FAIL(error_pairs.push_back(ObTabletLSPair(curr_item.tablet_id_, curr_item.ls_id_)))) { LOG_WARN("fail to push back error pair", K(tmp_ret), "tablet_id", curr_item.tablet_id_, "ls_id", curr_item.ls_id_); } check_ret = OB_CHECKSUM_ERROR; } } #ifdef ERRSIM if (OB_SUCC(ret)) { ret = OB_E(EventTable::EN_MEDIUM_REPLICA_CHECKSUM_ERROR) OB_SUCCESS; if (OB_FAIL(ret)) { STORAGE_LOG(INFO, "ERRSIM EN_MEDIUM_REPLICA_CHECKSUM_ERROR", K(ret), "tablet_id", curr_item.tablet_id_, "ls_id", curr_item.ls_id_); if (OB_TMP_FAIL(error_pairs.push_back(ObTabletLSPair(curr_item.tablet_id_, curr_item.ls_id_)))) { LOG_WARN("fail to push back error pair", K(tmp_ret), "tablet_id", curr_item.tablet_id_, "ls_id", curr_item.ls_id_); } check_ret = OB_CHECKSUM_ERROR; } } #endif } else { break; } } else if (OB_FAIL(prev_item.assign(curr_item))) { LOG_WARN("fail to assign tablet replica checksum item", KR(ret), K(item_idx), K(curr_item)); } ++item_idx; } } // end for while return ret; } int ObMediumCompactionScheduleFunc::batch_check_medium_checksum( const ObIArray &checksum_items) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; int check_ret = OB_SUCCESS; int64_t pair_idx = 0; int64_t item_idx = 0; int64_t items_cnt = checksum_items.count(); int64_t affected_rows = 0; ObSEArray error_pairs; while (OB_SUCC(ret) && item_idx < items_cnt) { const ObTabletReplicaChecksumItem &tmp_item = checksum_items.at(item_idx); const ObTabletID &tablet_id = tmp_item.tablet_id_; const ObLSID &ls_id = tmp_item.ls_id_; if (OB_FAIL(check_medium_checksum(checksum_items, error_pairs, item_idx, check_ret))) { LOG_WARN("failed to check medium checksum", K(ret), K(item_idx)); } else if (OB_SUCCESS == check_ret) { ObLSHandle ls_handle; ObTabletHandle unused_handle; if (OB_TMP_FAIL((MTL(storage::ObLSService *)->get_ls(ls_id, ls_handle, ObLSGetMod::STORAGE_MOD)))) { if (OB_LS_NOT_EXIST == tmp_ret) { LOG_TRACE("ls not exist", K(tmp_ret), K(ls_id)); } else { LOG_WARN("failed to get ls", K(tmp_ret), K(ls_id)); } } else if (OB_TMP_FAIL(ls_handle.get_ls()->update_medium_compaction_info(tablet_id, unused_handle))) { LOG_WARN("failed to update medium compaction info", K(tmp_ret), K(ls_id), K(tablet_id)); } else { FLOG_INFO("finish check medium compaction info", K(tmp_ret), K(ls_id), K(tablet_id)); } } ++pair_idx; check_ret = OB_SUCCESS; } // end for while if (!error_pairs.empty()) { if (OB_TMP_FAIL(ObTabletMetaTableCompactionOperator::batch_set_info_status(MTL_ID(), error_pairs, affected_rows))) { LOG_WARN("fail to batch set info status", KR(tmp_ret)); } else { LOG_INFO("succ to batch set info status", K(ret), K(affected_rows), K(error_pairs)); } } if (affected_rows > 0) { MTL(ObTenantTabletScheduler*)->update_error_tablet_cnt(affected_rows); } return ret; } // for Leader, clean wait_check_medium_scn int ObMediumCompactionScheduleFunc::batch_check_medium_finish( const hash::ObHashMap &ls_info_map, ObIArray &finish_tablet_ls_infos, const ObIArray &tablet_ls_infos, ObCompactionTimeGuard &time_guard) { int ret = OB_SUCCESS; if (tablet_ls_infos.empty()) { } else { // different ObTabletCheckInfo have different medium_check_scn ObArray checksum_items; checksum_items.set_attr(ObMemAttr(MTL_ID(), "CkmItems")); if (OB_FAIL(batch_check_medium_meta_table(tablet_ls_infos, ls_info_map, finish_tablet_ls_infos, time_guard))) { LOG_WARN("failed to check inner table", K(ret), K(tablet_ls_infos)); } else if (!finish_tablet_ls_infos.empty()) { if (OB_FAIL(checksum_items.reserve(finish_tablet_ls_infos.count()))) { LOG_WARN("failed to reserve array", KR(ret), "array_cnt", finish_tablet_ls_infos.count()); } else if (OB_FAIL(ObTabletReplicaChecksumOperator::get_tablets_replica_checksum( MTL_ID(), finish_tablet_ls_infos, checksum_items))) { LOG_WARN("failed to get tablet checksum", K(ret)); } else if (FALSE_IT(time_guard.click(ObCompactionScheduleTimeGuard::SEARCH_CHECKSUM))) { } else if (OB_FAIL(batch_check_medium_checksum(checksum_items))) { LOG_WARN("failed to check medium tablets checksum", K(ret)); } else if (FALSE_IT(time_guard.click(ObCompactionScheduleTimeGuard::CHECK_CHECKSUM))) { } } // TODO, sort tablet ls pair first } return ret; } // scheduler_called = false : called in compaction, DO NOT visit inner table in this func // scheduler_called = true : in scheduler loop, could visit inner table int ObMediumCompactionScheduleFunc::schedule_tablet_medium_merge( ObLS &ls, ObTablet &tablet, ObTabletSchedulePair &schedule_pair, bool &create_dag_flag, const int64_t input_major_snapshot, const bool scheduler_called) { int ret = OB_SUCCESS; create_dag_flag = false; #ifdef ERRSIM ret = OB_E(EventTable::EN_MEDIUM_CREATE_DAG) ret; if (OB_FAIL(ret)) { LOG_INFO("ERRSIM EN_MEDIUM_CREATE_DAG", K(ret)); return ret; } #endif const int64_t last_major_snapshot = tablet.get_last_major_snapshot_version(); // current compaction scn if (MTL(ObTenantTabletScheduler *)->could_major_merge_start() && last_major_snapshot > 0) { bool is_standy_tenant = !MTL_TENANT_ROLE_CACHE_IS_PRIMARY_OR_INVALID(); if (is_standy_tenant && !scheduler_called) { // standy tenant should not visit inner table if it is not scheduler_called, wait for scheduler loop } else { const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_; const ObLSID &ls_id = ls.get_ls_id(); ObArenaAllocator temp_allocator("GetMediumInfo", OB_MALLOC_NORMAL_BLOCK_SIZE, MTL_ID()); // for load medium info const ObMediumCompactionInfoList *medium_list = nullptr; bool schedule_flag = false; const int64_t major_frozen_snapshot = 0 == input_major_snapshot ? MTL(ObTenantTabletScheduler *)->get_frozen_version() : input_major_snapshot; // broadcast scn ObMediumCompactionInfo::ObCompactionType compaction_type = ObMediumCompactionInfo::COMPACTION_TYPE_MAX; int64_t schedule_scn = 0; // medium_snapshot in medium info bool tablet_need_freeze_flag = false; if (OB_FAIL(tablet.read_medium_info_list(temp_allocator, medium_list))) { LOG_WARN("failed to load medium info list", K(ret), K(tablet)); } else if (OB_FAIL(read_medium_info_from_list(*medium_list, last_major_snapshot, major_frozen_snapshot, compaction_type, schedule_scn))) { LOG_WARN("failed to read medium info from list", K(ret), K(ls_id), K(tablet_id), KPC(medium_list), K(last_major_snapshot)); } else if (is_standy_tenant) { // for STANDBY/RESTORE TENANT if (OB_FAIL(decide_standy_tenant_schedule(ls_id, tablet_id, compaction_type, schedule_scn, major_frozen_snapshot, *medium_list, schedule_flag))) { LOG_WARN("failed to decide whehter to schedule standy schedule", K(ret), K(ls_id), K(tablet_id), K(compaction_type), K(schedule_scn), K(major_frozen_snapshot), K(medium_list)); } } else { schedule_flag = true; } if (OB_FAIL(ret) || !schedule_flag) { } else if (schedule_scn > 0 && OB_FAIL(check_need_merge_and_schedule(ls, tablet, schedule_scn, tablet_need_freeze_flag, create_dag_flag))) { LOG_WARN("failed to check medium merge", K(ret), K(ls_id), K(tablet_id), K(schedule_scn)); } if (OB_SUCC(ret) && tablet_need_freeze_flag) { schedule_pair.tablet_id_ = tablet_id; schedule_pair.schedule_merge_scn_ = schedule_scn; } else { schedule_pair.reset(); } } } return ret; } int ObMediumCompactionScheduleFunc::decide_standy_tenant_schedule( const ObLSID &ls_id, const ObTabletID &tablet_id, const ObMediumCompactionInfo::ObCompactionType &compaction_type, const int64_t schedule_scn, const int64_t major_frozen_snapshot, const ObMediumCompactionInfoList &medium_list, bool &schedule_flag) { int ret = OB_SUCCESS; schedule_flag = false; if (ObMediumCompactionInfo::MAJOR_COMPACTION == compaction_type) { if (schedule_scn > major_frozen_snapshot) { ret = OB_ERR_UNEXPECTED; LOG_WARN("schedule_scn of current major is bigger than broadcast scn, wait for next round loop", K(ret), K(schedule_scn), K(major_frozen_snapshot)); } else { schedule_flag = true; } } else if (ObMediumCompactionInfo::MEDIUM_COMPACTION == compaction_type) { if (schedule_scn > major_frozen_snapshot && ObMediumCompactionInfo::MAJOR_COMPACTION == medium_list.get_last_compaction_type()) { ObTabletCompactionScnInfo ret_info; if (OB_FAIL(get_status_from_inner_table(ls_id, tablet_id, ret_info))) { LOG_WARN("failed to get status from inner tablet", K(ret), K(ls_id), K(tablet_id)); } else if (ret_info.could_schedule_next_round(medium_list.get_last_compaction_scn())) { LOG_INFO("success to check RS major checksum validation finished", K(ret), K(ls_id), K(tablet_id)); schedule_flag = true; } } else { schedule_flag = true; } } else { // does not read valid medium info, wait for next round scheduler loop } return ret; } int ObMediumCompactionScheduleFunc::read_medium_info_from_list( const ObMediumCompactionInfoList &medium_list, const int64_t last_major_snapshot, const int64_t major_frozen_snapshot, ObMediumCompactionInfo::ObCompactionType &compaction_type, int64_t &schedule_scn) { int ret = OB_SUCCESS; DLIST_FOREACH_X(info, medium_list.get_list(), OB_SUCC(ret)) { if (info->medium_snapshot_ <= last_major_snapshot) { // finished, this medium info could recycle } else { if (info->is_medium_compaction() || info->medium_snapshot_ <= major_frozen_snapshot) { schedule_scn = info->medium_snapshot_; compaction_type = (ObMediumCompactionInfo::ObCompactionType)info->compaction_type_; } break; // found one unfinish medium info, loop end } } return ret; } int ObMediumCompactionScheduleFunc::is_election_leader(const ObLSID &ls_id, bool &is_election_leader) { int ret = OB_SUCCESS; ObRole role = INVALID_ROLE; int64_t unused_proposal_id = 0; palf::PalfHandleGuard palf_handle_guard; if (OB_FAIL(MTL(logservice::ObLogService*)->open_palf(ls_id, palf_handle_guard))) { if (OB_LS_NOT_EXIST != ret) { LOG_WARN("failed to open palf", K(ret), K(ls_id)); } } else if (OB_FAIL(palf_handle_guard.get_role(role, unused_proposal_id))) { LOG_WARN("failed to get palf handle role", K(ret), K(ls_id)); } else { is_election_leader = is_leader_by_election(role); } return ret; } int ObMediumCompactionScheduleFunc::check_need_merge_and_schedule( ObLS &ls, ObTablet &tablet, const int64_t schedule_scn, bool &tablet_need_freeze_flag, bool &create_dag_flag) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; bool need_merge = false; bool can_merge = false; bool need_force_freeze = false; const ObLSID &ls_id = ls.get_ls_id(); const ObTabletID &tablet_id = tablet.get_tablet_meta().tablet_id_; if (OB_UNLIKELY(0 == schedule_scn)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(schedule_scn)); } else if (OB_FAIL(ObPartitionMergePolicy::check_need_medium_merge( ls, tablet, schedule_scn, need_merge, can_merge, need_force_freeze))) { // check merge finish LOG_WARN("failed to check medium merge", K(ret), K(ls_id), K(tablet_id)); } else if (need_merge && can_merge) { if (OB_TMP_FAIL(ObTenantTabletScheduler::schedule_merge_dag( ls.get_ls_id(), tablet, MEDIUM_MERGE, schedule_scn))) { if (OB_SIZE_OVERFLOW != tmp_ret && OB_EAGAIN != tmp_ret) { ret = tmp_ret; LOG_WARN("failed to schedule medium merge dag", K(ret), K(ls_id), K(tablet_id)); } } else { create_dag_flag = true; LOG_DEBUG("success to schedule medium merge dag", K(ret), K(schedule_scn)); } } else if (need_force_freeze) { tablet_need_freeze_flag = true; } return ret; } int ObMediumCompactionScheduleFunc::get_max_sync_medium_scn( const ObTablet &tablet, const ObMediumCompactionInfoList &medium_list, int64_t &max_sync_medium_scn) { int ret = OB_SUCCESS; max_sync_medium_scn = 0; int64_t max_sync_medium_scn_on_tablet = 0; int64_t max_sync_medium_scn_from_list = 0; if (OB_FAIL(tablet.get_max_sync_medium_scn(max_sync_medium_scn_on_tablet))) { LOG_WARN("failed to get max sync medium scn from tablet", KR(ret), K(tablet)); } else if (OB_FAIL(medium_list.get_max_sync_medium_scn(max_sync_medium_scn_from_list))) { LOG_WARN("failed to get max sync medium scn from medium_list", KR(ret), K(medium_list)); } else { max_sync_medium_scn = MAX(max_sync_medium_scn_on_tablet, max_sync_medium_scn_from_list); LOG_TRACE("get max sync medium scn", KR(ret), K(max_sync_medium_scn), K(max_sync_medium_scn_on_tablet), K(max_sync_medium_scn_from_list)); } return ret; } } //namespace compaction } // namespace oceanbase