From f06dba66da03754a4d48ee59265ba996137c7cc0 Mon Sep 17 00:00:00 2001 From: obdev Date: Fri, 6 Dec 2024 13:45:17 +0000 Subject: [PATCH] fix old minors lost when data split orthogonal migration. --- src/objit/src/ob_llvm_di_helper.cpp | 6 +- src/share/ob_debug_sync_point.h | 1 + src/storage/ddl/ob_tablet_split_task.cpp | 85 ++++++++++++++++--- src/storage/ddl/ob_tablet_split_task.h | 4 + .../ob_tablet_copy_finish_task.cpp | 22 ++++- .../ob_mds_table_merge_task.cpp | 9 ++ 6 files changed, 110 insertions(+), 17 deletions(-) diff --git a/src/objit/src/ob_llvm_di_helper.cpp b/src/objit/src/ob_llvm_di_helper.cpp index f7421cc7f..7209ae490 100644 --- a/src/objit/src/ob_llvm_di_helper.cpp +++ b/src/objit/src/ob_llvm_di_helper.cpp @@ -356,9 +356,9 @@ int ObLLVMDIHelper::create_array_type(ObLLVMDIType &base_type, int64_t count, int ret = OB_SUCCESS; ObDIType *type_ptr = NULL; DISubrange *subrange = NULL; - if (OB_ISNULL(jc_)) { - ret = OB_NOT_INIT; - LOG_WARN("jc is NULL", K(ret)); + if (OB_ISNULL(jc_)) { + ret = OB_NOT_INIT; + LOG_WARN("jc is NULL", K(ret)); } else if (OB_ISNULL(subrange = jc_->dbuilder_.getOrCreateSubrange(0, count))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("subrange is NULL", K(ret)); diff --git a/src/share/ob_debug_sync_point.h b/src/share/ob_debug_sync_point.h index 2757a0d82..a0eda3062 100755 --- a/src/share/ob_debug_sync_point.h +++ b/src/share/ob_debug_sync_point.h @@ -678,6 +678,7 @@ class ObString; ACT(BEFOR_PREPARE_CREATE_TFS_INDEX_DOC_WORD,)\ ACT(AFTER_JOIN_LEARNER_LIST_FOR_SPECIFIED_SERVER,)\ ACT(BEFORE_MV_FINISH_COMPLETE_REFRESH,)\ + ACT(BEFORE_MIGRATION_CREATE_TABLE_STORE,)\ ACT(MAX_DEBUG_SYNC_POINT,) DECLARE_ENUM(ObDebugSyncPoint, debug_sync_point, OB_DEBUG_SYNC_POINT_DEF); diff --git a/src/storage/ddl/ob_tablet_split_task.cpp b/src/storage/ddl/ob_tablet_split_task.cpp index e95bcaf56..3ada0842e 100644 --- a/src/storage/ddl/ob_tablet_split_task.cpp +++ b/src/storage/ddl/ob_tablet_split_task.cpp @@ -381,7 +381,11 @@ int ObTabletSplitDag::init_by_param(const share::ObIDagInitParam *param) ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected err", K(ret), K(param_)); } else if (OB_FAIL(context_.init(param_))) { - LOG_WARN("init failed", K(ret)); + if (OB_NEED_RETRY != ret) { + LOG_WARN("init failed", K(ret)); + } else if (REACH_COUNT_INTERVAL(1000L)) { + LOG_INFO("wait conditions satisfied", K(ret), KPC(tmp_param)); + } } else { consumer_group_id_ = tmp_param->consumer_group_id_; is_inited_ = true; @@ -663,7 +667,8 @@ int ObTabletSplitPrepareTask::process() } else { #ifdef ERRSIM ret = OB_E(EventTable::EN_BLOCK_SPLIT_BEFORE_SSTABLES_SPLIT) OB_SUCCESS; - if (OB_FAIL(ret)) { // errsim trigger. + if (OB_SUCC(ret)) { + } else if (OB_EAGAIN == ret) { // ret=-4023, errsim trigger to test orthogonal ls rebuild. common::ObZone self_zone; ObString zone1_str("z1"); if (OB_FAIL(SVR_TRACER.get_server_zone(GCTX.self_addr(), self_zone))) { // overwrite ret is expected. @@ -671,8 +676,22 @@ int ObTabletSplitPrepareTask::process() } else if (0 != ObCharset::instr(ObCollationType::CS_TYPE_UTF8MB4_GENERAL_CI, self_zone.str().ptr(), self_zone.str().length(), zone1_str.ptr(), zone1_str.length())) { ret = OB_EAGAIN; - LOG_INFO("set eagain for tablet split", K(ret)); + LOG_INFO("[ERRSIM] set eagain for tablet split", K(ret)); } + } else if (OB_DDL_TASK_EXECUTE_TOO_MUCH_TIME == ret) { // ret=-4192, errsim trigger to test orthogonal ls migration. + common::ObAddr addr; + const ObAddr &my_addr = GCONF.self_addr_; + const ObString &errsim_migration_src_server_addr = GCONF.errsim_migration_src_server_addr.str(); + if (!errsim_migration_src_server_addr.empty() && OB_FAIL(addr.parse_from_string(errsim_migration_src_server_addr))) { + LOG_WARN("failed to parse from string to addr", K(ret), K(errsim_migration_src_server_addr)); + } else if (addr == my_addr) { + ret = OB_EAGAIN; + LOG_INFO("[ERRSIM] stuck split task", K(ret)); + } else { + LOG_INFO("[ERRSIM] skip stuck split task", K(errsim_migration_src_server_addr), K(my_addr)); + } + } else { + LOG_WARN("[ERRSIM] unknown errsim type", K(ret)); } #endif } @@ -1695,11 +1714,15 @@ int ObTabletSplitMergeTask::update_table_store_with_batch_tables( } if (OB_SUCC(ret)) { + ObLSHandle new_ls_handle; + const share::ObLSID &ls_id = ls_handle.get_ls()->get_ls_id(); param.tablet_split_param_.snapshot_version_ = src_tablet_handle.get_obj()->get_tablet_meta().snapshot_version_; param.tablet_split_param_.multi_version_start_ = src_tablet_handle.get_obj()->get_multi_version_start(); param.tablet_split_param_.merge_type_ = merge_type; - param.rebuild_seq_ = ls_handle.get_ls()->get_rebuild_seq(); - if (OB_FAIL(ls_handle.get_ls()->build_tablet_with_batch_tables(dst_tablet_id, param))) { + param.rebuild_seq_ = ls_handle.get_ls()->get_rebuild_seq(); // old rebuild seq. + if (OB_FAIL(MTL(ObLSService *)->get_ls(ls_id, new_ls_handle, ObLSGetMod::DDL_MOD))) { + LOG_WARN("failed to get log stream", K(ret), K(param)); + } else if (OB_FAIL(new_ls_handle.get_ls()->build_tablet_with_batch_tables(dst_tablet_id, param))) { LOG_WARN("failed to update tablet table store", K(ret), K(dst_tablet_id), K(param)); } FLOG_INFO("update batch sstables", K(ret), K(dst_tablet_id), K(batch_tables), K(param)); @@ -2663,8 +2686,8 @@ int ObTabletSplitUtil::check_data_split_finished( } else if (OB_FAIL(tmp_tablet_handle.get_obj()->fetch_table_store(table_store_wrapper))) { LOG_WARN("fail to fetch table store", K(ret)); } else if (OB_ISNULL(table_store_wrapper.get_member()->get_major_sstables().get_boundary_table(false/*first*/))) { - ret = OB_ERR_SYS; - LOG_WARN("split finished but major is lost", K(ret), K(ls_id), K(tablet_id), K(tmp_tablet_handle)); + ret = OB_EAGAIN; + LOG_WARN("wait for migration to copy sstable finished", K(ret), K(ls_id), K(tablet_id), K(tmp_tablet_handle)); } } } @@ -2699,6 +2722,10 @@ int ObTabletSplitUtil::check_satisfy_split_condition( } else if (OB_UNLIKELY(!is_tablet_status_need_to_split)) { ret = OB_TABLET_STATUS_NO_NEED_TO_SPLIT; LOG_WARN("there is no need to split, because of the special restore status of src tablet or des tablets", K(ret), K(source_tablet_handle), K(dest_tablets_id)); + } else if (OB_FAIL(check_tablet_ha_status(ls_handle, source_tablet_handle, dest_tablets_id))) { + if (OB_NEED_RETRY != ret) { + LOG_WARN("check tablet ha status failed", K(ls_handle), K(source_tablet_handle), K(dest_tablets_id)); + } } else if (OB_FAIL(tablet->get_all_memtables(memtable_handles))) { LOG_WARN("failed to get_memtable_mgr for get all memtable", K(ret), KPC(tablet)); } else if (!memtable_handles.empty()) { @@ -2706,12 +2733,6 @@ int ObTabletSplitUtil::check_satisfy_split_condition( if (REACH_COUNT_INTERVAL(1000L)) { LOG_INFO("should wait memtable dump", K(ret), "tablet_id", tablet->get_tablet_meta().tablet_id_, K(memtable_handles)); } - } else if (!source_tablet_handle.get_obj()->get_tablet_meta().ha_status_.check_allow_read()) { - ret = OB_NEED_RETRY; - if (REACH_COUNT_INTERVAL(1000L)) { - LOG_INFO("should wait data complete", K(ret), "tablet_id", tablet->get_tablet_meta().tablet_id_, - "tablet_meta", source_tablet_handle.get_obj()->get_tablet_meta()); - } } else if (OB_FAIL(ls_handle.get_ls()->get_max_decided_scn(max_decided_scn))) { LOG_WARN("get max decided log ts failed", K(ret), "ls_id", ls_handle.get_ls()->get_ls_id(), "source_tablet_id", tablet->get_tablet_meta().tablet_id_); @@ -3104,5 +3125,43 @@ int ObTabletSplitUtil::check_sstables_skip_data_split( return ret; } +// For migration, wait data status COMPLETE. +// For restore, wait restore status FULL/REMOTE. +int ObTabletSplitUtil::check_tablet_ha_status( + const ObLSHandle &ls_handle, + const ObTabletHandle &source_tablet_handle, + const ObIArray &dest_tablets_id) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!ls_handle.is_valid() || !source_tablet_handle.is_valid() || dest_tablets_id.empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arg", K(ret), K(source_tablet_handle), K(dest_tablets_id)); + } else if (!source_tablet_handle.get_obj()->get_tablet_meta().ha_status_.check_allow_read()) { + ret = OB_NEED_RETRY; + if (REACH_COUNT_INTERVAL(1000L)) { + LOG_INFO("should wait data complete", K(ret), + "tablet_meta", source_tablet_handle.get_obj()->get_tablet_meta()); + } + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < dest_tablets_id.count(); i++) { + ObTabletHandle tmp_tablet_handle; + const ObTabletID &tablet_id = dest_tablets_id.at(i); + if ((OB_FAIL(ObDDLUtil::ddl_get_tablet(ls_handle, tablet_id, tmp_tablet_handle, ObMDSGetTabletMode::READ_ALL_COMMITED)))) { + LOG_WARN("get tablet failed", K(ret), K(tablet_id)); + } else if (OB_UNLIKELY(!tmp_tablet_handle.is_valid())) { + ret = OB_ERR_SYS; + LOG_WARN("invalid tablet", K(ret), K(tablet_id), K(tmp_tablet_handle)); + } else if (!tmp_tablet_handle.get_obj()->get_tablet_meta().ha_status_.check_allow_read()) { + ret = OB_NEED_RETRY; + if (REACH_COUNT_INTERVAL(1000L)) { + LOG_INFO("should wait data complete", K(ret), + "tablet_meta", tmp_tablet_handle.get_obj()->get_tablet_meta()); + } + } + } + } + return ret; +} + } //end namespace stroage } //end namespace oceanbase diff --git a/src/storage/ddl/ob_tablet_split_task.h b/src/storage/ddl/ob_tablet_split_task.h index c50398ce8..7f65b4d5c 100644 --- a/src/storage/ddl/ob_tablet_split_task.h +++ b/src/storage/ddl/ob_tablet_split_task.h @@ -476,6 +476,10 @@ private: static int check_and_determine_mds_end_scn( const ObTabletHandle &dest_tablet_handle, share::SCN &end_scn); + static int check_tablet_ha_status( + const ObLSHandle &ls_handle, + const ObTabletHandle &source_tablet_handle, + const ObIArray &dest_tablets_id); }; } // end namespace storage diff --git a/src/storage/high_availability/ob_tablet_copy_finish_task.cpp b/src/storage/high_availability/ob_tablet_copy_finish_task.cpp index a1b87dea5..83f133149 100644 --- a/src/storage/high_availability/ob_tablet_copy_finish_task.cpp +++ b/src/storage/high_availability/ob_tablet_copy_finish_task.cpp @@ -102,7 +102,27 @@ int ObTabletCopyFinishTask::process() ObCopyTabletStatus::STATUS status = ObCopyTabletStatus::MAX_STATUS; ObCopyTabletRecordExtraInfo *extra_info = nullptr; - if (!is_inited_) { +#ifdef ERRSIM + ret = OB_E(EventTable::EN_BLOCK_SPLIT_BEFORE_SSTABLES_SPLIT) OB_SUCCESS; + if (OB_SUCC(ret)) { + // do nothing. + } else if (OB_DDL_TASK_EXECUTE_TOO_MUCH_TIME == ret ) { // ret=-4192, errsim trigger to test ddl-split orthogonal ls-migration. + ret = OB_SUCCESS; + if (tablet_id_.is_inner_tablet() || tablet_id_.is_ls_inner_tablet()) { + } else if (GCONF.errsim_test_tablet_id.get_value() > 0 && tablet_id_.id() == GCONF.errsim_test_tablet_id.get_value()){ + LOG_INFO("[ERRSIM] stuck before create table store", K(tablet_id_), KPC(this)); + DEBUG_SYNC(BEFORE_MIGRATION_CREATE_TABLE_STORE); + } else { + LOG_INFO("start to process copy finish task", K(tablet_id_), KPC(this)); + } + } else { + ret = OB_SUCCESS; // other errsim errors of ddl split, ignored here. + } +#endif + + if (OB_FAIL(ret)) { + LOG_WARN("error found", K(ret)); + } else if (!is_inited_) { ret = OB_NOT_INIT; LOG_WARN("tablet copy finish task do not init", K(ret)); } else if (ha_dag_->get_ha_dag_net_ctx()->is_failed()) { diff --git a/src/storage/multi_data_source/ob_mds_table_merge_task.cpp b/src/storage/multi_data_source/ob_mds_table_merge_task.cpp index 69aede6f8..1652ded98 100644 --- a/src/storage/multi_data_source/ob_mds_table_merge_task.cpp +++ b/src/storage/multi_data_source/ob_mds_table_merge_task.cpp @@ -36,6 +36,7 @@ namespace storage { namespace mds { +ERRSIM_POINT_DEF(EN_SKIP_MDS_MINI_MERGE); ObMdsTableMergeTask::ObMdsTableMergeTask() : ObITask(ObITaskType::TASK_TYPE_MDS_MINI_MERGE), @@ -79,6 +80,14 @@ int ObMdsTableMergeTask::process() DEBUG_SYNC(AFTER_EMPTY_SHELL_TABLET_CREATE); bool need_schedule_mds_minor = true; +#ifdef ERRSIM + if (OB_SUCCESS != EN_SKIP_MDS_MINI_MERGE) { + ret = OB_NO_NEED_MERGE; + LOG_INFO("[ERRSIM] mds mini merge, skip", KR(ret), KPC_(mds_merge_dag)); + return ret; + } +#endif + if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; LOG_WARN("not inited", K(ret), K_(is_inited), KPC(mds_merge_dag_));