diff --git a/src/storage/backup/ob_backup_task.cpp b/src/storage/backup/ob_backup_task.cpp index cf10c30bad..c42acf1050 100755 --- a/src/storage/backup/ob_backup_task.cpp +++ b/src/storage/backup/ob_backup_task.cpp @@ -46,6 +46,8 @@ #include "share/backup/ob_archive_store.h" #include "share/backup/ob_backup_data_table_operator.h" #include "share/backup/ob_backup_connectivity.h" +#include "share/rc/ob_tenant_base.h" +#include "observer/omt/ob_tenant.h" #include using namespace oceanbase::blocksstable; @@ -3569,6 +3571,9 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i int ret = OB_SUCCESS; storage::ObLSHandle ls_handle; storage::ObLS *ls = NULL; + ObTenantDagScheduler *scheduler = NULL; + share::ObTenantBase *tenant_base = MTL_CTX(); + omt::ObTenant *tenant = NULL; ObBackupLSMetaInfo ls_meta_info; ObExternTabletMetaWriter writer; ObBackupDest backup_set_dest; @@ -3617,7 +3622,14 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i return ret; }; - if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) { + if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret)); + } else if (OB_ISNULL(tenant_base)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base)); + } else if (FALSE_IT(tenant = static_cast(tenant_base))) { + } else if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) { LOG_WARN("failed to get ls", K(ret), K(tenant_id), K(ls_id)); } else if (OB_ISNULL(ls = ls_handle.get_ls())) { ret = OB_ERR_UNEXPECTED; @@ -3627,12 +3639,43 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i } else if (OB_FAIL(writer.init(backup_set_dest, param_.ls_id_, param_.turn_id_, param_.retry_id_))) { LOG_WARN("failed to init tablet info writer", K(ret)); } else { - if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas( + const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min + const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s + const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time(); + int64_t cost_ts = 0; + do { + if (ls->is_stopped()) { + ret = OB_NOT_RUNNING; + LOG_WARN("ls is not running, stop backup", K(ret), KPC(ls)); + } else if (scheduler->has_set_stop()) { + ret = OB_SERVER_IS_STOPPING; + LOG_WARN("tenant dag scheduler has set stop, stop backup", K(ret), KPC(ls)); + } else if (tenant->has_stopped()) { + ret = OB_TENANT_HAS_BEEN_DROPPED; + LOG_WARN("tenant has been stopped, stop backup", K(ret), KPC(ls)); + } else if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas( false/* check_archive */, save_ls_meta_f, backup_tablet_meta_f))) { - LOG_WARN("failed to get ls meta package and tablet meta", K(ret), KPC(ls)); - } else if (OB_FAIL(backup_ls_meta_package_(ls_meta_info))) { + if (OB_TABLET_GC_LOCK_CONFLICT != ret) { + LOG_WARN("failed to get ls meta package and tablet meta", K(ret), KPC(ls)); + } else { + cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts; + if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) { + ret = OB_EAGAIN; + LOG_WARN("get ls meta package and tablet meta timeout, need try again.", K(ret), K(ls_id)); + } else { + ob_usleep(CHECK_GC_LOCK_INTERVAL); + } + } + } else { + cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts; + LOG_INFO("succeed to get ls meta package and tablet meta", K(ls_id), K(cost_ts)); + } + } while (OB_TABLET_GC_LOCK_CONFLICT == ret); + + + if (FAILEDx(backup_ls_meta_package_(ls_meta_info))) { LOG_WARN("failed to backup ls meta package", K(ret), K(ls_meta_info)); } else if (OB_FAIL(ObBackupLSTaskOperator::update_max_tablet_checkpoint_scn( *report_ctx_.sql_proxy_, diff --git a/src/storage/high_availability/ob_storage_ha_reader.cpp b/src/storage/high_availability/ob_storage_ha_reader.cpp index 9ce0217e55..5f82fe6d35 100644 --- a/src/storage/high_availability/ob_storage_ha_reader.cpp +++ b/src/storage/high_availability/ob_storage_ha_reader.cpp @@ -20,6 +20,7 @@ #include "storage/tablet/ob_tablet.h" #include "storage/high_availability/ob_storage_ha_utils.h" #include "storage/tablet/ob_tablet_iterator.h" +#include "observer/omt/ob_tenant.h" namespace oceanbase { @@ -2591,6 +2592,12 @@ int ObCopyLSViewInfoObReader::init( common::ObInOutBandwidthThrottle &bandwidth_throttle) { int ret = OB_SUCCESS; + ObTenantDagScheduler *scheduler = NULL; + share::ObTenantBase *tenant_base = MTL_CTX(); + omt::ObTenant *tenant = NULL; + ObLSService *ls_service = NULL; + ObLS *ls = NULL; + ObLSHandle ls_handle; if (IS_INIT) { ret = OB_INIT_TWICE; @@ -2599,22 +2606,71 @@ int ObCopyLSViewInfoObReader::init( || OB_UNLIKELY(!rpc_arg.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(src_info), K(rpc_arg)); + } else if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret)); + } else if (OB_ISNULL(tenant_base)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base)); + } else if (FALSE_IT(tenant = static_cast(tenant_base))) { + } else if (OB_ISNULL(ls_service = MTL(ObLSService*))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to get ObLSService from MTL", K(ret), KP(ls_service)); + } else if (OB_FAIL(ls_service->get_ls(rpc_arg.ls_id_, ls_handle, ObLSGetMod::HA_MOD))) { + LOG_WARN("failed to get ls", K(ret), K(rpc_arg)); + } else if (OB_ISNULL(ls = ls_handle.get_ls())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", K(ret), K(rpc_arg)); } else if (OB_FAIL(rpc_reader_.init(bandwidth_throttle))) { LOG_WARN("fail to init tablet info rpc reader", K(ret)); - } else if (OB_FAIL(srv_rpc_proxy.to(src_info.src_addr_).by(OB_DATA_TENANT_ID) + } else { + const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min + const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s + const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time(); + int64_t cost_ts = 0; + do { + if (ls->is_stopped()) { + ret = OB_NOT_RUNNING; + LOG_WARN("ls is not running, stop send get ls view rpc", K(ret), KPC(ls)); + } else if (scheduler->has_set_stop()) { + ret = OB_SERVER_IS_STOPPING; + LOG_WARN("tenant dag scheduler has set stop, stop send get ls view rpc", K(ret), KPC(ls)); + } else if (tenant->has_stopped()) { + ret = OB_TENANT_HAS_BEEN_DROPPED; + LOG_WARN("tenant has been stopped, stop send get ls view rpc", K(ret), KPC(ls)); + } else if (OB_FAIL(srv_rpc_proxy.to(src_info.src_addr_).by(OB_DATA_TENANT_ID) .timeout(FETCH_LS_VIEW_INFO_TIMEOUT).dst_cluster_id(src_info.cluster_id_) .ratelimit(true).bg_flow(obrpc::ObRpcProxy::BACKGROUND_FLOW) .fetch_ls_view(rpc_arg, rpc_reader_.get_rpc_buffer(), rpc_reader_.get_handle()))) { - LOG_WARN("failed to send fetch ls view info rpc", K(ret), K(src_info), K(rpc_arg)); - } else if (OB_FAIL(rpc_reader_.fetch_and_decode(ls_meta_))) { - LOG_WARN("fail to fetch and decode ls meta", K(ret)); - } else if (!ls_meta_.is_valid()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid ls meta", K(ret), K_(ls_meta)); - } else { - is_inited_ = true; - LOG_INFO("succeed to init fetch ls view info reader", K(src_info), K(rpc_arg)); + if (OB_TABLET_GC_LOCK_CONFLICT != ret) { + LOG_WARN("failed to send fetch ls view info rpc", K(ret), K(src_info), K(rpc_arg)); + } else { + cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts; + if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) { + ret = OB_EAGAIN; + LOG_WARN("copy ls view wait gc lock timeout, need try again.", K(ret), K(src_info), K(rpc_arg)); + } else { + ob_usleep(CHECK_GC_LOCK_INTERVAL); + } + } + } else { + cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts; + LOG_INFO("succeed to init copy ls view", K(src_info), K(rpc_arg), K(cost_ts)); + } + } while (OB_TABLET_GC_LOCK_CONFLICT == ret); + + + if (FAILEDx(rpc_reader_.fetch_and_decode(ls_meta_))) { + LOG_WARN("fail to fetch and decode ls meta", K(ret)); + } else if (!ls_meta_.is_valid()) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid ls meta", K(ret), K_(ls_meta)); + } else { + is_inited_ = true; + LOG_INFO("succeed to init fetch ls view info reader", K(src_info), K(rpc_arg)); + } } + return ret; } diff --git a/src/storage/restore/ob_ls_restore_handler.cpp b/src/storage/restore/ob_ls_restore_handler.cpp index 401f096e5d..8ddc4a261a 100644 --- a/src/storage/restore/ob_ls_restore_handler.cpp +++ b/src/storage/restore/ob_ls_restore_handler.cpp @@ -2829,7 +2829,7 @@ void ObLSRestoreResultMgr::set_result(const int result, const share::ObTaskId &t // 1. result_ is OB_SUCCESS; // 2. result_ is retrieable err, but input result is non retrieable err. lib::ObMutexGuard guard(mtx_); - if (result == OB_EAGAIN) { + if (OB_EAGAIN == result) { } else { if (retry_cnt_ >= OB_MAX_RESTORE_RETRY_TIMES) { // avoiding overwrite error code } else if ((!can_retrieable_err(result) && can_retrieable_err(result_)) diff --git a/src/storage/tx_storage/ob_tablet_gc_service.cpp b/src/storage/tx_storage/ob_tablet_gc_service.cpp index 70cd9d1c31..fb91302b3d 100755 --- a/src/storage/tx_storage/ob_tablet_gc_service.cpp +++ b/src/storage/tx_storage/ob_tablet_gc_service.cpp @@ -286,9 +286,13 @@ uint8_t ObTabletGCHandler::get_tablet_persist_trigger_and_reset() int ObTabletGCHandler::disable_gc() { int ret = OB_SUCCESS; - if (OB_FAIL(gc_lock_.trylock())) { + if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) { ret = OB_TABLET_GC_LOCK_CONFLICT; LOG_WARN("try lock failed, please retry later", K(ret)); + } else if (check_stop()) { + gc_lock_.unlock(); + ret = OB_NOT_RUNNING; + LOG_WARN("gc handler has already been offline", K(ret)); } return ret; @@ -302,7 +306,7 @@ void ObTabletGCHandler::enable_gc() int ObTabletGCHandler::set_tablet_change_checkpoint_scn(const share::SCN &scn) { int ret = OB_SUCCESS; - if (OB_FAIL(gc_lock_.trylock())) { + if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) { ret = OB_TABLET_GC_LOCK_CONFLICT; LOG_WARN("try lock failed, please retry later", K(ret)); } else { @@ -699,7 +703,7 @@ int ObTabletGCHandler::offline() if (!is_finish()) { ret = OB_EAGAIN; STORAGE_LOG(INFO, "tablet gc handler not finish, retry", KR(ret), KPC(this), KPC(ls_), K(ls_->get_ls_meta())); - } else if (OB_FAIL(gc_lock_.trylock())) { + } else if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) { // make sure 'gc_lock_' is not using. ret = OB_TABLET_GC_LOCK_CONFLICT; LOG_WARN("tablet gc handler not finish, retry", K(ret)); diff --git a/src/storage/tx_storage/ob_tablet_gc_service.h b/src/storage/tx_storage/ob_tablet_gc_service.h index 16b577fc09..673760ef9d 100755 --- a/src/storage/tx_storage/ob_tablet_gc_service.h +++ b/src/storage/tx_storage/ob_tablet_gc_service.h @@ -100,6 +100,7 @@ private: void set_start() { ATOMIC_STORE(&update_enabled_, true); } public: + static const int64_t GC_LOCK_TIMEOUT = 100_ms; // 100ms obsys::ObRWLock wait_lock_; lib::ObMutex gc_lock_;