From 9964d8504e119897f7b2c579d0a6290014e1fde8 Mon Sep 17 00:00:00 2001 From: wanyue-wy <345657357@qq.com> Date: Thu, 8 Feb 2024 07:40:06 +0000 Subject: [PATCH] modify wait logic of clone executor and retry logic of clone scheduler in CLONE_SYS_RELEASE_RESOURCE --- src/rootserver/restore/ob_clone_scheduler.cpp | 90 +++++++++++++------ .../restore/ob_tenant_clone_util.cpp | 10 ++- .../ob_tenant_snapshot_scheduler.cpp | 18 ++-- .../ob_tenant_snapshot_util.cpp | 22 ++--- .../tenant_snapshot/ob_tenant_snapshot_util.h | 14 +-- .../ob_tenant_clone_table_operator.cpp | 14 +++ .../restore/ob_tenant_clone_table_operator.h | 1 + .../ob_tenant_snapshot_table_operator.cpp | 4 +- .../ob_tenant_snapshot_table_operator.h | 6 +- src/sql/engine/cmd/ob_clone_executor.cpp | 50 ++++------- .../tenant_snapshot/ob_ls_snapshot_defs.cpp | 2 +- .../ob_tenant_snapshot_meta_table.cpp | 2 +- 12 files changed, 137 insertions(+), 96 deletions(-) diff --git a/src/rootserver/restore/ob_clone_scheduler.cpp b/src/rootserver/restore/ob_clone_scheduler.cpp index f93bcff00a..8204b84bc2 100644 --- a/src/rootserver/restore/ob_clone_scheduler.cpp +++ b/src/rootserver/restore/ob_clone_scheduler.cpp @@ -28,6 +28,7 @@ #ifdef OB_BUILD_TDE_SECURITY #include "share/ob_master_key_getter.h" #endif +#include "lib/utility/ob_tracepoint.h" namespace oceanbase { @@ -261,6 +262,7 @@ int ObCloneScheduler::process_user_clone_job(const share::ObCloneJob &job) return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_LOCK_ERROR); int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -273,7 +275,10 @@ int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) const ObTenantSnapshotUtil::TenantSnapshotOp op = ObTenantCloneJobType::RESTORE == job_type ? ObTenantSnapshotUtil::RESTORE_OP : ObTenantSnapshotUtil::FORK_OP; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_LOCK_ERROR)) { + ret = ERRSIM_CLONE_LOCK_ERROR; + LOG_WARN("mock clone lock failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -306,7 +311,7 @@ int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) if (ObTenantSnapStatus::CREATING == original_global_state_status) { ret = OB_SUCCESS; need_wait = true; - LOG_INFO("need wait for current tenant restore operation", KR(ret), K(source_tenant_id)); + LOG_INFO("need wait for current tenant snapshot creation", KR(ret), K(source_tenant_id)); } else { LOG_WARN("GLOBAL_STATE snapshot lock conflict", KR(ret), K(source_tenant_id), K(original_global_state_status)); @@ -315,10 +320,10 @@ int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) } else if (OB_FAIL(ObTenantSnapshotUtil::check_tenant_has_no_conflict_tasks(source_tenant_id))) { LOG_WARN("fail to check tenant has conflict tasks", KR(ret), K(source_tenant_id)); } else if (ObTenantCloneJobType::RESTORE == job_type && - OB_FAIL(ObTenantSnapshotUtil::add_restore_tenant_task(trans, source_tenant_id, - snapshot_id))) { - // if job_type is FORK, the snapshot will be updated as RESTORE when it is created successful - LOG_WARN("failed to add restore tenant snapshot task", KR(ret), K(source_tenant_id), K(snapshot_id)); + OB_FAIL(ObTenantSnapshotUtil::add_clone_tenant_task(trans, source_tenant_id, + snapshot_id))) { + // if job_type is FORK, the snapshot will be updated as CLONING when it is created successful + LOG_WARN("failed to add clone tenant snapshot task", KR(ret), K(source_tenant_id), K(snapshot_id)); } if (trans.is_started()) { int tmp_ret = OB_SUCCESS; @@ -342,10 +347,6 @@ int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) } } - // if (FAILEDx(wait_source_relative_task_finished_(source_tenant_id))) { - // LOG_WARN("wait source relative task finished failed", KR(ret), KR(source_tenant_id)); - // } - int tmp_ret = OB_SUCCESS; if (OB_TMP_FAIL(try_update_job_status_(ret, job))) { LOG_WARN("fail to update job status", KR(ret), KR(tmp_ret), K(job)); @@ -354,6 +355,7 @@ int ObCloneScheduler::clone_lock(const share::ObCloneJob &job) return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_RESOURCE_POOL_ERROR); int ObCloneScheduler::clone_create_resource_pool(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -362,7 +364,10 @@ int ObCloneScheduler::clone_create_resource_pool(const share::ObCloneJob &job) uint64_t resource_pool_id = job.get_resource_pool_id(); const int64_t job_id = job.get_job_id(); - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_RESOURCE_POOL_ERROR)) { + ret = ERRSIM_CLONE_RESOURCE_POOL_ERROR; + LOG_WARN("mock clone resource pool failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -402,6 +407,7 @@ int ObCloneScheduler::clone_create_resource_pool(const share::ObCloneJob &job) return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_CREATE_SNAPSHOT_ERROR); int ObCloneScheduler::clone_create_snapshot_for_fork_tenant(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -416,7 +422,10 @@ int ObCloneScheduler::clone_create_snapshot_for_fork_tenant(const share::ObClone ObSqlString snapshot_name; ObTenantSnapshotID tenant_snapshot_id; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_CREATE_SNAPSHOT_ERROR)) { + ret = ERRSIM_CLONE_CREATE_SNAPSHOT_ERROR; + LOG_WARN("mock clone create snapshot failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -498,6 +507,7 @@ int ObCloneScheduler::clone_create_snapshot_for_fork_tenant(const share::ObClone return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_WAIT_CREATE_SNAPSHOT_ERROR); int ObCloneScheduler::clone_wait_create_snapshot_for_fork_tenant(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -510,7 +520,10 @@ int ObCloneScheduler::clone_wait_create_snapshot_for_fork_tenant(const share::Ob ObMySQLTransaction trans; bool need_wait = false; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_WAIT_CREATE_SNAPSHOT_ERROR)) { + ret = ERRSIM_CLONE_WAIT_CREATE_SNAPSHOT_ERROR; + LOG_WARN("mock clone wait create snapshot failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -534,14 +547,14 @@ int ObCloneScheduler::clone_wait_create_snapshot_for_fork_tenant(const share::Ob } else if (ObTenantSnapStatus::CREATING == item.get_status() || ObTenantSnapStatus::DECIDED == item.get_status()) { need_wait = true; - } else if (ObTenantSnapStatus::RESTORING == item.get_status()) { + } else if (ObTenantSnapStatus::CLONING == item.get_status()) { // no need to update snapshot status } else if (ObTenantSnapStatus::NORMAL != item.get_status()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid status for fork tenant snapshot", KR(ret), K(source_tenant_id), K(tenant_snapshot_id), K(item)); - } else if (OB_FAIL(rootserver::ObTenantSnapshotUtil::add_restore_tenant_task(trans, item))) { - LOG_WARN("fail to update fork tenant snapshot to restoring", KR(ret), K(item)); + } else if (OB_FAIL(rootserver::ObTenantSnapshotUtil::add_clone_tenant_task(trans, item))) { + LOG_WARN("fail to update fork tenant snapshot to cloning", KR(ret), K(item)); } if (trans.is_started()) { @@ -572,6 +585,7 @@ int ObCloneScheduler::clone_wait_create_snapshot_for_fork_tenant(const share::Ob return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_CREATE_TENANT_ERROR); int ObCloneScheduler::clone_create_tenant(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -583,7 +597,10 @@ int ObCloneScheduler::clone_create_tenant(const share::ObCloneJob &job) const int64_t timeout = GCONF._ob_ddl_timeout; ObTenantCloneTableOperator clone_op; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_CREATE_TENANT_ERROR)) { + ret = ERRSIM_CLONE_CREATE_TENANT_ERROR; + LOG_WARN("mock clone create tenant failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -621,6 +638,7 @@ int ObCloneScheduler::clone_create_tenant(const share::ObCloneJob &job) return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_WAIT_CREATE_TENANT_ERROR); int ObCloneScheduler::clone_wait_tenant_restore_finish(const ObCloneJob &job) { int ret = OB_SUCCESS; @@ -629,7 +647,11 @@ int ObCloneScheduler::clone_wait_tenant_restore_finish(const ObCloneJob &job) const uint64_t clone_tenant_id = job.get_clone_tenant_id(); bool need_wait = false; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_WAIT_CREATE_TENANT_ERROR)) { + ret = ERRSIM_CLONE_WAIT_CREATE_TENANT_ERROR; + need_wait = OB_EAGAIN == ret ? true : false; + LOG_WARN("mock clone wait create tenant failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -678,11 +700,12 @@ int ObCloneScheduler::clone_wait_tenant_restore_finish(const ObCloneJob &job) if (OB_TMP_FAIL(try_update_job_status_(ret, job))) { LOG_WARN("fail to update job status", KR(ret), KR(tmp_ret), K(job)); } - LOG_INFO("[RESTORE] clone wait tenant restore finish", KR(ret), K(job)); } + LOG_INFO("[RESTORE] clone wait tenant restore finish", KR(ret), K(job)); return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_RELEASE_RESOURCE_ERROR); int ObCloneScheduler::clone_release_resource(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -691,8 +714,13 @@ int ObCloneScheduler::clone_release_resource(const share::ObCloneJob &job) const ObTenantSnapshotID snapshot_id = job.get_tenant_snapshot_id(); const ObTenantCloneJobType job_type = job.get_job_type(); int tmp_ret = OB_SUCCESS; + bool need_retry = false; - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_RELEASE_RESOURCE_ERROR)) { + ret = ERRSIM_CLONE_RELEASE_RESOURCE_ERROR; + need_retry = OB_EAGAIN == ret ? true : false; + LOG_WARN("mock clone release resource failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -708,16 +736,19 @@ int ObCloneScheduler::clone_release_resource(const share::ObCloneJob &job) ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(job)); } else if (OB_FAIL(ObTenantCloneUtil::release_source_tenant_resource_of_clone_job(*sql_proxy_, job))) { + need_retry = true; LOG_WARN("failed to release source tenant resource", KR(ret), K(job)); } - if (OB_TMP_FAIL(try_update_job_status_(ret, job))) { + if (OB_FAIL(ret) && need_retry) { + } else if (OB_TMP_FAIL(try_update_job_status_(ret, job))) { LOG_WARN("fail to update job status", KR(ret), KR(tmp_ret), K(job)); } - LOG_INFO("[RESTORE] clone_release_resource", KR(ret), K(job)); + LOG_INFO("[RESTORE] clone_release_resource", KR(ret), K(need_retry), K(job)); return ret; } +ERRSIM_POINT_DEF(ERRSIM_CLONE_SYS_FINISH_ERROR); int ObCloneScheduler::clone_sys_finish(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -727,7 +758,10 @@ int ObCloneScheduler::clone_sys_finish(const share::ObCloneJob &job) const uint64_t source_tenant_id = job.get_source_tenant_id(); const ObTenantSnapshotID &snapshot_id = job.get_tenant_snapshot_id(); - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_SYS_FINISH_ERROR)) { + ret = ERRSIM_CLONE_SYS_FINISH_ERROR; + LOG_WARN("mock clone sys finish failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -1116,6 +1150,7 @@ int ObCloneScheduler::clone_user_finish(const share::ObCloneJob &job) // 1. for clone_tenant, gc the resource of resource_pool and clone_tenant // 2. for source_tenant, release global_lock and tenant snapshot // 3. for sys_tenant, finish the clone job +ERRSIM_POINT_DEF(ERRSIM_CLONE_RECYCLE_FAILED_JOB_ERROR); int ObCloneScheduler::clone_recycle_failed_job(const share::ObCloneJob &job) { int ret = OB_SUCCESS; @@ -1123,7 +1158,10 @@ int ObCloneScheduler::clone_recycle_failed_job(const share::ObCloneJob &job) const uint64_t source_tenant_id = job.get_source_tenant_id(); const ObTenantCloneStatus job_status = job.get_status(); - if (IS_NOT_INIT) { + if (OB_UNLIKELY(ERRSIM_CLONE_RECYCLE_FAILED_JOB_ERROR)) { + ret = ERRSIM_CLONE_RECYCLE_FAILED_JOB_ERROR; + LOG_WARN("mock clone recycle failed job failed", KR(ret), K(job)); + } else if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not inited", KR(ret)); } else if (has_set_stop()) { @@ -1139,10 +1177,8 @@ int ObCloneScheduler::clone_recycle_failed_job(const share::ObCloneJob &job) } else if (!job_status.is_sys_failed_status()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("try to recycle a processing or successful job", KR(ret), K(job)); - } else if (ObTenantCloneStatus::Status::CLONE_SYS_RELEASE_RESOURCE_FAIL != job_status && + } else if (job_status.is_sys_release_clone_resource_status() && OB_FAIL(ObTenantCloneUtil::release_clone_tenant_resource_of_clone_job(job))) { - // CLONE_SYS_RELEASE_RESOURCE means the clone_tenant has been created and restored successful. - // thus, if the clone_job is failed in this status, we just need to release the according snapshot. LOG_WARN("fail to release resource of clone tenant", KR(ret), K(job)); } else if (OB_FAIL(ObTenantCloneUtil::release_source_tenant_resource_of_clone_job(*sql_proxy_, job))) { LOG_WARN("fail to release resource of source tenant", KR(ret), K(job)); diff --git a/src/rootserver/restore/ob_tenant_clone_util.cpp b/src/rootserver/restore/ob_tenant_clone_util.cpp index d907615ca2..4e2c105a2e 100644 --- a/src/rootserver/restore/ob_tenant_clone_util.cpp +++ b/src/rootserver/restore/ob_tenant_clone_util.cpp @@ -421,14 +421,14 @@ int ObTenantCloneUtil::release_source_tenant_resource_of_clone_job(common::ObISQ } else { LOG_WARN("fail to get global_lock", KR(ret), K(clone_job)); } - } else if (ObTenantSnapStatus::RESTORING != global_lock.get_status()) { + } else if (ObTenantSnapStatus::CLONING != global_lock.get_status()) { is_already_unlocked = true; LOG_INFO("global lock has been released", KR(ret), K(clone_job)); } else if (OB_FAIL(ObTenantSnapshotUtil::unlock_tenant_snapshot_simulated_mutex_from_clone_release_task( trans, source_tenant_id, job_id, - ObTenantSnapStatus::RESTORING, + ObTenantSnapStatus::CLONING, is_already_unlocked))) { LOG_WARN("fail to unlock", KR(ret), K(clone_job), K(global_lock)); } @@ -532,7 +532,7 @@ int ObTenantCloneUtil::cancel_clone_job(common::ObISQLClient &sql_client, ObTenantCloneTableOperator clone_op; ObCloneJob clone_job; ObMySQLTransaction trans; - static const char *err_msg = "clone job has been canceled"; + ObSqlString err_msg; const ObTenantCloneStatus next_status(ObTenantCloneStatus::Status::CLONE_SYS_CANCELED); if (OB_UNLIKELY(clone_tenant_name.empty())) { @@ -560,7 +560,9 @@ int ObTenantCloneUtil::cancel_clone_job(common::ObISQLClient &sql_client, clone_job.get_status(), /*old_status*/ next_status))) { LOG_WARN("fail to update job status", KR(ret), K(clone_tenant_name), K(clone_job)); - } else if (OB_FAIL(clone_op.update_job_failed_info(clone_job.get_job_id(), OB_CANCELED, err_msg))) { + } else if (OB_FAIL(err_msg.append_fmt("clone job has been canceled in %s status", + ObTenantCloneStatus::get_clone_status_str(clone_job.get_status())))) { + } else if (OB_FAIL(clone_op.update_job_failed_info(clone_job.get_job_id(), OB_CANCELED, err_msg.string()))) { LOG_WARN("fail to update job failed info", KR(ret), K(clone_job)); } diff --git a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_scheduler.cpp b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_scheduler.cpp index 8d2cf813d4..28aba08cb7 100644 --- a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_scheduler.cpp +++ b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_scheduler.cpp @@ -190,7 +190,7 @@ int ObTenantSnapshotScheduler::get_tenant_snapshot_jobs_( uint64_t user_tenant_id = gen_user_tenant_id(MTL_ID()); ObArbitrationServiceStatus arbitration_service_status; int64_t paxos_replica_num = OB_INVALID_COUNT; - int64_t restore_job_num = 0; + int64_t clone_job_num = 0; if (OB_ISNULL(GCTX.schema_service_)) { ret = OB_INVALID_ARGUMENT; @@ -242,8 +242,8 @@ int ObTenantSnapshotScheduler::get_tenant_snapshot_jobs_( } else if (OB_FAIL(delete_jobs.push_back(delete_job))) { LOG_WARN("push back failed", KR(ret), K(item), K(delete_job)); } - } else if (ObTenantSnapStatus::RESTORING == items.at(i).get_status()) { - restore_job_num++; + } else if (ObTenantSnapStatus::CLONING == items.at(i).get_status()) { + clone_job_num++; } else if (ObTenantSnapStatus::FAILED == items.at(i).get_status()) { // when a tenant snapshot is created failed, // for the normal tenant snapshot, it will be setted as DELETING and be deleted directly; @@ -259,10 +259,10 @@ int ObTenantSnapshotScheduler::get_tenant_snapshot_jobs_( if (OB_FAIL(ret)) { } else if ((create_jobs.count() > 1) - || (create_jobs.count() + restore_job_num > 1)) { + || (create_jobs.count() + clone_job_num > 1)) { //only one creation job/restoration job can exist at a time, num > 1 is illegal! ret = OB_ERR_UNEXPECTED; - LOG_ERROR("unexpected tenant snapshot count", KR(ret), K(create_jobs), K(restore_job_num)); + LOG_ERROR("unexpected tenant snapshot count", KR(ret), K(create_jobs), K(clone_job_num)); } return ret; @@ -1028,8 +1028,8 @@ int ObTenantSnapshotScheduler::finish_create_tenant_snapshot_( true /*for update*/, global_lock))) { LOG_WARN("failed to get special tenant snapshot item", KR(ret), K(user_tenant_id)); - } else if (ObTenantSnapStatus::RESTORING == global_lock.get_status()) { - // For fork tenant (a job type of tenant cloning), the status of global_lock is set as RESTORING at beginning. + } else if (ObTenantSnapStatus::CLONING == global_lock.get_status()) { + // For fork tenant (a job type of tenant cloning), the status of global_lock is set as CLONING at beginning. // in this case, the global_lock should be unlocked after cloning tenant is finished } else if (OB_FAIL(ObTenantSnapshotUtil::unlock_tenant_snapshot_simulated_mutex_from_snapshot_task( trans, @@ -1077,8 +1077,8 @@ int ObTenantSnapshotScheduler::create_tenant_snapshot_fail_(const ObCreateSnapsh global_lock))) { LOG_WARN("failed to get special tenant snapshot item", KR(ret), K(user_tenant_id)); } else { - if (ObTenantSnapStatus::RESTORING == global_lock.get_status()) { - // For fork tenant (a job type of tenant cloning), the status of global_lock is set as RESTORING at beginning. + if (ObTenantSnapStatus::CLONING == global_lock.get_status()) { + // For fork tenant (a job type of tenant cloning), the status of global_lock is set as CLONING at beginning. // in this case, when creating snapshot failed, // the snapshot and global_lock should only be released by clone job if (OB_FAIL(table_op.update_tenant_snap_item(tenant_snapshot_id, diff --git a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.cpp b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.cpp index c6d7f10bb6..9c07b9e113 100644 --- a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.cpp +++ b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.cpp @@ -672,10 +672,10 @@ int ObTenantSnapshotUtil::lock_(ObMySQLTransaction &trans, if (CREATE_OP == op) { new_status = ObTenantSnapStatus::CREATING; } else { // FORK_OP == op - new_status = ObTenantSnapStatus::RESTORING; + new_status = ObTenantSnapStatus::CLONING; } } else if (RESTORE_OP == op) { - new_status = ObTenantSnapStatus::RESTORING; + new_status = ObTenantSnapStatus::CLONING; } else { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected tenant snapshot operation", KR(ret), K(tenant_id), K(op)); @@ -835,9 +835,9 @@ int ObTenantSnapshotUtil::get_tenant_snapshot_info(common::ObISQLClient &sql_cli return ret; } -int ObTenantSnapshotUtil::add_restore_tenant_task(ObMySQLTransaction &trans, - const uint64_t tenant_id, - const share::ObTenantSnapshotID &tenant_snapshot_id) +int ObTenantSnapshotUtil::add_clone_tenant_task(ObMySQLTransaction &trans, + const uint64_t tenant_id, + const share::ObTenantSnapshotID &tenant_snapshot_id) { int ret = OB_SUCCESS; ObTenantSnapshotTableOperator table_op; @@ -853,18 +853,18 @@ int ObTenantSnapshotUtil::add_restore_tenant_task(ObMySQLTransaction &trans, } else if (ObTenantSnapStatus::NORMAL != snap_item.get_status()) { ret = OB_OP_NOT_ALLOW; LOG_WARN("not allowed for current snapshot operation", KR(ret), K(tenant_id), K(snap_item.get_status())); - LOG_USER_ERROR(OB_OP_NOT_ALLOW, "there may be other operation on the same tenant snapshot, restore tenant"); + LOG_USER_ERROR(OB_OP_NOT_ALLOW, "there may be other operation on the same tenant snapshot, clone tenant"); } else if (OB_FAIL(table_op.update_tenant_snap_item(tenant_snapshot_id, ObTenantSnapStatus::NORMAL, /*old_status*/ - ObTenantSnapStatus::RESTORING /*new_status*/))) { + ObTenantSnapStatus::CLONING /*new_status*/))) { LOG_WARN("update tenant snapshot status failed", KR(ret), K(tenant_id), K(tenant_snapshot_id)); } return ret; } -int ObTenantSnapshotUtil::add_restore_tenant_task(ObMySQLTransaction &trans, - const ObTenantSnapItem &snap_item) +int ObTenantSnapshotUtil::add_clone_tenant_task(ObMySQLTransaction &trans, + const ObTenantSnapItem &snap_item) { int ret = OB_SUCCESS; ObTenantSnapshotTableOperator table_op; @@ -875,12 +875,12 @@ int ObTenantSnapshotUtil::add_restore_tenant_task(ObMySQLTransaction &trans, } else if (ObTenantSnapStatus::NORMAL != snap_item.get_status()) { ret = OB_OP_NOT_ALLOW; LOG_WARN("not allowed for current snapshot operation", KR(ret), K(snap_item)); - LOG_USER_ERROR(OB_OP_NOT_ALLOW, "there may be other operation on the same tenant snapshot, restore tenant"); + LOG_USER_ERROR(OB_OP_NOT_ALLOW, "there may be other operation on the same tenant snapshot, clone tenant"); } else if (OB_FAIL(table_op.init(snap_item.get_tenant_id(), &trans))) { LOG_WARN("failed to init table op", KR(ret), K(snap_item)); } else if (OB_FAIL(table_op.update_tenant_snap_item(snap_item.get_tenant_snapshot_id(), ObTenantSnapStatus::NORMAL, /*old_status*/ - ObTenantSnapStatus::RESTORING /*new_status*/))) { + ObTenantSnapStatus::CLONING /*new_status*/))) { LOG_WARN("update tenant snapshot status failed", KR(ret), K(snap_item)); } diff --git a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.h b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.h index 39ba8f438f..e23e9096dc 100644 --- a/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.h +++ b/src/rootserver/tenant_snapshot/ob_tenant_snapshot_util.h @@ -82,8 +82,8 @@ public: { CREATE_OP = 0, DROP_OP = 1, - RESTORE_OP = 2, - FORK_OP = 3, + RESTORE_OP = 2, // a type of clone job + FORK_OP = 3, // a type of clone job MAX, }; public: @@ -137,11 +137,11 @@ public: const uint64_t source_tenant_id, const share::ObTenantSnapshotID &snapshot_id, share::ObTenantSnapItem &item); - static int add_restore_tenant_task(ObMySQLTransaction &trans, - const uint64_t tenant_id, - const share::ObTenantSnapshotID &tenant_snapshot_id); - static int add_restore_tenant_task(ObMySQLTransaction &trans, - const share::ObTenantSnapItem &snap_item); + static int add_clone_tenant_task(ObMySQLTransaction &trans, + const uint64_t tenant_id, + const share::ObTenantSnapshotID &tenant_snapshot_id); + static int add_clone_tenant_task(ObMySQLTransaction &trans, + const share::ObTenantSnapItem &snap_item); static int generate_tenant_snapshot_name(const uint64_t tenant_id, ObSqlString &tenant_snapshot_name, bool is_inner = false); diff --git a/src/share/restore/ob_tenant_clone_table_operator.cpp b/src/share/restore/ob_tenant_clone_table_operator.cpp index 343f7f2363..3ed99de43e 100644 --- a/src/share/restore/ob_tenant_clone_table_operator.cpp +++ b/src/share/restore/ob_tenant_clone_table_operator.cpp @@ -216,6 +216,20 @@ bool ObTenantCloneStatus::is_sys_release_resource_status() const return b_ret; } +bool ObTenantCloneStatus::is_sys_release_clone_resource_status() const +{ + bool b_ret = false; + + if (ObTenantCloneStatus::Status::CLONE_SYS_CREATE_INNER_RESOURCE_POOL_FAIL <= status_ && + ObTenantCloneStatus::Status::CLONE_SYS_RELEASE_RESOURCE_FAIL > status_) { + // CLONE_SYS_RELEASE_RESOURCE means the clone_tenant has been created and restored successful. + // thus, if the clone_job is in or is failed in this status, we just need to release the according snapshot. + b_ret = true; + } + + return b_ret; +} + ObCloneJob::ObCloneJob() : trace_id_(), tenant_id_(OB_INVALID_TENANT_ID), diff --git a/src/share/restore/ob_tenant_clone_table_operator.h b/src/share/restore/ob_tenant_clone_table_operator.h index 48f473fbbf..971ae11027 100644 --- a/src/share/restore/ob_tenant_clone_table_operator.h +++ b/src/share/restore/ob_tenant_clone_table_operator.h @@ -93,6 +93,7 @@ public: bool is_sys_processing_status() const; bool is_sys_valid_snapshot_status_for_fork() const; bool is_sys_release_resource_status() const; + bool is_sys_release_clone_resource_status() const; TO_STRING_KV(K_(status)); diff --git a/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.cpp b/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.cpp index 1a192220ae..88d618205e 100644 --- a/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.cpp +++ b/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.cpp @@ -357,7 +357,7 @@ const char* ObTenantSnapshotTableOperator::TENANT_SNAP_STATUS_ARRAY[] = "CREATING", "DECIDED", "NORMAL", - "RESTORING", + "CLONING", "DELETING", "FAILED", }; @@ -366,7 +366,7 @@ const char* ObTenantSnapshotTableOperator::LS_SNAP_STATUS_ARRAY[] = { "CREATING", "NORMAL", - "RESTORING", + "CLONING", "FAILED", }; diff --git a/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.h b/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.h index fba066003e..f64a5dcc6e 100644 --- a/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.h +++ b/src/share/tenant_snapshot/ob_tenant_snapshot_table_operator.h @@ -30,7 +30,7 @@ enum class ObTenantSnapStatus : int64_t CREATING = 0, DECIDED, NORMAL, - RESTORING, + CLONING, DELETING, FAILED, MAX, @@ -40,7 +40,7 @@ enum class ObLSSnapStatus : int64_t { CREATING = 0, NORMAL, - RESTORING, + CLONING, FAILED, MAX, }; @@ -233,7 +233,7 @@ private: ObTenantSnapType type_; int64_t create_time_; uint64_t data_version_; - // when the status_ is RESTORING, the clone_job id will be owner_job_id of "global_lock"(snapshot_id == 0) + // when the status_ is CLONING, the clone_job id will be owner_job_id of "global_lock"(snapshot_id == 0) // for the other status or the other snapshot, the owner_job_id always be OB_INVALID_ID int64_t owner_job_id_; }; diff --git a/src/sql/engine/cmd/ob_clone_executor.cpp b/src/sql/engine/cmd/ob_clone_executor.cpp index 6a15e7fb9c..772f1010c9 100644 --- a/src/sql/engine/cmd/ob_clone_executor.cpp +++ b/src/sql/engine/cmd/ob_clone_executor.cpp @@ -79,9 +79,7 @@ int ObCloneTenantExecutor::wait_clone_tenant_finished_(ObExecContext &ctx, const int64_t abs_timeout = ObTimeUtility::current_time() + OB_MAX_USER_SPECIFIED_TIMEOUT; // 102 years THIS_WORKER.set_timeout_ts(abs_timeout); - if (OB_UNLIKELY(ERRSIM_WAIT_CLONE_TENANT_FINISHED_ERROR)) { - ret = ERRSIM_WAIT_CLONE_TENANT_FINISHED_ERROR; - } else if (OB_UNLIKELY(job_id < 0)) { + if (OB_UNLIKELY(job_id < 0)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(job_id)); } else if (OB_ISNULL(ctx.get_physical_plan_ctx())) { @@ -100,16 +98,17 @@ int ObCloneTenantExecutor::wait_clone_tenant_finished_(ObExecContext &ctx, // the according record will be moved to __all_clone_job_history from __all_clone_job; // if the clone job is failed, // the according record will be set as failed status in __all_clone_job and - // will be moved to __all_clone_job_history after user executes the "recycle" sql + // will be moved to __all_clone_job_history after the related resource is recycled bool clone_over = false; while (OB_SUCC(ret) && !clone_over) { job.reset(); ob_usleep(2 * 1000 * 1000L); // 2s ObTenantCloneTableOperator table_op; ObMySQLTransaction trans; - bool exist_in_history = false; - if (THIS_WORKER.is_timeout()) { + if (OB_UNLIKELY(ERRSIM_WAIT_CLONE_TENANT_FINISHED_ERROR)) { + ret = ERRSIM_WAIT_CLONE_TENANT_FINISHED_ERROR; + } else if (THIS_WORKER.is_timeout()) { ret = OB_TIMEOUT; LOG_WARN("wait clone tenant timeout", KR(ret), K(job_id)); } else if (OB_FAIL(ctx.check_status())) { @@ -119,36 +118,25 @@ int ObCloneTenantExecutor::wait_clone_tenant_finished_(ObExecContext &ctx, } else if (OB_FAIL(table_op.init(OB_SYS_TENANT_ID, &trans))) { LOG_WARN("failed to init table op", KR(ret)); } else if (OB_FAIL(table_op.get_sys_clone_job_history(job_id, job))) { - if (OB_ENTRY_NOT_EXIST == ret) { + if (OB_ENTRY_NOT_EXIST == ret) { // clone job is running ret = OB_SUCCESS; + + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(ObTenantCloneUtil::notify_clone_scheduler(OB_SYS_TENANT_ID))) { + LOG_WARN("notify clone scheduler failed", KR(tmp_ret)); + } } else { LOG_WARN("failed to get clone job history", KR(ret), K(job_id)); } + } else if (job.get_status().is_sys_success_status()) { + clone_over = true; + LOG_INFO("clone tenant successful", K(job)); + } else if (job.get_status().is_sys_failed_status()) { + ret = OB_ERR_CLONE_TENANT; + LOG_WARN("clone tenant failed", KR(ret), K(job)); } else { - exist_in_history = true; - if (job.get_status().is_sys_success_status()) { - clone_over = true; - LOG_INFO("clone tenant successful", K(job)); - } else if (job.get_status().is_sys_failed_status()) { - ret = OB_ERR_CLONE_TENANT; - LOG_WARN("clone tenant failed", KR(ret), K(job)); - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected status", KR(ret), K(job)); - } - } - - if (OB_SUCC(ret) && !exist_in_history) { - int tmp_ret = OB_SUCCESS; - if (OB_FAIL(table_op.get_clone_job_by_job_id(job_id, job))) { - LOG_WARN("failed to get clone job", KR(ret), K(job)); - } else if (job.get_status().is_sys_failed_status()) { - ret = OB_ERR_CLONE_TENANT; - LOG_WARN("clone tenant failed", KR(ret), K(job)); - } else if (OB_TMP_FAIL(ObTenantCloneUtil::notify_clone_scheduler(OB_SYS_TENANT_ID))) { - // clone job is running - LOG_WARN("notify clone scheduler failed", KR(tmp_ret)); - } + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected status", KR(ret), K(job)); } if (OB_UNLIKELY(OB_TIMEOUT == ret)) { diff --git a/src/storage/tenant_snapshot/ob_ls_snapshot_defs.cpp b/src/storage/tenant_snapshot/ob_ls_snapshot_defs.cpp index 4de881541a..a10120b748 100644 --- a/src/storage/tenant_snapshot/ob_ls_snapshot_defs.cpp +++ b/src/storage/tenant_snapshot/ob_ls_snapshot_defs.cpp @@ -298,7 +298,7 @@ int ObLSSnapshot::get_tablet_meta_entry(blocksstable::MacroBlockId &tablet_meta_ if (!meta_existed_) { ret = OB_STATE_NOT_MATCH; - LOG_WARN("ObLSSnapshot's meta not exsited", KR(ret), KPC(this)); + LOG_WARN("ObLSSnapshot's meta not existed", KR(ret), KPC(this)); } else if (OB_FAIL(meta_handler_->get_ls_snapshot(tenant_snapshot_id_, ls_id_, tablet_meta_entry))) { diff --git a/src/storage/tenant_snapshot/ob_tenant_snapshot_meta_table.cpp b/src/storage/tenant_snapshot/ob_tenant_snapshot_meta_table.cpp index 2ff09b631e..104435a0ef 100644 --- a/src/storage/tenant_snapshot/ob_tenant_snapshot_meta_table.cpp +++ b/src/storage/tenant_snapshot/ob_tenant_snapshot_meta_table.cpp @@ -83,7 +83,7 @@ bool ObLSSnapshotReportInfo::scn_range_is_valid(const ObTenantSnapItem &tenant_s int ret = OB_SUCCESS; bool bret = true; - if ((ObTenantSnapStatus::RESTORING == tenant_snap_item.get_status() || + if ((ObTenantSnapStatus::CLONING == tenant_snap_item.get_status() || ObTenantSnapStatus::NORMAL == tenant_snap_item.get_status())) { if (begin_interval_scn_ < tenant_snap_item.get_clog_start_scn() || end_interval_scn_ > tenant_snap_item.get_snapshot_scn()) {