From d137b6ba0007e01e348e41c0577048ce3d64a09d Mon Sep 17 00:00:00 2001 From: obdev Date: Tue, 29 Nov 2022 09:07:07 +0000 Subject: [PATCH] break the indefinite retry in del_tenant when get tenant lock failed --- src/observer/omt/ob_multi_tenant.cpp | 45 ++++++++++++++++++++-------- src/observer/omt/ob_multi_tenant.h | 2 +- src/observer/omt/ob_tenant.cpp | 6 ++++ src/observer/omt/ob_tenant.h | 1 + 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/observer/omt/ob_multi_tenant.cpp b/src/observer/omt/ob_multi_tenant.cpp index 67f087f06..f01e044cf 100644 --- a/src/observer/omt/ob_multi_tenant.cpp +++ b/src/observer/omt/ob_multi_tenant.cpp @@ -374,12 +374,13 @@ void ObMultiTenant::stop() TenantIdList ids; ids.set_label(ObModIds::OMT); get_tenant_ids(ids); + bool lock_succ = false; while (ids.size() > 0) { LOG_INFO("there're some tenants need destroy", "count", ids.size()); for (TenantIdList::iterator it = ids.begin(); it != ids.end(); it++) { uint64_t id = *it; - remove_tenant(id); + remove_tenant(id, lock_succ); } get_tenant_ids(ids); } @@ -1267,11 +1268,12 @@ int ObMultiTenant::mark_del_tenant(const uint64_t tenant_id) // 确保remove_tenant函数可以重复调用, 因为在删除租户时失败会不断重试, // 这里只是删除内存结构,持久化的数据还在。 -int ObMultiTenant::remove_tenant(const uint64_t tenant_id) +int ObMultiTenant::remove_tenant(const uint64_t tenant_id, bool &try_clock_succ) { int ret = OB_SUCCESS; int tmp_ret = OB_SUCCESS; ObTenant *removed_tenant = nullptr; + try_clock_succ = false; if (IS_NOT_INIT) { ret = OB_NOT_INIT; @@ -1291,18 +1293,18 @@ int ObMultiTenant::remove_tenant(const uint64_t tenant_id) LOG_ERROR("fail to kill tenant session", K(ret), K(tenant_id)); } else { LOG_INFO("removed_tenant begin to try wlock", K(tenant_id)); - bool locked = false; ObLDHandle handle; - for (int i = 0; i < DEL_TRY_TIMES && !locked; ++i) { + for (int i = 0; i < DEL_TRY_TIMES && !try_clock_succ; ++i) { if (OB_SUCC(removed_tenant->try_wrlock(handle))) { - locked = true; + try_clock_succ = true; } else { ob_usleep(TIME_SLICE_PERIOD); } } if (OB_FAIL(ret)) { - LOG_WARN("can't get tenant wlock to remove tenant", K(tenant_id), K(ret)); + LOG_WARN("can't get tenant wlock to remove tenant", K(ret), K(tenant_id), + KP(removed_tenant), K(removed_tenant->lock_)); removed_tenant->lock_.ld_.print(); } else { LOG_INFO("removed_tenant begin to stop", K(tenant_id)); @@ -1454,20 +1456,36 @@ int ObMultiTenant::del_tenant(const uint64_t tenant_id) } else if (tenant->is_hidden()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("hidden tenant can't be deleted", K(ret), K(tenant_id)); - } else if (FALSE_IT(tenant->set_create_status(ObTenantCreateStatus::DELETING))) { - } else if (FALSE_IT(tenant->set_unit_status(ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER))) { - } else if (OB_FAIL(write_delete_tenant_prepare_slog(tenant_id))) { - LOG_ERROR("fail to write delete tenant slog", K(ret), K(tenant_id)); + } else { + // Ensure to write delete_tenant_prepare_slog only once + ObUnitInfoGetter::ObUnitStatus old_unit_status = tenant->get_unit_status(); + if (old_unit_status != ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER) { + tenant->set_unit_status(ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER); + tenant->set_create_status(ObTenantCreateStatus::DELETING); + if (OB_FAIL(write_delete_tenant_prepare_slog(tenant_id))) { + LOG_ERROR("fail to write delete tenant slog", K(ret), K(tenant_id), K(old_unit_status)); + tenant->set_unit_status(old_unit_status); + } + } } if (OB_SUCC(ret)) { do { // 保证remove_tenant, clear_persistent_data可以幂等重试, - // 如果失败会一直无限重试, 这样可以保证如果prepare log写成功一定会有commit日志, + // 如果失败会但不是加锁失败会一直无限重试, 保证如果prepare log写成功一定会有commit日志, // 即使这个过程中宕机重启, 重启回放日志时会继续删除并且补一条delete commit log - if (OB_FAIL(remove_tenant(tenant_id))) { + bool lock_tenant_succ = false; + if (OB_FAIL(remove_tenant(tenant_id, lock_tenant_succ))) { LOG_WARN("fail to remove tenant", K(ret), K(tenant_id)); - SLEEP(1); + // If lock failed, the tenant is not removed from tenants_list, + // Here can break and leave ObTenantNodeBalancer::check_del_tenant to retry again, + // in this case, the deletion of other tenants does not get stuck. + // Otherwise it will have to retry indefinitely here, because the tenant cannot be obtained + if (false == lock_tenant_succ) { + break; + } else { + SLEEP(1); + } } else if (OB_FAIL(clear_persistent_data(tenant_id))) { LOG_ERROR("fail to clear persistent_data", K(ret), K(tenant_id)); SLEEP(1); @@ -1476,6 +1494,7 @@ int ObMultiTenant::del_tenant(const uint64_t tenant_id) } } while (OB_FAIL(ret)); } + if (lock_succ) { bucket_lock_.unlock(bucket_lock_idx); } diff --git a/src/observer/omt/ob_multi_tenant.h b/src/observer/omt/ob_multi_tenant.h index c5c59e873..cc982c892 100644 --- a/src/observer/omt/ob_multi_tenant.h +++ b/src/observer/omt/ob_multi_tenant.h @@ -180,7 +180,7 @@ protected: const int64_t mem_limit, ObTenantMeta &meta); int create_virtual_tenants(); - int remove_tenant(const uint64_t tenant_id); + int remove_tenant(const uint64_t tenant_id, bool &lock_succ); uint32_t get_tenant_lock_bucket_idx(const uint64_t tenant_id); int update_tenant_unit_no_lock(const share::ObUnitInfoGetter::ObTenantConfig &unit); diff --git a/src/observer/omt/ob_tenant.cpp b/src/observer/omt/ob_tenant.cpp index 8c0e993d6..af868262c 100644 --- a/src/observer/omt/ob_tenant.cpp +++ b/src/observer/omt/ob_tenant.cpp @@ -745,6 +745,12 @@ void ObTenant::set_unit_status(const ObUnitInfoGetter::ObUnitStatus status) tenant_meta_.unit_.unit_status_ = status; } +ObUnitInfoGetter::ObUnitStatus ObTenant::get_unit_status() +{ + TCRLockGuard guard(meta_lock_); + return tenant_meta_.unit_.unit_status_; +} + void ObTenant::mark_tenant_is_removed() { TCWLockGuard guard(meta_lock_); diff --git a/src/observer/omt/ob_tenant.h b/src/observer/omt/ob_tenant.h index 290e4c1c8..fdca1c7c0 100644 --- a/src/observer/omt/ob_tenant.h +++ b/src/observer/omt/ob_tenant.h @@ -427,6 +427,7 @@ public: void set_tenant_super_block(const storage::ObTenantSuperBlock &super_block); void mark_tenant_is_removed(); void set_unit_status(const share::ObUnitInfoGetter::ObUnitStatus status); + share::ObUnitInfoGetter::ObUnitStatus get_unit_status(); void set_unit_max_cpu(double cpu); double unit_max_cpu() const;