break the indefinite retry in del_tenant when get tenant lock failed
This commit is contained in:
parent
f0afbf2d66
commit
d137b6ba00
@ -374,12 +374,13 @@ void ObMultiTenant::stop()
|
||||
TenantIdList ids;
|
||||
ids.set_label(ObModIds::OMT);
|
||||
get_tenant_ids(ids);
|
||||
bool lock_succ = false;
|
||||
while (ids.size() > 0) {
|
||||
LOG_INFO("there're some tenants need destroy", "count", ids.size());
|
||||
|
||||
for (TenantIdList::iterator it = ids.begin(); it != ids.end(); it++) {
|
||||
uint64_t id = *it;
|
||||
remove_tenant(id);
|
||||
remove_tenant(id, lock_succ);
|
||||
}
|
||||
get_tenant_ids(ids);
|
||||
}
|
||||
@ -1267,11 +1268,12 @@ int ObMultiTenant::mark_del_tenant(const uint64_t tenant_id)
|
||||
|
||||
// 确保remove_tenant函数可以重复调用, 因为在删除租户时失败会不断重试,
|
||||
// 这里只是删除内存结构,持久化的数据还在。
|
||||
int ObMultiTenant::remove_tenant(const uint64_t tenant_id)
|
||||
int ObMultiTenant::remove_tenant(const uint64_t tenant_id, bool &try_clock_succ)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int tmp_ret = OB_SUCCESS;
|
||||
ObTenant *removed_tenant = nullptr;
|
||||
try_clock_succ = false;
|
||||
|
||||
if (IS_NOT_INIT) {
|
||||
ret = OB_NOT_INIT;
|
||||
@ -1291,18 +1293,18 @@ int ObMultiTenant::remove_tenant(const uint64_t tenant_id)
|
||||
LOG_ERROR("fail to kill tenant session", K(ret), K(tenant_id));
|
||||
} else {
|
||||
LOG_INFO("removed_tenant begin to try wlock", K(tenant_id));
|
||||
bool locked = false;
|
||||
ObLDHandle handle;
|
||||
for (int i = 0; i < DEL_TRY_TIMES && !locked; ++i) {
|
||||
for (int i = 0; i < DEL_TRY_TIMES && !try_clock_succ; ++i) {
|
||||
if (OB_SUCC(removed_tenant->try_wrlock(handle))) {
|
||||
locked = true;
|
||||
try_clock_succ = true;
|
||||
} else {
|
||||
ob_usleep(TIME_SLICE_PERIOD);
|
||||
}
|
||||
}
|
||||
|
||||
if (OB_FAIL(ret)) {
|
||||
LOG_WARN("can't get tenant wlock to remove tenant", K(tenant_id), K(ret));
|
||||
LOG_WARN("can't get tenant wlock to remove tenant", K(ret), K(tenant_id),
|
||||
KP(removed_tenant), K(removed_tenant->lock_));
|
||||
removed_tenant->lock_.ld_.print();
|
||||
} else {
|
||||
LOG_INFO("removed_tenant begin to stop", K(tenant_id));
|
||||
@ -1454,20 +1456,36 @@ int ObMultiTenant::del_tenant(const uint64_t tenant_id)
|
||||
} else if (tenant->is_hidden()) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("hidden tenant can't be deleted", K(ret), K(tenant_id));
|
||||
} else if (FALSE_IT(tenant->set_create_status(ObTenantCreateStatus::DELETING))) {
|
||||
} else if (FALSE_IT(tenant->set_unit_status(ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER))) {
|
||||
} else if (OB_FAIL(write_delete_tenant_prepare_slog(tenant_id))) {
|
||||
LOG_ERROR("fail to write delete tenant slog", K(ret), K(tenant_id));
|
||||
} else {
|
||||
// Ensure to write delete_tenant_prepare_slog only once
|
||||
ObUnitInfoGetter::ObUnitStatus old_unit_status = tenant->get_unit_status();
|
||||
if (old_unit_status != ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER) {
|
||||
tenant->set_unit_status(ObUnitInfoGetter::UNIT_DELETING_IN_OBSERVER);
|
||||
tenant->set_create_status(ObTenantCreateStatus::DELETING);
|
||||
if (OB_FAIL(write_delete_tenant_prepare_slog(tenant_id))) {
|
||||
LOG_ERROR("fail to write delete tenant slog", K(ret), K(tenant_id), K(old_unit_status));
|
||||
tenant->set_unit_status(old_unit_status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (OB_SUCC(ret)) {
|
||||
do {
|
||||
// 保证remove_tenant, clear_persistent_data可以幂等重试,
|
||||
// 如果失败会一直无限重试, 这样可以保证如果prepare log写成功一定会有commit日志,
|
||||
// 如果失败会但不是加锁失败会一直无限重试, 保证如果prepare log写成功一定会有commit日志,
|
||||
// 即使这个过程中宕机重启, 重启回放日志时会继续删除并且补一条delete commit log
|
||||
if (OB_FAIL(remove_tenant(tenant_id))) {
|
||||
bool lock_tenant_succ = false;
|
||||
if (OB_FAIL(remove_tenant(tenant_id, lock_tenant_succ))) {
|
||||
LOG_WARN("fail to remove tenant", K(ret), K(tenant_id));
|
||||
SLEEP(1);
|
||||
// If lock failed, the tenant is not removed from tenants_list,
|
||||
// Here can break and leave ObTenantNodeBalancer::check_del_tenant to retry again,
|
||||
// in this case, the deletion of other tenants does not get stuck.
|
||||
// Otherwise it will have to retry indefinitely here, because the tenant cannot be obtained
|
||||
if (false == lock_tenant_succ) {
|
||||
break;
|
||||
} else {
|
||||
SLEEP(1);
|
||||
}
|
||||
} else if (OB_FAIL(clear_persistent_data(tenant_id))) {
|
||||
LOG_ERROR("fail to clear persistent_data", K(ret), K(tenant_id));
|
||||
SLEEP(1);
|
||||
@ -1476,6 +1494,7 @@ int ObMultiTenant::del_tenant(const uint64_t tenant_id)
|
||||
}
|
||||
} while (OB_FAIL(ret));
|
||||
}
|
||||
|
||||
if (lock_succ) {
|
||||
bucket_lock_.unlock(bucket_lock_idx);
|
||||
}
|
||||
|
@ -180,7 +180,7 @@ protected:
|
||||
const int64_t mem_limit,
|
||||
ObTenantMeta &meta);
|
||||
int create_virtual_tenants();
|
||||
int remove_tenant(const uint64_t tenant_id);
|
||||
int remove_tenant(const uint64_t tenant_id, bool &lock_succ);
|
||||
uint32_t get_tenant_lock_bucket_idx(const uint64_t tenant_id);
|
||||
int update_tenant_unit_no_lock(const share::ObUnitInfoGetter::ObTenantConfig &unit);
|
||||
|
||||
|
@ -745,6 +745,12 @@ void ObTenant::set_unit_status(const ObUnitInfoGetter::ObUnitStatus status)
|
||||
tenant_meta_.unit_.unit_status_ = status;
|
||||
}
|
||||
|
||||
ObUnitInfoGetter::ObUnitStatus ObTenant::get_unit_status()
|
||||
{
|
||||
TCRLockGuard guard(meta_lock_);
|
||||
return tenant_meta_.unit_.unit_status_;
|
||||
}
|
||||
|
||||
void ObTenant::mark_tenant_is_removed()
|
||||
{
|
||||
TCWLockGuard guard(meta_lock_);
|
||||
|
@ -427,6 +427,7 @@ public:
|
||||
void set_tenant_super_block(const storage::ObTenantSuperBlock &super_block);
|
||||
void mark_tenant_is_removed();
|
||||
void set_unit_status(const share::ObUnitInfoGetter::ObUnitStatus status);
|
||||
share::ObUnitInfoGetter::ObUnitStatus get_unit_status();
|
||||
|
||||
void set_unit_max_cpu(double cpu);
|
||||
double unit_max_cpu() const;
|
||||
|
Loading…
x
Reference in New Issue
Block a user