From 47ee66e349ea72cc0156cb07f5e37a0b892ccdec Mon Sep 17 00:00:00 2001 From: obdev Date: Wed, 18 Sep 2024 08:29:33 +0000 Subject: [PATCH] [bugfix] fix bug that new created unit is set WAIT_GC --- src/observer/omt/ob_tenant_node_balancer.cpp | 149 +++++++++++-------- src/observer/omt/ob_tenant_node_balancer.h | 1 + src/rootserver/ob_unit_manager.cpp | 1 + 3 files changed, 90 insertions(+), 61 deletions(-) diff --git a/src/observer/omt/ob_tenant_node_balancer.cpp b/src/observer/omt/ob_tenant_node_balancer.cpp index 553acc70c5..5db3250821 100644 --- a/src/observer/omt/ob_tenant_node_balancer.cpp +++ b/src/observer/omt/ob_tenant_node_balancer.cpp @@ -516,12 +516,11 @@ void ObTenantNodeBalancer::periodically_check_tenant() int ObTenantNodeBalancer::fetch_effective_tenants(const TenantUnits &old_tenants, TenantUnits &new_tenants) { int ret = OB_SUCCESS; - bool found = false; - bool is_released = false; + // tenants that are not in inner-table but CAN NOT be deleted locally yet TenantUnits tenants; for (int64_t i = 0; OB_SUCC(ret) && i < old_tenants.count(); i++) { - found = false; + bool found = false; const ObUnitInfoGetter::ObTenantConfig &tenant_config = old_tenants.at(i); const ObUnitInfoGetter::ObUnitStatus local_unit_status = tenant_config.unit_status_; for (int64_t j = 0; j < new_tenants.count(); j++) { @@ -534,70 +533,50 @@ int ObTenantNodeBalancer::fetch_effective_tenants(const TenantUnits &old_tenants } if (!found) { - ObTenant *tenant = nullptr; - MTL_SWITCH(tenant_config.tenant_id_) { - if (OB_FAIL(MTL(ObTenantMetaMemMgr*)->check_all_meta_mem_released(is_released, "[DELETE_TENANT]"))) { - LOG_WARN("fail to check_all_meta_mem_released", K(ret), K(tenant_config)); - } else if (!is_released) { - // can not release now. dump some debug info - const uint64_t interval = 180 * 1000 * 1000; // 180s - if (!is_released && REACH_TIME_INTERVAL(interval)) { - MTL(ObTenantMetaMemMgr*)->dump_tablet_info(); - MTL(ObLSService *)->dump_ls_info(); - PRINT_OBJ_LEAK(MTL_ID(), share::LEAK_CHECK_OBJ_MAX_NUM); - } + const int64_t now_time = ObTimeUtility::current_time(); + const int64_t life_time = now_time - tenant_config.create_timestamp_; + if (life_time < RECYCLE_LATENCY && !tenant_config.is_removed_) { + // tenant-unit is only allowed to update to WAIT_GC after reaching RECYCLE_LATENCY, + // to avoid accidentally deleting tenants that haven't yet been persisted in inner-table. + // UNLESS the tenant is marked removed during this period. + if (OB_FAIL(tenants.push_back(tenant_config))) { + LOG_WARN("failed to push back tenant", KR(ret)); } else { - // check ls service is empty. - is_released = MTL(ObLSService *)->is_empty(); + LOG_INFO("[DELETE_TENANT] tenant has not reached the RECYCLE_LATENCY yet " + "and not marked removed, can not delete tenant", + "create_timestamp", tenant_config.create_timestamp_, + "is_removed", tenant_config.is_removed_, + "local_unit_status", ObUnitInfoGetter::get_unit_status_str(local_unit_status), + K(life_time), K(tenant_config)); } - - bool is_tenant_snapshot_released = false; + } else { + // tenant-unit has been deleted in inner-table or marked removed, and ready to be recycled. + // Notify tenant snapshot can start gc if (is_user_tenant(tenant_config.tenant_id_)) { - const int64_t now_time = ObTimeUtility::current_time(); - const int64_t life_time = now_time - tenant_config.create_timestamp_; - if (tenant_config.is_removed_ || life_time >= RECYCLE_LATENCY) { + MTL_SWITCH(tenant_config.tenant_id_) { MTL(ObTenantSnapshotService*)->notify_unit_is_deleting(); } - if (OB_FAIL(MTL(ObTenantSnapshotService*)-> - check_all_tenant_snapshot_released(is_tenant_snapshot_released))) { - LOG_WARN("fail to check_all_tenant_snapshot_released", K(ret), K(tenant_config)); - } else if (!is_tenant_snapshot_released) { - // can not release now. dump some debug info - const uint64_t interval = 180 * 1000 * 1000; // 180s - if (!is_tenant_snapshot_released && REACH_TIME_INTERVAL(interval)) { - MTL(ObTenantSnapshotService*)->dump_all_tenant_snapshot_info(); - } - LOG_INFO("[DELETE_TENANT] tenant has been dropped, tenant snapshot is still waiting for gc", - K(tenant_config)); - } - if (OB_SUCC(ret)) { - is_released = is_released && is_tenant_snapshot_released; - } else { - is_released = false; - } } - } - - if (OB_SUCC(ret)) { - // remove local units after RECYCLE_LATENCY to avoid removing by mistake - // but if marked removed, remove it directly without waiting - const int64_t now_time = ObTimeUtility::current_time(); - const int64_t life_time = now_time - tenant_config.create_timestamp_; - if ((!tenant_config.is_removed_ && life_time < RECYCLE_LATENCY) || !is_released) { + // Check if resources are already released: + // 1. if not released, update status to WAIT_GC if status is not WAIT_GC or DELETING, + // and push it back to effective tenants to avoid starting deleting. + // 2. if released, ignore this tenant, let it be deleted in subsequent process. + bool is_released = false; + if (FAILEDx(check_tenant_resource_released(tenant_config.tenant_id_, is_released))) { + LOG_WARN("failed to check_tenant_resource_released", KR(ret), K(tenant_config)); + } else if (!is_released) { if (OB_FAIL(tenants.push_back(tenant_config))) { LOG_WARN("failed to push back tenant", KR(ret)); } else { // update tenant unit status which need be deleted // need wait gc in observer // NOTE: only update unit status when can not release resource - if (!is_released) { - tenants.at(tenants.count() - 1).unit_status_ = ObUnitInfoGetter::UNIT_WAIT_GC_IN_OBSERVER; - // add a event when try to gc for the first time - if (local_unit_status != ObUnitInfoGetter::ObUnitStatus::UNIT_WAIT_GC_IN_OBSERVER && - local_unit_status != ObUnitInfoGetter::ObUnitStatus::UNIT_DELETING_IN_OBSERVER) { - SERVER_EVENT_ADD("unit", "start unit gc", "tenant_id", tenant_config.tenant_id_, - "unit_id", tenant_config.unit_id_, "unit_status", "WAIT GC"); - } + tenants.at(tenants.count() - 1).unit_status_ = ObUnitInfoGetter::UNIT_WAIT_GC_IN_OBSERVER; + // add a event when try to gc for the first time + if (local_unit_status != ObUnitInfoGetter::ObUnitStatus::UNIT_WAIT_GC_IN_OBSERVER && + local_unit_status != ObUnitInfoGetter::ObUnitStatus::UNIT_DELETING_IN_OBSERVER) { + SERVER_EVENT_ADD("unit", "start unit gc", "tenant_id", tenant_config.tenant_id_, + "unit_id", tenant_config.unit_id_, "unit_status", "WAIT GC"); } LOG_INFO("[DELETE_TENANT] tenant has been dropped. can not delete tenant", @@ -606,12 +585,12 @@ int ObTenantNodeBalancer::fetch_effective_tenants(const TenantUnits &old_tenants "create_timestamp", tenant_config.create_timestamp_, K(life_time), K(tenant_config)); } - } else { - LOG_INFO("[DELETE_TENANT] tenant has been dropped. can delete tenant", - K(is_released), "local_unit_status", ObUnitInfoGetter::get_unit_status_str(local_unit_status), - "is_removed", tenant_config.is_removed_, - "create_timestamp", tenant_config.create_timestamp_, - K(life_time), K(tenant_config)); + } else { // released + LOG_INFO("[DELETE_TENANT] tenant has been dropped. can delete tenant", + K(is_released), "local_unit_status", ObUnitInfoGetter::get_unit_status_str(local_unit_status), + "is_removed", tenant_config.is_removed_, + "create_timestamp", tenant_config.create_timestamp_, + K(life_time), K(tenant_config)); } } } @@ -626,6 +605,54 @@ int ObTenantNodeBalancer::fetch_effective_tenants(const TenantUnits &old_tenants return ret; } +int ObTenantNodeBalancer::check_tenant_resource_released(const uint64_t tenant_id, bool &is_released) const +{ + int ret = OB_SUCCESS; + const int64_t dump_info_interval = 180 * 1000 * 1000; // 180s + if (OB_UNLIKELY(!is_valid_tenant_id(tenant_id))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid tenant_id", KR(ret), K(tenant_id)); + } else { + is_released = false; + MTL_SWITCH(tenant_id) { + if (OB_FAIL(MTL(ObTenantMetaMemMgr*)->check_all_meta_mem_released(is_released, "[DELETE_TENANT]"))) { + LOG_WARN("fail to check_all_meta_mem_released", K(ret), K(tenant_id)); + } else if (!is_released) { + // can not release now. dump some debug info + if (!is_released && REACH_TIME_INTERVAL(dump_info_interval)) { + MTL(ObTenantMetaMemMgr*)->dump_tablet_info(); + MTL(ObLSService *)->dump_ls_info(); + PRINT_OBJ_LEAK(MTL_ID(), share::LEAK_CHECK_OBJ_MAX_NUM); + } + } else { + // check ls service is empty. + is_released = MTL(ObLSService *)->is_empty(); + } + + if (is_user_tenant(tenant_id)) { + bool is_tenant_snapshot_released = false; + if (OB_FAIL(MTL(ObTenantSnapshotService*)-> + check_all_tenant_snapshot_released(is_tenant_snapshot_released))) { + LOG_WARN("fail to check_all_tenant_snapshot_released", K(ret), K(tenant_id)); + } else if (!is_tenant_snapshot_released) { + // can not release now. dump some debug info + if (!is_tenant_snapshot_released && REACH_TIME_INTERVAL(dump_info_interval)) { + MTL(ObTenantSnapshotService*)->dump_all_tenant_snapshot_info(); + } + LOG_INFO("[DELETE_TENANT] tenant has been dropped, tenant snapshot is still waiting for gc", + K(tenant_id)); + } + if (OB_SUCC(ret)) { + is_released = is_released && is_tenant_snapshot_released; + } else { + is_released = false; + } + } + } + } + return ret; +} + int ObTenantNodeBalancer::refresh_tenant(TenantUnits &units) { int ret = OB_SUCCESS; diff --git a/src/observer/omt/ob_tenant_node_balancer.h b/src/observer/omt/ob_tenant_node_balancer.h index cdb1c1b6df..c46090f244 100644 --- a/src/observer/omt/ob_tenant_node_balancer.h +++ b/src/observer/omt/ob_tenant_node_balancer.h @@ -89,6 +89,7 @@ private: int refresh_hidden_sys_memory(); void periodically_check_tenant(); int fetch_effective_tenants(const share::TenantUnits &old_tenants, share::TenantUnits &new_tenants); + int check_tenant_resource_released(const uint64_t tenant_id, bool &is_released) const; int refresh_tenant(share::TenantUnits &units); DISALLOW_COPY_AND_ASSIGN(ObTenantNodeBalancer); diff --git a/src/rootserver/ob_unit_manager.cpp b/src/rootserver/ob_unit_manager.cpp index 7926b49a12..f8283adb07 100644 --- a/src/rootserver/ob_unit_manager.cpp +++ b/src/rootserver/ob_unit_manager.cpp @@ -9656,6 +9656,7 @@ int ObUnitManager::do_migrate_unit_in_trans_(const share::ObResourcePool &pool, LOG_WARN("update_unit failed", K(new_unit), K(ret)); } // End this transaction + DEBUG_SYNC(BEFORE_MIGRATE_FINISH); if (trans.is_started()) { int tmp_ret = OB_SUCCESS; if (OB_SUCCESS != (tmp_ret = trans.end(OB_SUCC(ret)))) {