update gc lock timeout and retry

This commit is contained in:
wxhwang
2023-08-09 07:12:52 +00:00
committed by ob-robot
parent 11f908e65e
commit 265c501f02
5 changed files with 122 additions and 18 deletions

View File

@ -46,6 +46,8 @@
#include "share/backup/ob_archive_store.h"
#include "share/backup/ob_backup_data_table_operator.h"
#include "share/backup/ob_backup_connectivity.h"
#include "share/rc/ob_tenant_base.h"
#include "observer/omt/ob_tenant.h"
#include <algorithm>
using namespace oceanbase::blocksstable;
@ -3569,6 +3571,9 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
int ret = OB_SUCCESS;
storage::ObLSHandle ls_handle;
storage::ObLS *ls = NULL;
ObTenantDagScheduler *scheduler = NULL;
share::ObTenantBase *tenant_base = MTL_CTX();
omt::ObTenant *tenant = NULL;
ObBackupLSMetaInfo ls_meta_info;
ObExternTabletMetaWriter writer;
ObBackupDest backup_set_dest;
@ -3617,7 +3622,14 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
return ret;
};
if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) {
if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret));
} else if (OB_ISNULL(tenant_base)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base));
} else if (FALSE_IT(tenant = static_cast<omt::ObTenant *>(tenant_base))) {
} else if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) {
LOG_WARN("failed to get ls", K(ret), K(tenant_id), K(ls_id));
} else if (OB_ISNULL(ls = ls_handle.get_ls())) {
ret = OB_ERR_UNEXPECTED;
@ -3627,12 +3639,43 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
} else if (OB_FAIL(writer.init(backup_set_dest, param_.ls_id_, param_.turn_id_, param_.retry_id_))) {
LOG_WARN("failed to init tablet info writer", K(ret));
} else {
if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas(
const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min
const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s
const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time();
int64_t cost_ts = 0;
do {
if (ls->is_stopped()) {
ret = OB_NOT_RUNNING;
LOG_WARN("ls is not running, stop backup", K(ret), KPC(ls));
} else if (scheduler->has_set_stop()) {
ret = OB_SERVER_IS_STOPPING;
LOG_WARN("tenant dag scheduler has set stop, stop backup", K(ret), KPC(ls));
} else if (tenant->has_stopped()) {
ret = OB_TENANT_HAS_BEEN_DROPPED;
LOG_WARN("tenant has been stopped, stop backup", K(ret), KPC(ls));
} else if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas(
false/* check_archive */,
save_ls_meta_f,
backup_tablet_meta_f))) {
if (OB_TABLET_GC_LOCK_CONFLICT != ret) {
LOG_WARN("failed to get ls meta package and tablet meta", K(ret), KPC(ls));
} else if (OB_FAIL(backup_ls_meta_package_(ls_meta_info))) {
} else {
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) {
ret = OB_EAGAIN;
LOG_WARN("get ls meta package and tablet meta timeout, need try again.", K(ret), K(ls_id));
} else {
ob_usleep(CHECK_GC_LOCK_INTERVAL);
}
}
} else {
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
LOG_INFO("succeed to get ls meta package and tablet meta", K(ls_id), K(cost_ts));
}
} while (OB_TABLET_GC_LOCK_CONFLICT == ret);
if (FAILEDx(backup_ls_meta_package_(ls_meta_info))) {
LOG_WARN("failed to backup ls meta package", K(ret), K(ls_meta_info));
} else if (OB_FAIL(ObBackupLSTaskOperator::update_max_tablet_checkpoint_scn(
*report_ctx_.sql_proxy_,

View File

@ -20,6 +20,7 @@
#include "storage/tablet/ob_tablet.h"
#include "storage/high_availability/ob_storage_ha_utils.h"
#include "storage/tablet/ob_tablet_iterator.h"
#include "observer/omt/ob_tenant.h"
namespace oceanbase
{
@ -2591,6 +2592,12 @@ int ObCopyLSViewInfoObReader::init(
common::ObInOutBandwidthThrottle &bandwidth_throttle)
{
int ret = OB_SUCCESS;
ObTenantDagScheduler *scheduler = NULL;
share::ObTenantBase *tenant_base = MTL_CTX();
omt::ObTenant *tenant = NULL;
ObLSService *ls_service = NULL;
ObLS *ls = NULL;
ObLSHandle ls_handle;
if (IS_INIT) {
ret = OB_INIT_TWICE;
@ -2599,14 +2606,61 @@ int ObCopyLSViewInfoObReader::init(
|| OB_UNLIKELY(!rpc_arg.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), K(src_info), K(rpc_arg));
} else if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret));
} else if (OB_ISNULL(tenant_base)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base));
} else if (FALSE_IT(tenant = static_cast<omt::ObTenant *>(tenant_base))) {
} else if (OB_ISNULL(ls_service = MTL(ObLSService*))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("failed to get ObLSService from MTL", K(ret), KP(ls_service));
} else if (OB_FAIL(ls_service->get_ls(rpc_arg.ls_id_, ls_handle, ObLSGetMod::HA_MOD))) {
LOG_WARN("failed to get ls", K(ret), K(rpc_arg));
} else if (OB_ISNULL(ls = ls_handle.get_ls())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("ls should not be NULL", K(ret), K(rpc_arg));
} else if (OB_FAIL(rpc_reader_.init(bandwidth_throttle))) {
LOG_WARN("fail to init tablet info rpc reader", K(ret));
} else {
const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min
const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s
const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time();
int64_t cost_ts = 0;
do {
if (ls->is_stopped()) {
ret = OB_NOT_RUNNING;
LOG_WARN("ls is not running, stop send get ls view rpc", K(ret), KPC(ls));
} else if (scheduler->has_set_stop()) {
ret = OB_SERVER_IS_STOPPING;
LOG_WARN("tenant dag scheduler has set stop, stop send get ls view rpc", K(ret), KPC(ls));
} else if (tenant->has_stopped()) {
ret = OB_TENANT_HAS_BEEN_DROPPED;
LOG_WARN("tenant has been stopped, stop send get ls view rpc", K(ret), KPC(ls));
} else if (OB_FAIL(srv_rpc_proxy.to(src_info.src_addr_).by(OB_DATA_TENANT_ID)
.timeout(FETCH_LS_VIEW_INFO_TIMEOUT).dst_cluster_id(src_info.cluster_id_)
.ratelimit(true).bg_flow(obrpc::ObRpcProxy::BACKGROUND_FLOW)
.fetch_ls_view(rpc_arg, rpc_reader_.get_rpc_buffer(), rpc_reader_.get_handle()))) {
if (OB_TABLET_GC_LOCK_CONFLICT != ret) {
LOG_WARN("failed to send fetch ls view info rpc", K(ret), K(src_info), K(rpc_arg));
} else if (OB_FAIL(rpc_reader_.fetch_and_decode(ls_meta_))) {
} else {
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) {
ret = OB_EAGAIN;
LOG_WARN("copy ls view wait gc lock timeout, need try again.", K(ret), K(src_info), K(rpc_arg));
} else {
ob_usleep(CHECK_GC_LOCK_INTERVAL);
}
}
} else {
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
LOG_INFO("succeed to init copy ls view", K(src_info), K(rpc_arg), K(cost_ts));
}
} while (OB_TABLET_GC_LOCK_CONFLICT == ret);
if (FAILEDx(rpc_reader_.fetch_and_decode(ls_meta_))) {
LOG_WARN("fail to fetch and decode ls meta", K(ret));
} else if (!ls_meta_.is_valid()) {
ret = OB_ERR_UNEXPECTED;
@ -2615,6 +2669,8 @@ int ObCopyLSViewInfoObReader::init(
is_inited_ = true;
LOG_INFO("succeed to init fetch ls view info reader", K(src_info), K(rpc_arg));
}
}
return ret;
}

View File

@ -2829,7 +2829,7 @@ void ObLSRestoreResultMgr::set_result(const int result, const share::ObTaskId &t
// 1. result_ is OB_SUCCESS;
// 2. result_ is retrieable err, but input result is non retrieable err.
lib::ObMutexGuard guard(mtx_);
if (result == OB_EAGAIN) {
if (OB_EAGAIN == result) {
} else {
if (retry_cnt_ >= OB_MAX_RESTORE_RETRY_TIMES) { // avoiding overwrite error code
} else if ((!can_retrieable_err(result) && can_retrieable_err(result_))

View File

@ -286,9 +286,13 @@ uint8_t ObTabletGCHandler::get_tablet_persist_trigger_and_reset()
int ObTabletGCHandler::disable_gc()
{
int ret = OB_SUCCESS;
if (OB_FAIL(gc_lock_.trylock())) {
if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
ret = OB_TABLET_GC_LOCK_CONFLICT;
LOG_WARN("try lock failed, please retry later", K(ret));
} else if (check_stop()) {
gc_lock_.unlock();
ret = OB_NOT_RUNNING;
LOG_WARN("gc handler has already been offline", K(ret));
}
return ret;
@ -302,7 +306,7 @@ void ObTabletGCHandler::enable_gc()
int ObTabletGCHandler::set_tablet_change_checkpoint_scn(const share::SCN &scn)
{
int ret = OB_SUCCESS;
if (OB_FAIL(gc_lock_.trylock())) {
if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
ret = OB_TABLET_GC_LOCK_CONFLICT;
LOG_WARN("try lock failed, please retry later", K(ret));
} else {
@ -699,7 +703,7 @@ int ObTabletGCHandler::offline()
if (!is_finish()) {
ret = OB_EAGAIN;
STORAGE_LOG(INFO, "tablet gc handler not finish, retry", KR(ret), KPC(this), KPC(ls_), K(ls_->get_ls_meta()));
} else if (OB_FAIL(gc_lock_.trylock())) {
} else if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
// make sure 'gc_lock_' is not using.
ret = OB_TABLET_GC_LOCK_CONFLICT;
LOG_WARN("tablet gc handler not finish, retry", K(ret));

View File

@ -100,6 +100,7 @@ private:
void set_start() { ATOMIC_STORE(&update_enabled_, true); }
public:
static const int64_t GC_LOCK_TIMEOUT = 100_ms; // 100ms
obsys::ObRWLock wait_lock_;
lib::ObMutex gc_lock_;