update gc lock timeout and retry
This commit is contained in:
@ -46,6 +46,8 @@
|
||||
#include "share/backup/ob_archive_store.h"
|
||||
#include "share/backup/ob_backup_data_table_operator.h"
|
||||
#include "share/backup/ob_backup_connectivity.h"
|
||||
#include "share/rc/ob_tenant_base.h"
|
||||
#include "observer/omt/ob_tenant.h"
|
||||
#include <algorithm>
|
||||
|
||||
using namespace oceanbase::blocksstable;
|
||||
@ -3569,6 +3571,9 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
|
||||
int ret = OB_SUCCESS;
|
||||
storage::ObLSHandle ls_handle;
|
||||
storage::ObLS *ls = NULL;
|
||||
ObTenantDagScheduler *scheduler = NULL;
|
||||
share::ObTenantBase *tenant_base = MTL_CTX();
|
||||
omt::ObTenant *tenant = NULL;
|
||||
ObBackupLSMetaInfo ls_meta_info;
|
||||
ObExternTabletMetaWriter writer;
|
||||
ObBackupDest backup_set_dest;
|
||||
@ -3617,7 +3622,14 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
|
||||
return ret;
|
||||
};
|
||||
|
||||
if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) {
|
||||
if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret));
|
||||
} else if (OB_ISNULL(tenant_base)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base));
|
||||
} else if (FALSE_IT(tenant = static_cast<omt::ObTenant *>(tenant_base))) {
|
||||
} else if (OB_FAIL(get_ls_handle(tenant_id, ls_id, ls_handle))) {
|
||||
LOG_WARN("failed to get ls", K(ret), K(tenant_id), K(ls_id));
|
||||
} else if (OB_ISNULL(ls = ls_handle.get_ls())) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
@ -3627,12 +3639,43 @@ int ObLSBackupMetaTask::backup_ls_meta_and_tablet_metas_(const uint64_t tenant_i
|
||||
} else if (OB_FAIL(writer.init(backup_set_dest, param_.ls_id_, param_.turn_id_, param_.retry_id_))) {
|
||||
LOG_WARN("failed to init tablet info writer", K(ret));
|
||||
} else {
|
||||
if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas(
|
||||
const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min
|
||||
const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s
|
||||
const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time();
|
||||
int64_t cost_ts = 0;
|
||||
do {
|
||||
if (ls->is_stopped()) {
|
||||
ret = OB_NOT_RUNNING;
|
||||
LOG_WARN("ls is not running, stop backup", K(ret), KPC(ls));
|
||||
} else if (scheduler->has_set_stop()) {
|
||||
ret = OB_SERVER_IS_STOPPING;
|
||||
LOG_WARN("tenant dag scheduler has set stop, stop backup", K(ret), KPC(ls));
|
||||
} else if (tenant->has_stopped()) {
|
||||
ret = OB_TENANT_HAS_BEEN_DROPPED;
|
||||
LOG_WARN("tenant has been stopped, stop backup", K(ret), KPC(ls));
|
||||
} else if (OB_FAIL(ls->get_ls_meta_package_and_tablet_metas(
|
||||
false/* check_archive */,
|
||||
save_ls_meta_f,
|
||||
backup_tablet_meta_f))) {
|
||||
if (OB_TABLET_GC_LOCK_CONFLICT != ret) {
|
||||
LOG_WARN("failed to get ls meta package and tablet meta", K(ret), KPC(ls));
|
||||
} else if (OB_FAIL(backup_ls_meta_package_(ls_meta_info))) {
|
||||
} else {
|
||||
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
|
||||
if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) {
|
||||
ret = OB_EAGAIN;
|
||||
LOG_WARN("get ls meta package and tablet meta timeout, need try again.", K(ret), K(ls_id));
|
||||
} else {
|
||||
ob_usleep(CHECK_GC_LOCK_INTERVAL);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
|
||||
LOG_INFO("succeed to get ls meta package and tablet meta", K(ls_id), K(cost_ts));
|
||||
}
|
||||
} while (OB_TABLET_GC_LOCK_CONFLICT == ret);
|
||||
|
||||
|
||||
if (FAILEDx(backup_ls_meta_package_(ls_meta_info))) {
|
||||
LOG_WARN("failed to backup ls meta package", K(ret), K(ls_meta_info));
|
||||
} else if (OB_FAIL(ObBackupLSTaskOperator::update_max_tablet_checkpoint_scn(
|
||||
*report_ctx_.sql_proxy_,
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
#include "storage/tablet/ob_tablet.h"
|
||||
#include "storage/high_availability/ob_storage_ha_utils.h"
|
||||
#include "storage/tablet/ob_tablet_iterator.h"
|
||||
#include "observer/omt/ob_tenant.h"
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
@ -2591,6 +2592,12 @@ int ObCopyLSViewInfoObReader::init(
|
||||
common::ObInOutBandwidthThrottle &bandwidth_throttle)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObTenantDagScheduler *scheduler = NULL;
|
||||
share::ObTenantBase *tenant_base = MTL_CTX();
|
||||
omt::ObTenant *tenant = NULL;
|
||||
ObLSService *ls_service = NULL;
|
||||
ObLS *ls = NULL;
|
||||
ObLSHandle ls_handle;
|
||||
|
||||
if (IS_INIT) {
|
||||
ret = OB_INIT_TWICE;
|
||||
@ -2599,14 +2606,61 @@ int ObCopyLSViewInfoObReader::init(
|
||||
|| OB_UNLIKELY(!rpc_arg.is_valid())) {
|
||||
ret = OB_INVALID_ARGUMENT;
|
||||
LOG_WARN("invalid argument", K(ret), K(src_info), K(rpc_arg));
|
||||
} else if (OB_ISNULL(scheduler = MTL(ObTenantDagScheduler*))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("failed to get ObTenantDagScheduler from MTL", K(ret));
|
||||
} else if (OB_ISNULL(tenant_base)) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("tenant base should not be NULL", K(ret), KP(tenant_base));
|
||||
} else if (FALSE_IT(tenant = static_cast<omt::ObTenant *>(tenant_base))) {
|
||||
} else if (OB_ISNULL(ls_service = MTL(ObLSService*))) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("failed to get ObLSService from MTL", K(ret), KP(ls_service));
|
||||
} else if (OB_FAIL(ls_service->get_ls(rpc_arg.ls_id_, ls_handle, ObLSGetMod::HA_MOD))) {
|
||||
LOG_WARN("failed to get ls", K(ret), K(rpc_arg));
|
||||
} else if (OB_ISNULL(ls = ls_handle.get_ls())) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
LOG_WARN("ls should not be NULL", K(ret), K(rpc_arg));
|
||||
} else if (OB_FAIL(rpc_reader_.init(bandwidth_throttle))) {
|
||||
LOG_WARN("fail to init tablet info rpc reader", K(ret));
|
||||
} else {
|
||||
const int64_t WAIT_GC_LOCK_TIMEOUT = 3 * 60 * 1000 * 1000; // 3 min
|
||||
const int64_t CHECK_GC_LOCK_INTERVAL = 1000000; // 1s
|
||||
const int64_t wait_gc_lock_start_ts = ObTimeUtility::current_time();
|
||||
int64_t cost_ts = 0;
|
||||
do {
|
||||
if (ls->is_stopped()) {
|
||||
ret = OB_NOT_RUNNING;
|
||||
LOG_WARN("ls is not running, stop send get ls view rpc", K(ret), KPC(ls));
|
||||
} else if (scheduler->has_set_stop()) {
|
||||
ret = OB_SERVER_IS_STOPPING;
|
||||
LOG_WARN("tenant dag scheduler has set stop, stop send get ls view rpc", K(ret), KPC(ls));
|
||||
} else if (tenant->has_stopped()) {
|
||||
ret = OB_TENANT_HAS_BEEN_DROPPED;
|
||||
LOG_WARN("tenant has been stopped, stop send get ls view rpc", K(ret), KPC(ls));
|
||||
} else if (OB_FAIL(srv_rpc_proxy.to(src_info.src_addr_).by(OB_DATA_TENANT_ID)
|
||||
.timeout(FETCH_LS_VIEW_INFO_TIMEOUT).dst_cluster_id(src_info.cluster_id_)
|
||||
.ratelimit(true).bg_flow(obrpc::ObRpcProxy::BACKGROUND_FLOW)
|
||||
.fetch_ls_view(rpc_arg, rpc_reader_.get_rpc_buffer(), rpc_reader_.get_handle()))) {
|
||||
if (OB_TABLET_GC_LOCK_CONFLICT != ret) {
|
||||
LOG_WARN("failed to send fetch ls view info rpc", K(ret), K(src_info), K(rpc_arg));
|
||||
} else if (OB_FAIL(rpc_reader_.fetch_and_decode(ls_meta_))) {
|
||||
} else {
|
||||
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
|
||||
if (WAIT_GC_LOCK_TIMEOUT <= cost_ts) {
|
||||
ret = OB_EAGAIN;
|
||||
LOG_WARN("copy ls view wait gc lock timeout, need try again.", K(ret), K(src_info), K(rpc_arg));
|
||||
} else {
|
||||
ob_usleep(CHECK_GC_LOCK_INTERVAL);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cost_ts = ObTimeUtility::current_time() - wait_gc_lock_start_ts;
|
||||
LOG_INFO("succeed to init copy ls view", K(src_info), K(rpc_arg), K(cost_ts));
|
||||
}
|
||||
} while (OB_TABLET_GC_LOCK_CONFLICT == ret);
|
||||
|
||||
|
||||
if (FAILEDx(rpc_reader_.fetch_and_decode(ls_meta_))) {
|
||||
LOG_WARN("fail to fetch and decode ls meta", K(ret));
|
||||
} else if (!ls_meta_.is_valid()) {
|
||||
ret = OB_ERR_UNEXPECTED;
|
||||
@ -2615,6 +2669,8 @@ int ObCopyLSViewInfoObReader::init(
|
||||
is_inited_ = true;
|
||||
LOG_INFO("succeed to init fetch ls view info reader", K(src_info), K(rpc_arg));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -2829,7 +2829,7 @@ void ObLSRestoreResultMgr::set_result(const int result, const share::ObTaskId &t
|
||||
// 1. result_ is OB_SUCCESS;
|
||||
// 2. result_ is retrieable err, but input result is non retrieable err.
|
||||
lib::ObMutexGuard guard(mtx_);
|
||||
if (result == OB_EAGAIN) {
|
||||
if (OB_EAGAIN == result) {
|
||||
} else {
|
||||
if (retry_cnt_ >= OB_MAX_RESTORE_RETRY_TIMES) { // avoiding overwrite error code
|
||||
} else if ((!can_retrieable_err(result) && can_retrieable_err(result_))
|
||||
|
||||
@ -286,9 +286,13 @@ uint8_t ObTabletGCHandler::get_tablet_persist_trigger_and_reset()
|
||||
int ObTabletGCHandler::disable_gc()
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_FAIL(gc_lock_.trylock())) {
|
||||
if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
|
||||
ret = OB_TABLET_GC_LOCK_CONFLICT;
|
||||
LOG_WARN("try lock failed, please retry later", K(ret));
|
||||
} else if (check_stop()) {
|
||||
gc_lock_.unlock();
|
||||
ret = OB_NOT_RUNNING;
|
||||
LOG_WARN("gc handler has already been offline", K(ret));
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -302,7 +306,7 @@ void ObTabletGCHandler::enable_gc()
|
||||
int ObTabletGCHandler::set_tablet_change_checkpoint_scn(const share::SCN &scn)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_FAIL(gc_lock_.trylock())) {
|
||||
if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
|
||||
ret = OB_TABLET_GC_LOCK_CONFLICT;
|
||||
LOG_WARN("try lock failed, please retry later", K(ret));
|
||||
} else {
|
||||
@ -699,7 +703,7 @@ int ObTabletGCHandler::offline()
|
||||
if (!is_finish()) {
|
||||
ret = OB_EAGAIN;
|
||||
STORAGE_LOG(INFO, "tablet gc handler not finish, retry", KR(ret), KPC(this), KPC(ls_), K(ls_->get_ls_meta()));
|
||||
} else if (OB_FAIL(gc_lock_.trylock())) {
|
||||
} else if (OB_FAIL(gc_lock_.lock(GC_LOCK_TIMEOUT))) {
|
||||
// make sure 'gc_lock_' is not using.
|
||||
ret = OB_TABLET_GC_LOCK_CONFLICT;
|
||||
LOG_WARN("tablet gc handler not finish, retry", K(ret));
|
||||
|
||||
@ -100,6 +100,7 @@ private:
|
||||
void set_start() { ATOMIC_STORE(&update_enabled_, true); }
|
||||
|
||||
public:
|
||||
static const int64_t GC_LOCK_TIMEOUT = 100_ms; // 100ms
|
||||
obsys::ObRWLock wait_lock_;
|
||||
lib::ObMutex gc_lock_;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user