fix restore hang and tenant freeze fail in data_checkpoint flushing

This commit is contained in:
obdev 2023-05-17 15:11:21 +00:00 committed by ob-robot
parent 36608dd357
commit ffb515a326
7 changed files with 26 additions and 8 deletions

View File

@ -193,7 +193,9 @@ int ObCheckpointExecutor::update_clog_checkpoint()
return ret;
}
int ObCheckpointExecutor::advance_checkpoint_by_flush(SCN recycle_scn) {
int ObCheckpointExecutor::advance_checkpoint_by_flush(
SCN recycle_scn)
{
int ret = OB_SUCCESS;
RLockGuard guard(rwlock_);
@ -278,7 +280,7 @@ int64_t ObCheckpointExecutor::get_cannot_recycle_log_size()
LSN end_lsn;
if (OB_FAIL(loghandler_->get_end_lsn(end_lsn))) {
STORAGE_LOG(WARN, "get end lsn failed", K(ret), K(ls_->get_ls_id()));
} else {
} else if (!ls_->get_data_checkpoint()->is_flushing()) {
cannot_recycle_log_size =
end_lsn.val_ - ls_->get_clog_base_lsn().val_;
}

View File

@ -75,7 +75,8 @@ public:
// the service will flush and advance checkpoint
// after flush, checkpoint_scn will be equal or greater than recycle_scn
int advance_checkpoint_by_flush(share::SCN recycle_scn = share::SCN::invalid_scn());
int advance_checkpoint_by_flush(
share::SCN recycle_scn = share::SCN::invalid_scn());
// for __all_virtual_checkpoint
int get_checkpoint_info(ObIArray<ObCheckpointVTInfo> &checkpoint_array);
@ -87,6 +88,7 @@ public:
int diagnose(CheckpointDiagnoseInfo &diagnose_info) const;
int traversal_flush() const;
private:
static const int64_t CLOG_GC_PERCENT = 60;

View File

@ -24,6 +24,8 @@ namespace storage
namespace checkpoint
{
__thread bool ObDataCheckpoint::is_tenant_freeze_for_flush_ = false;
// ** ObCheckpointDList **
void ObCheckpointDList::reset()
{
@ -770,7 +772,7 @@ int ObDataCheckpoint::freeze_base_on_needs_(share::SCN recycle_scn)
{
int ret = OB_SUCCESS;
if (get_rec_scn() <= recycle_scn) {
if (!is_flushing() && prepare_list_.is_empty()) {
if (is_tenant_freeze() || (!is_flushing() && prepare_list_.is_empty())) {
int64_t wait_flush_num =
new_create_list_.checkpoint_list_.get_size()
+ active_list_.checkpoint_list_.get_size();
@ -787,7 +789,7 @@ int ObDataCheckpoint::freeze_base_on_needs_(share::SCN recycle_scn)
}
if (logstream_freeze) {
if (OB_FAIL(ls_->logstream_freeze(true/*is_sync*/))) {
if (OB_FAIL(ls_->logstream_freeze(false /* !is_sync */))) {
STORAGE_LOG(WARN, "minor freeze failed", K(ret), K(ls_->get_ls_id()));
}
} else if (OB_FAIL(ls_->batch_tablet_freeze(need_flush_tablets, true/*is_sync*/))) {

View File

@ -120,6 +120,10 @@ public:
bool is_empty();
static void set_tenant_freeze() { is_tenant_freeze_for_flush_ = true; }
static void reset_tenant_freeze() { is_tenant_freeze_for_flush_ = false; }
static bool is_tenant_freeze() { return is_tenant_freeze_for_flush_; }
private:
// traversal prepare_list to flush memtable
// case1: some memtable flush failed when ls freeze
@ -179,6 +183,8 @@ private:
// avoid blocking other list due to traversal ls_frozen_list
common::ObSpinLock ls_frozen_list_lock_;
bool ls_freeze_finished_;
static __thread bool is_tenant_freeze_for_flush_;
};
static const ObTabletID LS_DATA_CHECKPOINT_TABLET(ObDataCheckpoint::LS_DATA_CHECKPOINT_TABLET_ID);

View File

@ -1505,7 +1505,7 @@ int ObLS::batch_tablet_freeze(const ObIArray<ObTabletID> &tablet_ids,
return ret;
}
int ObLS::advance_checkpoint_by_flush(SCN recycle_scn, const int64_t abs_timeout_ts)
int ObLS::advance_checkpoint_by_flush(SCN recycle_scn, const int64_t abs_timeout_ts, const bool is_tennat_freeze)
{
int ret = OB_SUCCESS;
int64_t read_lock = LSLOCKALL;
@ -1515,7 +1515,12 @@ int ObLS::advance_checkpoint_by_flush(SCN recycle_scn, const int64_t abs_timeout
ret = OB_TIMEOUT;
LOG_WARN("lock failed, please retry later", K(ret), K(ls_meta_));
} else {
if (is_tennat_freeze) {
ObDataCheckpoint::set_tenant_freeze();
LOG_INFO("set tenant_freeze", K(ls_meta_.ls_id_));
}
ret = checkpoint_executor_.advance_checkpoint_by_flush(recycle_scn);
ObDataCheckpoint::reset_tenant_freeze();
}
return ret;
}

View File

@ -694,7 +694,8 @@ public:
// advance the checkpoint of this ls
// @param [in] abs_timeout_ts, wait until timeout if lock conflict
int advance_checkpoint_by_flush(share::SCN recycle_scn,
const int64_t abs_timeout_ts = INT64_MAX);
const int64_t abs_timeout_ts = INT64_MAX,
const bool is_tenant_freeze = false);
// ObDataCheckpoint interface:
DELEGATE_WITH_RET(data_checkpoint_, get_freezecheckpoint_info, int);

View File

@ -245,7 +245,7 @@ int ObTenantFreezer::ls_freeze_all_unit_(ObLS *ls, const int64_t abs_timeout_ts)
do {
need_retry = false;
retry_times++;
if (OB_SUCC(ls->advance_checkpoint_by_flush(SCN::max_scn(), abs_timeout_ts))) {
if (OB_SUCC(ls->advance_checkpoint_by_flush(SCN::max_scn(), abs_timeout_ts, true /* is_tennat_freeze */))) {
} else {
current_ts = ObTimeUtil::current_time();
is_timeout = (current_ts >= abs_timeout_ts);