cherrypick to 3.1_opensource_release

This commit is contained in:
handora
2021-08-11 15:33:27 +08:00
committed by wangzelin.wzl
parent a6d5591ddf
commit 4a58022b57
18 changed files with 249 additions and 46 deletions

View File

@ -3409,7 +3409,8 @@ int ObPartitionGroup::get_freeze_cut_(ObMemtable& frozen_memtable, const bool is
K(*this));
}
} else {
// 2. The freeze_id of follower is the right boundary of replay queue.
// 2. The freeze_id of follower is the the maximum log id of the right
// boundary of replay queue and the max majoritied log id
// The follower will block the replay, wait it to be empty and then get the freeze_id.
if (OB_FAIL(wait_follower_no_pending_task_())) {
STORAGE_LOG(WARN, "wait follower no pending task failed", K(is_leader), K(freeze_id), K(*this));
@ -3421,6 +3422,53 @@ int ObPartitionGroup::get_freeze_cut_(ObMemtable& frozen_memtable, const bool is
K(freeze_id),
K(freeze_ts),
K(*this));
} else {
// The logic below is sophistic:
//
// If you remember the semantic of end_log_ts and max_log_ts belong to
// the memstore, you will know that all data belong to the log before
// end_log_ts is within the memstore, and the data may or maynot exist
// in the memstore if the log creates the data is between end_log_ts and
// max_log_ts
//
// In terms of the minor freeze, follower needs to wait until replaying
// to a continuous log point and fetch the freeze point. While follower
// cannot use the min replayed log ts both as the end_log_ts and
// max_log_ts.
//
// To see why the more sophistic max_log_ts calculation is required,
// consider the following example:
// 1. Leader submits the log 5,6,7 and only log 7 is in quorum using
// paxos and its data is already filled in the memstore
// 2. Leader switches to the follower and the min replayed log ts is
// smaller than the log 5's log_ts
// 3. If we just use the min replayed log ts as both the end_log_ts and
// max_log_ts the semantic specified above is broken
//
// So we need maintain the max_log_ts using the log 7's timestamp, in
// terms of the implementation, we use the max_majority_log_ts which is
// updated after each log's synchronization of leader.
//
// What's more, we need mark all data whose log is between end_log_ts to
// max_log_ts as overflow(the requirement from the storage layer). while
// the data may already synced and we have no chance to mark the data
// except traversing all data in the memtable. So we choose to mark the
// end_log_ts as the max_majority_log_ts as well. The detailed issue can
// be found in https://work.aone.alibaba-inc.com/issue/33865988
//
// NB: we never maintain the max_mjority_log_ts for follower, so we just
// use the variable for the corner case of leader transfer.
uint64_t max_majority_log_id = OB_INVALID_ID;
int64_t max_majority_log_ts = OB_INVALID_TIMESTAMP;
(void)pls_->get_max_majority_log(max_majority_log_id, max_majority_log_ts);
if (max_majority_log_ts > freeze_ts) {
TRANS_LOG(WARN,
"max majority log ts is larger than freeze timestamp",
K(max_majority_log_ts),
K(freeze_ts),
K(*this));
ret = OB_EAGAIN;
}
}
}
if (OB_FAIL(ret)) {
@ -3616,7 +3664,7 @@ int ObPartitionGroup::wait_follower_no_pending_task_()
int64_t cnt = 0;
int64_t task_cnt = replay_status_->get_pending_task_count();
while (replay_status_->has_pending_task(pkey_) && OB_SUCC(ret)) {
while (replay_status_->has_pending_task(pkey_) && !replay_status_->has_encount_fatal_error() && OB_SUCC(ret)) {
usleep(FREEZE_WAIT_RETRY_SLEEP_TS);
cnt++;
@ -3633,6 +3681,11 @@ int ObPartitionGroup::wait_follower_no_pending_task_()
}
}
if (replay_status_->has_encount_fatal_error()) {
TRANS_LOG(ERROR, "encounter fatal error", K(*replay_status_), K(ret), K(pkey_));
ret = OB_ERR_UNEXPECTED;
}
return ret;
}
@ -3661,7 +3714,6 @@ int ObPartitionGroup::check_range_changed_(ObTableHandle& handle, const bool is_
base_version = mt->get_base_version();
if (tmp_freeze_ts < start_log_ts || tmp_snapshot_version < base_version) {
ret = OB_EAGAIN;
STORAGE_LOG(INFO,
"skip freeze, maybe in the process of restarting",
K(ret),
@ -3818,7 +3870,7 @@ int ObPartitionGroup::freeze_log_and_data_v2_(const bool emergency, const bool f
if (OB_STATE_NOT_MATCH == ret) {
STORAGE_LOG(INFO, "skip freeze due to clog state", K(ret), K(pkey_));
ret = OB_SUCCESS;
} else if (OB_EAGAIN != ret) {
} else {
STORAGE_LOG(WARN, "failed to check log_id or version range changed", K(ret), K(old_handle));
}
} else if (!changed) {
@ -3837,6 +3889,10 @@ int ObPartitionGroup::freeze_log_and_data_v2_(const bool emergency, const bool f
}
}
if (OB_FAIL(ret) || !effected) {
freeze_record_.clear();
}
return ret;
}