Files
oceanbase/src/logservice/palf/election/algorithm/election_acceptor.cpp

365 lines
18 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#include "share/ob_occam_time_guard.h"
#include "election_acceptor.h"
#include "common/ob_clock_generator.h"
#include "election_impl.h"
#include "lib/net/ob_addr.h"
#include "logservice/palf/election/interface/election_priority.h"
#include "logservice/palf/election/utils/election_common_define.h"
#include "logservice/palf/election/utils/election_event_recorder.h"
namespace oceanbase
{
namespace palf
{
namespace election
{
#define CHECK_SILENCE()\
do {\
if (ATOMIC_LOAD(&INIT_TS) < 0) {\
ELECT_LOG_RET(ERROR, common::OB_ERROR, "INIT_TS is less than 0, may not call GLOBAL_INIT_ELECTION_MODULE yet!", K(*this));\
return;\
} else if (OB_UNLIKELY(get_monotonic_ts() < ATOMIC_LOAD(&INIT_TS) + MAX_LEASE_TIME)) {\
ELECT_LOG(INFO, "keep silence for safty, won't send response", K(*this));\
return;\
}\
} while(0)
template <typename Type>
struct ResponseType {};
template <>
struct ResponseType<ElectionPrepareRequestMsg> { using type = ElectionPrepareResponseMsg; };
template <>
struct ResponseType<ElectionAcceptRequestMsg> { using type = ElectionAcceptResponseMsg; };
class RequestChecker
{
private:
public:
template <typename RequestMsg, typename Acceptor>
static bool check_ballot_valid(const RequestMsg &msg, Acceptor *p_acceptor, const LogPhase phase)
{
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER K(msg), K(*p_acceptor)
bool ret = false;
if (OB_UNLIKELY(msg.get_ballot_number() < p_acceptor->ballot_number_)) {
using T = typename ResponseType<RequestMsg>::type;
T reject_msg = create_reject_message_(p_acceptor->p_election_->get_self_addr(),
p_acceptor->p_election_->inner_priority_seed_,
p_acceptor->p_election_->get_membership_version_(),
p_acceptor->p_election_->get_ls_biggest_min_cluster_version_ever_seen_(),
msg);
reject_msg.set_rejected(p_acceptor->ballot_number_);
p_acceptor->p_election_->send_(reject_msg);
LOG_PHASE(WARN, phase, "receive old ballot request, refused");
} else {
ret = true;
}
return ret;
#undef PRINT_WRAPPER
}
private:
static ElectionPrepareResponseMsg create_reject_message_(const common::ObAddr &addr,
const uint64_t inner_priority_seed,
const LogConfigVersion &membership_version,
const LsBiggestMinClusterVersionEverSeen &version,
const ElectionPrepareRequestMsg &msg)
{
UNUSED(inner_priority_seed),
UNUSED(membership_version);
return ElectionPrepareResponseMsg(addr, version, msg);
}
static ElectionAcceptResponseMsg create_reject_message_(const common::ObAddr &addr,
const uint64_t inner_priority_seed,
const LogConfigVersion &membership_version,
const LsBiggestMinClusterVersionEverSeen &version,
const ElectionAcceptRequestMsg &msg)
{
return ElectionAcceptResponseMsg(addr, inner_priority_seed, membership_version, version, msg);
}
};
ElectionAcceptor::ElectionAcceptor(ElectionImpl *p_election) :
ballot_number_(INVALID_VALUE),
ballot_of_time_window_(INVALID_VALUE),
is_time_window_opened_(false),
p_election_(p_election),
last_time_window_open_ts_(INVALID_VALUE),
last_dump_acceptor_info_ts_(INVALID_VALUE) {}
void ElectionAcceptor::advance_ballot_number_and_reset_related_states_(const int64_t new_ballot_number,
const LogPhase phase)
{
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER K(new_ballot_number), K(*this)
if (new_ballot_number > ballot_number_) {
ballot_number_ = new_ballot_number;
ballot_of_time_window_ = ballot_number_;
reset_time_window_states_(phase);
} else {
LOG_PHASE_RET(ERROR, OB_INVALID_ARGUMENT, phase, "invalid argument");
}
#undef PRINT_WRAPPER
}
int ElectionAcceptor::start()
{
ObAddr last_record_lease_owner;
bool last_record_lease_valid_state = false;
int ret = OB_SUCCESS;
return p_election_->timer_->schedule_task_repeat(time_window_task_handle_,
250_ms,
[this,
last_record_lease_owner,
last_record_lease_valid_state]() mutable {
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER KR(ret), K(*this)
int ret = OB_SUCCESS;
LockGuard lock_guard(p_election_->lock_);
// 周期性打印选举的状态
if (ObClockGenerator::getClock() > last_dump_acceptor_info_ts_ + 3_s) {
last_dump_acceptor_info_ts_ = ObClockGenerator::getClock();
ELECT_LOG(INFO, "dump acceptor info", K(*this));
}
// 当acceptor的Lease有效状态发生变化时需要打印日志以及汇报事件
bool lease_valid_state = !lease_.is_expired();
if (last_record_lease_valid_state != lease_valid_state) {// 当记录的lease有效状态与当前Lease的有效状态不符时
if (lease_valid_state) {// Lease从无效变有效,意味着该acceptor【可能】见证了一个新的Leader诞生
LOG_ELECT_LEADER(INFO, "witness new leader");
} else {// Lease从有效变无效,意味着本副本可能与Leader失去了网络连接
LOG_RENEW_LEASE(WARN, "lease expired");
p_election_->event_recorder_.report_acceptor_lease_expired_event(lease_);
lease_.reset();
}
}
// 当acceptor发现Lease的owner发生变化的时候需要打印日志以及汇报事件
if (last_record_lease_owner != lease_.get_owner()) {
if (last_record_lease_owner.is_valid() && lease_.get_owner().is_valid()) {
LOG_CHANGE_LEADER(INFO, "lease owner changed");
p_election_->event_recorder_.report_acceptor_witness_change_leader_event(last_record_lease_owner, lease_.get_owner());
}
last_record_lease_owner = lease_.get_owner();
}
if (is_time_window_opened_) {
ElectionPrepareResponseMsg prepare_res_accept(p_election_->get_self_addr(),
p_election_->get_ls_biggest_min_cluster_version_ever_seen_(),
highest_priority_prepare_req_);
bool can_vote = false;
if (last_record_lease_valid_state && !lease_valid_state) {// 这个定时任务可能是被延迟致lease到期时触发的,为了在lease到期的第一时间投票
can_vote = true;
LOG_ELECT_LEADER(INFO, "vote when lease expired");
} else if (ObClockGenerator::getClock() - last_time_window_open_ts_ >= CALCULATE_TIME_WINDOW_SPAN_TS()) {
can_vote = true;
} else {
LOG_ELECT_LEADER(INFO, "can't vote now", K(last_record_lease_valid_state),
K(lease_valid_state), K(CALCULATE_TIME_WINDOW_SPAN_TS()),
KTIME_(last_time_window_open_ts));
}
if (can_vote) {
// 1. 定时任务触发时要检查是否重新开启过窗口,避免提前关闭新的窗口,同时检查是否有这一轮记录的prepare请求
if (ballot_of_time_window_ == highest_priority_prepare_req_.get_ballot_number() &&
ballot_of_time_window_ > ballot_number_) {
// 1.1 要投票了,可以推大ballot number了
ballot_number_ = ballot_of_time_window_;
// 1.2 若lease过期需要重置lease,因为proposer没办法判断lease的有效性
if (lease_.is_expired()) {// 若Lease过期,则返回空Lease(Lease是否过期只能在本机上进行判断)
lease_.reset();
}
// 1.3 构造prepare ok消息
prepare_res_accept.set_accepted(ballot_number_, lease_);
if (CLICK_FAIL(p_election_->send_(prepare_res_accept))) {
LOG_ELECT_LEADER(ERROR, "fail to send prepare ok", K(prepare_res_accept));
} else {
p_election_->event_recorder_.report_vote_event(prepare_res_accept.get_receiver(), vote_reason_);
LOG_ELECT_LEADER(INFO, "time window closed, send vote", K(prepare_res_accept));
}
} else {
LOG_ELECT_LEADER(ERROR, "give up sending prepare response, casuse ballot number not match", K(prepare_res_accept));
}
is_time_window_opened_ = false;// 发出消息后,关闭时间窗口
}
}
last_record_lease_valid_state = lease_valid_state;
#undef PRINT_WRAPPER
return false;
});
}
void ElectionAcceptor::stop()
{
ELECT_TIME_GUARD(3_s);
time_window_task_handle_.stop_and_wait();
}
void ElectionAcceptor::reset_time_window_states_(const LogPhase phase)
{
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER K(*this)
if (is_time_window_opened_) {
is_time_window_opened_ = false;// 推大ballot number的时候要关闭时间窗口
highest_priority_prepare_req_.reset();
vote_reason_.reset();
LOG_PHASE(INFO, phase, "time window closed, not vote");
}
#undef PRINT_WRAPPER
}
void ElectionAcceptor::on_prepare_request(const ElectionPrepareRequestMsg &prepare_req)
{
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER KR(ret), K(prepare_req), K(*this)
CHECK_SILENCE();// 启动后的要维持一段静默时间,acceptor假装看不到任何消息,以维护lease的正确语义
int ret = OB_SUCCESS;
LogPhase phase = (prepare_req.get_role() == common::ObRole::FOLLOWER ? LogPhase::ELECT_LEADER : LogPhase::RENEW_LEASE);
LOG_PHASE(INFO, phase, "handle prepare request");
if (OB_UNLIKELY(false == p_election_->is_member_list_valid_())) {
LOG_PHASE(INFO, phase, "ignore prepare when member_list is invalid");
} else if (prepare_req.get_membership_version() < p_election_->get_membership_version_()) {
LOG_PHASE(INFO, phase, "ignore lower membership version request");
} else if (OB_LIKELY(RequestChecker::check_ballot_valid(prepare_req, this, phase))) {
// 0. 收到leader prepare的时候无须比较优先级,直接返回投票结果
if (prepare_req.get_role() == common::ObRole::LEADER) {
if (prepare_req.get_ballot_number() <= ballot_number_) {
LOG_PHASE(WARN, phase, "leader prepare message's ballot number is smaller than self");
} else {
advance_ballot_number_and_reset_related_states_(prepare_req.get_ballot_number(), phase);
ElectionPrepareResponseMsg prepare_res_accept(p_election_->get_self_addr(),
p_election_->get_ls_biggest_min_cluster_version_ever_seen_(),
prepare_req);
prepare_res_accept.set_accepted(ballot_number_, lease_);
if (CLICK_FAIL(p_election_->msg_handler_->send(prepare_res_accept))) {
LOG_PHASE(WARN, phase, "send prepare response to leader prepare failed");
} else {
LOG_PHASE(INFO, phase, "receive valid leader prepare message, send vote to him");
}
}
} else {
// 1. 遇到比时间窗口更大的ballot number的时候需要关闭当前的时间窗口
if (is_time_window_opened_ && prepare_req.get_ballot_number() > ballot_of_time_window_) {
reset_time_window_states_(phase);
}
// 2. 若时间窗口未开启,需要开启时间窗口
if (!is_time_window_opened_ && prepare_req.get_ballot_number() > ballot_of_time_window_) {
ballot_of_time_window_ = prepare_req.get_ballot_number();
highest_priority_prepare_req_.reset();
LOG_PHASE(DEBUG, phase, "advance ballot_of_time_window_");
int64_t ballot_of_time_window_when_registered = ballot_of_time_window_;
int64_t timewindow_span = 0;
if (!lease_.is_expired()) {// 当前Lease有效时,如果有效的时间超过一个最大单程消息延迟,则窗口关闭时机以Lease到期时间为准
timewindow_span = std::max(lease_.get_lease_end_ts() - get_monotonic_ts(), CALCULATE_TIME_WINDOW_SPAN_TS() / 2);
} else {// 否则视为普通的无主选举流程,窗口需要覆盖两个最大单程消息延迟
timewindow_span = CALCULATE_TIME_WINDOW_SPAN_TS();
}
if (CLICK_FAIL(time_window_task_handle_.reschedule_after(timewindow_span))) {
LOG_PHASE(ERROR, phase, "open time window failed");
} else {
is_time_window_opened_ = true;// 定时任务注册成功,打开时间窗口
last_time_window_open_ts_ = ObClockGenerator::getClock();
LOG_PHASE(INFO, phase, "open time window success", K(timewindow_span));
}
}
// 3. 成员版本号是第一优先级,在成员版本号不小于自己的基础上要比较成员版本号的大小,否则将导致分票
if (OB_SUCC(ret) && is_time_window_opened_) {// 在时间窗口内,进行计票
if (prepare_req.get_ballot_number() != ballot_of_time_window_) {
LOG_PHASE(INFO, phase, "prepare request's ballot is not same as time window, just ignore");
} else if (!highest_priority_prepare_req_.is_valid()) {
highest_priority_prepare_req_ = prepare_req;
LOG_PHASE(INFO, phase, "highest priority prepare message will be replaced casuse cached highest prioriy message is invalid");
vote_reason_.assign("the only request");
} else if (prepare_req.get_membership_version() > highest_priority_prepare_req_.get_membership_version()) {
highest_priority_prepare_req_ = prepare_req;
LOG_PHASE(INFO, phase, "highest priority prepare message will be replaced casuse new message's membership version is higher");
vote_reason_.assign("membership_version is higher");
} else if (prepare_req.get_membership_version() < highest_priority_prepare_req_.get_membership_version()) {
LOG_PHASE(INFO, phase, "prepare message's membership version not less than self, but not greater than cached highest priority prepare message");
} else {
// 4. 比较消息和缓存的最高优先级之间的高低
if (p_election_->is_rhs_message_higher_(highest_priority_prepare_req_, prepare_req, vote_reason_, true, LogPhase::ELECT_LEADER)) {
LOG_PHASE(INFO, phase, "highest priority prepare request will be replaced", K(vote_reason_));
highest_priority_prepare_req_ = prepare_req;
} else {
LOG_PHASE(INFO, phase, "ignore prepare request, cause it has lower priority", K(vote_reason_));
}
}
}
}
}
#undef PRINT_WRAPPER
}
void ElectionAcceptor::on_accept_request(const ElectionAcceptRequestMsg &accept_req,
int64_t *us_to_expired)
{
ELECT_TIME_GUARD(500_ms);
#define PRINT_WRAPPER KR(ret), K(accept_req), K(*this)
CHECK_SILENCE();// 启动后的要维持一段静默时间,acceptor假装看不到任何消息,以维护lease的语义
int ret = OB_SUCCESS;
if (OB_LIKELY(RequestChecker::check_ballot_valid(accept_req,
this,
LogPhase::RENEW_LEASE))) {
// 1. 推大ballot number,防止accept lease的ballot number回退
if (accept_req.get_ballot_number() > ballot_number_) {
advance_ballot_number_and_reset_related_states_(accept_req.get_ballot_number(), LogPhase::RENEW_LEASE);
}
// 2. 无条件更新Lease
lease_.update_from(accept_req);
*us_to_expired = lease_.get_lease_end_ts() - get_monotonic_ts();
// 3. 构造accept ok消息
ElectionAcceptResponseMsg accept_res_accept(p_election_->get_self_addr(),
p_election_->inner_priority_seed_,
p_election_->get_membership_version_(),
p_election_->get_ls_biggest_min_cluster_version_ever_seen_(),
accept_req);
(void) p_election_->refresh_priority_();
if (CLICK_FAIL(accept_res_accept.set_accepted(ballot_number_,
p_election_->get_priority_()))) {
LOG_RENEW_LEASE(ERROR, "fail to copy priority", K(accept_res_accept));
} else if (CLICK_FAIL(p_election_->send_(accept_res_accept))) {
LOG_RENEW_LEASE(ERROR, "fail to send msg", K(accept_res_accept));
}
}
#undef PRINT_WRAPPER
}
int64_t ElectionAcceptor::to_string(char *buf, const int64_t buf_len) const
{
int64_t pos = 0;
if (OB_ISNULL(p_election_)) {
common::databuff_printf(buf, buf_len, pos, "{p_election:NULL");
} else {
common::databuff_printf(buf, buf_len, pos, "{ls_id:{id:%ld}", p_election_->id_);
common::databuff_printf(buf, buf_len, pos, ", addr:%s", to_cstring(p_election_->get_self_addr()));
}
common::databuff_printf(buf, buf_len, pos, ", ballot_number:%ld", ballot_number_);
common::databuff_printf(buf, buf_len, pos, ", ballot_of_time_window:%ld", ballot_of_time_window_);
common::databuff_printf(buf, buf_len, pos, ", lease:%s", to_cstring(lease_));
common::databuff_printf(buf, buf_len, pos, ", is_time_window_opened:%s", to_cstring(is_time_window_opened_));
common::databuff_printf(buf, buf_len, pos, ", vote_reason:%s", to_cstring(vote_reason_));
common::databuff_printf(buf, buf_len, pos, ", last_time_window_open_ts:%s", ObTime2Str::ob_timestamp_str_range<YEAR, USECOND>(last_time_window_open_ts_));
if (highest_priority_prepare_req_.is_valid()) {
common::databuff_printf(buf, buf_len, pos, ", highest_priority_prepare_req:%s",
to_cstring(highest_priority_prepare_req_));
}
common::databuff_printf(buf, buf_len, pos, ", p_election:0x%lx}", (unsigned long)p_election_);
return pos;
}
}
}
}