MXS-1775 Check disk space warning bit when selecting a new master for failover

This also applies to autoselect switchover. The disk space warning has the least
priority, as the other criteria could lead to replication failures. Also, print
the reason the new master was selected over the second best candidate.
This commit is contained in:
Esa Korhonen
2018-06-14 11:32:34 +03:00
parent 019d62bbb8
commit 58207ec414
2 changed files with 49 additions and 14 deletions

View File

@ -1071,6 +1071,7 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
/* Select a new master candidate. Selects the one with the latest event in relay log. /* Select a new master candidate. Selects the one with the latest event in relay log.
* If multiple slaves have same number of events, select the one with most processed events. */ * If multiple slaves have same number of events, select the one with most processed events. */
MariaDBServer* current_best = NULL; MariaDBServer* current_best = NULL;
string current_best_reason;
// Servers that cannot be selected because of exclusion, but seem otherwise ok. // Servers that cannot be selected because of exclusion, but seem otherwise ok.
ServerArray valid_but_excluded; ServerArray valid_but_excluded;
// Index of the current best candidate in slaves_out // Index of the current best candidate in slaves_out
@ -1097,7 +1098,8 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
else if (cand->check_replication_settings()) else if (cand->check_replication_settings())
{ {
// If no new master yet, accept any valid candidate. Otherwise check. // If no new master yet, accept any valid candidate. Otherwise check.
if (current_best == NULL || is_candidate_better(current_best, cand, m_master_gtid_domain)) if (current_best == NULL ||
is_candidate_better(current_best, cand, m_master_gtid_domain, &current_best_reason))
{ {
// The server has been selected for promotion, for now. // The server has been selected for promotion, for now.
current_best = cand; current_best = cand;
@ -1142,6 +1144,12 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
{ {
PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found."); PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found.");
} }
else if (!current_best_reason.empty())
{
// If there was a specific reason this server was selected, print it now. It's possible that all
// were equally good, in that case no need to print.
MXS_NOTICE("%s", current_best_reason.c_str());
}
return current_best; return current_best;
} }
@ -1169,37 +1177,64 @@ bool MariaDBMonitor::server_is_excluded(const MariaDBServer* server)
* @param current_best_info Server info of current best choice * @param current_best_info Server info of current best choice
* @param candidate_info Server info of new candidate * @param candidate_info Server info of new candidate
* @param gtid_domain Which domain to compare * @param gtid_domain Which domain to compare
* @param reason_out Why is the candidate better than current_best
* @return True if candidate is better * @return True if candidate is better
*/ */
bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate, bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate,
uint32_t gtid_domain) uint32_t gtid_domain, std::string* reason_out)
{ {
uint64_t cand_io = candidate->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence; string reason = string("'") + candidate->name() + "' is the best candidate because it ";
uint64_t cand_processed = candidate->m_gtid_current_pos.get_gtid(gtid_domain).m_sequence;
uint64_t curr_io = current_best->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence;
uint64_t curr_processed = current_best->m_gtid_current_pos.get_gtid(gtid_domain).m_sequence;
bool cand_updates = candidate->m_rpl_settings.log_slave_updates;
bool curr_updates = current_best->m_rpl_settings.log_slave_updates;
bool is_better = false; bool is_better = false;
// Accept a slave with a later event in relay log. uint64_t cand_io = candidate->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence;
uint64_t curr_io = current_best->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence;
// A slave with a later event in relay log is always preferred.
if (cand_io > curr_io) if (cand_io > curr_io)
{ {
is_better = true; is_better = true;
reason += "has received more events.";
} }
// If io sequences are identical, the slave with more events processed wins. // If io sequences are identical ...
else if (cand_io == curr_io) else if (cand_io == curr_io)
{ {
uint64_t cand_processed = candidate->m_gtid_current_pos.get_gtid(gtid_domain).m_sequence;
uint64_t curr_processed = current_best->m_gtid_current_pos.get_gtid(gtid_domain).m_sequence;
// ... the slave with more events processed wins.
if (cand_processed > curr_processed) if (cand_processed > curr_processed)
{ {
is_better = true; is_better = true;
reason += "has processed more events.";
} }
// Finally, if binlog positions are identical, prefer a slave with log_slave_updates. // If gtid positions are identical ...
else if (cand_processed == curr_processed && cand_updates && !curr_updates) else if (cand_processed == curr_processed)
{ {
is_better = true; bool cand_updates = candidate->m_rpl_settings.log_slave_updates;
bool curr_updates = current_best->m_rpl_settings.log_slave_updates;
// ... prefer a slave with log_slave_updates.
if (cand_updates && !curr_updates)
{
is_better = true;
reason += "has 'log_slave_updates' on.";
}
// If both have log_slave_updates on ...
else if (cand_updates && curr_updates)
{
bool cand_disk_ok = !SERVER_IS_DISK_SPACE_EXHAUSTED(candidate->m_server_base->server);
bool curr_disk_ok = !SERVER_IS_DISK_SPACE_EXHAUSTED(current_best->m_server_base->server);
// ... prefer a slave without disk space issues.
if (cand_disk_ok && !curr_disk_ok)
{
is_better = true;
reason += "is not low on disk space.";
}
}
} }
} }
if (reason_out && is_better)
{
*reason_out = reason;
}
return is_better; return is_better;
} }

View File

@ -242,7 +242,7 @@ private:
MariaDBServer* select_new_master(ServerArray* slaves_out, json_t** err_out); MariaDBServer* select_new_master(ServerArray* slaves_out, json_t** err_out);
bool server_is_excluded(const MariaDBServer* server); bool server_is_excluded(const MariaDBServer* server);
bool is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate, bool is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate,
uint32_t gtid_domain); uint32_t gtid_domain, std::string* reason_out = NULL);
bool promote_new_master(MariaDBServer* new_master, json_t** err_out); bool promote_new_master(MariaDBServer* new_master, json_t** err_out);
int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves, int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves,
ServerArray* redirected_slaves); ServerArray* redirected_slaves);