A master which is down for longer than failcount is considered an invalid master

If auto_failover is disabled and an alternative master exists, the
monitor will swap the master. This may break replication, but the
situation requires that the dba has set up a cluster with multiple
masters.
This commit is contained in:
Esa Korhonen 2018-07-19 19:23:16 +03:00
parent c9570ff616
commit 382a017518
3 changed files with 88 additions and 38 deletions

View File

@ -20,6 +20,7 @@
#include <algorithm>
using std::string;
using maxscale::string_printf;
static bool check_replicate_ignore_table(MXS_MONITORED_SERVER* database);
static bool check_replicate_do_table(MXS_MONITORED_SERVER* database);
@ -783,7 +784,7 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node)
/**
* Is the current master server still valid or should a new one be selected?
*
* @param reason_out Output for a text description
* @param reason_out If master is not valid, the reason is printed here.
* @return True, if master is ok. False if the current master has changed in a way that
* a new master should be selected.
*/
@ -791,8 +792,8 @@ bool MariaDBMonitor::master_is_valid(std::string* reason_out)
{
// The master server of the cluster needs to be re-calculated in the following four cases:
bool rval = true;
// 1) There is no master.
if (m_master == NULL || m_master->is_down())
// 1) There is no master. This typically only applies when MaxScale is first ran.
if (m_master == NULL)
{
rval = false;
}
@ -802,6 +803,13 @@ bool MariaDBMonitor::master_is_valid(std::string* reason_out)
rval = false;
*reason_out = "it is in read-only mode";
}
// 3) The master has been down for failcount iterations and auto_failover is not on.
else if (m_master->is_down() && !m_auto_failover && m_master->m_server_base->mon_err_count >= m_failcount)
{
rval = false;
*reason_out = string_printf("it has been down over %d (failcount) monitor updates and failover "
"is not on", m_failcount);
}
// 3) The master was a non-replicating master (not in a cycle) but now has a slave connection.
else if (m_master_cycle_status.cycle_id == NodeData::CYCLE_NONE)
{
@ -897,51 +905,87 @@ void MariaDBMonitor::update_topology()
// Find the server that looks like it would be the best master. It does not yet overwrite the
// current master.
string topology_messages;
MariaDBServer* root_master = find_topology_master_server(&topology_messages);
MariaDBServer* master_candidate = find_topology_master_server(&topology_messages);
// If the 'master_candidate' is a valid server but different from the current master,
// a change may be necessary. It will only happen if the current master is no longer usable.
bool have_better = (master_candidate && master_candidate != m_master);
// Check if current master is still valid.
string reason;
if (master_is_valid(&reason))
string reason_not_valid;
bool current_is_ok = master_is_valid(&reason_not_valid);
if (current_is_ok)
{
// Update master cycle info in case it has changed
m_warn_current_master_invalid = true;
// Update master cycle info in case it has changed.
update_master_cycle_info();
if (root_master && m_master != root_master)
if (have_better)
{
// Master is still valid but it is no longer the best master. Print a warning.
MXS_WARNING("'%s' is a better master candidate than the current master '%s'. "
"Master will change if '%s' is no longer a valid master.",
root_master->name(), m_master->name(), m_master->name());
// Master is still valid but it is no longer the best master. Print a warning. This
// may be a continuous situation so only print once.
if (m_warn_have_better_master)
{
MXS_WARNING("'%s' is a better master candidate than the current master '%s'. "
"Master will change when '%s' is no longer a valid master.",
master_candidate->name(), m_master->name(), m_master->name());
m_warn_have_better_master = false;
}
}
}
else
{
if (m_master && !reason.empty())
// Current master is faulty or does not exist
m_warn_have_better_master = true;
if (have_better)
{
MXS_WARNING("The previous master server '%s' is no longer a valid master because %s. "
"Selecting new master.", m_master->name(), reason.c_str());
// We have an alternative. Swap master. The messages give the impression
// that new master selection has not yet happened, but this is just for clarity.
const char sel_new_master[] = "Selecting new master server.";
if (m_master)
{
ss_dassert(!reason_not_valid.empty());
MXS_WARNING("The current master server '%s' is no longer valid because %s. %s",
m_master->name(), reason_not_valid.c_str(), sel_new_master);
}
else
{
// This typically happens only when starting from scratch.
MXS_NOTICE("%s", sel_new_master);
}
// At this point, print messages explaining why any/other possible master servers weren't picked.
if (!topology_messages.empty())
{
MXS_WARNING("%s", topology_messages.c_str());
}
MXS_NOTICE("Setting '%s' as master.", master_candidate->name());
// Change the master, even though this may break replication.
assign_new_master(master_candidate);
}
else
{
MXS_NOTICE("Selecting master server.");
}
// No alternative master. Keep current status and print warnings.
// This situation may stick so only print the messages once.
if (m_warn_current_master_invalid)
{
if (m_master)
{
ss_dassert(!reason_not_valid.empty());
MXS_WARNING("The current master server '%s' is no longer valid because %s, "
"but there is no better alternative to swap to.",
m_master->name(), reason_not_valid.c_str());
}
else
{
MXS_WARNING("No valid master server found.");
}
// The current master is no longer ok (or it never was). Change the master, even though this may
// break replication. Master changes are logged by the log_master_changes()-method.
if (!topology_messages.empty())
{
MXS_WARNING("%s", topology_messages.c_str());
}
assign_new_master(root_master);
if (m_master)
{
MXS_NOTICE("'%s' is the best master candidate.", m_master->name());
}
else
{
MXS_WARNING("No valid master servers found.");
if (!topology_messages.empty())
{
MXS_WARNING("%s", topology_messages.c_str());
}
m_warn_current_master_invalid = false;
}
}
}
m_cluster_topology_changed = false;
}

View File

@ -67,6 +67,8 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
, m_log_no_master(true)
, m_warn_failover_precond(true)
, m_warn_cannot_rejoin(true)
, m_warn_current_master_invalid(true)
, m_warn_have_better_master(true)
{}
MariaDBMonitor::~MariaDBMonitor()
@ -485,11 +487,11 @@ void MariaDBMonitor::tick()
}
}
if (m_cluster_topology_changed)
// Topology needs to be rechecked if it has changed or if master is down.
if (m_cluster_topology_changed || (m_master && m_master->is_down()))
{
// This means that a server id or a slave connection has changed, or read_only was set.
// Various things need to be checked and updated.
update_topology();
m_cluster_topology_changed = false;
}
// Always re-assign master, slave etc bits as these depend on other factors outside topology
@ -740,6 +742,8 @@ void MariaDBMonitor::assign_new_master(MariaDBServer* new_master)
{
m_master = new_master;
update_master_cycle_info();
m_warn_current_master_invalid = true;
m_warn_have_better_master = true;
}
/**

View File

@ -179,6 +179,8 @@ private:
* outside of a cycle. */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */
bool m_warn_have_better_master; /**< Print warning if the current master is not the best one? */
// Base methods
MariaDBMonitor(MXS_MONITOR* monitor_base);