MXS-1886 Auto-failover error tolerance

Contains changes from commit 9e68d8ec3ddf1621f533067021c4b3042f695e80
adapted to the 2.3 branch.
This commit is contained in:
Esa Korhonen 2018-06-15 15:59:35 +03:00
parent 72bfc73706
commit d3e9cc9a4f
5 changed files with 63 additions and 30 deletions

View File

@ -92,7 +92,8 @@ bool MariaDBMonitor::manual_failover(json_t** output)
}
bool rv = true;
rv = failover_check(output);
string failover_error;
rv = failover_check(&failover_error);
if (rv)
{
rv = do_failover(output);
@ -105,6 +106,11 @@ bool MariaDBMonitor::manual_failover(json_t** output)
PRINT_MXS_JSON_ERROR(output, "Failover failed.");
}
}
else
{
PRINT_MXS_JSON_ERROR(output, "Failover not performed due to the following errors: \n%s",
failover_error.c_str());
}
if (running)
{
@ -659,12 +665,6 @@ bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer
*/
bool MariaDBMonitor::do_failover(json_t** err_out)
{
// Topology has already been tested to be simple.
if (m_master_gtid_domain == GTID_DOMAIN_UNKNOWN)
{
PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot failover.");
return false;
}
// Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed.
int seconds_remaining = m_failover_timeout;
@ -1262,15 +1262,24 @@ bool MariaDBMonitor::switchover_check_new(const MariaDBServer* new_master_cand,
/**
* Check that preconditions for a failover are met.
*
* @param error_out JSON error out
* @param error_out Error output
* @return True if failover may proceed
*/
bool MariaDBMonitor::failover_check(json_t** error_out)
bool MariaDBMonitor::failover_check(string* error_out)
{
// Check that there is no running master and that there is at least one running server in the cluster.
// Also, all slaves must be using gtid-replication.
// Check that there is no running master and that there is at least one running slave in the cluster.
// Also, all slaves must be using gtid-replication and the gtid-domain of the cluster must be known.
int slaves = 0;
bool error = false;
string separator;
// Topology has already been tested to be simple.
if (m_master_gtid_domain < 0)
{
*error_out += "Cluster gtid domain is unknown. This is usually caused by the cluster never having "
"a master server while MaxScale was running.";
separator = "\n";
error = true;
}
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
@ -1285,31 +1294,32 @@ bool MariaDBMonitor::failover_check(json_t** error_out)
master_up_msg += ", although in maintenance mode";
}
master_up_msg += ".";
PRINT_MXS_JSON_ERROR(error_out, "%s", master_up_msg.c_str());
*error_out += separator + master_up_msg;
separator = "\n";
error = true;
}
else if (server->is_slave())
else if (server->is_slave() && !server_is_excluded(server))
{
if (server->uses_gtid(error_out))
string gtid_error;
if (server->uses_gtid(&gtid_error))
{
slaves++;
}
else
{
*error_out += separator + gtid_error;
separator = "\n";
error = true;
}
}
}
if (error)
if (slaves == 0)
{
PRINT_MXS_JSON_ERROR(error_out, "Failover not allowed due to errors.");
*error_out += separator + "No valid slaves to promote.";
error = true;
}
else if (slaves == 0)
{
PRINT_MXS_JSON_ERROR(error_out, "No running slaves, cannot failover.");
}
return !error && slaves > 0;
return !error;
}
/**
@ -1388,10 +1398,14 @@ bool MariaDBMonitor::handle_auto_failover()
}
else if (failed_master->m_server_base->mon_err_count >= m_failcount)
{
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name());
failed_master->m_server_base->new_event = false;
if (failover_check(NULL))
// Failover is required, but first we should check if preconditions are met.
string error_msg;
if (failover_check(&error_msg))
{
m_warn_failover_precond = true;
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
failed_master->name());
failed_master->m_server_base->new_event = false;
if (!do_failover(NULL))
{
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
@ -1401,8 +1415,23 @@ bool MariaDBMonitor::handle_auto_failover()
}
cluster_modified = true;
}
else
{
// Failover was not attempted because of errors, however these errors are not permanent.
// Servers were not modified, so it's ok to try this again.
if (m_warn_failover_precond)
{
MXS_WARNING("Not performing automatic failover. Will keep retrying with this message "
"suppressed. Errors: \n%s", error_msg.c_str());
m_warn_failover_precond = false;
}
}
}
}
else
{
m_warn_failover_precond = true;
}
return cluster_modified;
}
@ -1546,9 +1575,12 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master
bool gtid_ok = true;
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
if ((*iter)->is_slave() && !(*iter)->uses_gtid(error_out))
MariaDBServer* server = *iter;
string gtid_error;
if (server->is_slave() && !server->uses_gtid(&gtid_error))
{
gtid_ok = false;
PRINT_MXS_JSON_ERROR(error_out, "%s", gtid_error.c_str());
}
}

View File

@ -59,6 +59,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
, m_switchover_on_low_disk_space(false)
, m_warn_set_standalone_master(true)
, m_log_no_master(true)
, m_warn_failover_precond(true)
{}
MariaDBMonitor::~MariaDBMonitor()

View File

@ -158,6 +158,7 @@ private:
bool m_warn_no_valid_in_cycle; /**< Log a warning when a replication cycle has no valid master */
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
* outside of a cycle. */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
enum slave_down_setting_t
{
@ -226,7 +227,7 @@ private:
bool handle_auto_failover();
bool failover_not_possible();
bool slave_receiving_events();
bool failover_check(json_t** error_out);
bool failover_check(std::string* error_out);
bool do_failover(json_t** err_out);
// Rejoin methods

View File

@ -520,13 +520,12 @@ json_t* MariaDBServer::diagnostics_json(bool multimaster) const
return srv;
}
bool MariaDBServer::uses_gtid(json_t** error_out)
bool MariaDBServer::uses_gtid(std::string* error_out)
{
bool using_gtid = !m_slave_status.empty() && !m_slave_status[0].gtid_io_pos.empty();
if (!using_gtid)
{
string slave_not_gtid_msg = string("Slave server ") + name() + " is not using gtid replication.";
PRINT_MXS_JSON_ERROR(error_out, "%s", slave_not_gtid_msg.c_str());
*error_out = string("Slave server ") + name() + " is not using gtid replication.";
}
return using_gtid;
}

View File

@ -298,7 +298,7 @@ public:
* @return True if using gtid-replication. False if not, or if server is not a slave or otherwise does
* not have a gtid_IO_Pos.
*/
bool uses_gtid(json_t** error_out);
bool uses_gtid(std::string* error_out);
/**
* Update replication settings, gtid:s and slave status of the server.