MXS-1886 Better auto-rejoin error description and tolerance

Contains changes from commit 09df01752812444c6e7c409a8957d292f7de63cf
adapted to the 2.3 branch.
This commit is contained in:
Esa Korhonen 2018-06-15 17:29:00 +03:00
parent d3e9cc9a4f
commit 019d62bbb8
5 changed files with 47 additions and 16 deletions

View File

@ -146,7 +146,8 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
{
if (m_master->update_gtids())
{
if (slave_cand->can_replicate_from(m_master))
string no_rejoin_reason;
if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason))
{
ServerArray joinable_server;
joinable_server.push_back(slave_cand);
@ -162,9 +163,9 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
}
else
{
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' "
"or it could not be queried.", rejoin_serv_name,
m_master->name());
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': "
"%s.", rejoin_serv_name, m_master->name(),
no_rejoin_reason.c_str());
}
}
else
@ -398,10 +399,20 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
{
for (size_t i = 0; i < suspects.size(); i++)
{
if (suspects[i]->can_replicate_from(m_master))
string rejoin_err_msg;
if (suspects[i]->can_replicate_from(m_master, &rejoin_err_msg))
{
output->push_back(suspects[i]);
}
else if (m_warn_cannot_rejoin)
{
// Print a message explaining why an auto-rejoin is not done. Suppress printing.
MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a "
"valid candidate. Will keep retrying with this message suppressed for all "
"servers. Errors: \n%s",
suspects[i]->name(), rejoin_err_msg.c_str());
m_warn_cannot_rejoin = false;
}
}
}
else
@ -409,6 +420,10 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
comm_ok = false;
}
}
else
{
m_warn_cannot_rejoin = true;
}
return comm_ok;
}

View File

@ -60,6 +60,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
, m_warn_set_standalone_master(true)
, m_log_no_master(true)
, m_warn_failover_precond(true)
, m_warn_cannot_rejoin(true)
{}
MariaDBMonitor::~MariaDBMonitor()
@ -670,14 +671,6 @@ void MariaDBMonitor::handle_auto_rejoin()
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
m_cluster_modified = true;
}
if (joins < joinable_servers.size())
{
MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
"To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
"the REST API.", CN_AUTO_REJOIN, m_monitor->name);
m_auto_rejoin = false;
disable_setting(CN_AUTO_REJOIN);
}
}
else
{

View File

@ -159,6 +159,7 @@ private:
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
* outside of a cycle. */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
enum slave_down_setting_t
{

View File

@ -537,12 +537,33 @@ bool MariaDBServer::update_slave_info()
update_gtids() && do_show_slave_status());
}
bool MariaDBServer::can_replicate_from(MariaDBServer* master)
bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* error_out)
{
bool rval = false;
if (update_gtids())
{
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
if (m_gtid_current_pos.empty())
{
*error_out = string("'") + name() + "' does not have a valid 'gtid_current_pos'.";
}
else if (master->m_gtid_binlog_pos.empty())
{
*error_out = string("'") + master->name() + "' does not have a valid 'gtid_binlog_pos'.";
}
else
{
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
if (!rval)
{
*error_out = string("gtid_current_pos of '") + name() + "' (" +
m_gtid_current_pos.to_string() + ") is incompatible with gtid_binlog_pos of '" +
master->name() + "' (" + master->m_gtid_binlog_pos.to_string() + ").";
}
}
}
else
{
*error_out = string("Server '") + name() + "' could not be queried.";
}
return rval;
}

View File

@ -313,9 +313,10 @@ public:
* The non-detected errors will mostly be detected once the slave tries to start replicating.
*
* @param master_info Master server
* @param error_out Details the reason for a negative result
* @return True if slave can replicate from master
*/
bool can_replicate_from(MariaDBServer* master);
bool can_replicate_from(MariaDBServer* master, std::string* error_out);
/**
* Redirect one slave server to another master