MXS-1886 Better auto-rejoin error description and tolerance
Contains changes from commit 09df01752812444c6e7c409a8957d292f7de63cf adapted to the 2.3 branch.
This commit is contained in:
parent
d3e9cc9a4f
commit
019d62bbb8
@ -146,7 +146,8 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
|
||||
{
|
||||
if (m_master->update_gtids())
|
||||
{
|
||||
if (slave_cand->can_replicate_from(m_master))
|
||||
string no_rejoin_reason;
|
||||
if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason))
|
||||
{
|
||||
ServerArray joinable_server;
|
||||
joinable_server.push_back(slave_cand);
|
||||
@ -162,9 +163,9 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' "
|
||||
"or it could not be queried.", rejoin_serv_name,
|
||||
m_master->name());
|
||||
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': "
|
||||
"%s.", rejoin_serv_name, m_master->name(),
|
||||
no_rejoin_reason.c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -398,10 +399,20 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
|
||||
{
|
||||
for (size_t i = 0; i < suspects.size(); i++)
|
||||
{
|
||||
if (suspects[i]->can_replicate_from(m_master))
|
||||
string rejoin_err_msg;
|
||||
if (suspects[i]->can_replicate_from(m_master, &rejoin_err_msg))
|
||||
{
|
||||
output->push_back(suspects[i]);
|
||||
}
|
||||
else if (m_warn_cannot_rejoin)
|
||||
{
|
||||
// Print a message explaining why an auto-rejoin is not done. Suppress printing.
|
||||
MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a "
|
||||
"valid candidate. Will keep retrying with this message suppressed for all "
|
||||
"servers. Errors: \n%s",
|
||||
suspects[i]->name(), rejoin_err_msg.c_str());
|
||||
m_warn_cannot_rejoin = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -409,6 +420,10 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
|
||||
comm_ok = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
m_warn_cannot_rejoin = true;
|
||||
}
|
||||
return comm_ok;
|
||||
}
|
||||
|
||||
|
@ -60,6 +60,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
|
||||
, m_warn_set_standalone_master(true)
|
||||
, m_log_no_master(true)
|
||||
, m_warn_failover_precond(true)
|
||||
, m_warn_cannot_rejoin(true)
|
||||
{}
|
||||
|
||||
MariaDBMonitor::~MariaDBMonitor()
|
||||
@ -670,14 +671,6 @@ void MariaDBMonitor::handle_auto_rejoin()
|
||||
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
|
||||
m_cluster_modified = true;
|
||||
}
|
||||
if (joins < joinable_servers.size())
|
||||
{
|
||||
MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
|
||||
"To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
|
||||
"the REST API.", CN_AUTO_REJOIN, m_monitor->name);
|
||||
m_auto_rejoin = false;
|
||||
disable_setting(CN_AUTO_REJOIN);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -159,6 +159,7 @@ private:
|
||||
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
|
||||
* outside of a cycle. */
|
||||
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
|
||||
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
|
||||
|
||||
enum slave_down_setting_t
|
||||
{
|
||||
|
@ -537,12 +537,33 @@ bool MariaDBServer::update_slave_info()
|
||||
update_gtids() && do_show_slave_status());
|
||||
}
|
||||
|
||||
bool MariaDBServer::can_replicate_from(MariaDBServer* master)
|
||||
bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* error_out)
|
||||
{
|
||||
bool rval = false;
|
||||
if (update_gtids())
|
||||
{
|
||||
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
|
||||
if (m_gtid_current_pos.empty())
|
||||
{
|
||||
*error_out = string("'") + name() + "' does not have a valid 'gtid_current_pos'.";
|
||||
}
|
||||
else if (master->m_gtid_binlog_pos.empty())
|
||||
{
|
||||
*error_out = string("'") + master->name() + "' does not have a valid 'gtid_binlog_pos'.";
|
||||
}
|
||||
else
|
||||
{
|
||||
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
|
||||
if (!rval)
|
||||
{
|
||||
*error_out = string("gtid_current_pos of '") + name() + "' (" +
|
||||
m_gtid_current_pos.to_string() + ") is incompatible with gtid_binlog_pos of '" +
|
||||
master->name() + "' (" + master->m_gtid_binlog_pos.to_string() + ").";
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*error_out = string("Server '") + name() + "' could not be queried.";
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
@ -313,9 +313,10 @@ public:
|
||||
* The non-detected errors will mostly be detected once the slave tries to start replicating.
|
||||
*
|
||||
* @param master_info Master server
|
||||
* @param error_out Details the reason for a negative result
|
||||
* @return True if slave can replicate from master
|
||||
*/
|
||||
bool can_replicate_from(MariaDBServer* master);
|
||||
bool can_replicate_from(MariaDBServer* master, std::string* error_out);
|
||||
|
||||
/**
|
||||
* Redirect one slave server to another master
|
||||
|
Loading…
x
Reference in New Issue
Block a user