MXS-1886 Better auto-rejoin error description and tolerance
Contains changes from commit 09df01752812444c6e7c409a8957d292f7de63cf adapted to the 2.3 branch.
This commit is contained in:
@ -146,7 +146,8 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
|
|||||||
{
|
{
|
||||||
if (m_master->update_gtids())
|
if (m_master->update_gtids())
|
||||||
{
|
{
|
||||||
if (slave_cand->can_replicate_from(m_master))
|
string no_rejoin_reason;
|
||||||
|
if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason))
|
||||||
{
|
{
|
||||||
ServerArray joinable_server;
|
ServerArray joinable_server;
|
||||||
joinable_server.push_back(slave_cand);
|
joinable_server.push_back(slave_cand);
|
||||||
@ -162,9 +163,9 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' "
|
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': "
|
||||||
"or it could not be queried.", rejoin_serv_name,
|
"%s.", rejoin_serv_name, m_master->name(),
|
||||||
m_master->name());
|
no_rejoin_reason.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -398,10 +399,20 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
|
|||||||
{
|
{
|
||||||
for (size_t i = 0; i < suspects.size(); i++)
|
for (size_t i = 0; i < suspects.size(); i++)
|
||||||
{
|
{
|
||||||
if (suspects[i]->can_replicate_from(m_master))
|
string rejoin_err_msg;
|
||||||
|
if (suspects[i]->can_replicate_from(m_master, &rejoin_err_msg))
|
||||||
{
|
{
|
||||||
output->push_back(suspects[i]);
|
output->push_back(suspects[i]);
|
||||||
}
|
}
|
||||||
|
else if (m_warn_cannot_rejoin)
|
||||||
|
{
|
||||||
|
// Print a message explaining why an auto-rejoin is not done. Suppress printing.
|
||||||
|
MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a "
|
||||||
|
"valid candidate. Will keep retrying with this message suppressed for all "
|
||||||
|
"servers. Errors: \n%s",
|
||||||
|
suspects[i]->name(), rejoin_err_msg.c_str());
|
||||||
|
m_warn_cannot_rejoin = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -409,6 +420,10 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
|
|||||||
comm_ok = false;
|
comm_ok = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_warn_cannot_rejoin = true;
|
||||||
|
}
|
||||||
return comm_ok;
|
return comm_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -60,6 +60,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
|
|||||||
, m_warn_set_standalone_master(true)
|
, m_warn_set_standalone_master(true)
|
||||||
, m_log_no_master(true)
|
, m_log_no_master(true)
|
||||||
, m_warn_failover_precond(true)
|
, m_warn_failover_precond(true)
|
||||||
|
, m_warn_cannot_rejoin(true)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
MariaDBMonitor::~MariaDBMonitor()
|
MariaDBMonitor::~MariaDBMonitor()
|
||||||
@ -670,14 +671,6 @@ void MariaDBMonitor::handle_auto_rejoin()
|
|||||||
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
|
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
|
||||||
m_cluster_modified = true;
|
m_cluster_modified = true;
|
||||||
}
|
}
|
||||||
if (joins < joinable_servers.size())
|
|
||||||
{
|
|
||||||
MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
|
|
||||||
"To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
|
|
||||||
"the REST API.", CN_AUTO_REJOIN, m_monitor->name);
|
|
||||||
m_auto_rejoin = false;
|
|
||||||
disable_setting(CN_AUTO_REJOIN);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|||||||
@ -159,6 +159,7 @@ private:
|
|||||||
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
|
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
|
||||||
* outside of a cycle. */
|
* outside of a cycle. */
|
||||||
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
|
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
|
||||||
|
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
|
||||||
|
|
||||||
enum slave_down_setting_t
|
enum slave_down_setting_t
|
||||||
{
|
{
|
||||||
|
|||||||
@ -537,12 +537,33 @@ bool MariaDBServer::update_slave_info()
|
|||||||
update_gtids() && do_show_slave_status());
|
update_gtids() && do_show_slave_status());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MariaDBServer::can_replicate_from(MariaDBServer* master)
|
bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* error_out)
|
||||||
{
|
{
|
||||||
bool rval = false;
|
bool rval = false;
|
||||||
if (update_gtids())
|
if (update_gtids())
|
||||||
{
|
{
|
||||||
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
|
if (m_gtid_current_pos.empty())
|
||||||
|
{
|
||||||
|
*error_out = string("'") + name() + "' does not have a valid 'gtid_current_pos'.";
|
||||||
|
}
|
||||||
|
else if (master->m_gtid_binlog_pos.empty())
|
||||||
|
{
|
||||||
|
*error_out = string("'") + master->name() + "' does not have a valid 'gtid_binlog_pos'.";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos);
|
||||||
|
if (!rval)
|
||||||
|
{
|
||||||
|
*error_out = string("gtid_current_pos of '") + name() + "' (" +
|
||||||
|
m_gtid_current_pos.to_string() + ") is incompatible with gtid_binlog_pos of '" +
|
||||||
|
master->name() + "' (" + master->m_gtid_binlog_pos.to_string() + ").";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*error_out = string("Server '") + name() + "' could not be queried.";
|
||||||
}
|
}
|
||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -313,9 +313,10 @@ public:
|
|||||||
* The non-detected errors will mostly be detected once the slave tries to start replicating.
|
* The non-detected errors will mostly be detected once the slave tries to start replicating.
|
||||||
*
|
*
|
||||||
* @param master_info Master server
|
* @param master_info Master server
|
||||||
|
* @param error_out Details the reason for a negative result
|
||||||
* @return True if slave can replicate from master
|
* @return True if slave can replicate from master
|
||||||
*/
|
*/
|
||||||
bool can_replicate_from(MariaDBServer* master);
|
bool can_replicate_from(MariaDBServer* master, std::string* error_out);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Redirect one slave server to another master
|
* Redirect one slave server to another master
|
||||||
|
|||||||
Reference in New Issue
Block a user