MXS-1886 Better auto-rejoin error description and tolerance
Auto-rejoin now explains more accurately if a server cannot be joined due to conflicting gtid. Also, auto-rejoin is no longer disabled if a join fails. Usually the fail is due to the server not replying fast enough with query completion. The query is often completed anyways. This can lead to some log spam.
This commit is contained in:
parent
9e68d8ec3d
commit
09df017528
@ -125,7 +125,8 @@ static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
|
||||
static bool cluster_can_be_joined(MYSQL_MONITOR* mon);
|
||||
static bool can_replicate_from(MYSQL_MONITOR* mon,
|
||||
MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
|
||||
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info);
|
||||
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info,
|
||||
string* err_msg);
|
||||
static bool wait_cluster_stabilization(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master,
|
||||
const ServerVector& slaves, int seconds_remaining);
|
||||
static string get_connection_errors(const ServerVector& servers);
|
||||
@ -761,7 +762,9 @@ bool mysql_rejoin(MXS_MONITOR* mon, SERVER* rejoin_server, json_t** output)
|
||||
{
|
||||
if (update_gtids(handle, master, master_info))
|
||||
{
|
||||
if (can_replicate_from(handle, mon_server, server_info, master, master_info))
|
||||
string no_rejoin_reason;
|
||||
if (can_replicate_from(handle, mon_server, server_info, master, master_info,
|
||||
&no_rejoin_reason))
|
||||
{
|
||||
ServerVector joinable_server;
|
||||
joinable_server.push_back(mon_server);
|
||||
@ -777,8 +780,8 @@ bool mysql_rejoin(MXS_MONITOR* mon, SERVER* rejoin_server, json_t** output)
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' "
|
||||
"or it could not be queried.", rejoin_serv_name, master_name);
|
||||
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': "
|
||||
"%s", rejoin_serv_name, master_name, no_rejoin_reason.c_str());
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -1101,6 +1104,7 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
|
||||
handle->id = config_get_global_options()->id;
|
||||
handle->warn_set_standalone_master = true;
|
||||
handle->warn_failover_precond = true;
|
||||
handle->warn_cannot_rejoin = true;
|
||||
handle->master_gtid_domain = -1;
|
||||
handle->external_master_host[0] = '\0';
|
||||
handle->external_master_port = PORT_UNKNOWN;
|
||||
@ -2636,14 +2640,6 @@ monitorMain(void *arg)
|
||||
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
|
||||
cluster_modified = true;
|
||||
}
|
||||
if (joins < joinable_servers.size())
|
||||
{
|
||||
MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
|
||||
"To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
|
||||
"the REST API.", CN_AUTO_REJOIN, mon->name);
|
||||
handle->auto_rejoin = false;
|
||||
disable_setting(handle, CN_AUTO_REJOIN);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -4697,13 +4693,19 @@ static void read_server_variables(MXS_MONITORED_SERVER* database, MySqlServerInf
|
||||
* @param slave_info Slave info
|
||||
* @param master Replication master
|
||||
* @param master_info Master info
|
||||
* @param err_msg Error output. Details the reason for invalid replication.
|
||||
* @return True if slave can replicate from master
|
||||
*/
|
||||
static bool can_replicate_from(MYSQL_MONITOR* mon,
|
||||
MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
|
||||
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info)
|
||||
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info,
|
||||
string* err_msg)
|
||||
{
|
||||
ss_dassert(err_msg);
|
||||
bool rval = false;
|
||||
const char* master_name = master->server->unique_name;
|
||||
const char* slave_name = slave->server->unique_name;
|
||||
|
||||
if (update_gtids(mon, slave, slave_info))
|
||||
{
|
||||
Gtid slave_gtid = slave_info->gtid_current_pos;
|
||||
@ -4711,13 +4713,30 @@ static bool can_replicate_from(MYSQL_MONITOR* mon,
|
||||
// The following are not sufficient requirements for replication to work, they only cover the basics.
|
||||
// If the servers have diverging histories, the redirection will seem to succeed but the slave IO
|
||||
// thread will stop in error.
|
||||
if (slave_gtid.server_id != SERVER_ID_UNKNOWN && master_gtid.server_id != SERVER_ID_UNKNOWN &&
|
||||
slave_gtid.domain == master_gtid.domain &&
|
||||
slave_gtid.sequence <= master_info->gtid_current_pos.sequence)
|
||||
if (slave_gtid.server_id == SERVER_ID_UNKNOWN)
|
||||
{
|
||||
*err_msg = string("'") + slave_name + "' does not have a valid 'gtid_current_pos'.";
|
||||
}
|
||||
else if (master_gtid.server_id == SERVER_ID_UNKNOWN)
|
||||
{
|
||||
*err_msg = string("'") + master_name + "' does not have a valid 'gtid_binlog_pos'.";
|
||||
}
|
||||
else if (slave_gtid.domain != master_gtid.domain ||
|
||||
slave_gtid.sequence > master_info->gtid_current_pos.sequence)
|
||||
{
|
||||
*err_msg = string("gtid_current_pos of '") + slave_name + "' (" + slave_gtid.to_string() +
|
||||
") is incompatible with gtid_binlog_pos of '" + master_name + "' (" +
|
||||
master_gtid.to_string() + ").";
|
||||
}
|
||||
else
|
||||
{
|
||||
rval = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*err_msg = string("Server '") + slave_name + "' could not be queried.";
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
@ -4809,10 +4828,20 @@ static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output)
|
||||
{
|
||||
MXS_MONITORED_SERVER* suspect = suspects[i];
|
||||
MySqlServerInfo* suspect_info = get_server_info(mon, suspect);
|
||||
if (can_replicate_from(mon, suspect, suspect_info, master, master_info))
|
||||
string rejoin_err_msg;
|
||||
if (can_replicate_from(mon, suspect, suspect_info, master, master_info, &rejoin_err_msg))
|
||||
{
|
||||
output->push_back(suspect);
|
||||
}
|
||||
else if (mon->warn_cannot_rejoin)
|
||||
{
|
||||
// Print a message explaining why an auto-rejoin is not done. Suppress printing.
|
||||
MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a "
|
||||
"valid candidate. Will keep retrying with this message suppressed for all "
|
||||
"servers. Errors: \n%s",
|
||||
suspect->server->unique_name, rejoin_err_msg.c_str());
|
||||
mon->warn_cannot_rejoin = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -4820,6 +4849,10 @@ static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output)
|
||||
comm_ok = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
mon->warn_cannot_rejoin = true;
|
||||
}
|
||||
return comm_ok;
|
||||
}
|
||||
|
||||
|
@ -76,6 +76,7 @@ typedef struct
|
||||
int external_master_port; /**< External master port */
|
||||
bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers
|
||||
replicating from the wrong master. */
|
||||
bool warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
|
||||
bool enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers. */
|
||||
int n_excluded; /**< Number of excluded servers */
|
||||
MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */
|
||||
|
Loading…
x
Reference in New Issue
Block a user