MXS-1886 Better auto-rejoin error description and tolerance

Auto-rejoin now explains more accurately if a server cannot be joined due
to conflicting gtid.

Also, auto-rejoin is no longer disabled if a join fails. Usually the fail
is due to the server not replying fast enough with query completion. The
query is often completed anyways. This can lead to some log spam.
This commit is contained in:
Esa Korhonen
2018-06-13 16:29:14 +03:00
parent 9e68d8ec3d
commit 09df017528
2 changed files with 51 additions and 17 deletions

View File

@ -125,7 +125,8 @@ static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
static bool cluster_can_be_joined(MYSQL_MONITOR* mon); static bool cluster_can_be_joined(MYSQL_MONITOR* mon);
static bool can_replicate_from(MYSQL_MONITOR* mon, static bool can_replicate_from(MYSQL_MONITOR* mon,
MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info, MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info); MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info,
string* err_msg);
static bool wait_cluster_stabilization(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, static bool wait_cluster_stabilization(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master,
const ServerVector& slaves, int seconds_remaining); const ServerVector& slaves, int seconds_remaining);
static string get_connection_errors(const ServerVector& servers); static string get_connection_errors(const ServerVector& servers);
@ -761,7 +762,9 @@ bool mysql_rejoin(MXS_MONITOR* mon, SERVER* rejoin_server, json_t** output)
{ {
if (update_gtids(handle, master, master_info)) if (update_gtids(handle, master, master_info))
{ {
if (can_replicate_from(handle, mon_server, server_info, master, master_info)) string no_rejoin_reason;
if (can_replicate_from(handle, mon_server, server_info, master, master_info,
&no_rejoin_reason))
{ {
ServerVector joinable_server; ServerVector joinable_server;
joinable_server.push_back(mon_server); joinable_server.push_back(mon_server);
@ -777,8 +780,8 @@ bool mysql_rejoin(MXS_MONITOR* mon, SERVER* rejoin_server, json_t** output)
} }
else else
{ {
PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' " PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': "
"or it could not be queried.", rejoin_serv_name, master_name); "%s", rejoin_serv_name, master_name, no_rejoin_reason.c_str());
} }
} }
else else
@ -1101,6 +1104,7 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
handle->id = config_get_global_options()->id; handle->id = config_get_global_options()->id;
handle->warn_set_standalone_master = true; handle->warn_set_standalone_master = true;
handle->warn_failover_precond = true; handle->warn_failover_precond = true;
handle->warn_cannot_rejoin = true;
handle->master_gtid_domain = -1; handle->master_gtid_domain = -1;
handle->external_master_host[0] = '\0'; handle->external_master_host[0] = '\0';
handle->external_master_port = PORT_UNKNOWN; handle->external_master_port = PORT_UNKNOWN;
@ -2636,14 +2640,6 @@ monitorMain(void *arg)
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins); MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
cluster_modified = true; cluster_modified = true;
} }
if (joins < joinable_servers.size())
{
MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
"To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
"the REST API.", CN_AUTO_REJOIN, mon->name);
handle->auto_rejoin = false;
disable_setting(handle, CN_AUTO_REJOIN);
}
} }
else else
{ {
@ -4697,13 +4693,19 @@ static void read_server_variables(MXS_MONITORED_SERVER* database, MySqlServerInf
* @param slave_info Slave info * @param slave_info Slave info
* @param master Replication master * @param master Replication master
* @param master_info Master info * @param master_info Master info
* @param err_msg Error output. Details the reason for invalid replication.
* @return True if slave can replicate from master * @return True if slave can replicate from master
*/ */
static bool can_replicate_from(MYSQL_MONITOR* mon, static bool can_replicate_from(MYSQL_MONITOR* mon,
MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info, MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info) MXS_MONITORED_SERVER* master, MySqlServerInfo* master_info,
string* err_msg)
{ {
ss_dassert(err_msg);
bool rval = false; bool rval = false;
const char* master_name = master->server->unique_name;
const char* slave_name = slave->server->unique_name;
if (update_gtids(mon, slave, slave_info)) if (update_gtids(mon, slave, slave_info))
{ {
Gtid slave_gtid = slave_info->gtid_current_pos; Gtid slave_gtid = slave_info->gtid_current_pos;
@ -4711,13 +4713,30 @@ static bool can_replicate_from(MYSQL_MONITOR* mon,
// The following are not sufficient requirements for replication to work, they only cover the basics. // The following are not sufficient requirements for replication to work, they only cover the basics.
// If the servers have diverging histories, the redirection will seem to succeed but the slave IO // If the servers have diverging histories, the redirection will seem to succeed but the slave IO
// thread will stop in error. // thread will stop in error.
if (slave_gtid.server_id != SERVER_ID_UNKNOWN && master_gtid.server_id != SERVER_ID_UNKNOWN && if (slave_gtid.server_id == SERVER_ID_UNKNOWN)
slave_gtid.domain == master_gtid.domain && {
slave_gtid.sequence <= master_info->gtid_current_pos.sequence) *err_msg = string("'") + slave_name + "' does not have a valid 'gtid_current_pos'.";
}
else if (master_gtid.server_id == SERVER_ID_UNKNOWN)
{
*err_msg = string("'") + master_name + "' does not have a valid 'gtid_binlog_pos'.";
}
else if (slave_gtid.domain != master_gtid.domain ||
slave_gtid.sequence > master_info->gtid_current_pos.sequence)
{
*err_msg = string("gtid_current_pos of '") + slave_name + "' (" + slave_gtid.to_string() +
") is incompatible with gtid_binlog_pos of '" + master_name + "' (" +
master_gtid.to_string() + ").";
}
else
{ {
rval = true; rval = true;
} }
} }
else
{
*err_msg = string("Server '") + slave_name + "' could not be queried.";
}
return rval; return rval;
} }
@ -4809,10 +4828,20 @@ static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output)
{ {
MXS_MONITORED_SERVER* suspect = suspects[i]; MXS_MONITORED_SERVER* suspect = suspects[i];
MySqlServerInfo* suspect_info = get_server_info(mon, suspect); MySqlServerInfo* suspect_info = get_server_info(mon, suspect);
if (can_replicate_from(mon, suspect, suspect_info, master, master_info)) string rejoin_err_msg;
if (can_replicate_from(mon, suspect, suspect_info, master, master_info, &rejoin_err_msg))
{ {
output->push_back(suspect); output->push_back(suspect);
} }
else if (mon->warn_cannot_rejoin)
{
// Print a message explaining why an auto-rejoin is not done. Suppress printing.
MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a "
"valid candidate. Will keep retrying with this message suppressed for all "
"servers. Errors: \n%s",
suspect->server->unique_name, rejoin_err_msg.c_str());
mon->warn_cannot_rejoin = false;
}
} }
} }
else else
@ -4820,6 +4849,10 @@ static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output)
comm_ok = false; comm_ok = false;
} }
} }
else
{
mon->warn_cannot_rejoin = true;
}
return comm_ok; return comm_ok;
} }

View File

@ -76,6 +76,7 @@ typedef struct
int external_master_port; /**< External master port */ int external_master_port; /**< External master port */
bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers
replicating from the wrong master. */ replicating from the wrong master. */
bool warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
bool enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers. */ bool enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers. */
int n_excluded; /**< Number of excluded servers */ int n_excluded; /**< Number of excluded servers */
MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */ MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */