Add config parameter for excluding servers from failover

"servers_no_promotion" is a comma-separated list of servers
which cannot be chosen when selecting a new master during failover
(auto or manual), or when automatically selecting a new master
for switchover (currently disabled).

The servers in the list are redirected normally and can be promoted
by switchover when manually selecting a new master.
This commit is contained in:
Esa Korhonen 2018-01-30 15:18:01 +02:00
parent 6f6c11e6a3
commit 1cf3de4a74
5 changed files with 200 additions and 56 deletions

View File

@ -382,6 +382,19 @@ void load_server_journal(MXS_MONITOR *monitor, MXS_MONITORED_SERVER **master);
* @param search_server Server to search for
* @return Found monitored server or NULL if not found
*/
MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_server);
MXS_MONITORED_SERVER* mon_get_monitored_server(const MXS_MONITOR* mon, SERVER* search_server);
/**
* Get an array of monitored servers. All the servers defined in the config setting must be monitored by
* the given monitor.
*
* @param params Config parameters
* @param key Setting name
* @param mon Monitor which should monitor the servers
* @param monitored_servers_out Where to save output. The caller should free the array, but not the elements.
* @return Output array size if successful, negative value otherwise
*/
int mon_config_get_servers(const MXS_CONFIG_PARAMETER* params, const char* key, const MXS_MONITOR* mon,
MXS_MONITORED_SERVER*** monitored_array_out);
MXS_END_DECLS

View File

@ -3643,6 +3643,7 @@ void fix_serverlist(char* value)
dest += sep;
dest += start;
sep = ",";
start = strtok_r(NULL, ",", &end);
}
/** The value will always be smaller than the original one or of equal size */

View File

@ -2428,7 +2428,7 @@ static bool journal_is_stale(MXS_MONITOR *monitor, time_t max_age)
return is_stale;
}
MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_server)
MXS_MONITORED_SERVER* mon_get_monitored_server(const MXS_MONITOR* mon, SERVER* search_server)
{
ss_dassert(mon && search_server);
for (MXS_MONITORED_SERVER* iter = mon->monitored_servers; iter != NULL; iter = iter->next)
@ -2439,4 +2439,45 @@ MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_
}
}
return NULL;
}
int mon_config_get_servers(const MXS_CONFIG_PARAMETER* params, const char* key, const MXS_MONITOR* mon,
MXS_MONITORED_SERVER*** monitored_servers_out)
{
ss_dassert(*monitored_servers_out == NULL);
SERVER** servers = NULL;
int servers_size = config_get_server_list(params, key, &servers);
int rval = -1;
// All servers in the array must be monitored by the given monitor.
if (servers_size > 0)
{
MXS_MONITORED_SERVER** monitored_array =
(MXS_MONITORED_SERVER**)MXS_CALLOC(servers_size, sizeof(MXS_MONITORED_SERVER*));
bool error = false;
for (int i = 0; i < servers_size && !error; i++)
{
MXS_MONITORED_SERVER* mon_serv = mon_get_monitored_server(mon, servers[i]);
if (mon_serv != NULL)
{
monitored_array[i] = mon_serv;
}
else
{
MXS_ERROR("Server '%s' is not monitored by monitor '%s'.", servers[i]->unique_name, mon->name);
error = true;
}
}
MXS_FREE(servers);
if (error)
{
MXS_FREE(monitored_array);
}
else
{
*monitored_servers_out = monitored_array;
rval = servers_size;
}
}
return rval;
}

View File

@ -85,6 +85,12 @@ enum slave_down_setting_t
REJECT_DOWN
};
enum print_repl_warnings_t
{
WARNINGS_ON,
WARNINGS_OFF
};
static void monitorMain(void *);
static void *startMonitor(MXS_MONITOR *, const MXS_CONFIG_PARAMETER*);
static void stopMonitor(MXS_MONITOR *);
@ -132,6 +138,7 @@ static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_AUTO_REJOIN[] = "auto_rejoin";
static const char CN_FAILCOUNT[] = "failcount";
static const char CN_NO_PROMOTE_SERVERS[] = "servers_no_promotion";
// Parameters for master failure verification and timeout
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
@ -893,6 +900,7 @@ extern "C"
{CN_VERIFY_MASTER_FAILURE, MXS_MODULE_PARAM_BOOL, "true"},
{CN_MASTER_FAILURE_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_MASTER_FAILURE_TIMEOUT},
{CN_AUTO_REJOIN, MXS_MODULE_PARAM_BOOL, "false"},
{CN_NO_PROMOTE_SERVERS, MXS_MODULE_PARAM_SERVERLIST},
{MXS_END_MODULE_PARAMS}
}
};
@ -985,7 +993,24 @@ static bool set_replication_credentials(MYSQL_MONITOR *handle, const MXS_CONFIG_
return rval;
}
/*lint +e14 */
/**
* Is the server in the excluded list
*
* @param handle Cluster monitor
* @param server Server to test
* @return True if server is in the excluded-list of the monitor.
*/
static bool server_is_excluded(const MYSQL_MONITOR *handle, const MXS_MONITORED_SERVER* server)
{
for (int i = 0; i < handle->n_excluded; i++)
{
if (handle->excluded_servers[i] == server)
{
return true;
}
}
return false;
}
/**
* Start the instance of the monitor, returning a handle on the monitor.
@ -999,14 +1024,17 @@ static bool set_replication_credentials(MYSQL_MONITOR *handle, const MXS_CONFIG_
static void *
startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
{
bool error = false;
MYSQL_MONITOR *handle = (MYSQL_MONITOR*) monitor->handle;
if (handle)
{
handle->shutdown = 0;
MXS_FREE(handle->script);
MXS_FREE(handle->replication_user);
MXS_FREE(handle->replication_password);
MXS_FREE(handle->excluded_servers);
handle->excluded_servers = NULL;
handle->n_excluded = 0;
}
else
{
@ -1052,7 +1080,13 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
handle->master_failure_timeout = config_get_integer(params, CN_MASTER_FAILURE_TIMEOUT);
handle->auto_rejoin = config_get_bool(params, CN_AUTO_REJOIN);
bool error = false;
handle->excluded_servers = NULL;
handle->n_excluded = mon_config_get_servers(params, CN_NO_PROMOTE_SERVERS, monitor,
&handle->excluded_servers);
if (handle->n_excluded < 0)
{
error = true;
}
if (!set_replication_credentials(handle, params))
{
@ -1075,6 +1109,7 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
{
hashtable_free(handle->server_info);
MXS_FREE(handle->script);
MXS_FREE(handle->excluded_servers);
MXS_FREE(handle);
handle = NULL;
}
@ -3274,20 +3309,25 @@ static MySqlServerInfo* update_slave_info(MYSQL_MONITOR* mon, MXS_MONITORED_SERV
*
* @param server Server to check
* @param server_info Server info
* @param print_on Print warnings or not
* @return True if log_bin is on
*/
static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySqlServerInfo* server_info)
static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySqlServerInfo* server_info,
print_repl_warnings_t print_warnings = WARNINGS_ON)
{
bool rval = true;
const char* servername = server->server->unique_name;
if (server_info->rpl_settings.log_bin == false)
{
const char NO_BINLOG[] =
"Slave '%s' has binary log disabled and is not a valid promotion candidate.";
MXS_WARNING(NO_BINLOG, servername);
if (print_warnings == WARNINGS_ON)
{
const char NO_BINLOG[] =
"Slave '%s' has binary log disabled and is not a valid promotion candidate.";
MXS_WARNING(NO_BINLOG, servername);
}
rval = false;
}
else
else if (print_warnings == WARNINGS_ON)
{
if (server_info->rpl_settings.gtid_strict_mode == false)
{
@ -3346,86 +3386,132 @@ bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER*
return rval;
}
/**
* Is the candidate a better choice for master than the previous best?
*
* @param current_best_info Server info of current best choice
* @param candidate_info Server info of new candidate
* @return True if candidate is better
*/
bool is_candidate_better(const MySqlServerInfo* current_best_info, const MySqlServerInfo* candidate_info)
{
uint64_t cand_io = candidate_info->slave_status.gtid_io_pos.sequence;
uint64_t cand_processed = candidate_info->gtid_current_pos.sequence;
uint64_t curr_io = current_best_info->slave_status.gtid_io_pos.sequence;
uint64_t curr_processed = current_best_info->gtid_current_pos.sequence;
bool cand_updates = candidate_info->rpl_settings.log_slave_updates;
bool curr_updates = current_best_info->rpl_settings.log_slave_updates;
bool is_better = false;
// Accept a slave with a later event in relay log.
if (cand_io > curr_io)
{
is_better = true;
}
// If io sequences are identical, the slave with more events processed wins.
else if (cand_io == curr_io)
{
if (cand_processed > curr_processed)
{
is_better = true;
}
// Finally, if binlog positions are identical, prefer a slave with log_slave_updates.
else if (cand_processed == curr_processed && cand_updates && !curr_updates)
{
is_better = true;
}
}
return is_better;
}
/**
* Select a new master. Also add slaves which should be redirected to an array.
*
* @param mon The monitor
* @param out_slaves Vector for storing slave servers, can be NULL
* @param out_slaves Vector for storing slave servers.
* @param err_out json object for error printing. Can be NULL.
* @return The found master, or NULL if not found
*/
MXS_MONITORED_SERVER* select_new_master(MYSQL_MONITOR* mon,
ServerVector* slaves_out,
json_t** err_out)
MXS_MONITORED_SERVER* select_new_master(MYSQL_MONITOR* mon, ServerVector* slaves_out, json_t** err_out)
{
ss_dassert(slaves_out && slaves_out->size() == 0);
/* Select a new master candidate. Selects the one with the latest event in relay log.
* If multiple slaves have same number of events, select the one with most processed events. */
MXS_MONITORED_SERVER* new_master = NULL;
MySqlServerInfo* new_master_info = NULL;
MXS_MONITORED_SERVER* current_best = NULL;
MySqlServerInfo* current_best_info = NULL;
// Servers that cannot be selected because of exclusion, but seem otherwise ok.
ServerVector valid_but_excluded;
// Index of the current best candidate in slaves_out
int master_vector_index = -1;
for (MXS_MONITORED_SERVER *cand = mon->monitor->monitored_servers; cand; cand = cand->next)
{
// If a server cannot be connected to, it won't be considered for promotion or redirected.
// Do not worry about the exclusion list yet, querying the excluded servers is ok.
MySqlServerInfo* cand_info = update_slave_info(mon, cand);
if (cand_info)
{
if (slaves_out)
slaves_out->push_back(cand);
// Check that server is not in the exclusion list while still being a valid choice.
if (server_is_excluded(mon, cand) && check_replication_settings(cand, cand_info, WARNINGS_OFF))
{
slaves_out->push_back(cand);
valid_but_excluded.push_back(cand);
const char CANNOT_SELECT[] = "Promotion candidate '%s' is excluded from new "
"master selection.";
MXS_INFO(CANNOT_SELECT, cand->server->unique_name);
}
if (check_replication_settings(cand, cand_info))
else if (check_replication_settings(cand, cand_info))
{
bool select_this = false;
// If no candidate yet, accept any slave. Slaves have already been checked to use gtid.
if (new_master == NULL)
// If no new master yet, accept any valid candidate. Otherwise check.
if (current_best == NULL || is_candidate_better(current_best_info, cand_info))
{
select_this = true;
}
else
{
uint64_t cand_io = cand_info->slave_status.gtid_io_pos.sequence;
uint64_t cand_processed = cand_info->gtid_current_pos.sequence;
uint64_t master_io = new_master_info->slave_status.gtid_io_pos.sequence;
uint64_t master_processed = new_master_info->gtid_current_pos.sequence;
bool cand_updates = cand_info->rpl_settings.log_slave_updates;
bool master_updates = new_master_info->rpl_settings.log_slave_updates;
// Otherwise accept a slave with a later event in relay log.
if (cand_io > master_io ||
// If io sequences are identical, the slave with more events processed wins.
(cand_io == master_io && (cand_processed > master_processed ||
// Finally, if binlog positions are identical,
// prefer a slave with log_slave_updates.
(cand_processed == master_processed &&
cand_updates && !master_updates))))
{
select_this = true;
}
}
if (select_this)
{
new_master = cand;
new_master_info = cand_info;
if (slaves_out)
{
master_vector_index = slaves_out->size() - 1;
}
// The server has been selected for promotion, for now.
current_best = cand;
current_best_info = cand_info;
master_vector_index = slaves_out->size() - 1;
}
}
}
}
if (new_master && slaves_out)
if (current_best)
{
// Remove the selected master from the vector.
ServerVector::iterator remove_this = slaves_out->begin();
remove_this += master_vector_index;
slaves_out->erase(remove_this);
}
if (new_master == NULL)
// Check if any of the excluded servers would be better than the best candidate.
for (ServerVector::const_iterator iter = valid_but_excluded.begin();
iter != valid_but_excluded.end();
iter++)
{
MySqlServerInfo* excluded_info = get_server_info(mon, *iter);
const char* excluded_name = (*iter)->server->unique_name;
if (current_best == NULL)
{
const char EXCLUDED_ONLY_CAND[] = "Server '%s' is a viable choice for new master, "
"but cannot be selected as it's excluded.";
MXS_WARNING(EXCLUDED_ONLY_CAND, excluded_name);
break;
}
else if (is_candidate_better(current_best_info, excluded_info))
{
// Print a warning if this server is actually a better candidate than the previous
// best.
const char EXCLUDED_CAND[] = "Server '%s' is superior to current "
"best candidate '%s', but cannot be selected as it's excluded. This may lead to "
"loss of data if '%s' is ahead of other servers.";
MXS_WARNING(EXCLUDED_CAND, excluded_name, current_best->server->unique_name, excluded_name);
break;
}
}
if (current_best == NULL)
{
PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found.");
}
return new_master;
return current_best;
}
/**

View File

@ -73,6 +73,9 @@ typedef struct
int64_t master_gtid_domain; /**< Gtid domain currently used by the master */
bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers
replicating from the wrong master. */
int n_excluded; /**< Number of excluded servers */
MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */
MXS_MONITOR* monitor;
} MYSQL_MONITOR;