diff --git a/include/maxscale/monitor.h b/include/maxscale/monitor.h index 7b5a66916..e041e28b3 100644 --- a/include/maxscale/monitor.h +++ b/include/maxscale/monitor.h @@ -382,6 +382,19 @@ void load_server_journal(MXS_MONITOR *monitor, MXS_MONITORED_SERVER **master); * @param search_server Server to search for * @return Found monitored server or NULL if not found */ -MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_server); +MXS_MONITORED_SERVER* mon_get_monitored_server(const MXS_MONITOR* mon, SERVER* search_server); + +/** + * Get an array of monitored servers. All the servers defined in the config setting must be monitored by + * the given monitor. + * + * @param params Config parameters + * @param key Setting name + * @param mon Monitor which should monitor the servers + * @param monitored_servers_out Where to save output. The caller should free the array, but not the elements. + * @return Output array size if successful, negative value otherwise + */ +int mon_config_get_servers(const MXS_CONFIG_PARAMETER* params, const char* key, const MXS_MONITOR* mon, + MXS_MONITORED_SERVER*** monitored_array_out); MXS_END_DECLS diff --git a/server/core/config.cc b/server/core/config.cc index d331763ac..5c51d48c4 100644 --- a/server/core/config.cc +++ b/server/core/config.cc @@ -3643,6 +3643,7 @@ void fix_serverlist(char* value) dest += sep; dest += start; sep = ","; + start = strtok_r(NULL, ",", &end); } /** The value will always be smaller than the original one or of equal size */ diff --git a/server/core/monitor.cc b/server/core/monitor.cc index c8f0b450b..98875a05d 100644 --- a/server/core/monitor.cc +++ b/server/core/monitor.cc @@ -2428,7 +2428,7 @@ static bool journal_is_stale(MXS_MONITOR *monitor, time_t max_age) return is_stale; } -MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_server) +MXS_MONITORED_SERVER* mon_get_monitored_server(const MXS_MONITOR* mon, SERVER* search_server) { ss_dassert(mon && search_server); for (MXS_MONITORED_SERVER* iter = mon->monitored_servers; iter != NULL; iter = iter->next) @@ -2439,4 +2439,45 @@ MXS_MONITORED_SERVER* mon_get_monitored_server(MXS_MONITOR* mon, SERVER* search_ } } return NULL; +} + +int mon_config_get_servers(const MXS_CONFIG_PARAMETER* params, const char* key, const MXS_MONITOR* mon, + MXS_MONITORED_SERVER*** monitored_servers_out) +{ + ss_dassert(*monitored_servers_out == NULL); + SERVER** servers = NULL; + int servers_size = config_get_server_list(params, key, &servers); + int rval = -1; + // All servers in the array must be monitored by the given monitor. + if (servers_size > 0) + { + MXS_MONITORED_SERVER** monitored_array = + (MXS_MONITORED_SERVER**)MXS_CALLOC(servers_size, sizeof(MXS_MONITORED_SERVER*)); + bool error = false; + for (int i = 0; i < servers_size && !error; i++) + { + MXS_MONITORED_SERVER* mon_serv = mon_get_monitored_server(mon, servers[i]); + if (mon_serv != NULL) + { + monitored_array[i] = mon_serv; + } + else + { + MXS_ERROR("Server '%s' is not monitored by monitor '%s'.", servers[i]->unique_name, mon->name); + error = true; + } + } + MXS_FREE(servers); + + if (error) + { + MXS_FREE(monitored_array); + } + else + { + *monitored_servers_out = monitored_array; + rval = servers_size; + } + } + return rval; } \ No newline at end of file diff --git a/server/modules/monitor/mariadbmon/mysql_mon.cc b/server/modules/monitor/mariadbmon/mysql_mon.cc index 09abb6c08..b0d3cf40e 100644 --- a/server/modules/monitor/mariadbmon/mysql_mon.cc +++ b/server/modules/monitor/mariadbmon/mysql_mon.cc @@ -85,6 +85,12 @@ enum slave_down_setting_t REJECT_DOWN }; +enum print_repl_warnings_t +{ + WARNINGS_ON, + WARNINGS_OFF +}; + static void monitorMain(void *); static void *startMonitor(MXS_MONITOR *, const MXS_CONFIG_PARAMETER*); static void stopMonitor(MXS_MONITOR *); @@ -132,6 +138,7 @@ static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout"; static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout"; static const char CN_AUTO_REJOIN[] = "auto_rejoin"; static const char CN_FAILCOUNT[] = "failcount"; +static const char CN_NO_PROMOTE_SERVERS[] = "servers_no_promotion"; // Parameters for master failure verification and timeout static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure"; @@ -893,6 +900,7 @@ extern "C" {CN_VERIFY_MASTER_FAILURE, MXS_MODULE_PARAM_BOOL, "true"}, {CN_MASTER_FAILURE_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_MASTER_FAILURE_TIMEOUT}, {CN_AUTO_REJOIN, MXS_MODULE_PARAM_BOOL, "false"}, + {CN_NO_PROMOTE_SERVERS, MXS_MODULE_PARAM_SERVERLIST}, {MXS_END_MODULE_PARAMS} } }; @@ -985,7 +993,24 @@ static bool set_replication_credentials(MYSQL_MONITOR *handle, const MXS_CONFIG_ return rval; } -/*lint +e14 */ +/** + * Is the server in the excluded list + * + * @param handle Cluster monitor + * @param server Server to test + * @return True if server is in the excluded-list of the monitor. + */ +static bool server_is_excluded(const MYSQL_MONITOR *handle, const MXS_MONITORED_SERVER* server) +{ + for (int i = 0; i < handle->n_excluded; i++) + { + if (handle->excluded_servers[i] == server) + { + return true; + } + } + return false; +} /** * Start the instance of the monitor, returning a handle on the monitor. @@ -999,14 +1024,17 @@ static bool set_replication_credentials(MYSQL_MONITOR *handle, const MXS_CONFIG_ static void * startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params) { + bool error = false; MYSQL_MONITOR *handle = (MYSQL_MONITOR*) monitor->handle; - if (handle) { handle->shutdown = 0; MXS_FREE(handle->script); MXS_FREE(handle->replication_user); MXS_FREE(handle->replication_password); + MXS_FREE(handle->excluded_servers); + handle->excluded_servers = NULL; + handle->n_excluded = 0; } else { @@ -1052,7 +1080,13 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params) handle->master_failure_timeout = config_get_integer(params, CN_MASTER_FAILURE_TIMEOUT); handle->auto_rejoin = config_get_bool(params, CN_AUTO_REJOIN); - bool error = false; + handle->excluded_servers = NULL; + handle->n_excluded = mon_config_get_servers(params, CN_NO_PROMOTE_SERVERS, monitor, + &handle->excluded_servers); + if (handle->n_excluded < 0) + { + error = true; + } if (!set_replication_credentials(handle, params)) { @@ -1075,6 +1109,7 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params) { hashtable_free(handle->server_info); MXS_FREE(handle->script); + MXS_FREE(handle->excluded_servers); MXS_FREE(handle); handle = NULL; } @@ -3274,20 +3309,25 @@ static MySqlServerInfo* update_slave_info(MYSQL_MONITOR* mon, MXS_MONITORED_SERV * * @param server Server to check * @param server_info Server info + * @param print_on Print warnings or not * @return True if log_bin is on */ -static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySqlServerInfo* server_info) +static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySqlServerInfo* server_info, + print_repl_warnings_t print_warnings = WARNINGS_ON) { bool rval = true; const char* servername = server->server->unique_name; if (server_info->rpl_settings.log_bin == false) { - const char NO_BINLOG[] = - "Slave '%s' has binary log disabled and is not a valid promotion candidate."; - MXS_WARNING(NO_BINLOG, servername); + if (print_warnings == WARNINGS_ON) + { + const char NO_BINLOG[] = + "Slave '%s' has binary log disabled and is not a valid promotion candidate."; + MXS_WARNING(NO_BINLOG, servername); + } rval = false; } - else + else if (print_warnings == WARNINGS_ON) { if (server_info->rpl_settings.gtid_strict_mode == false) { @@ -3346,86 +3386,132 @@ bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* return rval; } +/** + * Is the candidate a better choice for master than the previous best? + * + * @param current_best_info Server info of current best choice + * @param candidate_info Server info of new candidate + * @return True if candidate is better + */ +bool is_candidate_better(const MySqlServerInfo* current_best_info, const MySqlServerInfo* candidate_info) +{ + uint64_t cand_io = candidate_info->slave_status.gtid_io_pos.sequence; + uint64_t cand_processed = candidate_info->gtid_current_pos.sequence; + uint64_t curr_io = current_best_info->slave_status.gtid_io_pos.sequence; + uint64_t curr_processed = current_best_info->gtid_current_pos.sequence; + bool cand_updates = candidate_info->rpl_settings.log_slave_updates; + bool curr_updates = current_best_info->rpl_settings.log_slave_updates; + bool is_better = false; + // Accept a slave with a later event in relay log. + if (cand_io > curr_io) + { + is_better = true; + } + // If io sequences are identical, the slave with more events processed wins. + else if (cand_io == curr_io) + { + if (cand_processed > curr_processed) + { + is_better = true; + } + // Finally, if binlog positions are identical, prefer a slave with log_slave_updates. + else if (cand_processed == curr_processed && cand_updates && !curr_updates) + { + is_better = true; + } + } + return is_better; +} + /** * Select a new master. Also add slaves which should be redirected to an array. * * @param mon The monitor - * @param out_slaves Vector for storing slave servers, can be NULL + * @param out_slaves Vector for storing slave servers. * @param err_out json object for error printing. Can be NULL. * @return The found master, or NULL if not found */ -MXS_MONITORED_SERVER* select_new_master(MYSQL_MONITOR* mon, - ServerVector* slaves_out, - json_t** err_out) +MXS_MONITORED_SERVER* select_new_master(MYSQL_MONITOR* mon, ServerVector* slaves_out, json_t** err_out) { + ss_dassert(slaves_out && slaves_out->size() == 0); /* Select a new master candidate. Selects the one with the latest event in relay log. * If multiple slaves have same number of events, select the one with most processed events. */ - MXS_MONITORED_SERVER* new_master = NULL; - MySqlServerInfo* new_master_info = NULL; + MXS_MONITORED_SERVER* current_best = NULL; + MySqlServerInfo* current_best_info = NULL; + // Servers that cannot be selected because of exclusion, but seem otherwise ok. + ServerVector valid_but_excluded; + // Index of the current best candidate in slaves_out int master_vector_index = -1; + for (MXS_MONITORED_SERVER *cand = mon->monitor->monitored_servers; cand; cand = cand->next) { + // If a server cannot be connected to, it won't be considered for promotion or redirected. + // Do not worry about the exclusion list yet, querying the excluded servers is ok. MySqlServerInfo* cand_info = update_slave_info(mon, cand); if (cand_info) { - if (slaves_out) + slaves_out->push_back(cand); + // Check that server is not in the exclusion list while still being a valid choice. + if (server_is_excluded(mon, cand) && check_replication_settings(cand, cand_info, WARNINGS_OFF)) { - slaves_out->push_back(cand); + valid_but_excluded.push_back(cand); + const char CANNOT_SELECT[] = "Promotion candidate '%s' is excluded from new " + "master selection."; + MXS_INFO(CANNOT_SELECT, cand->server->unique_name); } - if (check_replication_settings(cand, cand_info)) + else if (check_replication_settings(cand, cand_info)) { - bool select_this = false; - // If no candidate yet, accept any slave. Slaves have already been checked to use gtid. - if (new_master == NULL) + // If no new master yet, accept any valid candidate. Otherwise check. + if (current_best == NULL || is_candidate_better(current_best_info, cand_info)) { - select_this = true; - } - else - { - uint64_t cand_io = cand_info->slave_status.gtid_io_pos.sequence; - uint64_t cand_processed = cand_info->gtid_current_pos.sequence; - uint64_t master_io = new_master_info->slave_status.gtid_io_pos.sequence; - uint64_t master_processed = new_master_info->gtid_current_pos.sequence; - bool cand_updates = cand_info->rpl_settings.log_slave_updates; - bool master_updates = new_master_info->rpl_settings.log_slave_updates; - // Otherwise accept a slave with a later event in relay log. - if (cand_io > master_io || - // If io sequences are identical, the slave with more events processed wins. - (cand_io == master_io && (cand_processed > master_processed || - // Finally, if binlog positions are identical, - // prefer a slave with log_slave_updates. - (cand_processed == master_processed && - cand_updates && !master_updates)))) - { - select_this = true; - } - } - - if (select_this) - { - new_master = cand; - new_master_info = cand_info; - if (slaves_out) - { - master_vector_index = slaves_out->size() - 1; - } + // The server has been selected for promotion, for now. + current_best = cand; + current_best_info = cand_info; + master_vector_index = slaves_out->size() - 1; } } } } - if (new_master && slaves_out) + if (current_best) { // Remove the selected master from the vector. ServerVector::iterator remove_this = slaves_out->begin(); remove_this += master_vector_index; slaves_out->erase(remove_this); } - if (new_master == NULL) + + // Check if any of the excluded servers would be better than the best candidate. + for (ServerVector::const_iterator iter = valid_but_excluded.begin(); + iter != valid_but_excluded.end(); + iter++) + { + MySqlServerInfo* excluded_info = get_server_info(mon, *iter); + const char* excluded_name = (*iter)->server->unique_name; + if (current_best == NULL) + { + const char EXCLUDED_ONLY_CAND[] = "Server '%s' is a viable choice for new master, " + "but cannot be selected as it's excluded."; + MXS_WARNING(EXCLUDED_ONLY_CAND, excluded_name); + break; + } + else if (is_candidate_better(current_best_info, excluded_info)) + { + // Print a warning if this server is actually a better candidate than the previous + // best. + const char EXCLUDED_CAND[] = "Server '%s' is superior to current " + "best candidate '%s', but cannot be selected as it's excluded. This may lead to " + "loss of data if '%s' is ahead of other servers."; + MXS_WARNING(EXCLUDED_CAND, excluded_name, current_best->server->unique_name, excluded_name); + break; + } + } + + if (current_best == NULL) { PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found."); } - return new_master; + return current_best; } /** diff --git a/server/modules/monitor/mysqlmon.h b/server/modules/monitor/mysqlmon.h index 445218f6b..35f571b68 100644 --- a/server/modules/monitor/mysqlmon.h +++ b/server/modules/monitor/mysqlmon.h @@ -73,6 +73,9 @@ typedef struct int64_t master_gtid_domain; /**< Gtid domain currently used by the master */ bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers replicating from the wrong master. */ + int n_excluded; /**< Number of excluded servers */ + MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */ + MXS_MONITOR* monitor; } MYSQL_MONITOR;