MXS-2426 Do not permanently disable automatic cluster operations when they fail
Only disabled for "failcount" monitor ticks. Also turns some related log messages to notices.
This commit is contained in:
parent
b9aec60d7a
commit
5ba305c2c1
@ -33,8 +33,6 @@
|
||||
#include <maxscale/modutil.h>
|
||||
#include <maxscale/mysql_utils.h>
|
||||
#include <maxscale/utils.h>
|
||||
// TODO: For monitorAddParameters
|
||||
#include "../../../core/internal/monitor.h"
|
||||
|
||||
/** Column positions for SHOW SLAVE STATUS */
|
||||
#define MYSQL55_STATUS_MASTER_LOG_POS 5
|
||||
@ -95,7 +93,7 @@ static int add_slave_to_master(long *, int, long);
|
||||
static bool isMySQLEvent(mxs_monitor_event_t event);
|
||||
void check_maxscale_schema_replication(MXS_MONITOR *monitor);
|
||||
static MySqlServerInfo* get_server_info(const MYSQL_MONITOR* handle, const MXS_MONITORED_SERVER* db);
|
||||
static bool mon_process_failover(MYSQL_MONITOR*, uint32_t, bool*);
|
||||
static void mon_process_failover(MYSQL_MONITOR* monitor);
|
||||
static bool do_failover(MYSQL_MONITOR* mon, json_t** output);
|
||||
static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
|
||||
MXS_MONITORED_SERVER* new_master, json_t** err_out);
|
||||
@ -109,7 +107,8 @@ static bool server_is_rejoin_suspect(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* s
|
||||
static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output);
|
||||
static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& servers, json_t** output);
|
||||
static bool join_cluster(MXS_MONITORED_SERVER* server, const char* change_cmd);
|
||||
static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
|
||||
static void delay_auto_cluster_ops(MYSQL_MONITOR* mon);
|
||||
static bool can_perform_cluster_ops(MYSQL_MONITOR* mon);
|
||||
static bool cluster_can_be_joined(MYSQL_MONITOR* mon);
|
||||
static bool can_replicate_from(MYSQL_MONITOR* mon,
|
||||
MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
|
||||
@ -552,15 +551,9 @@ bool mysql_switchover(MXS_MONITOR* mon, MXS_MONITORED_SERVER* new_master,
|
||||
}
|
||||
else
|
||||
{
|
||||
string format = "Switchover %s -> %s failed";
|
||||
bool failover = config_get_bool(mon->parameters, CN_AUTO_FAILOVER);
|
||||
if (failover)
|
||||
{
|
||||
disable_setting(handle, CN_AUTO_FAILOVER);
|
||||
format += ", failover has been disabled.";
|
||||
}
|
||||
format += ".";
|
||||
PRINT_MXS_JSON_ERROR(error_out, format.c_str(), curr_master_name, new_master_name);
|
||||
const char format[] = "Switchover %s -> %s failed.";
|
||||
PRINT_MXS_JSON_ERROR(error_out, format, curr_master_name, new_master_name);
|
||||
delay_auto_cluster_ops(handle);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2206,7 +2199,8 @@ monitorMain(void *arg)
|
||||
size_t nrounds = 0;
|
||||
int log_no_master = 1;
|
||||
bool heartbeat_checked = false;
|
||||
|
||||
handle->cluster_op_performed = false;
|
||||
handle->cluster_operation_disable_timer = 0;
|
||||
replication_heartbeat = handle->replicationHeartbeat;
|
||||
detect_stale_master = handle->detectStaleMaster;
|
||||
|
||||
@ -2255,6 +2249,12 @@ monitorMain(void *arg)
|
||||
num_servers = 0;
|
||||
|
||||
atomic_add_uint64(&mon->ticks, 1);
|
||||
handle->cluster_op_performed = false;
|
||||
if (handle->cluster_operation_disable_timer > 0)
|
||||
{
|
||||
handle->cluster_operation_disable_timer--;
|
||||
}
|
||||
|
||||
lock_monitor_servers(mon);
|
||||
servers_status_pending_to_current(mon);
|
||||
|
||||
@ -2559,34 +2559,25 @@ monitorMain(void *arg)
|
||||
* need to be launched.
|
||||
*/
|
||||
mon_process_state_changes(mon, handle->script, handle->events);
|
||||
bool cluster_modified = false; // Has an automatic failover/rejoin been performed this loop?
|
||||
|
||||
if (handle->auto_failover)
|
||||
if (handle->auto_failover && can_perform_cluster_ops(handle))
|
||||
{
|
||||
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
|
||||
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
|
||||
if (failover_not_possible(handle))
|
||||
{
|
||||
const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
|
||||
"replication configuration, disabling automatic failover. Failover "
|
||||
"should only be enabled after the replication configuration has been "
|
||||
"fixed.";
|
||||
MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, mon->name);
|
||||
handle->auto_failover = false;
|
||||
disable_setting(handle, CN_AUTO_FAILOVER);
|
||||
"replication configuration.";
|
||||
MXS_ERROR(PROBLEMS);
|
||||
delay_auto_cluster_ops(handle);
|
||||
}
|
||||
// If master seems to be down, check if slaves are receiving events.
|
||||
else if (handle->verify_master_failure && handle->master &&
|
||||
SERVER_IS_DOWN(handle->master->server) && slave_receiving_events(handle))
|
||||
{
|
||||
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
|
||||
MXS_NOTICE("A slave is still receiving data from master, delaying failover.");
|
||||
}
|
||||
else if (!mon_process_failover(handle, handle->failover_timeout, &cluster_modified))
|
||||
else
|
||||
{
|
||||
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
|
||||
MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, mon->name);
|
||||
handle->auto_failover = false;
|
||||
disable_setting(handle, CN_AUTO_FAILOVER);
|
||||
mon_process_failover(handle);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2651,10 +2642,9 @@ monitorMain(void *arg)
|
||||
|
||||
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
|
||||
// has been performed, as server states have not been updated yet. It will happen next iteration.
|
||||
if (!config_get_global_options()->passive && handle->auto_rejoin &&
|
||||
!cluster_modified && cluster_can_be_joined(handle))
|
||||
if (handle->auto_rejoin && cluster_can_be_joined(handle) && can_perform_cluster_ops(handle))
|
||||
{
|
||||
// Check if any servers should be autojoined to the cluster
|
||||
// Check if any servers should be autojoined to the cluster.
|
||||
ServerVector joinable_servers;
|
||||
if (get_joinable_servers(handle, &joinable_servers))
|
||||
{
|
||||
@ -2662,7 +2652,7 @@ monitorMain(void *arg)
|
||||
if (joins > 0)
|
||||
{
|
||||
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
|
||||
cluster_modified = true;
|
||||
handle->cluster_op_performed = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -2675,7 +2665,7 @@ monitorMain(void *arg)
|
||||
/* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not
|
||||
* perform this if cluster has been modified this loop since it may not be clear which server
|
||||
* should be a slave. */
|
||||
if (!config_get_global_options()->passive && handle->enforce_read_only_slaves && !cluster_modified)
|
||||
if (handle->enforce_read_only_slaves && can_perform_cluster_ops(handle))
|
||||
{
|
||||
enforce_read_only_on_slaves(handle);
|
||||
}
|
||||
@ -3384,30 +3374,19 @@ void check_maxscale_schema_replication(MXS_MONITOR *monitor)
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Process possible failover event
|
||||
* If a master has failed, performs failover. This function only works with flat replication topologies
|
||||
* Should be called immediately after @c mon_process_state_changes.
|
||||
*
|
||||
* If a master failure has occurred and MaxScale is configured with failover
|
||||
* functionality, this fuction executes an external failover program to elect
|
||||
* a new master server.
|
||||
*
|
||||
* This function should be called immediately after @c mon_process_state_changes.
|
||||
*
|
||||
* @param monitor Monitor whose cluster is processed
|
||||
* @param failover_timeout Timeout in seconds for the failover
|
||||
* @param cluster_modified_out Set to true if modifying cluster
|
||||
* @return True on success, false on error
|
||||
*
|
||||
* @todo Currently this only works with flat replication topologies and
|
||||
* needs to be moved inside mysqlmon as it is MariaDB specific code.
|
||||
* @param monitor Monitor whose cluster is processed
|
||||
*/
|
||||
bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, bool* cluster_modified_out)
|
||||
void mon_process_failover(MYSQL_MONITOR* monitor)
|
||||
{
|
||||
ss_dassert(*cluster_modified_out == false);
|
||||
if (config_get_global_options()->passive ||
|
||||
(monitor->master && SERVER_IS_MASTER(monitor->master->server)))
|
||||
ss_dassert(monitor->cluster_op_performed == false);
|
||||
if (monitor->master && SERVER_IS_MASTER(monitor->master->server))
|
||||
{
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
int failover_timeout = monitor->failover_timeout;
|
||||
|
||||
MXS_MONITORED_SERVER* failed_master = NULL;
|
||||
for (MXS_MONITORED_SERVER *ptr = monitor->monitor->monitored_servers; ptr; ptr = ptr->next)
|
||||
@ -3416,12 +3395,11 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
|
||||
{
|
||||
if (failed_master)
|
||||
{
|
||||
MXS_ALERT("Multiple failed master servers detected: "
|
||||
"'%s' is the first master to fail but server "
|
||||
"'%s' has also triggered a master_down event.",
|
||||
failed_master->server->unique_name,
|
||||
ptr->server->unique_name);
|
||||
return false;
|
||||
MXS_ERROR("Multiple failed master servers detected: '%s' is the first master to fail "
|
||||
"but server '%s' has also triggered a master_down event.",
|
||||
failed_master->server->unique_name, ptr->server->unique_name);
|
||||
delay_auto_cluster_ops(monitor);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ptr->server->active_event)
|
||||
@ -3442,7 +3420,7 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
|
||||
if (t > timeout)
|
||||
{
|
||||
MXS_WARNING("Failover of server '%s' did not take place within "
|
||||
"%u seconds, failover needs to be re-triggered",
|
||||
"%i seconds, failover needs to be re-triggered",
|
||||
ptr->server->unique_name, failover_timeout);
|
||||
failed_master = ptr;
|
||||
}
|
||||
@ -3450,7 +3428,6 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
|
||||
}
|
||||
}
|
||||
|
||||
bool rval = true;
|
||||
if (failed_master)
|
||||
{
|
||||
int failcount = monitor->failcount;
|
||||
@ -3468,12 +3445,15 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
|
||||
monitor->warn_failover_precond = true;
|
||||
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
|
||||
failed_master->server->unique_name);
|
||||
failed_master->new_event = false;
|
||||
// If this fails, auto_failover is disabled.
|
||||
rval = do_failover(monitor, NULL);
|
||||
if (rval)
|
||||
if (do_failover(monitor, NULL))
|
||||
{
|
||||
*cluster_modified_out = true;
|
||||
monitor->cluster_op_performed = true;
|
||||
failed_master->new_event = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_ERROR("Automatic failover failed.");
|
||||
delay_auto_cluster_ops(monitor);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -3493,8 +3473,6 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
|
||||
{
|
||||
monitor->warn_failover_precond = true;
|
||||
}
|
||||
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3755,8 +3733,8 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
|
||||
io_pos_stable &&
|
||||
difftime(time(NULL), begin) < seconds_remaining)
|
||||
{
|
||||
MXS_INFO("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
|
||||
new_master->server->unique_name, master_info->relay_log_events());
|
||||
MXS_NOTICE("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
|
||||
new_master->server->unique_name, master_info->relay_log_events());
|
||||
thread_millisleep(1000); // Sleep for a while before querying server again.
|
||||
// Todo: check server version before entering failover.
|
||||
Gtid old_gtid_io_pos = master_info->slave_status.gtid_io_pos;
|
||||
@ -4916,6 +4894,7 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
|
||||
{
|
||||
SERVER* master = mon->master->server;
|
||||
uint32_t servers_joined = 0;
|
||||
bool rejoin_error = false;
|
||||
if (!joinable_servers.empty())
|
||||
{
|
||||
string change_cmd = generate_change_master_cmd(mon, master->name, master->port);
|
||||
@ -4947,6 +4926,10 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
|
||||
MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
|
||||
"redirecting it to '%s'.", name, master_name, master_name);
|
||||
op_success = redirect_one_slave(joinable, change_cmd.c_str());
|
||||
if (!op_success)
|
||||
{
|
||||
rejoin_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (op_success)
|
||||
@ -4955,6 +4938,11 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rejoin_error)
|
||||
{
|
||||
delay_auto_cluster_ops(mon);
|
||||
}
|
||||
return servers_joined;
|
||||
}
|
||||
|
||||
@ -4993,19 +4981,21 @@ static bool join_cluster(MXS_MONITORED_SERVER* server, const char* change_cmd)
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a monitor config parameter to "false". The effect persists over stopMonitor/startMonitor but not
|
||||
* MaxScale restart. Only use on boolean config settings.
|
||||
*
|
||||
* @param mon Cluster monitor
|
||||
* @param setting_name Setting to disable
|
||||
*/
|
||||
static void disable_setting(MYSQL_MONITOR* mon, const char* setting)
|
||||
static void delay_auto_cluster_ops(MYSQL_MONITOR* mon)
|
||||
{
|
||||
MXS_CONFIG_PARAMETER p = {};
|
||||
p.name = const_cast<char*>(setting);
|
||||
p.value = const_cast<char*>("false");
|
||||
monitorAddParameters(mon->monitor, &p);
|
||||
if (mon->auto_failover || mon->auto_rejoin || mon->enforce_read_only_slaves)
|
||||
{
|
||||
const char DISABLING_AUTO_OPS[] = "Disabling automatic cluster operations for %i monitor ticks.";
|
||||
MXS_NOTICE(DISABLING_AUTO_OPS, mon->failcount);
|
||||
}
|
||||
// + 1 is because the start of next tick substracts 1.
|
||||
mon->cluster_operation_disable_timer = mon->failcount + 1;
|
||||
}
|
||||
|
||||
static bool can_perform_cluster_ops(MYSQL_MONITOR* mon)
|
||||
{
|
||||
return (!config_get_global_options()->passive && mon->cluster_operation_disable_timer <= 0 &&
|
||||
!mon->cluster_op_performed);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5132,6 +5122,7 @@ static bool check_sql_files(MYSQL_MONITOR* mon)
|
||||
static void enforce_read_only_on_slaves(MYSQL_MONITOR* mon)
|
||||
{
|
||||
const char QUERY[] = "SET GLOBAL read_only=1;";
|
||||
bool error = false;
|
||||
for (MXS_MONITORED_SERVER* mon_srv = mon->monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
|
||||
{
|
||||
MySqlServerInfo* serv_info = get_server_info(mon, mon_srv);
|
||||
@ -5145,7 +5136,13 @@ static void enforce_read_only_on_slaves(MYSQL_MONITOR* mon)
|
||||
else
|
||||
{
|
||||
MXS_ERROR("Setting read_only on server '%s' failed: '%s.", name, mysql_error(mon_srv->con));
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (error)
|
||||
{
|
||||
delay_auto_cluster_ops(mon);
|
||||
}
|
||||
}
|
||||
|
@ -82,6 +82,8 @@ typedef struct
|
||||
MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */
|
||||
const char* promote_sql_file; /**< File with sql commands which are ran to a server being promoted. */
|
||||
const char* demote_sql_file; /**< File with sql commands which are ran to a server being demoted. */
|
||||
int cluster_operation_disable_timer; /**< Counter for temporary automatic cluster operation disabling. */
|
||||
bool cluster_op_performed; /**< Has an automatic failover/rejoin been performed this loop? */
|
||||
|
||||
MXS_MONITOR* monitor;
|
||||
} MYSQL_MONITOR;
|
||||
|
Loading…
x
Reference in New Issue
Block a user