MXS-1491: Failover can be executed manually

Also, renamed config setting "failover" to "auto_failover". Removed
setting "switchover" as it is now always enabled.
This commit is contained in:
Esa Korhonen
2017-11-28 17:31:13 +02:00
parent 90f6d78a58
commit 508ce3a703
3 changed files with 166 additions and 59 deletions

View File

@ -63,9 +63,8 @@ typedef struct
bool allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
bool warn_set_standalone_master; /**< Log a warning when setting standalone master */
bool allow_external_slaves; /**< Whether to allow usage of external slave servers */
bool failover; /**< If master failover is enabled */
bool auto_failover; /**< If automatic master failover is enabled */
uint32_t failover_timeout; /**< Timeout in seconds for the master failover */
bool switchover; /**< If master switchover is enabled */
uint32_t switchover_timeout; /**< Timeout in seconds for the master switchover */
char* replication_user; /**< Replication user for failover */
char* replication_password; /**< Replication password for failover*/

View File

@ -99,7 +99,7 @@ static int add_slave_to_master(long *, int, long);
static bool isMySQLEvent(mxs_monitor_event_t event);
void check_maxscale_schema_replication(MXS_MONITOR *monitor);
static bool mon_process_failover(MYSQL_MONITOR*, uint32_t, bool*);
static bool do_failover(MYSQL_MONITOR* mon);
static bool do_failover(MYSQL_MONITOR* mon, json_t** output);
static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
MXS_MONITORED_SERVER* new_master,json_t** err_out);
static bool update_gtids(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER *database, MySqlServerInfo* info);
@ -114,9 +114,8 @@ static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
static bool report_version_err = true;
static const char* hb_table_name = "maxscale_schema.replication_heartbeat";
static const char CN_FAILOVER[] = "failover";
static const char CN_AUTO_FAILOVER[] = "auto_failover";
static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER[] = "switchover";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_AUTO_JOIN[] = "auto_join";
@ -124,7 +123,7 @@ static const char CN_AUTO_JOIN[] = "auto_join";
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
// Replication credentials parameters for failover
// Replication credentials parameters for failover/switchover/join
static const char CN_REPLICATION_USER[] = "replication_user";
static const char CN_REPLICATION_PASSWORD[] = "replication_password";
@ -285,6 +284,47 @@ bool mysql_switchover_check(MXS_MONITOR* mon,
return rv;
}
/**
* Check that preconditions for a failover are met.
*
* @param mon Cluster monitor
* @param error_out JSON error out
* @return True if failover may proceed
*/
bool mysql_failover_check(MYSQL_MONITOR* mon, json_t** error_out)
{
// Check that there is no running master and that there is at least one running server in the cluster.
int slaves = 0;
for (MXS_MONITORED_SERVER* mon_server = mon->monitor->monitored_servers;
mon_server != NULL;
mon_server = mon_server->next)
{
uint64_t status_bits = mon_server->server->status;
uint64_t master_up = (SERVER_MASTER | SERVER_RUNNING);
if ((status_bits & master_up) == master_up)
{
string master_up_msg = string("Master server '") + mon_server->server->unique_name +
"' is running";
if (status_bits & SERVER_MAINT)
{
master_up_msg += ", although in maintenance mode";
}
master_up_msg += ".";
PRINT_MXS_JSON_ERROR(error_out, "%s Failover not allowed.", master_up_msg.c_str());
return false;
}
else if (SERVER_IS_SLAVE(mon_server->server))
{
slaves++;
}
}
if (slaves == 0)
{
PRINT_MXS_JSON_ERROR(error_out, "No running slaves, cannot failover.");
}
return slaves > 0;
}
/**
* Handle switchover
*
@ -324,7 +364,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast
if (rv)
{
bool failover = config_get_bool(mon->parameters, CN_FAILOVER);
bool failover = config_get_bool(mon->parameters, CN_AUTO_FAILOVER);
rv = do_switchover(handle, monitored_current_master, monitored_new_master, output);
if (rv)
@ -344,7 +384,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast
{
// TODO: There could be a more convenient way for this.
MXS_CONFIG_PARAMETER p = {};
p.name = const_cast<char*>(CN_FAILOVER);
p.name = const_cast<char*>(CN_AUTO_FAILOVER);
p.value = const_cast<char*>("false");
monitorAddParameters(mon, &p);
@ -396,24 +436,10 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** output)
bool rv = false;
if (!config_get_global_options()->passive)
{
if (mysql_mon->switchover)
{
rv = mysql_switchover(mon, new_master, current_master, output);
}
else
{
MXS_WARNING("Attempt to perform switchover %s -> %s, even though "
"switchover is not enabled.",
current_master ? current_master->unique_name : "none",
new_master->unique_name);
*output = mxs_json_error("Switchover %s -> %s not performed, as switchover is not enabled.",
current_master ? current_master->unique_name : "none",
new_master->unique_name);
}
}
else
{
MXS_WARNING("Attempt to perform switchover %s -> %s, even though "
"MaxScale is in passive mode.",
@ -427,6 +453,80 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** output)
return rv;
}
/**
* Perform user-activated failover
*
* @param mon Cluster monitor
* @param output Json error output
* @return True on success
*/
bool mysql_failover(MXS_MONITOR* mon, json_t** output)
{
bool rv = true;
MYSQL_MONITOR *handle = static_cast<MYSQL_MONITOR*>(mon->handle);
bool stopped = stop_monitor(mon);
if (stopped)
{
MXS_NOTICE("Stopped monitor %s for the duration of failover.", mon->name);
}
else
{
MXS_NOTICE("Monitor %s already stopped, failover can proceed.", mon->name);
}
rv = mysql_failover_check(handle, output);
if (rv)
{
rv = do_failover(handle, output);
if (rv)
{
MXS_NOTICE("Failover performed.");
if (stopped)
{
startMonitor(mon, mon->parameters);
}
}
else
{
PRINT_MXS_JSON_ERROR(output, "Failover failed.");
}
}
else
{
if (stopped)
{
startMonitor(mon, mon->parameters);
}
}
return rv;
}
/**
* Command handler for 'failover'
*
* @param args Arguments given by user
* @param output Json error output
* @return True on success
*/
bool mysql_handle_failover(const MODULECMD_ARG* args, json_t** output)
{
ss_dassert(args->argc == 1);
ss_dassert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
MXS_MONITOR* mon = args->argv[0].value.monitor;
bool rv = false;
if (!config_get_global_options()->passive)
{
rv = mysql_failover(mon, output);
}
else
{
PRINT_MXS_JSON_ERROR(output, "Failover attempted but not performed, as MaxScale is in passive mode.");
}
return rv;
}
/**
* The module entry point routine. It is this routine that
* must populate the structure that is referred to as the
@ -441,12 +541,12 @@ extern "C"
MXS_MODULE* MXS_CREATE_MODULE()
{
MXS_NOTICE("Initialise the MySQL Monitor module.");
const char ARG_MONITOR_DESC[] = "MySQL Monitor name (from configuration file)";
static modulecmd_arg_type_t switchover_argv[] =
{
{
MODULECMD_ARG_MONITOR | MODULECMD_ARG_NAME_MATCHES_DOMAIN,
"MySQL Monitor name (from configuration file)"
ARG_MONITOR_DESC
},
{ MODULECMD_ARG_SERVER, "New master" },
{ MODULECMD_ARG_SERVER | MODULECMD_ARG_OPTIONAL, "Current master (obligatory if exists)" }
@ -456,6 +556,18 @@ MXS_MODULE* MXS_CREATE_MODULE()
mysql_handle_switchover, MXS_ARRAY_NELEMS(switchover_argv), switchover_argv,
"Perform master switchover");
static modulecmd_arg_type_t failover_argv[] =
{
{
MODULECMD_ARG_MONITOR | MODULECMD_ARG_NAME_MATCHES_DOMAIN,
ARG_MONITOR_DESC
},
};
modulecmd_register_command(MXS_MODULE_NAME, "failover", MODULECMD_TYPE_ACTIVE,
mysql_handle_failover, MXS_ARRAY_NELEMS(failover_argv), failover_argv,
"Perform master failover");
static MXS_MONITOR_OBJECT MyObject =
{
startMonitor,
@ -500,9 +612,8 @@ MXS_MODULE* MXS_CREATE_MODULE()
MXS_MODULE_OPT_NONE,
mxs_monitor_event_enum_values
},
{CN_FAILOVER, MXS_MODULE_PARAM_BOOL, "false"},
{CN_AUTO_FAILOVER, MXS_MODULE_PARAM_BOOL, "false"},
{CN_FAILOVER_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_FAILOVER_TIMEOUT},
{CN_SWITCHOVER, MXS_MODULE_PARAM_BOOL, "false"},
{CN_SWITCHOVER_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_SWITCHOVER_TIMEOUT},
{CN_REPLICATION_USER, MXS_MODULE_PARAM_STRING},
{CN_REPLICATION_PASSWORD, MXS_MODULE_PARAM_STRING},
@ -820,9 +931,8 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
handle->script = config_copy_string(params, "script");
handle->events = config_get_enum(params, "events", mxs_monitor_event_enum_values);
handle->allow_external_slaves = config_get_bool(params, "allow_external_slaves");
handle->failover = config_get_bool(params, CN_FAILOVER);
handle->auto_failover = config_get_bool(params, CN_AUTO_FAILOVER);
handle->failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT);
handle->switchover = config_get_bool(params, CN_SWITCHOVER);
handle->switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
handle->verify_master_failure = config_get_bool(params, CN_VERIFY_MASTER_FAILURE);
handle->master_failure_timeout = config_get_integer(params, CN_MASTER_FAILURE_TIMEOUT);
@ -921,9 +1031,8 @@ static void diagnostics(DCB *dcb, const MXS_MONITOR *mon)
{
const MYSQL_MONITOR *handle = (const MYSQL_MONITOR *)mon->handle;
dcb_printf(dcb, "Failover:\t%s\n", handle->failover ? "Enabled" : "Disabled");
dcb_printf(dcb, "Automatic failover:\t%s\n", handle->auto_failover ? "Enabled" : "Disabled");
dcb_printf(dcb, "Failover Timeout:\t%u\n", handle->failover_timeout);
dcb_printf(dcb, "Switchover:\t%s\n", handle->switchover ? "Enabled" : "Disabled");
dcb_printf(dcb, "Switchover Timeout:\t%u\n", handle->switchover_timeout);
dcb_printf(dcb, "Auto join:\t%s\n", handle->auto_join_cluster ? "Enabled" : "Disabled");
dcb_printf(dcb, "MaxScale MonitorId:\t%lu\n", handle->id);
@ -972,9 +1081,8 @@ static json_t* diagnostics_json(const MXS_MONITOR *mon)
json_object_set_new(rval, "failcount", json_integer(handle->failcount));
json_object_set_new(rval, "allow_cluster_recovery", json_boolean(handle->allow_cluster_recovery));
json_object_set_new(rval, "mysql51_replication", json_boolean(handle->mysql51_replication));
json_object_set_new(rval, CN_FAILOVER, json_boolean(handle->failover));
json_object_set_new(rval, CN_AUTO_FAILOVER, json_boolean(handle->auto_failover));
json_object_set_new(rval, CN_FAILOVER_TIMEOUT, json_integer(handle->failover_timeout));
json_object_set_new(rval, CN_SWITCHOVER, json_boolean(handle->switchover));
json_object_set_new(rval, CN_SWITCHOVER_TIMEOUT, json_integer(handle->switchover_timeout));
json_object_set_new(rval, CN_AUTO_JOIN, json_boolean(handle->auto_join_cluster));
@ -2100,17 +2208,18 @@ monitorMain(void *arg)
mon_process_state_changes(mon, handle->script, handle->events);
bool failover_performed = false; // Has an automatic failover been performed this loop?
if (handle->failover)
if (handle->auto_failover)
{
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
if (failover_not_possible(handle))
{
MXS_ERROR("Failover is not possible due to one or more problems in "
"the replication configuration, disabling failover. "
"Failover should only be enabled after the replication "
"configuration has been fixed. To re-enable failover "
"functionality, manually set '%s' to 'true' for monitor "
"'%s' via MaxAdmin or the REST API.", CN_FAILOVER, mon->name);
handle->failover = false;
const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
"replication configuration, disabling automatic failover. Failover should only be "
"enabled after the replication configuration has been fixed.";
MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, mon->name);
handle->auto_failover = false;
disable_setting(handle, CN_AUTO_FAILOVER);
}
else if (master_maybe_dead(handle) && master_still_alive(handle))
{
@ -2118,12 +2227,10 @@ monitorMain(void *arg)
}
else if (!mon_process_failover(handle, handle->failover_timeout, &failover_performed))
{
MXS_ALERT("Failed to perform failover, disabling failover functionality. "
"To enable failover functionality, manually set 'failover' to "
"'true' for monitor '%s' via MaxAdmin or the REST API.", mon->name);
mon_alter_parameter(handle->monitor, CN_FAILOVER, "false");
handle->failover = false;
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, mon->name);
handle->auto_failover = false;
disable_setting(handle, CN_AUTO_FAILOVER);
}
}
@ -2977,7 +3084,7 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
failed_master->server->unique_name);
failed_master->new_event = false;
rval = do_failover(monitor);
rval = mysql_failover_check(monitor, NULL) && do_failover(monitor, NULL);
if (rval)
{
*cluster_modified_out = true;
@ -3170,9 +3277,10 @@ MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon,
*
* @param mon The monitor
* @param new_master The new master
* @param err_out Json error output
* @return True if relay log was processed within time limit, or false if time ran out or an error occurred.
*/
bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master)
bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out)
{
MySqlServerInfo* master_info = get_server_info(mon, new_master);
time_t begin = time(NULL);
@ -3216,7 +3324,8 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
reason = "Invalid Gtid(s) (current_pos: " + master_info->gtid_current_pos.to_string() +
", io_pos: " + master_info->slave_status.gtid_io_pos.to_string() + ")";
}
MXS_ERROR("Failover: %s while waiting for server '%s' to process relay log. Cancelling failover.",
PRINT_MXS_JSON_ERROR(err_out, "Failover: %s while waiting for server '%s' to process relay log. "
"Cancelling failover.",
reason.c_str(), new_master->server->unique_name);
rval = false;
}
@ -3318,28 +3427,29 @@ int failover_redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONIT
* Performs failover for a simple topology (1 master, N slaves, no intermediate masters).
*
* @param mon Server cluster monitor
* @param err_out Json output
* @return True if successful
*/
static bool do_failover(MYSQL_MONITOR* mon)
static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out)
{
// Topology has already been tested to be simple.
if (mon->master_gtid_domain < 0)
{
MXS_ERROR("Cluster gtid domain is unknown. Cannot failover.");
PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot failover.");
return false;
}
// Step 1: Select new master. Also populate a vector with all slaves not the selected master.
ServerVector slaves;
MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &slaves, NULL);
MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &slaves, err_out);
if (new_master == NULL)
{
return false;
}
bool rval = false;
// Step 2: Wait until relay log consumed.
if (failover_wait_relay_log(mon, new_master) &&
if (failover_wait_relay_log(mon, new_master, err_out) &&
// Step 3: Stop and reset slave, set read-only to 0.
failover_promote_new_master(mon, new_master, NULL))
failover_promote_new_master(mon, new_master, err_out))
{
// Step 4: Redirect slaves.
int redirects = failover_redirect_slaves(mon, slaves, new_master);

View File

@ -1585,8 +1585,6 @@ struct subcommand alteroptions[] =
"backend_connect_attempts Number of re-connection attempts\n"
"journal_max_age Maximum age of server state journal\n"
"script_timeout Timeout in seconds for monitor scripts\n"
"failover Enable or disable failover\n"
"failover_timeout Failover timeout in seconds\n"
"\n"
"This will alter an existing parameter of a monitor. To remove parameters,\n"
"pass an empty value for a key e.g. 'maxadmin alter monitor my-monitor my-key='\n"