diff --git a/Documentation/Monitors/Monitor-Common.md b/Documentation/Monitors/Monitor-Common.md index e7107dc8a..f85c1a546 100644 --- a/Documentation/Monitors/Monitor-Common.md +++ b/Documentation/Monitors/Monitor-Common.md @@ -115,19 +115,6 @@ If the script execution exceeds the configured timeout, it is stopped by sending a SIGTERM signal to it. If the process does not stop, a SIGKILL signal will be sent to it once the execution time is greater than twice the configured timeout. -### `failover_timeout` - -The timeout for the cluster failover in seconds. The default value is 90 -seconds. - -If no successful failover takes place within the configured time period, a -message is logged and the failover functionality is disabled. - -This parameter also controls how long a MaxScale instance that has transitioned -from passive to active will wait for a failover to take place after an apparent -loss of a master server. If no new master server is detected within the -configured time period, the failover will be initiated again. - ### `events` A list of event names which cause the script to be executed. If this option is not defined, all events cause the script to be executed. The list must contain a comma separated list of event names. diff --git a/Documentation/Monitors/MySQL-Monitor.md b/Documentation/Monitors/MySQL-Monitor.md index a22d93d6d..a0a3fc859 100644 --- a/Documentation/Monitors/MySQL-Monitor.md +++ b/Documentation/Monitors/MySQL-Monitor.md @@ -214,6 +214,35 @@ assigned the _Slave_ status which allows them to be used like normal slave servers. When the option is disabled, the servers will only receive the _Slave of External Server_ status and they will not be used. +### `failover` + +Enable automated master failover. This parameter expects a boolean value and the +default value is false. + +When the failover functionality is enabled, traditional MariaDB Master-Slave +clusters will automatically elect a new master if the old master goes down. The +failover functionality will not take place when MaxScale is configured as a +passive instance. For details on how MaxScale behaves in passive mode, see the +following documentation of `failover_timeout`. + +If an attempt at failover fails or multiple master servers are detected, an +error is logged and the failover functionality is disabled. If this happens, the +cluster must be fixed manually and the failover needs to be re-enabled via the +REST API or MaxAdmin. + +### `failover_timeout` + +The timeout for the cluster failover in seconds. The default value is 90 +seconds. + +If no successful failover takes place within the configured time period, a +message is logged and the failover functionality is disabled. + +This parameter also controls how long a MaxScale instance that has transitioned +from passive to active will wait for a failover to take place after an apparent +loss of a master server. If no new master server is detected within the +configured time period, the failover will be initiated again. + ## Using the MySQL Monitor With Binlogrouter Since MaxScale 2.2 it's possible to detect a replication setup @@ -252,5 +281,7 @@ script=mail_to_admin.sh events=master_down,slave_down ``` -When a master or a slave server goes down, the script is executed, a mail is sent and the administrator will be immediately notified of any possible problems. -This is just a simple example showing what you can do with MaxScale and monitor scripts. \ No newline at end of file +When a master or a slave server goes down, the script is executed, a mail is +sent and the administrator will be immediately notified of any possible +problems. This is just a simple example showing what you can do with MaxScale +and monitor scripts. diff --git a/include/maxscale/monitor.h b/include/maxscale/monitor.h index 50b4468c8..51f4d2b82 100644 --- a/include/maxscale/monitor.h +++ b/include/maxscale/monitor.h @@ -289,6 +289,22 @@ void release_monitor_servers(MXS_MONITOR *monitor); */ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_t events); +/** + * @brief Process possible failover event + * + * If a master failure has occurred and MaxScale is configured with failover + * functionality, this fuction executes an external failover program to elect + * a new master server. + * + * This function should be called immediately after @c mon_process_state_changes. + * + * @param monitor Monitor whose cluster is processed + * + * @todo Currently this only works with flat replication topologies and + * needs to be moved inside mysqlmon as it is MariaDB specific code. + */ +void mon_process_failover(MXS_MONITOR *monitor); + /** * @brief Hangup connections to failed servers * diff --git a/server/core/monitor.cc b/server/core/monitor.cc index 689a3b570..44227b09b 100644 --- a/server/core/monitor.cc +++ b/server/core/monitor.cc @@ -1728,9 +1728,6 @@ void servers_status_current_to_pending(MXS_MONITOR *monitor) void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_t events) { - MXS_CONFIG* cnf = config_get_global_options(); - MXS_MONITOR_SERVERS* failed_master = NULL; - for (MXS_MONITOR_SERVERS *ptr = monitor->databases; ptr; ptr = ptr->next) { if (mon_status_changed(ptr)) @@ -1752,11 +1749,6 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_ if (event == MASTER_DOWN_EVENT) { monitor->last_master_down = hkheartbeat; - - if (monitor->failover && !cnf->passive) - { - failed_master = ptr; - } } else if (event == MASTER_UP_EVENT || event == NEW_MASTER_EVENT) { @@ -1768,6 +1760,40 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_ monitor_launch_script(monitor, ptr, script, monitor->script_timeout); } } + } +} + +void mon_process_failover(MXS_MONITOR *monitor) +{ + MXS_CONFIG* cnf = config_get_global_options(); + MXS_MONITOR_SERVERS* failed_master = NULL; + + for (MXS_MONITOR_SERVERS *ptr = monitor->databases; ptr; ptr = ptr->next) + { + if (mon_status_changed(ptr)) + { + if (ptr->server->last_event == MASTER_DOWN_EVENT) + { + if (monitor->failover && !cnf->passive) + { + if (failed_master) + { + MXS_ALERT("Multiple failed master servers detected: " + "'%s' is the first master to fail but server " + "'%s' has also triggered a master_down event." + "Aborting and disabling failover.", + failed_master->server->unique_name, + ptr->server->unique_name); + monitorSetFailover(monitor, false); + return; + } + else + { + failed_master = ptr; + } + } + } + } else { /** diff --git a/server/modules/monitor/mysqlmon/mysql_mon.c b/server/modules/monitor/mysqlmon/mysql_mon.c index aa920a5be..77b14f94e 100644 --- a/server/modules/monitor/mysqlmon/mysql_mon.c +++ b/server/modules/monitor/mysqlmon/mysql_mon.c @@ -1401,6 +1401,7 @@ monitorMain(void *arg) * need to be launched. */ mon_process_state_changes(mon, handle->script, handle->events); + mon_process_failover(mon); /* log master detection failure of first master becomes available after failure */ if (root_master &&