Implement simple failover mode into mysqlmon

The mysqlmon simple failover mode allows it to direct write traffic to a secondary node. This enables a very simple failover mode with MaxScale when it is used in a two node master-slave setup.
2016-09-09 07:03:11 +03:00
parent a4aa03a1fb
commit c919511ba7
4 changed files with 153 additions and 0 deletions
--- a/Documentation/Monitors/MySQL-Monitor.md
+++ b/Documentation/Monitors/MySQL-Monitor.md
@ -119,6 +119,51 @@ This functionality is similar to the [Multi-Master Monitor](MM-Monitor.md)
 functionality. The only difference is that the MySQL monitor will also detect
 traditional Master-Slave topologies.

+### `failover`
+
+Failover mode. This feature takes a boolean parameter is disabled by default.
+
+This parameter is intended to be used with simple, two node master-slave pairs
+where the failure of the master can be resolved by "promoting" the slave as the
+new master. Normally this is done by using an external agent of some sort
+(possibly triggered by MaxScale's monitor scripts), like
+[MariaDB Replication Manager](https://github.com/tanji/replication-manager)
+or [MHA](https://code.google.com/p/mysql-master-ha/).
+
+The failover mode in mysqlmon is completely passive in the sense that it does
+not modify the cluster or any servers in it. It labels a slave server as a
+master server when there is only one running server. Before a failover can be
+initiated, the following conditions must have been met:
+
+- The monitor has repeatedly failed to connect to the failed servers
+- There is only one running server among the monitored servers
+- @@read_only is not enabled on the last running server
+
+When these conditions are met, the monitor assigns the last remaining server the
+master status and puts all other servers into maintenance mode. This is done to
+prevent accidental use of the failed servers if they came back online.
+
+When the failed servers come back up, the maintenance mode needs to be manually
+cleared once replication has been set up.
+
+**Note**: A failover will cause permanent changes in the data of the promoted
+  server. Only use this feature if you know that the slave servers are capable
+  of acting as master servers.
+
+### `failcount`
+
+Number of failures that must occur on all failed servers before a failover is
+initiated. The default value is 5 failures.
+
+The monitor will attemt to contact all servers once per monitoring cycle. When
+_failover_ mode is enabled, all of the failed servers must fail _failcount_
+number of connection attemps before a failover is initiated.
+
+The formula for calculating the actual number of milliseconds before failover
+can start is `monitor_interval * failcount`. This means that to trigger a
+failover after 10 seconds of master failure with a _monitor_interval_ of 1000
+milliseconds, the value of _failcount_ must be 10.
+
 ## Example 1 - Monitor script

 Here is an example shell script which sends an email to an admin when a server goes down.
--- a/server/core/config.c
+++ b/server/core/config.c
@ -185,6 +185,8 @@ static char *monitor_params[] =
    "disable_master_role_setting",
    "use_priority",
    "multimaster",
+    "failover",
+    "failcount",
    NULL
 };

--- a/server/modules/monitor/mysqlmon.h
+++ b/server/modules/monitor/mysqlmon.h
@ -50,6 +50,8 @@
 * @endverbatim
 */

+#define MYSQLMON_DEFAULT_FAILCOUNT 5
+
 /**
 * The handle for an instance of a MySQL Monitor module
 */
@ -72,6 +74,9 @@ typedef struct
    char* script; /*< Script to call when state changes occur on servers */
    bool events[MAX_MONITOR_EVENT]; /*< enabled events */
    HASHTABLE *server_info; /**< Contains server specific information */
+    bool failover; /**< If simple failover is enabled */
+    int failcount; /**< How many monitoring cycles servers must be
+                                   down before failover is initiated */
 } MYSQL_MONITOR;

 #endif
--- a/server/modules/monitor/mysqlmon/mysql_mon.c
+++ b/server/modules/monitor/mysqlmon/mysql_mon.c
@ -273,6 +273,8 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
        handle->script = NULL;
        handle->multimaster = false;
        handle->mysql51_replication = false;
+        handle->failover = false;
+        handle->failcount = MYSQLMON_DEFAULT_FAILCOUNT;
        memset(handle->events, false, sizeof(handle->events));
        spinlock_init(&handle->lock);
    }
@ -295,6 +297,19 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
        {
            handle->multimaster = config_truth_value(params->value);
        }
+        else if (!strcmp(params->name, "failover"))
+        {
+            handle->failover = config_truth_value(params->value);
+        }
+        else if (!strcmp(params->name, "failcount"))
+        {
+            handle->failcount = atoi(params->value);
+            if (handle->failcount <= 0)
+            {
+                MXS_ERROR("[%s] Invalid value for 'failcount': %s", monitor->name, params->value);
+                error = true;
+            }
+        }
        else if (!strcmp(params->name, "script"))
        {
            if (externcmd_can_execute(params->value))
@ -352,6 +367,7 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
        hashtable_free(handle->server_info);
        MXS_FREE(handle->script);
        MXS_FREE(handle);
+        handle = NULL;
    }
    else if (thread_start(&handle->thread, monitorMain, monitor) == NULL)
    {
@ -1021,6 +1037,80 @@ void find_graph_cycles(MYSQL_MONITOR *handle, MONITOR_SERVERS *database, int nse
    }
 }

+/**
+ * @brief Check whether failover conditions have been met
+ *
+ * This function checks whether all the conditions to trigger a failover have
+ * been met. For a failover to happen, only one server must be available and
+ * other servers must have passed the configured tolerance level of failures.
+ *
+ * @param handle Monitor instance
+ * @param db     Monitor servers
+ *
+ * @return True if failover is required
+ */
+bool failover_required(MYSQL_MONITOR *handle, MONITOR_SERVERS *db)
+{
+    int candidates = 0;
+
+    while (db)
+    {
+        if (SERVER_IS_RUNNING(db->server))
+        {
+            candidates++;
+            MYSQL_SERVER_INFO *server_info = hashtable_fetch(handle->server_info, db->server->unique_name);
+
+            if (server_info->read_only || candidates > 1)
+            {
+                return false;
+            }
+        }
+        else if (db->mon_err_count < handle->failcount)
+        {
+            return false;
+        }
+
+        db = db->next;
+    }
+
+    return candidates == 1;
+}
+
+/**
+ * @brief Initiate simple failover
+ *
+ * This function does the actual failover by assigning the last remaining server
+ * the master status and setting all other servers into maintenance mode. By
+ * setting the servers into maintenance mode, we prevent any possible conflicts
+ * when the failed servers come back up.
+ *
+ * @param handle Monitor instance
+ * @param db     Monitor servers
+ */
+void do_failover(MYSQL_MONITOR *handle, MONITOR_SERVERS *db)
+{
+    while (db)
+    {
+        if (SERVER_IS_RUNNING(db->server))
+        {
+            if (!SERVER_IS_MASTER(db->server))
+            {
+                MXS_WARNING("Failover initiated, server '%s' is now the master. "
+                            "All other servers are set into maintenance mode.",
+                            db->server->unique_name);
+            }
+
+            monitor_set_pending_status(db, SERVER_MASTER);
+            monitor_clear_pending_status(db, SERVER_SLAVE);
+        }
+        else
+        {
+            monitor_set_pending_status(db, SERVER_MAINT);
+        }
+        db = db->next;
+    }
+}
+
 /**
 * The entry point for the monitoring module thread
 *
@ -1296,6 +1386,17 @@ monitorMain(void *arg)
            ptr = ptr->next;
        }

+        /** Now that all servers have their status correctly set, we can check
+            if we need to do a failover */
+        if (handle->failover)
+        {
+            if (failover_required(handle, mon->databases))
+            {
+                /** Other servers have died, initiate a failover to the last remaining server */
+                do_failover(handle, mon->databases);
+            }
+        }
+
        ptr = mon->databases;
        monitor_event_t evtype;
        while (ptr)