Implement simple failover mode into mysqlmon

The mysqlmon simple failover mode allows it to direct write traffic to a
secondary node. This enables a very simple failover mode with MaxScale
when it is used in a two node master-slave setup.
This commit is contained in:
Markus Makela 2016-09-09 07:03:11 +03:00
parent a4aa03a1fb
commit c919511ba7
4 changed files with 153 additions and 0 deletions

View File

@ -119,6 +119,51 @@ This functionality is similar to the [Multi-Master Monitor](MM-Monitor.md)
functionality. The only difference is that the MySQL monitor will also detect
traditional Master-Slave topologies.
### `failover`
Failover mode. This feature takes a boolean parameter is disabled by default.
This parameter is intended to be used with simple, two node master-slave pairs
where the failure of the master can be resolved by "promoting" the slave as the
new master. Normally this is done by using an external agent of some sort
(possibly triggered by MaxScale's monitor scripts), like
[MariaDB Replication Manager](https://github.com/tanji/replication-manager)
or [MHA](https://code.google.com/p/mysql-master-ha/).
The failover mode in mysqlmon is completely passive in the sense that it does
not modify the cluster or any servers in it. It labels a slave server as a
master server when there is only one running server. Before a failover can be
initiated, the following conditions must have been met:
- The monitor has repeatedly failed to connect to the failed servers
- There is only one running server among the monitored servers
- @@read_only is not enabled on the last running server
When these conditions are met, the monitor assigns the last remaining server the
master status and puts all other servers into maintenance mode. This is done to
prevent accidental use of the failed servers if they came back online.
When the failed servers come back up, the maintenance mode needs to be manually
cleared once replication has been set up.
**Note**: A failover will cause permanent changes in the data of the promoted
server. Only use this feature if you know that the slave servers are capable
of acting as master servers.
### `failcount`
Number of failures that must occur on all failed servers before a failover is
initiated. The default value is 5 failures.
The monitor will attemt to contact all servers once per monitoring cycle. When
_failover_ mode is enabled, all of the failed servers must fail _failcount_
number of connection attemps before a failover is initiated.
The formula for calculating the actual number of milliseconds before failover
can start is `monitor_interval * failcount`. This means that to trigger a
failover after 10 seconds of master failure with a _monitor_interval_ of 1000
milliseconds, the value of _failcount_ must be 10.
## Example 1 - Monitor script
Here is an example shell script which sends an email to an admin when a server goes down.

View File

@ -185,6 +185,8 @@ static char *monitor_params[] =
"disable_master_role_setting",
"use_priority",
"multimaster",
"failover",
"failcount",
NULL
};

View File

@ -50,6 +50,8 @@
* @endverbatim
*/
#define MYSQLMON_DEFAULT_FAILCOUNT 5
/**
* The handle for an instance of a MySQL Monitor module
*/
@ -72,6 +74,9 @@ typedef struct
char* script; /*< Script to call when state changes occur on servers */
bool events[MAX_MONITOR_EVENT]; /*< enabled events */
HASHTABLE *server_info; /**< Contains server specific information */
bool failover; /**< If simple failover is enabled */
int failcount; /**< How many monitoring cycles servers must be
down before failover is initiated */
} MYSQL_MONITOR;
#endif

View File

@ -273,6 +273,8 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
handle->script = NULL;
handle->multimaster = false;
handle->mysql51_replication = false;
handle->failover = false;
handle->failcount = MYSQLMON_DEFAULT_FAILCOUNT;
memset(handle->events, false, sizeof(handle->events));
spinlock_init(&handle->lock);
}
@ -295,6 +297,19 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
{
handle->multimaster = config_truth_value(params->value);
}
else if (!strcmp(params->name, "failover"))
{
handle->failover = config_truth_value(params->value);
}
else if (!strcmp(params->name, "failcount"))
{
handle->failcount = atoi(params->value);
if (handle->failcount <= 0)
{
MXS_ERROR("[%s] Invalid value for 'failcount': %s", monitor->name, params->value);
error = true;
}
}
else if (!strcmp(params->name, "script"))
{
if (externcmd_can_execute(params->value))
@ -352,6 +367,7 @@ startMonitor(MONITOR *monitor, const CONFIG_PARAMETER* params)
hashtable_free(handle->server_info);
MXS_FREE(handle->script);
MXS_FREE(handle);
handle = NULL;
}
else if (thread_start(&handle->thread, monitorMain, monitor) == NULL)
{
@ -1021,6 +1037,80 @@ void find_graph_cycles(MYSQL_MONITOR *handle, MONITOR_SERVERS *database, int nse
}
}
/**
* @brief Check whether failover conditions have been met
*
* This function checks whether all the conditions to trigger a failover have
* been met. For a failover to happen, only one server must be available and
* other servers must have passed the configured tolerance level of failures.
*
* @param handle Monitor instance
* @param db Monitor servers
*
* @return True if failover is required
*/
bool failover_required(MYSQL_MONITOR *handle, MONITOR_SERVERS *db)
{
int candidates = 0;
while (db)
{
if (SERVER_IS_RUNNING(db->server))
{
candidates++;
MYSQL_SERVER_INFO *server_info = hashtable_fetch(handle->server_info, db->server->unique_name);
if (server_info->read_only || candidates > 1)
{
return false;
}
}
else if (db->mon_err_count < handle->failcount)
{
return false;
}
db = db->next;
}
return candidates == 1;
}
/**
* @brief Initiate simple failover
*
* This function does the actual failover by assigning the last remaining server
* the master status and setting all other servers into maintenance mode. By
* setting the servers into maintenance mode, we prevent any possible conflicts
* when the failed servers come back up.
*
* @param handle Monitor instance
* @param db Monitor servers
*/
void do_failover(MYSQL_MONITOR *handle, MONITOR_SERVERS *db)
{
while (db)
{
if (SERVER_IS_RUNNING(db->server))
{
if (!SERVER_IS_MASTER(db->server))
{
MXS_WARNING("Failover initiated, server '%s' is now the master. "
"All other servers are set into maintenance mode.",
db->server->unique_name);
}
monitor_set_pending_status(db, SERVER_MASTER);
monitor_clear_pending_status(db, SERVER_SLAVE);
}
else
{
monitor_set_pending_status(db, SERVER_MAINT);
}
db = db->next;
}
}
/**
* The entry point for the monitoring module thread
*
@ -1296,6 +1386,17 @@ monitorMain(void *arg)
ptr = ptr->next;
}
/** Now that all servers have their status correctly set, we can check
if we need to do a failover */
if (handle->failover)
{
if (failover_required(handle, mon->databases))
{
/** Other servers have died, initiate a failover to the last remaining server */
do_failover(handle, mon->databases);
}
}
ptr = mon->databases;
monitor_event_t evtype;
while (ptr)