MXS-1493: Improve master failure detection
The master failure can now be verified by checking when the slaves are connected to the master. If the slaves do not receive any events from the master, the connections are considered as down after a configurable limit. Added two parameters for controlling whether the check is done and for how long the monitor waits before doing the failover.
This commit is contained in:
parent
26b47d0b90
commit
0be39b8545
@ -358,6 +358,35 @@ The password of the replication user. This is given as the value for
|
||||
See `replication_user` parameter documentation for details about the use of this
|
||||
parameter.
|
||||
|
||||
### `verify_master_failure`
|
||||
|
||||
Enable master failure verification for failover. This parameter expects a
|
||||
boolean value and the feature is enabled by default.
|
||||
|
||||
The failure of a master can be verified by checking whether the slaves are still
|
||||
connected to the master. The timeout for master failure verification is
|
||||
controlled by the `master_failure_timeout` parameter.
|
||||
|
||||
### `master_failure_timeout`
|
||||
|
||||
This parameter controls the period of time, in seconds, that the monitor must
|
||||
wait before it can declare that the master has failed. The default value is 10
|
||||
seconds.
|
||||
|
||||
The failure of a master is verified by tracking when the last change to the
|
||||
relay log was done and when the last replication heartbeat was received. If the
|
||||
period of time between the last received event and the time of the check exceeds
|
||||
the configured value, the slave's connection to the master is considered to be
|
||||
broken.
|
||||
|
||||
When all slaves of a failed master are no longer connected to the master, the
|
||||
master failure is verified and the failover can be safely performed.
|
||||
|
||||
If the slaves lose their connections to the master before the configured timeout
|
||||
is exceeded, the failover is performed immediately. This allows a faster
|
||||
failover when the master server crashes causing immediate disconnection of the
|
||||
the network connections.
|
||||
|
||||
## Using the MySQL Monitor With Binlogrouter
|
||||
|
||||
Since MaxScale 2.2 it's possible to detect a replication setup
|
||||
|
@ -71,6 +71,8 @@ typedef struct
|
||||
uint32_t switchover_timeout; /**< Timeout in seconds for the master switchover */
|
||||
char* replication_user; /**< Replication user for failover */
|
||||
char* replication_password; /**< Replication password for failover*/
|
||||
bool verify_master_failure; /**< Whether master failure is verified via slaves */
|
||||
int master_failure_timeout; /**< Time in seconds to wait before doing failover */
|
||||
MXS_MONITOR* monitor;
|
||||
} MYSQL_MONITOR;
|
||||
|
||||
|
@ -81,16 +81,25 @@ static const char CN_SWITCHOVER[] = "switchover";
|
||||
static const char CN_SWITCHOVER_SCRIPT[] = "switchover_script";
|
||||
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
|
||||
|
||||
// Parameters for master failure verification and timeout
|
||||
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
|
||||
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
|
||||
|
||||
// Replication credentials parameters for failover
|
||||
static const char CN_REPLICATION_USER[] = "replication_user";
|
||||
static const char CN_REPLICATION_PASSWORD[] = "replication_password";
|
||||
|
||||
/** Default failover timeout */
|
||||
#define DEFAULT_FAILOVER_TIMEOUT "90"
|
||||
|
||||
/** Default switchover timeout */
|
||||
#define DEFAULT_SWITCHOVER_TIMEOUT "90"
|
||||
|
||||
/** Default master failure verification timeout */
|
||||
#define DEFAULT_MASTER_FAILURE_TIMEOUT "10"
|
||||
|
||||
typedef std::vector<MXS_MONITORED_SERVER*> ServerVector;
|
||||
|
||||
// TODO: Specify the real default failover script.
|
||||
static const char DEFAULT_FAILOVER_SCRIPT[] =
|
||||
"/usr/bin/echo INITIATOR=$INITIATOR "
|
||||
@ -549,6 +558,8 @@ MXS_MODULE* MXS_CREATE_MODULE()
|
||||
{CN_SWITCHOVER_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_SWITCHOVER_TIMEOUT},
|
||||
{CN_REPLICATION_USER, MXS_MODULE_PARAM_STRING},
|
||||
{CN_REPLICATION_PASSWORD, MXS_MODULE_PARAM_STRING},
|
||||
{CN_VERIFY_MASTER_FAILURE, MXS_MODULE_PARAM_BOOL, "true"},
|
||||
{CN_MASTER_FAILURE_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_MASTER_FAILURE_TIMEOUT},
|
||||
{MXS_END_MODULE_PARAMS}
|
||||
}
|
||||
};
|
||||
@ -742,6 +753,8 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
|
||||
handle->switchover = config_get_bool(params, CN_SWITCHOVER);
|
||||
handle->switchover_script = config_copy_string(params, CN_SWITCHOVER_SCRIPT);
|
||||
handle->switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
|
||||
handle->verify_master_failure = config_get_bool(params, CN_VERIFY_MASTER_FAILURE);
|
||||
handle->master_failure_timeout = config_get_integer(params, CN_MASTER_FAILURE_TIMEOUT);
|
||||
|
||||
bool error = false;
|
||||
|
||||
@ -1112,6 +1125,36 @@ static bool update_slave_status(MYSQL_MONITOR* handle, MXS_MONITORED_SERVER* db)
|
||||
return do_show_slave_status(info, db, version);
|
||||
}
|
||||
|
||||
static bool master_still_alive(MYSQL_MONITOR* handle)
|
||||
{
|
||||
bool rval = true;
|
||||
|
||||
if (handle->master && SERVER_IS_DOWN(handle->master->server))
|
||||
{
|
||||
// We have a master and it appears to be dead
|
||||
rval = false;
|
||||
|
||||
for (MXS_MONITORED_SERVER* s = handle->monitor->monitored_servers; s; s = s->next)
|
||||
{
|
||||
MYSQL_SERVER_INFO* info = get_server_info(handle, s);
|
||||
|
||||
if (info->slave_configured && info->master_id == handle->master->server->node_id &&
|
||||
difftime(time(NULL), info->latest_event) < handle->master_failure_timeout)
|
||||
{
|
||||
/**
|
||||
* The slave is still connected to the correct master and has
|
||||
* received events. This means that the master is not dead, but
|
||||
* we just can't connect to it.
|
||||
*/
|
||||
rval = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rval;
|
||||
}
|
||||
|
||||
static inline void monitor_mysql_db(MXS_MONITORED_SERVER* database, MYSQL_SERVER_INFO *serv_info,
|
||||
enum mysql_server_version server_version)
|
||||
{
|
||||
@ -2031,6 +2074,10 @@ monitorMain(void *arg)
|
||||
"'%s' via MaxAdmin or the REST API.", CN_FAILOVER, mon->name);
|
||||
handle->failover = false;
|
||||
}
|
||||
else if (handle->verify_master_failure && master_still_alive(handle))
|
||||
{
|
||||
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
|
||||
}
|
||||
else if (!mon_process_failover(handle, failover_script, handle->failover_timeout))
|
||||
{
|
||||
MXS_ALERT("Failed to perform failover, disabling failover functionality. "
|
||||
|
Loading…
x
Reference in New Issue
Block a user