MXS-1490 Prepare for failover functionality addition
Moved mon_process_failover() from monitor.cc to mysql_mon.cc. Renamed some functions and variables related to previous failover functionality to avoid confusion.
This commit is contained in:
@ -300,28 +300,6 @@ void mon_alter_parameter(MXS_MONITOR* monitor, const char* key, const char* valu
|
|||||||
*/
|
*/
|
||||||
void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_t events);
|
void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_t events);
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Process possible failover event
|
|
||||||
*
|
|
||||||
* If a master failure has occurred and MaxScale is configured with failover
|
|
||||||
* functionality, this fuction executes an external failover program to elect
|
|
||||||
* a new master server.
|
|
||||||
*
|
|
||||||
* This function should be called immediately after @c mon_process_state_changes.
|
|
||||||
*
|
|
||||||
* @param monitor Monitor whose cluster is processed
|
|
||||||
* @param failover_script The script to be used for performing the failover.
|
|
||||||
* @param failover_timeout Timeout in seconds for the failover
|
|
||||||
*
|
|
||||||
* @return True on success, false on error
|
|
||||||
*
|
|
||||||
* @todo Currently this only works with flat replication topologies and
|
|
||||||
* needs to be moved inside mysqlmon as it is MariaDB specific code.
|
|
||||||
*/
|
|
||||||
bool mon_process_failover(MXS_MONITOR *monitor,
|
|
||||||
const char* failover_script,
|
|
||||||
uint32_t failover_timeout);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Hangup connections to failed servers
|
* @brief Hangup connections to failed servers
|
||||||
*
|
*
|
||||||
|
@ -1775,79 +1775,6 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mon_process_failover(MXS_MONITOR *monitor, const char* failover_script, uint32_t failover_timeout)
|
|
||||||
{
|
|
||||||
bool rval = true;
|
|
||||||
MXS_CONFIG* cnf = config_get_global_options();
|
|
||||||
MXS_MONITORED_SERVER* failed_master = NULL;
|
|
||||||
|
|
||||||
for (MXS_MONITORED_SERVER *ptr = monitor->monitored_servers; ptr; ptr = ptr->next)
|
|
||||||
{
|
|
||||||
if (mon_status_changed(ptr))
|
|
||||||
{
|
|
||||||
if (ptr->server->last_event == MASTER_DOWN_EVENT)
|
|
||||||
{
|
|
||||||
if (!cnf->passive)
|
|
||||||
{
|
|
||||||
if (failed_master)
|
|
||||||
{
|
|
||||||
MXS_ALERT("Multiple failed master servers detected: "
|
|
||||||
"'%s' is the first master to fail but server "
|
|
||||||
"'%s' has also triggered a master_down event.",
|
|
||||||
failed_master->server->unique_name,
|
|
||||||
ptr->server->unique_name);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
failed_master = ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* If a master_down event was triggered when this MaxScale was
|
|
||||||
* passive, we need to execute the failover script again if no new
|
|
||||||
* masters have appeared and this MaxScale has been set as active
|
|
||||||
* since the event took place.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (!cnf->passive && // This is not a passive MaxScale
|
|
||||||
ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down
|
|
||||||
cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place
|
|
||||||
ptr->new_event && // Event has not yet been processed
|
|
||||||
monitor->last_master_down > monitor->last_master_up) // Latest relevant event
|
|
||||||
{
|
|
||||||
int64_t timeout = SEC_TO_HB(failover_timeout);
|
|
||||||
int64_t t = hkheartbeat - ptr->server->triggered_at;
|
|
||||||
|
|
||||||
if (t > timeout)
|
|
||||||
{
|
|
||||||
MXS_WARNING("Failover of server '%s' did not take place within "
|
|
||||||
"%u seconds, failover needs to be re-triggered",
|
|
||||||
ptr->server->unique_name, failover_timeout);
|
|
||||||
failed_master = ptr;
|
|
||||||
ptr->new_event = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (failed_master)
|
|
||||||
{
|
|
||||||
MXS_NOTICE("Performing failover of server '%s'", failed_master->server->unique_name);
|
|
||||||
|
|
||||||
if (monitor_launch_script(monitor, failed_master, failover_script, failover_timeout))
|
|
||||||
{
|
|
||||||
rval = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rval;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char* monitor_state_to_string(int state)
|
static const char* monitor_state_to_string(int state)
|
||||||
{
|
{
|
||||||
switch (state)
|
switch (state)
|
||||||
|
@ -61,7 +61,7 @@ typedef struct
|
|||||||
int failcount; /**< How many monitoring cycles servers must be
|
int failcount; /**< How many monitoring cycles servers must be
|
||||||
down before failover is initiated */
|
down before failover is initiated */
|
||||||
bool allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
|
bool allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
|
||||||
bool warn_failover; /**< Log a warning when failover happens */
|
bool warn_set_standalone_master; /**< Log a warning when setting standalone master */
|
||||||
bool allow_external_slaves; /**< Whether to allow usage of external slave servers */
|
bool allow_external_slaves; /**< Whether to allow usage of external slave servers */
|
||||||
bool failover; /**< If master failover is enabled */
|
bool failover; /**< If master failover is enabled */
|
||||||
char* failover_script; /**< Script to call for performing master failover */
|
char* failover_script; /**< Script to call for performing master failover */
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
#include <maxscale/alloc.h>
|
#include <maxscale/alloc.h>
|
||||||
#include <maxscale/dcb.h>
|
#include <maxscale/dcb.h>
|
||||||
#include <maxscale/debug.h>
|
#include <maxscale/debug.h>
|
||||||
|
#include <maxscale/hk_heartbeat.h>
|
||||||
#include <maxscale/json_api.h>
|
#include <maxscale/json_api.h>
|
||||||
#include <maxscale/modulecmd.h>
|
#include <maxscale/modulecmd.h>
|
||||||
#include <maxscale/modutil.h>
|
#include <maxscale/modutil.h>
|
||||||
@ -64,6 +65,8 @@ static void set_slave_heartbeat(MXS_MONITOR *, MXS_MONITORED_SERVER *);
|
|||||||
static int add_slave_to_master(long *, int, long);
|
static int add_slave_to_master(long *, int, long);
|
||||||
static bool isMySQLEvent(mxs_monitor_event_t event);
|
static bool isMySQLEvent(mxs_monitor_event_t event);
|
||||||
void check_maxscale_schema_replication(MXS_MONITOR *monitor);
|
void check_maxscale_schema_replication(MXS_MONITOR *monitor);
|
||||||
|
static bool mon_process_failover(MYSQL_MONITOR* monitor, const char* failover_script, uint32_t failover_timeout);
|
||||||
|
static bool do_failover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* failed_master);
|
||||||
static bool report_version_err = true;
|
static bool report_version_err = true;
|
||||||
static const char* hb_table_name = "maxscale_schema.replication_heartbeat";
|
static const char* hb_table_name = "maxscale_schema.replication_heartbeat";
|
||||||
|
|
||||||
@ -705,7 +708,7 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
|
|||||||
handle->server_info = server_info;
|
handle->server_info = server_info;
|
||||||
handle->shutdown = 0;
|
handle->shutdown = 0;
|
||||||
handle->id = config_get_global_options()->id;
|
handle->id = config_get_global_options()->id;
|
||||||
handle->warn_failover = true;
|
handle->warn_set_standalone_master = true;
|
||||||
handle->monitor = monitor;
|
handle->monitor = monitor;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1586,18 +1589,18 @@ void find_graph_cycles(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *database, in
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Check whether failover conditions have been met
|
* @brief Check whether standalone master conditions have been met
|
||||||
*
|
*
|
||||||
* This function checks whether all the conditions to trigger a failover have
|
* This function checks whether all the conditions to use a standalone master have
|
||||||
* been met. For a failover to happen, only one server must be available and
|
* been met. For this to happen, only one server must be available and
|
||||||
* other servers must have passed the configured tolerance level of failures.
|
* other servers must have passed the configured tolerance level of failures.
|
||||||
*
|
*
|
||||||
* @param handle Monitor instance
|
* @param handle Monitor instance
|
||||||
* @param db Monitor servers
|
* @param db Monitor servers
|
||||||
*
|
*
|
||||||
* @return True if failover is required
|
* @return True if standalone master should be used
|
||||||
*/
|
*/
|
||||||
bool failover_required(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *db)
|
bool standalone_master_required(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *db)
|
||||||
{
|
{
|
||||||
int candidates = 0;
|
int candidates = 0;
|
||||||
|
|
||||||
@ -1625,29 +1628,28 @@ bool failover_required(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *db)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Initiate simple failover
|
* @brief Use standalone master
|
||||||
*
|
*
|
||||||
* This function does the actual failover by assigning the last remaining server
|
* This function assigns the last remaining server the master status and sets all other
|
||||||
* the master status and setting all other servers into maintenance mode. By
|
* servers into maintenance mode. By setting the servers into maintenance mode, we
|
||||||
* setting the servers into maintenance mode, we prevent any possible conflicts
|
* prevent any possible conflicts when the failed servers come back up.
|
||||||
* when the failed servers come back up.
|
|
||||||
*
|
*
|
||||||
* @param handle Monitor instance
|
* @param handle Monitor instance
|
||||||
* @param db Monitor servers
|
* @param db Monitor servers
|
||||||
*/
|
*/
|
||||||
void do_failover(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *db)
|
void set_standalone_master(MYSQL_MONITOR *handle, MXS_MONITORED_SERVER *db)
|
||||||
{
|
{
|
||||||
while (db)
|
while (db)
|
||||||
{
|
{
|
||||||
if (SERVER_IS_RUNNING(db->server))
|
if (SERVER_IS_RUNNING(db->server))
|
||||||
{
|
{
|
||||||
if (!SERVER_IS_MASTER(db->server) && handle->warn_failover)
|
if (!SERVER_IS_MASTER(db->server) && handle->warn_set_standalone_master)
|
||||||
{
|
{
|
||||||
MXS_WARNING("Failover initiated, server '%s' is now the master.%s",
|
MXS_WARNING("Setting standalone master, server '%s' is now the master.%s",
|
||||||
db->server->unique_name,
|
db->server->unique_name,
|
||||||
handle->allow_cluster_recovery ?
|
handle->allow_cluster_recovery ?
|
||||||
"" : " All other servers are set into maintenance mode.");
|
"" : " All other servers are set into maintenance mode.");
|
||||||
handle->warn_failover = false;
|
handle->warn_set_standalone_master = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
server_clear_set_status(db->server, SERVER_SLAVE, SERVER_MASTER | SERVER_STALE_STATUS);
|
server_clear_set_status(db->server, SERVER_SLAVE, SERVER_MASTER | SERVER_STALE_STATUS);
|
||||||
@ -1956,17 +1958,17 @@ monitorMain(void *arg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Now that all servers have their status correctly set, we can check
|
/** Now that all servers have their status correctly set, we can check
|
||||||
if we need to do a failover */
|
if we need to use standalone master. */
|
||||||
if (handle->detect_standalone_master)
|
if (handle->detect_standalone_master)
|
||||||
{
|
{
|
||||||
if (failover_required(handle, mon->monitored_servers))
|
if (standalone_master_required(handle, mon->monitored_servers))
|
||||||
{
|
{
|
||||||
/** Other servers have died, initiate a failover to the last remaining server */
|
/** Other servers have died, set last remaining server as master */
|
||||||
do_failover(handle, mon->monitored_servers);
|
set_standalone_master(handle, mon->monitored_servers);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
handle->warn_failover = true;
|
handle->warn_set_standalone_master = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1995,7 +1997,7 @@ monitorMain(void *arg)
|
|||||||
"'%s' via MaxAdmin or the REST API.", CN_FAILOVER, mon->name);
|
"'%s' via MaxAdmin or the REST API.", CN_FAILOVER, mon->name);
|
||||||
handle->failover = false;
|
handle->failover = false;
|
||||||
}
|
}
|
||||||
else if (!mon_process_failover(mon, failover_script, handle->failover_timeout))
|
else if (!mon_process_failover(handle, failover_script, handle->failover_timeout))
|
||||||
{
|
{
|
||||||
MXS_ALERT("Failed to perform failover, disabling failover functionality. "
|
MXS_ALERT("Failed to perform failover, disabling failover functionality. "
|
||||||
"To enable failover functionality, manually set 'failover' to "
|
"To enable failover functionality, manually set 'failover' to "
|
||||||
@ -2759,3 +2761,96 @@ void check_maxscale_schema_replication(MXS_MONITOR *monitor)
|
|||||||
"the table is replicated to all slaves.", hb_table_name);
|
"the table is replicated to all slaves.", hb_table_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Process possible failover event
|
||||||
|
*
|
||||||
|
* If a master failure has occurred and MaxScale is configured with failover
|
||||||
|
* functionality, this fuction executes an external failover program to elect
|
||||||
|
* a new master server.
|
||||||
|
*
|
||||||
|
* This function should be called immediately after @c mon_process_state_changes.
|
||||||
|
*
|
||||||
|
* @param monitor Monitor whose cluster is processed
|
||||||
|
* @param failover_script The script to be used for performing the failover.
|
||||||
|
* @param failover_timeout Timeout in seconds for the failover
|
||||||
|
*
|
||||||
|
* @return True on success, false on error
|
||||||
|
*
|
||||||
|
* @todo Currently this only works with flat replication topologies and
|
||||||
|
* needs to be moved inside mysqlmon as it is MariaDB specific code.
|
||||||
|
*/
|
||||||
|
bool mon_process_failover(MYSQL_MONITOR* monitor, const char* failover_script, uint32_t failover_timeout)
|
||||||
|
{
|
||||||
|
bool rval = true;
|
||||||
|
MXS_CONFIG* cnf = config_get_global_options();
|
||||||
|
MXS_MONITORED_SERVER* failed_master = NULL;
|
||||||
|
|
||||||
|
for (MXS_MONITORED_SERVER *ptr = monitor->monitor->monitored_servers; ptr; ptr = ptr->next)
|
||||||
|
{
|
||||||
|
if (mon_status_changed(ptr))
|
||||||
|
{
|
||||||
|
if (ptr->server->last_event == MASTER_DOWN_EVENT)
|
||||||
|
{
|
||||||
|
if (!cnf->passive)
|
||||||
|
{
|
||||||
|
if (failed_master)
|
||||||
|
{
|
||||||
|
MXS_ALERT("Multiple failed master servers detected: "
|
||||||
|
"'%s' is the first master to fail but server "
|
||||||
|
"'%s' has also triggered a master_down event.",
|
||||||
|
failed_master->server->unique_name,
|
||||||
|
ptr->server->unique_name);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
failed_master = ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* If a master_down event was triggered when this MaxScale was
|
||||||
|
* passive, we need to execute the failover script again if no new
|
||||||
|
* masters have appeared and this MaxScale has been set as active
|
||||||
|
* since the event took place.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (!cnf->passive && // This is not a passive MaxScale
|
||||||
|
ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down
|
||||||
|
cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place
|
||||||
|
ptr->new_event && // Event has not yet been processed
|
||||||
|
monitor->monitor->last_master_down > monitor->monitor->last_master_up) // Latest relevant event
|
||||||
|
{
|
||||||
|
int64_t timeout = SEC_TO_HB(failover_timeout);
|
||||||
|
int64_t t = hkheartbeat - ptr->server->triggered_at;
|
||||||
|
|
||||||
|
if (t > timeout)
|
||||||
|
{
|
||||||
|
MXS_WARNING("Failover of server '%s' did not take place within "
|
||||||
|
"%u seconds, failover needs to be re-triggered",
|
||||||
|
ptr->server->unique_name, failover_timeout);
|
||||||
|
failed_master = ptr;
|
||||||
|
ptr->new_event = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (failed_master)
|
||||||
|
{
|
||||||
|
MXS_NOTICE("Performing failover of server '%s'", failed_master->server->unique_name);
|
||||||
|
rval = do_failover(monitor, failed_master);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rval;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool do_failover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* failed_master)
|
||||||
|
{
|
||||||
|
// Implement here a simple failover script
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user