MXS-1703 Failover launch code cleanup
Removed one field from MXS_MONITOR, as it was only used by mariadbmon and is unnecessary (the case it handled was impossible).
This commit is contained in:
@ -219,7 +219,6 @@ struct mxs_monitor
|
|||||||
bool active; /**< True if monitor is active */
|
bool active; /**< True if monitor is active */
|
||||||
time_t journal_max_age; /**< Maximum age of journal file */
|
time_t journal_max_age; /**< Maximum age of journal file */
|
||||||
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
|
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
|
||||||
bool master_has_failed; /**< Set to true when the latest event is a master_down event */
|
|
||||||
uint8_t journal_hash[SHA_DIGEST_LENGTH]; /**< SHA1 hash of the latest written journal */
|
uint8_t journal_hash[SHA_DIGEST_LENGTH]; /**< SHA1 hash of the latest written journal */
|
||||||
struct mxs_monitor *next; /**< Next monitor in the linked list */
|
struct mxs_monitor *next; /**< Next monitor in the linked list */
|
||||||
};
|
};
|
||||||
|
@ -1780,21 +1780,9 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (master_down != master_up)
|
if (master_down && master_up)
|
||||||
{
|
{
|
||||||
// We either lost the master or gained a new one
|
MXS_NOTICE("Master switch detected: lost a master and gained a new one");
|
||||||
if (master_down)
|
|
||||||
{
|
|
||||||
monitor->master_has_failed = true;
|
|
||||||
}
|
|
||||||
else if (master_up)
|
|
||||||
{
|
|
||||||
monitor->master_has_failed = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (master_down && master_up)
|
|
||||||
{
|
|
||||||
MXS_INFO("Master switch detected: lost a master and gained a new one");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1450,56 +1450,63 @@ bool MariaDBMonitor::can_replicate_from(MariaDBServer* slave_cand, MariaDBServer
|
|||||||
*
|
*
|
||||||
* If a master failure has occurred and MaxScale is configured with failover functionality, this fuction
|
* If a master failure has occurred and MaxScale is configured with failover functionality, this fuction
|
||||||
* executes failover to select and promote a new master server. This function should be called immediately
|
* executes failover to select and promote a new master server. This function should be called immediately
|
||||||
* after @c mon_process_state_changes.
|
* after @c mon_process_state_changes. If an error occurs, this method disables automatic failover.
|
||||||
*
|
*
|
||||||
* @param cluster_modified_out Set to true if modifying cluster
|
* @return True if failover was performed, or at least attempted
|
||||||
* @return True on success, false on error
|
|
||||||
*/
|
*/
|
||||||
bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out)
|
bool MariaDBMonitor::handle_auto_failover()
|
||||||
{
|
{
|
||||||
ss_dassert(*cluster_modified_out == false);
|
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
|
||||||
|
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
|
||||||
|
bool cluster_modified = false;
|
||||||
if (config_get_global_options()->passive || (m_master && m_master->is_master()))
|
if (config_get_global_options()->passive || (m_master && m_master->is_master()))
|
||||||
{
|
{
|
||||||
return true;
|
return cluster_modified;
|
||||||
}
|
|
||||||
bool rval = true;
|
|
||||||
MXS_MONITORED_SERVER* failed_master = NULL;
|
|
||||||
|
|
||||||
for (MXS_MONITORED_SERVER *ptr = m_monitor_base->monitored_servers; ptr; ptr = ptr->next)
|
|
||||||
{
|
|
||||||
if (ptr->new_event && ptr->server->last_event == MASTER_DOWN_EVENT)
|
|
||||||
{
|
|
||||||
if (failed_master)
|
|
||||||
{
|
|
||||||
MXS_ALERT("Multiple failed master servers detected: "
|
|
||||||
"'%s' is the first master to fail but server "
|
|
||||||
"'%s' has also triggered a master_down event.",
|
|
||||||
failed_master->server->unique_name,
|
|
||||||
ptr->server->unique_name);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ptr->server->active_event)
|
if (failover_not_possible())
|
||||||
|
{
|
||||||
|
const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
|
||||||
|
"replication configuration, disabling automatic failover. Failover "
|
||||||
|
"should only be enabled after the replication configuration has been "
|
||||||
|
"fixed.";
|
||||||
|
MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name);
|
||||||
|
m_auto_failover = false;
|
||||||
|
disable_setting(CN_AUTO_FAILOVER);
|
||||||
|
return cluster_modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If master seems to be down, check if slaves are receiving events.
|
||||||
|
if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events())
|
||||||
|
{
|
||||||
|
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
|
||||||
|
return cluster_modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
MariaDBServer* failed_master = NULL;
|
||||||
|
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
|
||||||
|
{
|
||||||
|
MariaDBServer* server = *iter;
|
||||||
|
MXS_MONITORED_SERVER* mon_server = server->server_base;
|
||||||
|
if (mon_server->new_event && mon_server->server->last_event == MASTER_DOWN_EVENT)
|
||||||
|
{
|
||||||
|
if (mon_server->server->active_event)
|
||||||
{
|
{
|
||||||
// MaxScale was active when the event took place
|
// MaxScale was active when the event took place
|
||||||
failed_master = ptr;
|
failed_master = server;
|
||||||
}
|
}
|
||||||
else if (m_monitor_base->master_has_failed)
|
else
|
||||||
{
|
{
|
||||||
/**
|
/* If a master_down event was triggered when this MaxScale was passive, we need to execute
|
||||||
* If a master_down event was triggered when this MaxScale was
|
* the failover script again if no new masters have appeared. */
|
||||||
* passive, we need to execute the failover script again if no new
|
|
||||||
* masters have appeared.
|
|
||||||
*/
|
|
||||||
int64_t timeout = MXS_SEC_TO_CLOCK(m_failover_timeout);
|
int64_t timeout = MXS_SEC_TO_CLOCK(m_failover_timeout);
|
||||||
int64_t t = mxs_clock() - ptr->server->triggered_at;
|
int64_t t = mxs_clock() - mon_server->server->triggered_at;
|
||||||
|
|
||||||
if (t > timeout)
|
if (t > timeout)
|
||||||
{
|
{
|
||||||
MXS_WARNING("Failover of server '%s' did not take place within "
|
MXS_WARNING("Failover of server '%s' did not take place within %u seconds, "
|
||||||
"%u seconds, failover needs to be re-triggered",
|
"failover needs to be re-triggered", server->name(), m_failover_timeout);
|
||||||
ptr->server->unique_name, m_failover_timeout);
|
failed_master = server;
|
||||||
failed_master = ptr;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1507,24 +1514,30 @@ bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out)
|
|||||||
|
|
||||||
if (failed_master)
|
if (failed_master)
|
||||||
{
|
{
|
||||||
if (m_failcount > 1 && failed_master->mon_err_count == 1)
|
if (m_failcount > 1 && failed_master->server_base->mon_err_count == 1)
|
||||||
{
|
{
|
||||||
MXS_WARNING("Master has failed. If master status does not change in %d monitor passes, failover "
|
MXS_WARNING("Master has failed. If master status does not change in %d monitor passes, failover "
|
||||||
"begins.", m_failcount - 1);
|
"begins.", m_failcount - 1);
|
||||||
}
|
}
|
||||||
else if (failed_master->mon_err_count >= m_failcount)
|
else if (failed_master->server_base->mon_err_count >= m_failcount)
|
||||||
{
|
{
|
||||||
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
|
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name());
|
||||||
failed_master->server->unique_name);
|
failed_master->server_base->new_event = false;
|
||||||
failed_master->new_event = false;
|
if (failover_check(NULL))
|
||||||
rval = failover_check(NULL) && do_failover(NULL);
|
|
||||||
if (rval)
|
|
||||||
{
|
{
|
||||||
*cluster_modified_out = true;
|
if (!do_failover(NULL))
|
||||||
|
{
|
||||||
|
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
|
||||||
|
MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name);
|
||||||
|
m_auto_failover = false;
|
||||||
|
disable_setting(CN_AUTO_FAILOVER);
|
||||||
|
}
|
||||||
|
cluster_modified = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rval;
|
|
||||||
|
return cluster_modified;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -480,11 +480,11 @@ void MariaDBMonitor::main_loop()
|
|||||||
* need to be launched.
|
* need to be launched.
|
||||||
*/
|
*/
|
||||||
mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events);
|
mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events);
|
||||||
bool failover_performed = false; // Has an automatic failover been performed this loop?
|
bool failover_performed = false; // Has an automatic failover been performed (or attempted) this loop?
|
||||||
|
|
||||||
if (m_auto_failover)
|
if (m_auto_failover)
|
||||||
{
|
{
|
||||||
handle_auto_failover(&failover_performed);
|
failover_performed = handle_auto_failover();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* log master detection failure of first master becomes available after failure */
|
/* log master detection failure of first master becomes available after failure */
|
||||||
@ -603,34 +603,6 @@ void MariaDBMonitor::measure_replication_lag(MariaDBServer* root_master)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MariaDBMonitor::handle_auto_failover(bool* failover_performed)
|
|
||||||
{
|
|
||||||
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
|
|
||||||
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
|
|
||||||
if (failover_not_possible())
|
|
||||||
{
|
|
||||||
const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
|
|
||||||
"replication configuration, disabling automatic failover. Failover "
|
|
||||||
"should only be enabled after the replication configuration has been "
|
|
||||||
"fixed.";
|
|
||||||
MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name);
|
|
||||||
m_auto_failover = false;
|
|
||||||
disable_setting(CN_AUTO_FAILOVER);
|
|
||||||
}
|
|
||||||
// If master seems to be down, check if slaves are receiving events.
|
|
||||||
else if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events())
|
|
||||||
{
|
|
||||||
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
|
|
||||||
}
|
|
||||||
else if (!mon_process_failover(failover_performed))
|
|
||||||
{
|
|
||||||
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
|
|
||||||
MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name);
|
|
||||||
m_auto_failover = false;
|
|
||||||
disable_setting(CN_AUTO_FAILOVER);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MariaDBMonitor::log_master_changes(MariaDBServer* root_master_server, int* log_no_master)
|
void MariaDBMonitor::log_master_changes(MariaDBServer* root_master_server, int* log_no_master)
|
||||||
{
|
{
|
||||||
MXS_MONITORED_SERVER* root_master = root_master_server ? root_master_server->server_base : NULL;
|
MXS_MONITORED_SERVER* root_master = root_master_server ? root_master_server->server_base : NULL;
|
||||||
|
@ -199,10 +199,9 @@ private:
|
|||||||
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
|
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
|
||||||
|
|
||||||
// Failover methods
|
// Failover methods
|
||||||
void handle_auto_failover(bool* failover_performed);
|
bool handle_auto_failover();
|
||||||
bool failover_not_possible();
|
bool failover_not_possible();
|
||||||
bool slave_receiving_events();
|
bool slave_receiving_events();
|
||||||
bool mon_process_failover(bool* cluster_modified_out);
|
|
||||||
bool failover_check(json_t** error_out);
|
bool failover_check(json_t** error_out);
|
||||||
bool do_failover(json_t** err_out);
|
bool do_failover(json_t** err_out);
|
||||||
bool failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining, json_t** err_out);
|
bool failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining, json_t** err_out);
|
||||||
|
Reference in New Issue
Block a user