From 91e6874ac05f2780549766382158664418816b5d Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Mon, 16 Apr 2018 10:18:40 +0300 Subject: [PATCH] MXS-1703 Failover launch code cleanup Removed one field from MXS_MONITOR, as it was only used by mariadbmon and is unnecessary (the case it handled was impossible). --- include/maxscale/monitor.h | 1 - server/core/monitor.cc | 16 +-- .../mariadbmon/cluster_manipulation.cc | 99 +++++++++++-------- .../modules/monitor/mariadbmon/mariadbmon.cc | 32 +----- .../modules/monitor/mariadbmon/mariadbmon.hh | 3 +- 5 files changed, 61 insertions(+), 90 deletions(-) diff --git a/include/maxscale/monitor.h b/include/maxscale/monitor.h index e23cea728..acffd762d 100644 --- a/include/maxscale/monitor.h +++ b/include/maxscale/monitor.h @@ -219,7 +219,6 @@ struct mxs_monitor bool active; /**< True if monitor is active */ time_t journal_max_age; /**< Maximum age of journal file */ uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */ - bool master_has_failed; /**< Set to true when the latest event is a master_down event */ uint8_t journal_hash[SHA_DIGEST_LENGTH]; /**< SHA1 hash of the latest written journal */ struct mxs_monitor *next; /**< Next monitor in the linked list */ }; diff --git a/server/core/monitor.cc b/server/core/monitor.cc index 1b98b2689..d8d0fc414 100644 --- a/server/core/monitor.cc +++ b/server/core/monitor.cc @@ -1780,21 +1780,9 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_ } } - if (master_down != master_up) + if (master_down && master_up) { - // We either lost the master or gained a new one - if (master_down) - { - monitor->master_has_failed = true; - } - else if (master_up) - { - monitor->master_has_failed = false; - } - } - else if (master_down && master_up) - { - MXS_INFO("Master switch detected: lost a master and gained a new one"); + MXS_NOTICE("Master switch detected: lost a master and gained a new one"); } } diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 249af53b9..8f1745430 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -1450,56 +1450,63 @@ bool MariaDBMonitor::can_replicate_from(MariaDBServer* slave_cand, MariaDBServer * * If a master failure has occurred and MaxScale is configured with failover functionality, this fuction * executes failover to select and promote a new master server. This function should be called immediately - * after @c mon_process_state_changes. + * after @c mon_process_state_changes. If an error occurs, this method disables automatic failover. * - * @param cluster_modified_out Set to true if modifying cluster - * @return True on success, false on error + * @return True if failover was performed, or at least attempted */ -bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out) +bool MariaDBMonitor::handle_auto_failover() { - ss_dassert(*cluster_modified_out == false); + const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor " + "'%s' via MaxAdmin or the REST API, or restart MaxScale."; + bool cluster_modified = false; if (config_get_global_options()->passive || (m_master && m_master->is_master())) { - return true; + return cluster_modified; } - bool rval = true; - MXS_MONITORED_SERVER* failed_master = NULL; - for (MXS_MONITORED_SERVER *ptr = m_monitor_base->monitored_servers; ptr; ptr = ptr->next) + if (failover_not_possible()) { - if (ptr->new_event && ptr->server->last_event == MASTER_DOWN_EVENT) - { - if (failed_master) - { - MXS_ALERT("Multiple failed master servers detected: " - "'%s' is the first master to fail but server " - "'%s' has also triggered a master_down event.", - failed_master->server->unique_name, - ptr->server->unique_name); - return false; - } + const char PROBLEMS[] = "Failover is not possible due to one or more problems in the " + "replication configuration, disabling automatic failover. Failover " + "should only be enabled after the replication configuration has been " + "fixed."; + MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name); + m_auto_failover = false; + disable_setting(CN_AUTO_FAILOVER); + return cluster_modified; + } - if (ptr->server->active_event) + // If master seems to be down, check if slaves are receiving events. + if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events()) + { + MXS_INFO("Master failure not yet confirmed by slaves, delaying failover."); + return cluster_modified; + } + + MariaDBServer* failed_master = NULL; + for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++) + { + MariaDBServer* server = *iter; + MXS_MONITORED_SERVER* mon_server = server->server_base; + if (mon_server->new_event && mon_server->server->last_event == MASTER_DOWN_EVENT) + { + if (mon_server->server->active_event) { // MaxScale was active when the event took place - failed_master = ptr; + failed_master = server; } - else if (m_monitor_base->master_has_failed) + else { - /** - * If a master_down event was triggered when this MaxScale was - * passive, we need to execute the failover script again if no new - * masters have appeared. - */ + /* If a master_down event was triggered when this MaxScale was passive, we need to execute + * the failover script again if no new masters have appeared. */ int64_t timeout = MXS_SEC_TO_CLOCK(m_failover_timeout); - int64_t t = mxs_clock() - ptr->server->triggered_at; + int64_t t = mxs_clock() - mon_server->server->triggered_at; if (t > timeout) { - MXS_WARNING("Failover of server '%s' did not take place within " - "%u seconds, failover needs to be re-triggered", - ptr->server->unique_name, m_failover_timeout); - failed_master = ptr; + MXS_WARNING("Failover of server '%s' did not take place within %u seconds, " + "failover needs to be re-triggered", server->name(), m_failover_timeout); + failed_master = server; } } } @@ -1507,24 +1514,30 @@ bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out) if (failed_master) { - if (m_failcount > 1 && failed_master->mon_err_count == 1) + if (m_failcount > 1 && failed_master->server_base->mon_err_count == 1) { MXS_WARNING("Master has failed. If master status does not change in %d monitor passes, failover " "begins.", m_failcount - 1); } - else if (failed_master->mon_err_count >= m_failcount) + else if (failed_master->server_base->mon_err_count >= m_failcount) { - MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", - failed_master->server->unique_name); - failed_master->new_event = false; - rval = failover_check(NULL) && do_failover(NULL); - if (rval) + MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name()); + failed_master->server_base->new_event = false; + if (failover_check(NULL)) { - *cluster_modified_out = true; + if (!do_failover(NULL)) + { + const char FAILED[] = "Failed to perform failover, disabling automatic failover."; + MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name); + m_auto_failover = false; + disable_setting(CN_AUTO_FAILOVER); + } + cluster_modified = true; } } } - return rval; + + return cluster_modified; } /** @@ -1697,4 +1710,4 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master } return new_master_ok && current_master_ok && gtid_ok; -} \ No newline at end of file +} diff --git a/server/modules/monitor/mariadbmon/mariadbmon.cc b/server/modules/monitor/mariadbmon/mariadbmon.cc index a7dd85bd4..d4e4400d2 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.cc +++ b/server/modules/monitor/mariadbmon/mariadbmon.cc @@ -480,11 +480,11 @@ void MariaDBMonitor::main_loop() * need to be launched. */ mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events); - bool failover_performed = false; // Has an automatic failover been performed this loop? + bool failover_performed = false; // Has an automatic failover been performed (or attempted) this loop? if (m_auto_failover) { - handle_auto_failover(&failover_performed); + failover_performed = handle_auto_failover(); } /* log master detection failure of first master becomes available after failure */ @@ -603,34 +603,6 @@ void MariaDBMonitor::measure_replication_lag(MariaDBServer* root_master) } } -void MariaDBMonitor::handle_auto_failover(bool* failover_performed) -{ - const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor " - "'%s' via MaxAdmin or the REST API, or restart MaxScale."; - if (failover_not_possible()) - { - const char PROBLEMS[] = "Failover is not possible due to one or more problems in the " - "replication configuration, disabling automatic failover. Failover " - "should only be enabled after the replication configuration has been " - "fixed."; - MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name); - m_auto_failover = false; - disable_setting(CN_AUTO_FAILOVER); - } - // If master seems to be down, check if slaves are receiving events. - else if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events()) - { - MXS_INFO("Master failure not yet confirmed by slaves, delaying failover."); - } - else if (!mon_process_failover(failover_performed)) - { - const char FAILED[] = "Failed to perform failover, disabling automatic failover."; - MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name); - m_auto_failover = false; - disable_setting(CN_AUTO_FAILOVER); - } -} - void MariaDBMonitor::log_master_changes(MariaDBServer* root_master_server, int* log_no_master) { MXS_MONITORED_SERVER* root_master = root_master_server ? root_master_server->server_base : NULL; diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index dbfe8b69e..25c6f81f7 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -199,10 +199,9 @@ private: bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master); // Failover methods - void handle_auto_failover(bool* failover_performed); + bool handle_auto_failover(); bool failover_not_possible(); bool slave_receiving_events(); - bool mon_process_failover(bool* cluster_modified_out); bool failover_check(json_t** error_out); bool do_failover(json_t** err_out); bool failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining, json_t** err_out);