From 91e6874ac05f2780549766382158664418816b5d Mon Sep 17 00:00:00 2001
From: Esa Korhonen <esa.korhonen@mariadb.com>
Date: Mon, 16 Apr 2018 10:18:40 +0300
Subject: [PATCH] MXS-1703 Failover launch code cleanup

Removed one field from MXS_MONITOR, as it was only used by mariadbmon and is
unnecessary (the case it handled was impossible).
---
 include/maxscale/monitor.h                    |  1 -
 server/core/monitor.cc                        | 16 +--
 .../mariadbmon/cluster_manipulation.cc        | 99 +++++++++++--------
 .../modules/monitor/mariadbmon/mariadbmon.cc  | 32 +-----
 .../modules/monitor/mariadbmon/mariadbmon.hh  |  3 +-
 5 files changed, 61 insertions(+), 90 deletions(-)

diff --git a/include/maxscale/monitor.h b/include/maxscale/monitor.h
index e23cea728..acffd762d 100644
--- a/include/maxscale/monitor.h
+++ b/include/maxscale/monitor.h
@@ -219,7 +219,6 @@ struct mxs_monitor
     bool active; /**< True if monitor is active */
     time_t journal_max_age; /**< Maximum age of journal file */
     uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
-    bool master_has_failed; /**< Set to true when the latest event is a master_down event */
     uint8_t journal_hash[SHA_DIGEST_LENGTH]; /**< SHA1 hash of the latest written journal */
     struct mxs_monitor *next;     /**< Next monitor in the linked list */
 };
diff --git a/server/core/monitor.cc b/server/core/monitor.cc
index 1b98b2689..d8d0fc414 100644
--- a/server/core/monitor.cc
+++ b/server/core/monitor.cc
@@ -1780,21 +1780,9 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
         }
     }
 
-    if (master_down != master_up)
+    if (master_down && master_up)
     {
-        // We either lost the master or gained a new one
-        if (master_down)
-        {
-            monitor->master_has_failed = true;
-        }
-        else if (master_up)
-        {
-            monitor->master_has_failed = false;
-        }
-    }
-    else if (master_down && master_up)
-    {
-        MXS_INFO("Master switch detected: lost a master and gained a new one");
+        MXS_NOTICE("Master switch detected: lost a master and gained a new one");
     }
 }
 
diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
index 249af53b9..8f1745430 100644
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@@ -1450,56 +1450,63 @@ bool MariaDBMonitor::can_replicate_from(MariaDBServer* slave_cand, MariaDBServer
  *
  * If a master failure has occurred and MaxScale is configured with failover functionality, this fuction
  * executes failover to select and promote a new master server. This function should be called immediately
- * after @c mon_process_state_changes.
+ * after @c mon_process_state_changes. If an error occurs, this method disables automatic failover.
  *
- * @param cluster_modified_out Set to true if modifying cluster
- * @return True on success, false on error
+ * @return True if failover was performed, or at least attempted
 */
-bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out)
+bool MariaDBMonitor::handle_auto_failover()
 {
-    ss_dassert(*cluster_modified_out == false);
+    const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
+                                 "'%s' via MaxAdmin or the REST API, or restart MaxScale.";
+    bool cluster_modified = false;
     if (config_get_global_options()->passive || (m_master && m_master->is_master()))
     {
-        return true;
+        return cluster_modified;
     }
-    bool rval = true;
-    MXS_MONITORED_SERVER* failed_master = NULL;
 
-    for (MXS_MONITORED_SERVER *ptr = m_monitor_base->monitored_servers; ptr; ptr = ptr->next)
+    if (failover_not_possible())
     {
-        if (ptr->new_event && ptr->server->last_event == MASTER_DOWN_EVENT)
-        {
-            if (failed_master)
-            {
-                MXS_ALERT("Multiple failed master servers detected: "
-                          "'%s' is the first master to fail but server "
-                          "'%s' has also triggered a master_down event.",
-                          failed_master->server->unique_name,
-                          ptr->server->unique_name);
-                return false;
-            }
+        const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
+                                "replication configuration, disabling automatic failover. Failover "
+                                "should only be enabled after the replication configuration has been "
+                                "fixed.";
+        MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name);
+        m_auto_failover = false;
+        disable_setting(CN_AUTO_FAILOVER);
+        return cluster_modified;
+    }
 
-            if (ptr->server->active_event)
+    // If master seems to be down, check if slaves are receiving events.
+    if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events())
+    {
+        MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
+        return cluster_modified;
+    }
+
+    MariaDBServer* failed_master = NULL;
+    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
+    {
+        MariaDBServer* server = *iter;
+        MXS_MONITORED_SERVER* mon_server = server->server_base;
+        if (mon_server->new_event && mon_server->server->last_event == MASTER_DOWN_EVENT)
+        {
+            if (mon_server->server->active_event)
             {
                 // MaxScale was active when the event took place
-                failed_master = ptr;
+                failed_master = server;
             }
-            else if (m_monitor_base->master_has_failed)
+            else
             {
-                /**
-                 * If a master_down event was triggered when this MaxScale was
-                 * passive, we need to execute the failover script again if no new
-                 * masters have appeared.
-                 */
+                /* If a master_down event was triggered when this MaxScale was passive, we need to execute
+                 * the failover script again if no new masters have appeared. */
                 int64_t timeout = MXS_SEC_TO_CLOCK(m_failover_timeout);
-                int64_t t = mxs_clock() - ptr->server->triggered_at;
+                int64_t t = mxs_clock() - mon_server->server->triggered_at;
 
                 if (t > timeout)
                 {
-                    MXS_WARNING("Failover of server '%s' did not take place within "
-                                "%u seconds, failover needs to be re-triggered",
-                                ptr->server->unique_name, m_failover_timeout);
-                    failed_master = ptr;
+                    MXS_WARNING("Failover of server '%s' did not take place within %u seconds, "
+                                "failover needs to be re-triggered", server->name(), m_failover_timeout);
+                    failed_master = server;
                 }
             }
         }
@@ -1507,24 +1514,30 @@ bool MariaDBMonitor::mon_process_failover(bool* cluster_modified_out)
 
     if (failed_master)
     {
-        if (m_failcount > 1 && failed_master->mon_err_count == 1)
+        if (m_failcount > 1 && failed_master->server_base->mon_err_count == 1)
         {
             MXS_WARNING("Master has failed. If master status does not change in %d monitor passes, failover "
                         "begins.", m_failcount - 1);
         }
-        else if (failed_master->mon_err_count >= m_failcount)
+        else if (failed_master->server_base->mon_err_count >= m_failcount)
         {
-            MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
-                       failed_master->server->unique_name);
-            failed_master->new_event = false;
-            rval = failover_check(NULL) && do_failover(NULL);
-            if (rval)
+            MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name());
+            failed_master->server_base->new_event = false;
+            if (failover_check(NULL))
             {
-                *cluster_modified_out = true;
+                if (!do_failover(NULL))
+                {
+                    const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
+                    MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name);
+                    m_auto_failover = false;
+                    disable_setting(CN_AUTO_FAILOVER);
+                }
+                cluster_modified = true;
             }
         }
     }
-    return rval;
+
+    return cluster_modified;
 }
 
 /**
@@ -1697,4 +1710,4 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master
     }
 
     return new_master_ok && current_master_ok && gtid_ok;
-}
\ No newline at end of file
+}
diff --git a/server/modules/monitor/mariadbmon/mariadbmon.cc b/server/modules/monitor/mariadbmon/mariadbmon.cc
index a7dd85bd4..d4e4400d2 100644
--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@@ -480,11 +480,11 @@ void MariaDBMonitor::main_loop()
          * need to be launched.
          */
         mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events);
-        bool failover_performed = false; // Has an automatic failover been performed this loop?
+        bool failover_performed = false; // Has an automatic failover been performed (or attempted) this loop?
 
         if (m_auto_failover)
         {
-            handle_auto_failover(&failover_performed);
+            failover_performed = handle_auto_failover();
         }
 
         /* log master detection failure of first master becomes available after failure */
@@ -603,34 +603,6 @@ void MariaDBMonitor::measure_replication_lag(MariaDBServer* root_master)
     }
 }
 
-void MariaDBMonitor::handle_auto_failover(bool* failover_performed)
-{
-    const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
-                                 "'%s' via MaxAdmin or the REST API, or restart MaxScale.";
-    if (failover_not_possible())
-    {
-        const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
-                                "replication configuration, disabling automatic failover. Failover "
-                                "should only be enabled after the replication configuration has been "
-                                "fixed.";
-        MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor_base->name);
-        m_auto_failover = false;
-        disable_setting(CN_AUTO_FAILOVER);
-    }
-    // If master seems to be down, check if slaves are receiving events.
-    else if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events())
-    {
-        MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
-    }
-    else if (!mon_process_failover(failover_performed))
-    {
-        const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
-        MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name);
-        m_auto_failover = false;
-        disable_setting(CN_AUTO_FAILOVER);
-    }
-}
-
 void MariaDBMonitor::log_master_changes(MariaDBServer* root_master_server, int* log_no_master)
 {
     MXS_MONITORED_SERVER* root_master = root_master_server ? root_master_server->server_base : NULL;
diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh
index dbfe8b69e..25c6f81f7 100644
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@@ -199,10 +199,9 @@ private:
     bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
 
     // Failover methods
-    void handle_auto_failover(bool* failover_performed);
+    bool handle_auto_failover();
     bool failover_not_possible();
     bool slave_receiving_events();
-    bool mon_process_failover(bool* cluster_modified_out);
     bool failover_check(json_t** error_out);
     bool do_failover(json_t** err_out);
     bool failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining, json_t** err_out);