MXS-1886 Auto-failover error tolerance

Contains changes from commit 9e68d8ec3ddf1621f533067021c4b3042f695e80 adapted to the 2.3 branch.
2018-06-15 15:59:35 +03:00 · 2018-06-15 15:59:35 +03:00 · d3e9cc9a4f
commit d3e9cc9a4f
parent 72bfc73706
5 changed files with 63 additions and 30 deletions
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@ -92,7 +92,8 @@ bool MariaDBMonitor::manual_failover(json_t** output)
    }

    bool rv = true;
-    rv = failover_check(output);
+    string failover_error;
+    rv = failover_check(&failover_error);
    if (rv)
    {
        rv = do_failover(output);
@ -105,6 +106,11 @@ bool MariaDBMonitor::manual_failover(json_t** output)
            PRINT_MXS_JSON_ERROR(output, "Failover failed.");
        }
    }
+    else
+    {
+        PRINT_MXS_JSON_ERROR(output, "Failover not performed due to the following errors: \n%s",
+            failover_error.c_str());
+    }

    if (running)
    {
@ -659,12 +665,6 @@ bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer
 */
 bool MariaDBMonitor::do_failover(json_t** err_out)
 {
-    // Topology has already been tested to be simple.
-    if (m_master_gtid_domain == GTID_DOMAIN_UNKNOWN)
-    {
-        PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot failover.");
-        return false;
-    }
    // Total time limit on how long this operation may take. Checked and modified after significant steps are
    // completed.
    int seconds_remaining = m_failover_timeout;
@ -1262,15 +1262,24 @@ bool MariaDBMonitor::switchover_check_new(const MariaDBServer* new_master_cand,
 /**
 * Check that preconditions for a failover are met.
 *
- * @param error_out JSON error out
+ * @param error_out Error output
 * @return True if failover may proceed
 */
-bool MariaDBMonitor::failover_check(json_t** error_out)
+bool MariaDBMonitor::failover_check(string* error_out)
 {
-    // Check that there is no running master and that there is at least one running server in the cluster.
-    // Also, all slaves must be using gtid-replication.
+    // Check that there is no running master and that there is at least one running slave in the cluster.
+    // Also, all slaves must be using gtid-replication and the gtid-domain of the cluster must be known.
    int slaves = 0;
    bool error = false;
+    string separator;
+    // Topology has already been tested to be simple.
+    if (m_master_gtid_domain < 0)
+    {
+        *error_out += "Cluster gtid domain is unknown. This is usually caused by the cluster never having "
+                      "a master server while MaxScale was running.";
+        separator = "\n";
+        error = true;
+    }

    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
    {
@ -1285,31 +1294,32 @@ bool MariaDBMonitor::failover_check(json_t** error_out)
                master_up_msg += ", although in maintenance mode";
            }
            master_up_msg += ".";
-            PRINT_MXS_JSON_ERROR(error_out, "%s", master_up_msg.c_str());
+            *error_out += separator + master_up_msg;
+            separator = "\n";
            error = true;
        }
-        else if (server->is_slave())
+        else if (server->is_slave() && !server_is_excluded(server))
        {
-            if (server->uses_gtid(error_out))
+            string gtid_error;
+            if (server->uses_gtid(&gtid_error))
            {
                slaves++;
            }
            else
            {
+                *error_out += separator + gtid_error;
+                separator = "\n";
                error = true;
            }
        }
    }

-    if (error)
+    if (slaves == 0)
    {
-        PRINT_MXS_JSON_ERROR(error_out, "Failover not allowed due to errors.");
+        *error_out += separator + "No valid slaves to promote.";
+        error = true;
    }
-    else if (slaves == 0)
-    {
-        PRINT_MXS_JSON_ERROR(error_out, "No running slaves, cannot failover.");
-    }
-    return !error && slaves > 0;
+    return !error;
 }

 /**
@ -1388,10 +1398,14 @@ bool MariaDBMonitor::handle_auto_failover()
        }
        else if (failed_master->m_server_base->mon_err_count >= m_failcount)
        {
-            MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name());
-            failed_master->m_server_base->new_event = false;
-            if (failover_check(NULL))
+            // Failover is required, but first we should check if preconditions are met.
+            string error_msg;
+            if (failover_check(&error_msg))
            {
+                m_warn_failover_precond = true;
+                MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
+                    failed_master->name());
+                failed_master->m_server_base->new_event = false;
                if (!do_failover(NULL))
                {
                    const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
@ -1401,8 +1415,23 @@ bool MariaDBMonitor::handle_auto_failover()
                }
                cluster_modified = true;
            }
+            else
+            {
+                // Failover was not attempted because of errors, however these errors are not permanent.
+                // Servers were not modified, so it's ok to try this again.
+                if (m_warn_failover_precond)
+                {
+                    MXS_WARNING("Not performing automatic failover. Will keep retrying with this message "
+                        "suppressed. Errors: \n%s", error_msg.c_str());
+                    m_warn_failover_precond = false;
+                }
+            }
        }
    }
+    else
+    {
+        m_warn_failover_precond = true;
+    }

    return cluster_modified;
 }
@ -1546,9 +1575,12 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master
    bool gtid_ok = true;
    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
    {
-        if ((*iter)->is_slave() && !(*iter)->uses_gtid(error_out))
+        MariaDBServer* server = *iter;
+        string gtid_error;
+        if (server->is_slave() && !server->uses_gtid(&gtid_error))
        {
            gtid_ok = false;
+            PRINT_MXS_JSON_ERROR(error_out, "%s", gtid_error.c_str());
        }
    }

--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@ -59,6 +59,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
    , m_switchover_on_low_disk_space(false)
    , m_warn_set_standalone_master(true)
    , m_log_no_master(true)
+    , m_warn_failover_precond(true)
 {}

 MariaDBMonitor::~MariaDBMonitor()
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -158,6 +158,7 @@ private:
    bool m_warn_no_valid_in_cycle;       /**< Log a warning when a replication cycle has no valid master */
    bool m_warn_no_valid_outside_cycle;  /**< Log a warning when a replication topology has no valid master
                                          *   outside of a cycle. */
+    bool m_warn_failover_precond;        /**< Print failover preconditions error message? */

    enum slave_down_setting_t
    {
@ -226,7 +227,7 @@ private:
    bool handle_auto_failover();
    bool failover_not_possible();
    bool slave_receiving_events();
-    bool failover_check(json_t** error_out);
+    bool failover_check(std::string* error_out);
    bool do_failover(json_t** err_out);

    // Rejoin methods
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -520,13 +520,12 @@ json_t* MariaDBServer::diagnostics_json(bool multimaster) const
    return srv;
 }

-bool MariaDBServer::uses_gtid(json_t** error_out)
+bool MariaDBServer::uses_gtid(std::string* error_out)
 {
    bool using_gtid = !m_slave_status.empty() && !m_slave_status[0].gtid_io_pos.empty();
    if (!using_gtid)
    {
-        string slave_not_gtid_msg = string("Slave server ") + name() + " is not using gtid replication.";
-        PRINT_MXS_JSON_ERROR(error_out, "%s", slave_not_gtid_msg.c_str());
+        *error_out = string("Slave server ") + name() + " is not using gtid replication.";
    }
    return using_gtid;
 }
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -298,7 +298,7 @@ public:
     * @return True if using gtid-replication. False if not, or if server is not a slave or otherwise does
     * not have a gtid_IO_Pos.
     */
-    bool uses_gtid(json_t** error_out);
+    bool uses_gtid(std::string* error_out);

    /**
     * Update replication settings, gtid:s and slave status of the server.