MXS-1845 Cluster stabilization rewrite

No longer writes events to the master, as this creates problems if the promoted server was not the overall master. Instead, the slave status output is inspected.
2018-10-01 10:16:52 +03:00
parent 1ca5d02abb
commit d14b9bfe43
4 changed files with 172 additions and 36 deletions
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@ -14,7 +14,9 @@
 #include "mariadbmon.hh"

 #include <inttypes.h>
+#include <set>
 #include <sstream>
+#include <maxbase/stopwatch.hh>
 #include <maxscale/clock.h>
 #include <maxscale/mysql_utils.h>
 #include <maxscale/utils.hh>
@ -23,6 +25,7 @@ using std::string;
 using std::unique_ptr;
 using maxscale::string_printf;
 using maxbase::StopWatch;
+using maxbase::Duration;

 static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' "
                                    "for monitor '%s' via MaxAdmin or the REST API, or restart MaxScale.";
@ -837,52 +840,35 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op)
    ServerArray redirectable_slaves = get_redirectables(promotion_target, op.demotion_target);

    bool rval = false;
-    // Step 2: Stop and reset slave, set read-only to 0.
+    // Step 2: Stop and reset slave, set read-only to OFF.
    if (promotion_target->promote(op))
    {
-        // Point of no return. Even if following steps fail, do not try to undo.
-        m_next_master = promotion_target;
+        // Point of no return. Even if following steps fail, do not try to undo. Failover considered
+        // at least partially successful.
+        rval = true;
        m_cluster_modified = true;
+        if (op.demotion_target_is_master)
+        {
+            // Force a master swap on next tick.
+            m_next_master = promotion_target;
+        }

        // Step 3: Redirect slaves.
        ServerArray redirected_slaves;
-        int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves);
-        bool success = redirectable_slaves.empty() ? true : redirects > 0;
-        if (success)
+        redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves);
+        if (!redirected_slaves.empty())
        {
            StopWatch timer;
-            // Step 4: Finally, add an event to the new master to advance gtid and wait for the slaves
-            // to receive it. seconds_remaining can be 0 or less at this point. Even in such a case
-            // wait_cluster_stabilization() may succeed if replication is fast enough. If using external
-            // replication, skip this step. Come up with an alternative later.
-            if (m_external_master_port != PORT_UNKNOWN)
-            {
-                MXS_WARNING("Replicating from external master, skipping final check.");
-                rval = true;
-            }
-            else if (redirected_slaves.empty())
-            {
-                // No slaves to check. Assume success.
-                rval = true;
-                MXS_DEBUG("Failover: no slaves to redirect, skipping stabilization check.");
-            }
-            else if (wait_cluster_stabilization(promotion_target, redirected_slaves,
-                                                op.time_remaining.secs()))
-            {
-                rval = true;
-                auto step4_time = timer.restart();
-                op.time_remaining -= step4_time;
-                MXS_DEBUG("Failover: slave replication confirmation took %.1f seconds with "
-                          "%.1f seconds to spare.",
-                          step4_time.secs(), op.time_remaining.secs());
-            }
-        }
-        else
-        {
-            print_redirect_errors(NULL, redirectable_slaves, op.error_out);
+            /* Step 4: Finally, check that slaves are connected to the new master. Even if
+             * time is out at this point, wait_cluster_stabilization() will check the slaves
+             * once so that latest status is printed. */
+            wait_cluster_stabilization_ex(op, redirected_slaves);
+            auto step4_time = timer.lap();
+            MXS_DEBUG("Failover: slave replication confirmation took %.1f seconds with "
+                      "%.1f seconds to spare.",
+                      step4_time.secs(), op.time_remaining.secs());
        }
    }
-
    return rval;
 }

@ -1142,6 +1128,132 @@ bool MariaDBMonitor::wait_cluster_stabilization(MariaDBServer* new_master,
    return rval;
 }

+/**
+ * Check that the given slaves are connected and replicating from the new master. Only checks
+ * the SLAVE STATUS of the slaves.
+ *
+ * @param op Operation descriptor
+ * @param redirected_slaves Slaves to check
+ */
+void MariaDBMonitor::wait_cluster_stabilization_ex(ClusterOperation& op, const ServerArray& redirected_slaves)
+{
+    if (redirected_slaves.empty())
+    {
+        // No need to check anything or print messages.
+        return;
+    }
+
+    StopWatch timer;
+    const MariaDBServer* new_master = op.promotion_target;
+    // Check all the servers in the list. Using a set because erasing from container.
+    std::set<MariaDBServer*> unconfirmed(redirected_slaves.begin(), redirected_slaves.end());
+    ServerArray successes;
+    ServerArray repl_fails;
+    ServerArray query_fails;
+    bool time_is_up = false;    // Try at least once, even if time is up.
+
+    while (!unconfirmed.empty() && !time_is_up)
+    {
+        auto iter = unconfirmed.begin();
+        while (iter != unconfirmed.end())
+        {
+            MariaDBServer* slave = *iter;
+            if (slave->do_show_slave_status())
+            {
+                auto slave_conn = slave->slave_connection_status_host_port(new_master);
+                if (slave_conn == NULL)
+                {
+                    // Highly unlikely. Maybe someone just removed the slave connection after it was created.
+                    MXS_WARNING("%s does not have a slave connection to %s although one should have "
+                                "been created.",
+                                slave->name(), new_master->name());
+                    repl_fails.push_back(*iter);
+                    iter = unconfirmed.erase(iter);
+                }
+                else if (slave_conn->slave_io_running == SlaveStatus::SLAVE_IO_YES
+                         && slave_conn->slave_sql_running == true)
+                {
+                    // This slave has connected to master and replication seems to be ok.
+                    successes.push_back(*iter);
+                    iter = unconfirmed.erase(iter);
+                }
+                else if (slave_conn->slave_io_running == SlaveStatus::SLAVE_IO_NO)
+                {
+                    // IO error on slave
+                    MXS_WARNING("%s cannot start replication because of IO thread error: '%s'.",
+                                slave_conn->to_short_string(slave->name()).c_str(),
+                                slave_conn->last_error.c_str());
+                    repl_fails.push_back(*iter);
+                    iter = unconfirmed.erase(iter);
+                }
+                else if (slave_conn->slave_sql_running == false)
+                {
+                    // SQL error on slave
+                    MXS_WARNING("%s cannot start replication because of SQL thread error: '%s'.",
+                                slave_conn->to_short_string(slave->name()).c_str(),
+                                slave_conn->last_error.c_str());
+                    repl_fails.push_back(*iter);
+                    iter = unconfirmed.erase(iter);
+                }
+                else
+                {
+                    // Slave IO is still connecting, must wait.
+                    ++iter;
+                }
+            }
+            else
+            {
+                query_fails.push_back(*iter);
+                iter = unconfirmed.erase(iter);
+            }
+        }
+
+        op.time_remaining -= timer.lap();
+        if (!unconfirmed.empty())
+        {
+            if (op.time_remaining.secs() > 0)
+            {
+                double standard_sleep = 0.5;    // In seconds.
+                // If we have unconfirmed slaves and have time remaining, sleep a bit and try again.
+                /* TODO: This sleep is kinda pointless, because whether or not replication begins,
+                 * all operations for failover/switchover are complete. The sleep is only required to
+                 * get correct messages to the user. Think about removing it, or shortening the maximum
+                 * time of this function. */
+                Duration sleep_time = (op.time_remaining.secs() > standard_sleep) ?
+                    Duration(standard_sleep) : op.time_remaining;
+                std::this_thread::sleep_for(sleep_time);
+            }
+            else
+            {
+                // Have undecided slaves and is out of time.
+                time_is_up = true;
+            }
+        }
+    }
+
+    if (successes.size() == redirected_slaves.size())
+    {
+        // Complete success.
+        MXS_NOTICE("All redirected slaves successfully started replication from %s.", new_master->name());
+    }
+    else
+    {
+        if (!successes.empty())
+        {
+            MXS_NOTICE("%s successfully started replication from %s.",
+                       monitored_servers_to_string(successes).c_str(), new_master->name());
+        }
+        // Something went wrong.
+        auto fails = query_fails.size() + repl_fails.size() + unconfirmed.size();
+        const char MSG[] = "%lu slaves did not start replicating from %s. "
+                           "%lu encountered an I/O or SQL error, %lu failed to reply and %lu did not "
+                           "connect to %s within the time limit.";
+        MXS_WARNING(MSG, fails, new_master->name(), repl_fails.size(), query_fails.size(),
+                    unconfirmed.size(), new_master->name());
+    }
+    op.time_remaining -= timer.lap();
+}
+
 /**
 * Prepares a server for the replication master role.
 *
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -280,6 +280,7 @@ private:
    bool        wait_cluster_stabilization(MariaDBServer* new_master,
                                           const ServerArray& slaves,
                                           int seconds_remaining);
+    void wait_cluster_stabilization_ex(ClusterOperation& op, const ServerArray& slaves);
    void report_and_disable(const std::string& operation,
                            const std::string& setting_name,
                            bool* setting_var);
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -1179,6 +1179,20 @@ const SlaveStatus* MariaDBServer::slave_connection_status(const MariaDBServer* t
    return rval;
 }

+const SlaveStatus* MariaDBServer::slave_connection_status_host_port(const MariaDBServer* target) const
+{
+    string target_host = target->m_server_base->server->address;
+    int target_port = target->m_server_base->server->port;
+    for (const SlaveStatus& ss : m_slave_status)
+    {
+        if (ss.master_host == target_host && ss.master_port == target_port)
+        {
+            return &ss;
+        }
+    }
+    return NULL;
+}
+
 /**
 * Private, non-const version of 'slave_connection_status'.
 */
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -281,6 +281,15 @@ public:
     */
    const SlaveStatus* slave_connection_status(const MariaDBServer* target) const;

+    /**
+     * Find slave connection to the target server. Only considers host and port when selecting
+     * the connection. A matching connection is accepted even if its IO or SQL thread is stopped.
+     *
+     * @param target Immediate master or relay server
+     * @return The slave status info of the slave thread, or NULL if not found
+     */
+    const SlaveStatus* slave_connection_status_host_port(const MariaDBServer* target) const;
+
    /**
     * Is binary log on? 'update_replication_settings' should be ran before this function to query the data.
     *