Relay log waiting is part of failover_prepare

Since the servers are not modified before or during the wait, the waiting can be done in the preparation method. This simplifies the actual failover somewhat, and allows the monitor to keep running normally while waiting for the log to clear.
2018-08-28 18:47:38 +03:00
parent 32c4e60516
commit 7cd1cfdb80
5 changed files with 92 additions and 135 deletions
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@ -598,64 +598,54 @@ bool MariaDBMonitor::failover_perform(MariaDBServer* promotion_target, MariaDBSe
    // completed.
    int seconds_remaining = m_failover_timeout;
    time_t start_time = time(NULL);
+
    // Step 1: Populate a vector with all slaves not the selected master.
    ServerArray redirectable_slaves = get_redirectables(promotion_target, demotion_target);

-    time_t step1_time = time(NULL);
-    seconds_remaining -= difftime(step1_time, start_time);
-
    bool rval = false;
-    // Step 2: Wait until relay log consumed.
-    if (promotion_target->failover_wait_relay_log(demotion_target, seconds_remaining, error_out))
+    // Step 2: Stop and reset slave, set read-only to 0.
+    if (promote_new_master(promotion_target, error_out))
    {
-        time_t step2_time = time(NULL);
-        int seconds_step2 = difftime(step2_time, step1_time);
-        MXS_DEBUG("Failover: relay log processing took %d seconds.", seconds_step2);
-        seconds_remaining -= seconds_step2;
+        m_next_master = promotion_target;
+        m_cluster_modified = true;

-        // Step 3: Stop and reset slave, set read-only to 0.
-        if (promote_new_master(promotion_target, error_out))
+        // Step 3: Redirect slaves.
+        ServerArray redirected_slaves;
+        int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves);
+        bool success = redirectable_slaves.empty() ? true : redirects > 0;
+        if (success)
        {
-            m_next_master = promotion_target;
-            m_cluster_modified = true;
-            // Step 4: Redirect slaves.
-            ServerArray redirected_slaves;
-            int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves);
-            bool success = redirectable_slaves.empty() ? true : redirects > 0;
-            if (success)
-            {
-                time_t step4_time = time(NULL);
-                seconds_remaining -= difftime(step4_time, step2_time);
+            time_t step3_time = time(NULL);
+            seconds_remaining -= difftime(step3_time, start_time);

-                // Step 5: Finally, add an event to the new master to advance gtid and wait for the slaves
-                // to receive it. seconds_remaining can be 0 or less at this point. Even in such a case
-                // wait_cluster_stabilization() may succeed if replication is fast enough. If using external
-                // replication, skip this step. Come up with an alternative later.
-                if (m_external_master_port != PORT_UNKNOWN)
-                {
-                    MXS_WARNING("Replicating from external master, skipping final check.");
-                    rval = true;
-                }
-                else if (redirected_slaves.empty())
-                {
-                    // No slaves to check. Assume success.
-                    rval = true;
-                    MXS_DEBUG("Failover: no slaves to redirect, skipping stabilization check.");
-                }
-                else if (wait_cluster_stabilization(promotion_target, redirected_slaves, seconds_remaining))
-                {
-                    rval = true;
-                    time_t step5_time = time(NULL);
-                    int seconds_step5 = difftime(step5_time, step4_time);
-                    seconds_remaining -= seconds_step5;
-                    MXS_DEBUG("Failover: slave replication confirmation took %d seconds with "
-                              "%d seconds to spare.", seconds_step5, seconds_remaining);
-                }
-            }
-            else
+            // Step 4: Finally, add an event to the new master to advance gtid and wait for the slaves
+            // to receive it. seconds_remaining can be 0 or less at this point. Even in such a case
+            // wait_cluster_stabilization() may succeed if replication is fast enough. If using external
+            // replication, skip this step. Come up with an alternative later.
+            if (m_external_master_port != PORT_UNKNOWN)
            {
-                print_redirect_errors(NULL, redirectable_slaves, error_out);
+                MXS_WARNING("Replicating from external master, skipping final check.");
+                rval = true;
            }
+            else if (redirected_slaves.empty())
+            {
+                // No slaves to check. Assume success.
+                rval = true;
+                MXS_DEBUG("Failover: no slaves to redirect, skipping stabilization check.");
+            }
+            else if (wait_cluster_stabilization(promotion_target, redirected_slaves, seconds_remaining))
+            {
+                rval = true;
+                time_t step4_time = time(NULL);
+                int seconds_step4 = difftime(step4_time, step3_time);
+                seconds_remaining -= seconds_step4;
+                MXS_DEBUG("Failover: slave replication confirmation took %d seconds with "
+                          "%d seconds to spare.", seconds_step4, seconds_remaining);
+            }
+        }
+        else
+        {
+            print_redirect_errors(NULL, redirectable_slaves, error_out);
        }
    }

@ -1222,9 +1212,49 @@ bool MariaDBMonitor::failover_prepare(Log log_mode,

    if (promotion_target && demotion_target && gtid_ok)
    {
-        *promotion_target_out = promotion_target;
-        *demotion_target_out = demotion_target;
-        return true;
+        const SlaveStatus* slave_conn = promotion_target->slave_connection_status(demotion_target);
+        mxb_assert(slave_conn);
+        uint64_t events = promotion_target->relay_log_events(*slave_conn);
+        if (events > 0)
+        {
+            // The relay log of the promotion target is not yet clear. This is not really an error,
+            // but should be communicated to the user in the case of manual failover. For automatic
+            // failover, it's best to just try again during the next monitor iteration. The difference
+            // to a typical prepare-fail is that the relay log status should be logged
+            // repeatedly since it is likely to change continuously.
+            if (error_out)
+            {
+                // Print a bit more helpful error for the user, goes to log too. This should be a very rare
+                // occurrence: either the dba managed to start failover really fast, or the relay log is
+                // massive. In the latter case it's ok that the monitor does not do the waiting since there
+                // is no telling how long the wait will be.
+                const char wait_relay_log[] = "The relay log of '%s' has %" PRIu64 " unprocessed events "
+                "(Gtid_IO_Pos: %s, Gtid_Current_Pos: %s). To avoid data loss, failover should be "
+                "postponed until the log has been processed. Please try again later.";
+                string error_msg = string_printf(wait_relay_log,
+                                                 promotion_target->name(),
+                                                 events,
+                                                 slave_conn->gtid_io_pos.to_string().c_str(),
+                                                 promotion_target->m_gtid_current_pos.to_string().c_str());
+                PRINT_MXS_JSON_ERROR(error_out, "%s", error_msg.c_str());
+            }
+            else if (log_mode == Log::ON)
+            {
+                // For automatic failover the message is more typical. TODO: Think if this message should
+                // be logged more often.
+                MXS_WARNING("The relay log of '%s' has %" PRId64 " unprocessed events "
+                            "(Gtid_IO_Pos: %s, Gtid_Current_Pos: %s). To avoid data loss, "
+                            "failover is postponed until the log has been processed.",
+                            promotion_target->name(), events, slave_conn->gtid_io_pos.to_string().c_str(),
+                            promotion_target->m_gtid_current_pos.to_string().c_str());
+            }
+        }
+        else
+        {
+            *promotion_target_out = promotion_target;
+            *demotion_target_out = demotion_target;
+            return true;
+        }
    }
    return false;
 }
@ -1287,7 +1317,7 @@ void MariaDBMonitor::handle_auto_failover()
            // Servers were not modified, so it's ok to try this again.
            if (m_warn_failover_precond)
            {
-                MXS_WARNING("Not performing automatic failover. Will keep retrying with this message "
+                MXS_WARNING("Not performing automatic failover. Will keep retrying with most error messages "
                            "suppressed.");
                m_warn_failover_precond = false;
            }
--- a/server/modules/monitor/mariadbmon/gtid.cc
+++ b/server/modules/monitor/mariadbmon/gtid.cc
@ -97,12 +97,13 @@ bool GtidList::operator == (const GtidList& rhs) const
    return m_triplets == rhs.m_triplets;
 }

-int64_t GtidList::events_ahead(const GtidList& rhs, substraction_mode_t domain_substraction_mode) const
+uint64_t GtidList::events_ahead(const GtidList& rhs, substraction_mode_t domain_substraction_mode) const
 {
    const size_t n_lhs = m_triplets.size();
    const size_t n_rhs = rhs.m_triplets.size();
    size_t ind_lhs = 0, ind_rhs = 0;
    uint64_t events = 0;
+
    // GtidLists are assumed to be ordered by domain in ascending order.
    while (ind_lhs < n_lhs && ind_rhs < n_rhs)
    {
@ -138,7 +139,7 @@ int64_t GtidList::events_ahead(const GtidList& rhs, substraction_mode_t domain_s
            ind_rhs++;
        }
    }
-    return (events > INT64_MAX) ? INT64_MAX : events;
+    return events;
 }

 Gtid Gtid::from_string(const char* str, char** endptr)
--- a/server/modules/monitor/mariadbmon/gtid.hh
+++ b/server/modules/monitor/mariadbmon/gtid.hh
@ -140,7 +140,7 @@ public:
     * the sequence number on lhs is added to the total difference.
     * @return The number of events between the two gtid:s
     */
-    int64_t events_ahead(const GtidList& rhs, substraction_mode_t domain_substraction_mode) const;
+    uint64_t events_ahead(const GtidList& rhs, substraction_mode_t domain_substraction_mode) const;

    /**
     * Return an individual gtid with the given domain.
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -73,19 +73,13 @@ void NodeData::reset_indexes()
    in_stack = false;
 }

-int64_t MariaDBServer::relay_log_events(const MariaDBServer* master)
+uint64_t MariaDBServer::relay_log_events(const SlaveStatus& slave_conn)
 {
    /* The events_ahead-call below ignores domains where current_pos is ahead of io_pos. This situation is
     * rare but is possible (I guess?) if the server is replicating a domain from multiple masters
     * and decides to process events from one relay log before getting new events to the other. In
     * any case, such events are obsolete and the server can be considered to have processed such logs. */
-    int64_t rval = -1;
-    const SlaveStatus* sstatus = slave_connection_status(master);
-    if (sstatus)
-    {
-        rval = sstatus->gtid_io_pos.events_ahead(m_gtid_current_pos, GtidList::MISSING_DOMAIN_IGNORE);
-    }
-    return rval;
+    return slave_conn.gtid_io_pos.events_ahead(m_gtid_current_pos, GtidList::MISSING_DOMAIN_IGNORE);
 }

 std::unique_ptr<QueryResult> MariaDBServer::execute_query(const string& query, std::string* errmsg_out)
@ -660,62 +654,6 @@ bool MariaDBServer::join_cluster(const string& change_cmd)
    return success;
 }

-bool MariaDBServer::failover_wait_relay_log(const MariaDBServer* master, int seconds_remaining,
-                                            json_t** err_out)
-{
-    time_t begin = time(NULL);
-    bool query_ok = true;
-    bool io_pos_stable = true;
-    int64_t events = relay_log_events(master);
-    while (events > 0 && query_ok && io_pos_stable && difftime(time(NULL), begin) < seconds_remaining)
-    {
-        const SlaveStatus* sstatus = slave_connection_status(master);
-        mxb_assert(sstatus);
-        GtidList old_gtid_io_pos = sstatus->gtid_io_pos;
-
-        // Sleep for a while before querying server again.
-        MXS_NOTICE("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
-                   name(), events);
-        std::this_thread::sleep_for(std::chrono::seconds(1));
-
-        // Update gtid:s first to make sure Gtid_IO_Pos is the more recent value.
-        // It doesn't matter here, but is a general rule.
-        query_ok = update_gtids() && do_show_slave_status();
-        if (query_ok)
-        {
-            const SlaveStatus* new_sstatus = slave_connection_status(master);
-            io_pos_stable = new_sstatus ? (old_gtid_io_pos == new_sstatus->gtid_io_pos) : false;
-            events = relay_log_events(master);
-        }
-    }
-
-    bool rval = false;
-    if (events == 0 && query_ok && io_pos_stable)
-    {
-        rval = true;
-    }
-    else
-    {
-        string reason = "Timeout";
-        if (!query_ok)
-        {
-            reason = "Query error";
-        }
-        else if (!io_pos_stable)
-        {
-            reason = "Old master sent new event(s)";
-        }
-        else if (events < 0)
-        {
-            reason = string_printf("Slave connection to '%s' was removed", master->name());
-        }
-        PRINT_MXS_JSON_ERROR(err_out, "Failover: %s while waiting for server '%s' to process relay log. "
-                             "Cancelling failover.", reason.c_str(), name());
-        rval = false;
-    }
-    return rval;
-}
-
 bool MariaDBServer::run_sql_from_file(const string& path, json_t** error_out)
 {
    MYSQL* conn = m_server_base->con;
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -176,13 +176,12 @@ public:
    void check_permissions();

    /**
-     * Calculate how many events are left in the relay log of the slave connection to 'master'.
+     * Calculate how many events are left in the relay log of the slave connection.
     *
-     * @param master The master server from which the slave connection is replicating from
-     * @return Number of events in relay log according to latest queried info. Negative on error,
-     * e.g. the slave connection didn't exist.
+     * @param slave_conn The slave connection to calculate for
+     * @return Number of events in relay log. Always  0 or greater.
     */
-    int64_t relay_log_events(const MariaDBServer* master);
+    uint64_t relay_log_events(const SlaveStatus& slave_conn);

    /**
     * Execute a query which returns data. The results are returned as a unique pointer to a QueryResult
@ -398,17 +397,6 @@ public:
     */
    bool join_cluster(const std::string& change_cmd);

-    /**
-     * Waits until this server has processed all its relay log, or time is up.
-     *
-     * @param master The master (or relay) whose relay log should be waited on
-     * @param seconds_remaining Maximum wait time
-     * @param err_out Error output
-     * @return True if relay log was processed within time limit, or false if time ran out
-     * or an error occurred.
-     */
-    bool failover_wait_relay_log(const MariaDBServer* master, int seconds_remaining, json_t** err_out);
-
    /**
     * Check if the server can be demoted by switchover.
     *