MXS-1845 Switchover cleanup

Several small changes: Binlog is flushed at the end of old master demotion. Only new master is required to catch up to old master. Use the same replication check method as failover.
2018-10-02 17:42:03 +03:00
parent 49e85d9a28
commit 30eb21914f
4 changed files with 106 additions and 147 deletions
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@ -733,29 +733,31 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op)
    ServerArray redirectable_slaves = get_redirectables(promotion_target, demotion_target);

    bool rval = false;
-    // Step 2: Set read-only to on, flush logs, update master gtid:s
+    // Step 2: Set read-only to on, flush logs, update gtid:s.
    if (demotion_target->demote(op))
    {
        m_cluster_modified = true;
        bool catchup_and_promote_success = false;
-        maxbase::StopWatch timer;
-        // Step 3: Wait for the slaves (including promotion target) to catch up with master.
-        ServerArray catchup_slaves = redirectable_slaves;
-        catchup_slaves.push_back(promotion_target);
-        if (switchover_wait_slaves_catchup(catchup_slaves,
-                                           demotion_target->m_gtid_binlog_pos,
-                                           op.time_remaining.secs(),
-                                           error_out))
+        StopWatch timer;
+        // Step 3: Wait for the promotion target to catch up with the demotion target. Disregard the other
+        // slaves of the promotion target to avoid needless waiting.
+        // The gtid:s of the demotion target were updated at the end of demotion.
+        if (promotion_target->catchup_to_master(op))
        {
-            auto step3_duration = timer.lap();
-            MXS_DEBUG("Switchover: slave catchup took %.1f seconds.", step3_duration.secs());
-            op.time_remaining -= step3_duration;
-
-            // Step 4: On new master STOP and RESET SLAVE, set read-only to off.
+            MXS_INFO("Switchover: Catchup took %.1f seconds.", timer.lap().secs());
+            // Step 4: On new master: remove slave connections, set read-only to OFF etc.
            if (promotion_target->promote(op))
            {
+                // Point of no return. Even if following steps fail, do not try to undo.
+                // Switchover considered at least partially successful.
                catchup_and_promote_success = true;
-                m_next_master = promotion_target;
+                rval = true;
+                if (op.demotion_target_is_master)
+                {
+                    // Force a master swap on next tick.
+                    m_next_master = promotion_target;
+                }
+
                timer.restart();
                // Step 5: Redirect slaves and start replication on old master.
                ServerArray redirected_slaves;
@ -764,35 +766,20 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op)
                {
                    redirected_slaves.push_back(demotion_target);
                }
+                op.time_remaining -= timer.lap();
+
                int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves);

                bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0;
                if (success)
                {
-                    op.time_remaining -= timer.lap();
-                    // Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves
-                    // to receive it. If using external replication, skip this step. Come up with an
-                    // alternative later.
-                    if (m_external_master_port != PORT_UNKNOWN)
-                    {
-                        MXS_WARNING("Replicating from external master, skipping final check.");
-                        rval = true;
-                    }
-                    else if (wait_cluster_stabilization(promotion_target,
-                                                        redirected_slaves,
-                                                        op.time_remaining.secs()))
-                    {
-                        rval = true;
-                        auto step6_duration = timer.lap();
-                        op.time_remaining -= step6_duration;
-                        MXS_DEBUG("Switchover: slave replication confirmation took %.1f seconds with "
-                                  "%.1f seconds to spare.",
-                                  step6_duration.secs(), op.time_remaining.secs());
-                    }
-                }
-                else
-                {
-                    print_redirect_errors(demotion_target, redirectable_slaves, error_out);
+                    timer.restart();
+                    // Step 6: Finally, check that slaves are replicating.
+                    wait_cluster_stabilization_ex(op, redirected_slaves);
+                    auto step6_duration = timer.lap();
+                    MXS_INFO("Switchover: slave replication confirmation took %.1f seconds with "
+                             "%.1f seconds to spare.",
+                             step6_duration.secs(), op.time_remaining.secs());
                }
            }
        }
@ -994,46 +981,6 @@ bool MariaDBMonitor::switchover_demote_master(MariaDBServer* current_master, jso
    return !any_error();
 }

-/**
- * Wait until slave replication catches up with the master gtid for all slaves in the vector.
- *
- * @param slave Slaves to wait on
- * @param gtid Which gtid must be reached
- * @param total_timeout Maximum wait time in seconds
- * @param err_out json object for error printing. Can be NULL.
- * @return True, if target gtid was reached within allotted time for all servers
- */
-bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerArray& slaves,
-                                                    const GtidList& gtid,
-                                                    int total_timeout,
-                                                    json_t** err_out)
-{
-    bool success = true;
-    int seconds_remaining = total_timeout;
-
-    for (auto iter = slaves.begin(); iter != slaves.end() && success; iter++)
-    {
-        if (seconds_remaining <= 0)
-        {
-            success = false;
-        }
-        else
-        {
-            time_t begin = time(NULL);
-            MariaDBServer* slave_server = *iter;
-            if (slave_server->wait_until_gtid(gtid, seconds_remaining, err_out))
-            {
-                seconds_remaining -= difftime(time(NULL), begin);
-            }
-            else
-            {
-                success = false;
-            }
-        }
-    }
-    return success;
-}
-
 /**
 * Send an event to new master and wait for slaves to get the event.
 *
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -234,10 +234,6 @@ private:
                                                         Log log_mode, json_t** error_out);
    bool switchover_perform(ClusterOperation& operation);
    bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out);
-    bool switchover_wait_slaves_catchup(const ServerArray& slaves,
-                                        const GtidList& gtid,
-                                        int total_timeout,
-                                        json_t** err_out);
    bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
    bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
    void handle_low_disk_space_master();
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -502,20 +502,24 @@ void MariaDBServer::warn_replication_settings() const
    }
 }

-bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
+bool MariaDBServer::catchup_to_master(ClusterOperation& op)
 {
-    bool gtid_reached = false;
-    bool error = false;
    /* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on,
     * use gtid_current_pos. */
    const bool use_binlog_pos = m_rpl_settings.log_bin && m_rpl_settings.log_slave_updates;
+    bool time_is_up = false;    // Check at least once.
+    bool gtid_reached = false;
+    bool error = false;
+    const GtidList& target = op.demotion_target->m_gtid_binlog_pos;
+    json_t** error_out = op.error_out;

-    int seconds_remaining = 1;  // Cheat a bit here to allow at least one iteration.
-    int sleep_ms = 200;         // How long to sleep on next iteration. Incremented slowly.
-    time_t start_time = time(NULL);
-    while (seconds_remaining > 0 && !gtid_reached && !error)
+    Duration sleep_time(0.2);   // How long to sleep before next iteration. Incremented slowly.
+    StopWatch timer;
+
+    while (!time_is_up && !gtid_reached && !error)
    {
-        if (update_gtids())
+        string error_msg;
+        if (update_gtids(&error_msg))
        {
            const GtidList& compare_to = use_binlog_pos ? m_gtid_binlog_pos : m_gtid_current_pos;
            if (target.events_ahead(compare_to, GtidList::MISSING_DOMAIN_IGNORE) == 0)
@ -524,31 +528,33 @@ bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t*
            }
            else
            {
-                // Query was successful but target gtid not yet reached. Check elapsed time.
-                seconds_remaining = timeout - difftime(time(NULL), start_time);
-                if (seconds_remaining > 0)
+                // Query was successful but target gtid not yet reached. Check how much time left.
+                op.time_remaining -= timer.lap();
+                if (op.time_remaining.secs() > 0)
                {
                    // Sleep for a moment, then try again.
-                    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
-                    sleep_ms += 100;    // Sleep a bit more next iteration.
+                    Duration this_sleep = MXS_MIN(sleep_time, op.time_remaining);
+                    std::this_thread::sleep_for(this_sleep);
+                    sleep_time += Duration(0.1);    // Sleep a bit more next iteration.
+                }
+                else
+                {
+                    time_is_up = true;
                }
            }
        }
        else
        {
            error = true;
+            PRINT_MXS_JSON_ERROR(error_out,
+                                 "Failed to update gtid on %s while waiting for catchup: %s",
+                                 name(), error_msg.c_str());
        }
    }

-    if (error)
+    if (!error && !gtid_reached)
    {
-        PRINT_MXS_JSON_ERROR(err_out,
-                             "Failed to update gtid on server '%s' while waiting for catchup.",
-                             name());
-    }
-    else if (!gtid_reached)
-    {
-        PRINT_MXS_JSON_ERROR(err_out, "Slave catchup timed out on slave '%s'.", name());
+        PRINT_MXS_JSON_ERROR(error_out, "Slave catchup timed out on slave '%s'.", name());
    }
    return gtid_reached;
 }
@ -1568,13 +1574,19 @@ bool MariaDBServer::demote(ClusterOperation& op)

    if (!stop_slave_error)
    {
-        // Step 2: If this server is master, disable writes and scheduled events.
-        // Flush logs, update gtid:s, run demotion_sql_file.
+        // Step 2: If this server is master, disable writes and scheduled events, flush logs,
+        // update gtid:s, run demotion_sql_file.
+
+        // In theory, this part should be ran in the opposite order so it would "reverse"
+        // the promotion code. However, it's probably better to run the most
+        // likely part to fail, setting read_only=1, first to make undoing easier. Setting
+        // read_only may fail if another session has table locks or is doing long writes.
        bool demotion_error = false;
        if (op.demotion_target_is_master)
        {
            mxb_assert(is_master());
            // Step 2a: Enabling read-only can take time if writes are on or table locks taken.
+            // TODO: use max_statement_time to be safe!
            bool ro_enabled = set_read_only(ReadOnlySetting::ENABLE, op.time_remaining, error_out);
            op.time_remaining -= timer.lap();
            if (!ro_enabled)
@ -1597,33 +1609,7 @@ bool MariaDBServer::demote(ClusterOperation& op)
                    }
                }

-                if (!demotion_error)
-                {
-                    // Step 2c: FLUSH LOGS to ensure that all events have been written to binlog,
-                    // then update gtid:s.
-                    string error_msg;
-                    bool logs_flushed = execute_cmd_time_limit("FLUSH LOGS;", op.time_remaining, &error_msg);
-                    op.time_remaining -= timer.lap();
-                    if (logs_flushed)
-                    {
-                        if (!update_gtids(&error_msg))
-                        {
-                            demotion_error = true;
-                            PRINT_MXS_JSON_ERROR(error_out,
-                                                 "Failed to update gtid:s of %s during demotion: %s.",
-                                                 name(), error_msg.c_str());
-                        }
-                    }
-                    else
-                    {
-                        demotion_error = true;
-                        PRINT_MXS_JSON_ERROR(error_out,
-                                             "Failed to flush binary logs of %s during demotion: %s.",
-                                             name(), error_msg.c_str());
-                    }
-                }
-
-                // Step 2d: Run demotion_sql_file if no errors so far.
+                // Step 2c: Run demotion_sql_file if no errors so far.
                if (!demotion_error && !op.demotion_sql_file.empty())
                {
                    bool file_ran_ok = run_sql_from_file(op.demotion_sql_file, error_out);
@ -1637,19 +1623,48 @@ bool MariaDBServer::demote(ClusterOperation& op)
                    }
                }

-                if (demotion_error)
+                if (!demotion_error)
                {
-                    // Read_only was enabled but a later step failed. Disable read_only. Connection is
-                    // likely broken so use a short time limit.
-                    // TODO: add smarter undo
-                    set_read_only(ReadOnlySetting::DISABLE, Duration((double)0), NULL);
+                    // Step 2d: FLUSH LOGS to ensure that all events have been written to binlog.
+                    string error_msg;
+                    bool logs_flushed = execute_cmd_time_limit("FLUSH LOGS;", op.time_remaining, &error_msg);
+                    op.time_remaining -= timer.lap();
+                    if (!logs_flushed)
+                    {
+                        demotion_error = true;
+                        PRINT_MXS_JSON_ERROR(error_out,
+                                             "Failed to flush binary logs of %s during demotion: %s.",
+                                             name(), error_msg.c_str());
+                    }
                }
            }
        }

        if (!demotion_error)
        {
-            success = true;
+            // Finally, update gtid:s.
+            string error_msg;
+            if (update_gtids(&error_msg))
+            {
+                success = true;
+            }
+            else
+            {
+                demotion_error = true;
+                PRINT_MXS_JSON_ERROR(error_out,
+                                     "Failed to update gtid:s of %s during demotion: %s.",
+                                     name(), error_msg.c_str());
+            }
+        }
+
+        if (demotion_error && op.demotion_target_is_master)
+        {
+            // Read_only was enabled (or tried to be enabled) but a later step failed.
+            // Disable read_only. Connection is likely broken so use a short time limit.
+            // Even this is insufficient, because the server may still be executing the old
+            // 'SET GLOBAL read_only=1' query.
+            // TODO: add smarter undo, KILL QUERY etc.
+            set_read_only(ReadOnlySetting::DISABLE, Duration((double)0), NULL);
        }
    }
    return success;
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -260,16 +260,17 @@ public:
    void warn_replication_settings() const;

    /**
-     * Wait until server catches up to the target gtid. Only considers gtid domains common to this server
-     * and the target gtid. The gtid compared is the gtid_binlog_pos if this server has both log_bin and
-     * log_slave_updates on, and gtid_current_pos otherwise.
+     * Wait until server catches up to demotion target. Only considers gtid domains common
+     * to this server and the target. The gtid compared to on the demotion target is 'gtid_binlog_pos'.
+     * It is not updated during this method.
     *
-     * @param target Which gtid must be reached
-     * @param timeout Maximum wait time in seconds
-     * @param err_out json object for error printing. Can be NULL.
-     * @return True, if target gtid was reached within allotted time
+     * The gtid used for comparison on this server is 'gtid_binlog_pos' if this server has both 'log_bin'
+     * and 'log_slave_updates' on, and 'gtid_current_pos' otherwise. This server is updated during the
+     * method.
+     *
+     * @return True, if target server gtid was reached within allotted time
     */
-    bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out);
+    bool catchup_to_master(ClusterOperation& op);

    /**
     * Find slave connection to the target server. If the IO thread is trying to connect