MXS-2426 Do not permanently disable automatic cluster operations when they fail

Only disabled for "failcount" monitor ticks. Also turns some related log messages to notices.
2019-04-05 14:30:19 +03:00 · 2019-04-05 14:30:19 +03:00 · 5ba305c2c1
commit 5ba305c2c1
parent b9aec60d7a
2 changed files with 82 additions and 83 deletions
--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@ -33,8 +33,6 @@
 #include <maxscale/modutil.h>
 #include <maxscale/mysql_utils.h>
 #include <maxscale/utils.h>
-// TODO: For monitorAddParameters
-#include "../../../core/internal/monitor.h"

 /** Column positions for SHOW SLAVE STATUS */
 #define MYSQL55_STATUS_MASTER_LOG_POS 5
@ -95,7 +93,7 @@ static int add_slave_to_master(long *, int, long);
 static bool isMySQLEvent(mxs_monitor_event_t event);
 void check_maxscale_schema_replication(MXS_MONITOR *monitor);
 static MySqlServerInfo* get_server_info(const MYSQL_MONITOR* handle, const MXS_MONITORED_SERVER* db);
-static bool mon_process_failover(MYSQL_MONITOR*, uint32_t, bool*);
+static void mon_process_failover(MYSQL_MONITOR* monitor);
 static bool do_failover(MYSQL_MONITOR* mon, json_t** output);
 static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
                          MXS_MONITORED_SERVER* new_master, json_t** err_out);
@ -109,7 +107,8 @@ static bool server_is_rejoin_suspect(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* s
 static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output);
 static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& servers, json_t** output);
 static bool join_cluster(MXS_MONITORED_SERVER* server, const char* change_cmd);
-static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
+static void delay_auto_cluster_ops(MYSQL_MONITOR* mon);
+static bool can_perform_cluster_ops(MYSQL_MONITOR* mon);
 static bool cluster_can_be_joined(MYSQL_MONITOR* mon);
 static bool can_replicate_from(MYSQL_MONITOR* mon,
                               MXS_MONITORED_SERVER* slave, MySqlServerInfo* slave_info,
@ -552,15 +551,9 @@ bool mysql_switchover(MXS_MONITOR* mon, MXS_MONITORED_SERVER* new_master,
        }
        else
        {
-            string format = "Switchover %s -> %s failed";
-            bool failover = config_get_bool(mon->parameters, CN_AUTO_FAILOVER);
-            if (failover)
-            {
-                disable_setting(handle, CN_AUTO_FAILOVER);
-                format += ", failover has been disabled.";
-            }
-            format += ".";
-            PRINT_MXS_JSON_ERROR(error_out, format.c_str(), curr_master_name, new_master_name);
+            const char format[] = "Switchover %s -> %s failed.";
+            PRINT_MXS_JSON_ERROR(error_out, format, curr_master_name, new_master_name);
+            delay_auto_cluster_ops(handle);
        }
    }

@ -2206,7 +2199,8 @@ monitorMain(void *arg)
    size_t nrounds = 0;
    int log_no_master = 1;
    bool heartbeat_checked = false;
-
+    handle->cluster_op_performed = false;
+    handle->cluster_operation_disable_timer = 0;
    replication_heartbeat = handle->replicationHeartbeat;
    detect_stale_master = handle->detectStaleMaster;

@ -2255,6 +2249,12 @@ monitorMain(void *arg)
        num_servers = 0;

        atomic_add_uint64(&mon->ticks, 1);
+        handle->cluster_op_performed = false;
+        if (handle->cluster_operation_disable_timer > 0)
+        {
+            handle->cluster_operation_disable_timer--;
+        }
+
        lock_monitor_servers(mon);
        servers_status_pending_to_current(mon);

@ -2559,34 +2559,25 @@ monitorMain(void *arg)
         * need to be launched.
         */
        mon_process_state_changes(mon, handle->script, handle->events);
-        bool cluster_modified = false; // Has an automatic failover/rejoin been performed this loop?

-        if (handle->auto_failover)
+        if (handle->auto_failover && can_perform_cluster_ops(handle))
        {
-            const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
-                                         "'%s' via MaxAdmin or the REST API, or restart MaxScale.";
            if (failover_not_possible(handle))
            {
                const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
-                                        "replication configuration, disabling automatic failover. Failover "
-                                        "should only be enabled after the replication configuration has been "
-                                        "fixed.";
-                MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, mon->name);
-                handle->auto_failover = false;
-                disable_setting(handle, CN_AUTO_FAILOVER);
+                                        "replication configuration.";
+                MXS_ERROR(PROBLEMS);
+                delay_auto_cluster_ops(handle);
            }
            // If master seems to be down, check if slaves are receiving events.
            else if (handle->verify_master_failure && handle->master &&
                     SERVER_IS_DOWN(handle->master->server) && slave_receiving_events(handle))
            {
-                MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
+                MXS_NOTICE("A slave is still receiving data from master, delaying failover.");
            }
-            else if (!mon_process_failover(handle, handle->failover_timeout, &cluster_modified))
+            else
            {
-                const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
-                MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, mon->name);
-                handle->auto_failover = false;
-                disable_setting(handle, CN_AUTO_FAILOVER);
+                mon_process_failover(handle);
            }
        }

@ -2651,10 +2642,9 @@ monitorMain(void *arg)

        // Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
        // has been performed, as server states have not been updated yet. It will happen next iteration.
-        if (!config_get_global_options()->passive && handle->auto_rejoin &&
-            !cluster_modified && cluster_can_be_joined(handle))
+        if (handle->auto_rejoin && cluster_can_be_joined(handle) && can_perform_cluster_ops(handle))
        {
-            // Check if any servers should be autojoined to the cluster
+            // Check if any servers should be autojoined to the cluster.
            ServerVector joinable_servers;
            if (get_joinable_servers(handle, &joinable_servers))
            {
@ -2662,7 +2652,7 @@ monitorMain(void *arg)
                if (joins > 0)
                {
                    MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
-                    cluster_modified = true;
+                    handle->cluster_op_performed = true;
                }
            }
            else
@ -2675,7 +2665,7 @@ monitorMain(void *arg)
        /* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not
         * perform this if cluster has been modified this loop since it may not be clear which server
         * should be a slave. */
-        if (!config_get_global_options()->passive && handle->enforce_read_only_slaves && !cluster_modified)
+        if (handle->enforce_read_only_slaves && can_perform_cluster_ops(handle))
        {
            enforce_read_only_on_slaves(handle);
        }
@ -3384,30 +3374,19 @@ void check_maxscale_schema_replication(MXS_MONITOR *monitor)
 }

 /**
- * @brief Process possible failover event
+ * If a master has failed, performs failover. This function only works with flat replication topologies
+ * Should be called immediately after @c mon_process_state_changes.
 *
- * If a master failure has occurred and MaxScale is configured with failover
- * functionality, this fuction executes an external failover program to elect
- * a new master server.
- *
- * This function should be called immediately after @c mon_process_state_changes.
- *
- * @param monitor               Monitor whose cluster is processed
- * @param failover_timeout      Timeout in seconds for the failover
- * @param cluster_modified_out  Set to true if modifying cluster
- * @return True on success, false on error
- *
- * @todo Currently this only works with flat replication topologies and
- *       needs to be moved inside mysqlmon as it is MariaDB specific code.
+ * @param monitor Monitor whose cluster is processed
 */
-bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, bool* cluster_modified_out)
+void mon_process_failover(MYSQL_MONITOR* monitor)
 {
-    ss_dassert(*cluster_modified_out == false);
-    if (config_get_global_options()->passive ||
-        (monitor->master && SERVER_IS_MASTER(monitor->master->server)))
+    ss_dassert(monitor->cluster_op_performed == false);
+    if (monitor->master && SERVER_IS_MASTER(monitor->master->server))
    {
-        return true;
+        return;
    }
+    int failover_timeout = monitor->failover_timeout;

    MXS_MONITORED_SERVER* failed_master = NULL;
    for (MXS_MONITORED_SERVER *ptr = monitor->monitor->monitored_servers; ptr; ptr = ptr->next)
@ -3416,12 +3395,11 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
        {
            if (failed_master)
            {
-                MXS_ALERT("Multiple failed master servers detected: "
-                          "'%s' is the first master to fail but server "
-                          "'%s' has also triggered a master_down event.",
-                          failed_master->server->unique_name,
-                          ptr->server->unique_name);
-                return false;
+                MXS_ERROR("Multiple failed master servers detected: '%s' is the first master to fail "
+                          "but server '%s' has also triggered a master_down event.",
+                          failed_master->server->unique_name, ptr->server->unique_name);
+                delay_auto_cluster_ops(monitor);
+                return;
            }

            if (ptr->server->active_event)
@ -3442,7 +3420,7 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
                if (t > timeout)
                {
                    MXS_WARNING("Failover of server '%s' did not take place within "
-                                "%u seconds, failover needs to be re-triggered",
+                                "%i seconds, failover needs to be re-triggered",
                                ptr->server->unique_name, failover_timeout);
                    failed_master = ptr;
                }
@ -3450,7 +3428,6 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
        }
    }

-    bool rval = true;
    if (failed_master)
    {
        int failcount = monitor->failcount;
@ -3468,12 +3445,15 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
                monitor->warn_failover_precond = true;
                MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
                           failed_master->server->unique_name);
-                failed_master->new_event = false;
-                // If this fails, auto_failover is disabled.
-                rval = do_failover(monitor, NULL);
-                if (rval)
+                if (do_failover(monitor, NULL))
                {
-                    *cluster_modified_out = true;
+                    monitor->cluster_op_performed = true;
+                    failed_master->new_event = false;
+                }
+                else
+                {
+                    MXS_ERROR("Automatic failover failed.");
+                    delay_auto_cluster_ops(monitor);
                }
            }
            else
@ -3493,8 +3473,6 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
    {
        monitor->warn_failover_precond = true;
    }
-
-    return rval;
 }

 /**
@ -3755,8 +3733,8 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
           io_pos_stable &&
           difftime(time(NULL), begin) < seconds_remaining)
    {
-        MXS_INFO("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
-                 new_master->server->unique_name, master_info->relay_log_events());
+        MXS_NOTICE("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
+                   new_master->server->unique_name, master_info->relay_log_events());
        thread_millisleep(1000); // Sleep for a while before querying server again.
        // Todo: check server version before entering failover.
        Gtid old_gtid_io_pos = master_info->slave_status.gtid_io_pos;
@ -4916,6 +4894,7 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
 {
    SERVER* master = mon->master->server;
    uint32_t servers_joined = 0;
+    bool rejoin_error = false;
    if (!joinable_servers.empty())
    {
        string change_cmd = generate_change_master_cmd(mon, master->name, master->port);
@ -4947,6 +4926,10 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
                MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
                           "redirecting it to '%s'.", name, master_name, master_name);
                op_success = redirect_one_slave(joinable, change_cmd.c_str());
+                if (!op_success)
+                {
+                    rejoin_error = true;
+                }
            }

            if (op_success)
@ -4955,6 +4938,11 @@ static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_serve
            }
        }
    }
+
+    if (rejoin_error)
+    {
+        delay_auto_cluster_ops(mon);
+    }
    return servers_joined;
 }

@ -4993,19 +4981,21 @@ static bool join_cluster(MXS_MONITORED_SERVER* server, const char* change_cmd)
    return success;
 }

-/**
- * Set a monitor config parameter to "false". The effect persists over stopMonitor/startMonitor but not
- * MaxScale restart. Only use on boolean config settings.
- *
- * @param mon Cluster monitor
- * @param setting_name Setting to disable
- */
-static void disable_setting(MYSQL_MONITOR* mon, const char* setting)
+static void delay_auto_cluster_ops(MYSQL_MONITOR* mon)
 {
-    MXS_CONFIG_PARAMETER p = {};
-    p.name = const_cast<char*>(setting);
-    p.value = const_cast<char*>("false");
-    monitorAddParameters(mon->monitor, &p);
+    if (mon->auto_failover || mon->auto_rejoin || mon->enforce_read_only_slaves)
+    {
+        const char DISABLING_AUTO_OPS[] = "Disabling automatic cluster operations for %i monitor ticks.";
+        MXS_NOTICE(DISABLING_AUTO_OPS, mon->failcount);
+    }
+    // + 1 is because the start of next tick substracts 1.
+    mon->cluster_operation_disable_timer = mon->failcount + 1;
+}
+
+static bool can_perform_cluster_ops(MYSQL_MONITOR* mon)
+{
+    return (!config_get_global_options()->passive && mon->cluster_operation_disable_timer <= 0 &&
+            !mon->cluster_op_performed);
 }

 /**
@ -5132,6 +5122,7 @@ static bool check_sql_files(MYSQL_MONITOR* mon)
 static void enforce_read_only_on_slaves(MYSQL_MONITOR* mon)
 {
    const char QUERY[] = "SET GLOBAL read_only=1;";
+    bool error = false;
    for (MXS_MONITORED_SERVER* mon_srv = mon->monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
    {
        MySqlServerInfo* serv_info = get_server_info(mon, mon_srv);
@ -5145,7 +5136,13 @@ static void enforce_read_only_on_slaves(MYSQL_MONITOR* mon)
            else
            {
                MXS_ERROR("Setting read_only on server '%s' failed: '%s.", name, mysql_error(mon_srv->con));
+                error = true;
            }
        }
    }
+
+    if (error)
+    {
+        delay_auto_cluster_ops(mon);
+    }
 }
--- a/server/modules/monitor/mysqlmon.h
+++ b/server/modules/monitor/mysqlmon.h
@ -82,6 +82,8 @@ typedef struct
    MXS_MONITORED_SERVER** excluded_servers; /**< Servers banned for master promotion during auto-failover. */
    const char* promote_sql_file;  /**< File with sql commands which are ran to a server being promoted. */
    const char* demote_sql_file;   /**< File with sql commands which are ran to a server being demoted. */
+    int cluster_operation_disable_timer; /**< Counter for temporary automatic cluster operation disabling. */
+    bool cluster_op_performed;     /**< Has an automatic failover/rejoin been performed this loop? */

    MXS_MONITOR* monitor;
 } MYSQL_MONITOR;