MXS-1491: Failover can be executed manually

Also, renamed config setting "failover" to "auto_failover". Removed setting "switchover" as it is now always enabled.
2017-11-28 17:31:13 +02:00
parent 90f6d78a58
commit 508ce3a703
3 changed files with 166 additions and 59 deletions
--- a/server/modules/monitor/mysqlmon.h
+++ b/server/modules/monitor/mysqlmon.h
@ -63,9 +63,8 @@ typedef struct
    bool allow_cluster_recovery;   /**< Allow failed servers to rejoin the cluster */
    bool warn_set_standalone_master; /**< Log a warning when setting standalone master */
    bool allow_external_slaves;    /**< Whether to allow usage of external slave servers */
-    bool failover;                 /**< If master failover is enabled */
+    bool auto_failover;            /**< If automatic master failover is enabled */
    uint32_t failover_timeout;     /**< Timeout in seconds for the master failover */
-    bool switchover;               /**< If master switchover is enabled */
    uint32_t switchover_timeout;   /**< Timeout in seconds for the master switchover */
    char* replication_user;        /**< Replication user for failover */
    char* replication_password;    /**< Replication password for failover*/
--- a/server/modules/monitor/mysqlmon/mysql_mon.cc
+++ b/server/modules/monitor/mysqlmon/mysql_mon.cc
@ -99,7 +99,7 @@ static int add_slave_to_master(long *, int, long);
 static bool isMySQLEvent(mxs_monitor_event_t event);
 void check_maxscale_schema_replication(MXS_MONITOR *monitor);
 static bool mon_process_failover(MYSQL_MONITOR*, uint32_t, bool*);
-static bool do_failover(MYSQL_MONITOR* mon);
+static bool do_failover(MYSQL_MONITOR* mon, json_t** output);
 static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
                          MXS_MONITORED_SERVER* new_master,json_t** err_out);
 static bool update_gtids(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER *database, MySqlServerInfo* info);
@ -114,9 +114,8 @@ static void disable_setting(MYSQL_MONITOR* mon, const char* setting);
 static bool report_version_err = true;
 static const char* hb_table_name = "maxscale_schema.replication_heartbeat";

-static const char CN_FAILOVER[]           = "failover";
+static const char CN_AUTO_FAILOVER[]      = "auto_failover";
 static const char CN_FAILOVER_TIMEOUT[]   = "failover_timeout";
-static const char CN_SWITCHOVER[]         = "switchover";
 static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
 static const char CN_AUTO_JOIN[]          = "auto_join";

@ -124,7 +123,7 @@ static const char CN_AUTO_JOIN[]          = "auto_join";
 static const char CN_VERIFY_MASTER_FAILURE[]    = "verify_master_failure";
 static const char CN_MASTER_FAILURE_TIMEOUT[]   = "master_failure_timeout";

-// Replication credentials parameters for failover
+// Replication credentials parameters for failover/switchover/join
 static const char CN_REPLICATION_USER[]     = "replication_user";
 static const char CN_REPLICATION_PASSWORD[] = "replication_password";

@ -285,6 +284,47 @@ bool mysql_switchover_check(MXS_MONITOR* mon,
    return rv;
 }

+/**
+ * Check that preconditions for a failover are met.
+ *
+ * @param mon Cluster monitor
+ * @param error_out JSON error out
+ * @return True if failover may proceed
+ */
+bool mysql_failover_check(MYSQL_MONITOR* mon, json_t** error_out)
+{
+    // Check that there is no running master and that there is at least one running server in the cluster.
+    int slaves = 0;
+    for (MXS_MONITORED_SERVER* mon_server = mon->monitor->monitored_servers;
+         mon_server != NULL;
+         mon_server = mon_server->next)
+    {
+        uint64_t status_bits = mon_server->server->status;
+        uint64_t master_up = (SERVER_MASTER | SERVER_RUNNING);
+        if ((status_bits & master_up) == master_up)
+        {
+            string master_up_msg = string("Master server '") + mon_server->server->unique_name +
+                "' is running";
+            if (status_bits & SERVER_MAINT)
+            {
+                master_up_msg += ", although in maintenance mode";
+            }
+            master_up_msg += ".";
+            PRINT_MXS_JSON_ERROR(error_out, "%s Failover not allowed.", master_up_msg.c_str());
+            return false;
+        }
+        else if (SERVER_IS_SLAVE(mon_server->server))
+        {
+            slaves++;
+        }
+    }
+    if (slaves == 0)
+    {
+        PRINT_MXS_JSON_ERROR(error_out, "No running slaves, cannot failover.");
+    }
+    return slaves > 0;
+}
+
 /**
 * Handle switchover
 *
@ -324,7 +364,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast

    if (rv)
    {
-        bool failover = config_get_bool(mon->parameters, CN_FAILOVER);
+        bool failover = config_get_bool(mon->parameters, CN_AUTO_FAILOVER);
        rv = do_switchover(handle, monitored_current_master, monitored_new_master, output);

        if (rv)
@ -344,7 +384,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast
            {
                // TODO: There could be a more convenient way for this.
                MXS_CONFIG_PARAMETER p = {};
-                p.name = const_cast<char*>(CN_FAILOVER);
+                p.name = const_cast<char*>(CN_AUTO_FAILOVER);
                p.value = const_cast<char*>("false");

                monitorAddParameters(mon, &p);
@ -396,24 +436,10 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** output)
    bool rv = false;

    if (!config_get_global_options()->passive)
-    {
-        if (mysql_mon->switchover)
    {
        rv = mysql_switchover(mon, new_master, current_master, output);
    }
    else
-        {
-            MXS_WARNING("Attempt to perform switchover %s -> %s, even though "
-                        "switchover is not enabled.",
-                        current_master ? current_master->unique_name : "none",
-                        new_master->unique_name);
-
-            *output = mxs_json_error("Switchover %s -> %s not performed, as switchover is not enabled.",
-                                     current_master ? current_master->unique_name : "none",
-                                     new_master->unique_name);
-        }
-    }
-    else
    {
        MXS_WARNING("Attempt to perform switchover %s -> %s, even though "
                    "MaxScale is in passive mode.",
@ -427,6 +453,80 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** output)
    return rv;
 }

+/**
+ * Perform user-activated failover
+ *
+ * @param mon     Cluster monitor
+ * @param output  Json error output
+ * @return True on success
+ */
+bool mysql_failover(MXS_MONITOR* mon, json_t** output)
+{
+    bool rv = true;
+    MYSQL_MONITOR *handle = static_cast<MYSQL_MONITOR*>(mon->handle);
+    bool stopped = stop_monitor(mon);
+    if (stopped)
+    {
+        MXS_NOTICE("Stopped monitor %s for the duration of failover.", mon->name);
+    }
+    else
+    {
+        MXS_NOTICE("Monitor %s already stopped, failover can proceed.", mon->name);
+    }
+
+    rv = mysql_failover_check(handle, output);
+    if (rv)
+    {
+        rv = do_failover(handle, output);
+        if (rv)
+        {
+            MXS_NOTICE("Failover performed.");
+            if (stopped)
+            {
+                startMonitor(mon, mon->parameters);
+            }
+        }
+        else
+        {
+            PRINT_MXS_JSON_ERROR(output, "Failover failed.");
+        }
+    }
+    else
+    {
+        if (stopped)
+        {
+            startMonitor(mon, mon->parameters);
+        }
+    }
+    return rv;
+}
+
+/**
+ * Command handler for 'failover'
+ *
+ * @param args Arguments given by user
+ * @param output Json error output
+ * @return True on success
+ */
+bool mysql_handle_failover(const MODULECMD_ARG* args, json_t** output)
+{
+    ss_dassert(args->argc == 1);
+    ss_dassert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
+
+    MXS_MONITOR* mon = args->argv[0].value.monitor;
+
+    bool rv = false;
+    if (!config_get_global_options()->passive)
+    {
+        rv = mysql_failover(mon, output);
+    }
+    else
+    {
+        PRINT_MXS_JSON_ERROR(output, "Failover attempted but not performed, as MaxScale is in passive mode.");
+    }
+    return rv;
+}
+
 /**
 * The module entry point routine. It is this routine that
 * must populate the structure that is referred to as the
@ -441,12 +541,12 @@ extern "C"
 MXS_MODULE* MXS_CREATE_MODULE()
 {
    MXS_NOTICE("Initialise the MySQL Monitor module.");
-
+    const char ARG_MONITOR_DESC[] = "MySQL Monitor name (from configuration file)";
    static modulecmd_arg_type_t switchover_argv[] =
    {
        {
            MODULECMD_ARG_MONITOR | MODULECMD_ARG_NAME_MATCHES_DOMAIN,
-            "MySQL Monitor name (from configuration file)"
+            ARG_MONITOR_DESC
        },
        { MODULECMD_ARG_SERVER, "New master" },
        { MODULECMD_ARG_SERVER | MODULECMD_ARG_OPTIONAL, "Current master (obligatory if exists)" }
@ -456,6 +556,18 @@ MXS_MODULE* MXS_CREATE_MODULE()
                               mysql_handle_switchover, MXS_ARRAY_NELEMS(switchover_argv), switchover_argv,
                               "Perform master switchover");

+    static modulecmd_arg_type_t failover_argv[] =
+    {
+        {
+            MODULECMD_ARG_MONITOR | MODULECMD_ARG_NAME_MATCHES_DOMAIN,
+            ARG_MONITOR_DESC
+        },
+    };
+
+    modulecmd_register_command(MXS_MODULE_NAME, "failover", MODULECMD_TYPE_ACTIVE,
+                               mysql_handle_failover, MXS_ARRAY_NELEMS(failover_argv), failover_argv,
+                               "Perform master failover");
+
    static MXS_MONITOR_OBJECT MyObject =
    {
        startMonitor,
@ -500,9 +612,8 @@ MXS_MODULE* MXS_CREATE_MODULE()
                MXS_MODULE_OPT_NONE,
                mxs_monitor_event_enum_values
            },
-            {CN_FAILOVER, MXS_MODULE_PARAM_BOOL, "false"},
+            {CN_AUTO_FAILOVER, MXS_MODULE_PARAM_BOOL, "false"},
            {CN_FAILOVER_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_FAILOVER_TIMEOUT},
-            {CN_SWITCHOVER, MXS_MODULE_PARAM_BOOL, "false"},
            {CN_SWITCHOVER_TIMEOUT, MXS_MODULE_PARAM_COUNT, DEFAULT_SWITCHOVER_TIMEOUT},
            {CN_REPLICATION_USER, MXS_MODULE_PARAM_STRING},
            {CN_REPLICATION_PASSWORD, MXS_MODULE_PARAM_STRING},
@ -820,9 +931,8 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
    handle->script = config_copy_string(params, "script");
    handle->events = config_get_enum(params, "events", mxs_monitor_event_enum_values);
    handle->allow_external_slaves = config_get_bool(params, "allow_external_slaves");
-    handle->failover = config_get_bool(params, CN_FAILOVER);
+    handle->auto_failover = config_get_bool(params, CN_AUTO_FAILOVER);
    handle->failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT);
-    handle->switchover = config_get_bool(params, CN_SWITCHOVER);
    handle->switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
    handle->verify_master_failure = config_get_bool(params, CN_VERIFY_MASTER_FAILURE);
    handle->master_failure_timeout = config_get_integer(params, CN_MASTER_FAILURE_TIMEOUT);
@ -921,9 +1031,8 @@ static void diagnostics(DCB *dcb, const MXS_MONITOR *mon)
 {
    const MYSQL_MONITOR *handle = (const MYSQL_MONITOR *)mon->handle;

-    dcb_printf(dcb, "Failover:\t%s\n", handle->failover ? "Enabled" : "Disabled");
+    dcb_printf(dcb, "Automatic failover:\t%s\n", handle->auto_failover ? "Enabled" : "Disabled");
    dcb_printf(dcb, "Failover Timeout:\t%u\n", handle->failover_timeout);
-    dcb_printf(dcb, "Switchover:\t%s\n", handle->switchover ? "Enabled" : "Disabled");
    dcb_printf(dcb, "Switchover Timeout:\t%u\n", handle->switchover_timeout);
    dcb_printf(dcb, "Auto join:\t%s\n", handle->auto_join_cluster ? "Enabled" : "Disabled");
    dcb_printf(dcb, "MaxScale MonitorId:\t%lu\n", handle->id);
@ -972,9 +1081,8 @@ static json_t* diagnostics_json(const MXS_MONITOR *mon)
    json_object_set_new(rval, "failcount", json_integer(handle->failcount));
    json_object_set_new(rval, "allow_cluster_recovery", json_boolean(handle->allow_cluster_recovery));
    json_object_set_new(rval, "mysql51_replication", json_boolean(handle->mysql51_replication));
-    json_object_set_new(rval, CN_FAILOVER, json_boolean(handle->failover));
+    json_object_set_new(rval, CN_AUTO_FAILOVER, json_boolean(handle->auto_failover));
    json_object_set_new(rval, CN_FAILOVER_TIMEOUT, json_integer(handle->failover_timeout));
-    json_object_set_new(rval, CN_SWITCHOVER, json_boolean(handle->switchover));
    json_object_set_new(rval, CN_SWITCHOVER_TIMEOUT, json_integer(handle->switchover_timeout));
    json_object_set_new(rval, CN_AUTO_JOIN, json_boolean(handle->auto_join_cluster));

@ -2100,17 +2208,18 @@ monitorMain(void *arg)
        mon_process_state_changes(mon, handle->script, handle->events);
        bool failover_performed = false; // Has an automatic failover been performed this loop?

-        if (handle->failover)
+        if (handle->auto_failover)
        {
+            const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
+                                         "'%s' via MaxAdmin or the REST API, or restart MaxScale.";
            if (failover_not_possible(handle))
            {
-                MXS_ERROR("Failover is not possible due to one or more problems in "
-                          "the replication configuration, disabling failover. "
-                          "Failover should only be enabled after the replication "
-                          "configuration  has been fixed. To re-enable failover "
-                          "functionality, manually set '%s' to 'true' for monitor "
-                          "'%s' via MaxAdmin or the REST API.", CN_FAILOVER, mon->name);
-                handle->failover = false;
+                const char PROBLEMS[] = "Failover is not possible due to one or more problems in the "
+                    "replication configuration, disabling automatic failover. Failover should only be "
+                    "enabled after the replication configuration has been fixed.";
+                MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, mon->name);
+                handle->auto_failover = false;
+                disable_setting(handle, CN_AUTO_FAILOVER);
            }
            else if (master_maybe_dead(handle) && master_still_alive(handle))
            {
@ -2118,12 +2227,10 @@ monitorMain(void *arg)
            }
            else if (!mon_process_failover(handle, handle->failover_timeout, &failover_performed))
            {
-                MXS_ALERT("Failed to perform failover, disabling failover functionality. "
-                          "To enable failover functionality, manually set 'failover' to "
-                          "'true' for monitor '%s' via MaxAdmin or the REST API.", mon->name);
-
-                mon_alter_parameter(handle->monitor, CN_FAILOVER, "false");
-                handle->failover = false;
+                const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
+                MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, mon->name);
+                handle->auto_failover = false;
+                disable_setting(handle, CN_AUTO_FAILOVER);
            }
        }

@ -2977,7 +3084,7 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout, boo
        MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
                   failed_master->server->unique_name);
        failed_master->new_event = false;
-        rval = do_failover(monitor);
+        rval = mysql_failover_check(monitor, NULL) && do_failover(monitor, NULL);
        if (rval)
        {
            *cluster_modified_out = true;
@ -3170,9 +3277,10 @@ MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon,
 *
 * @param mon The monitor
 * @param new_master The new master
+ * @param err_out Json error output
 * @return True if relay log was processed within time limit, or false if time ran out or an error occurred.
 */
-bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master)
+bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out)
 {
    MySqlServerInfo* master_info = get_server_info(mon, new_master);
    time_t begin = time(NULL);
@ -3216,7 +3324,8 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
            reason = "Invalid Gtid(s) (current_pos: " + master_info->gtid_current_pos.to_string() +
                ", io_pos: " + master_info->slave_status.gtid_io_pos.to_string() + ")";
        }
-        MXS_ERROR("Failover: %s while waiting for server '%s' to process relay log. Cancelling failover.",
+        PRINT_MXS_JSON_ERROR(err_out, "Failover: %s while waiting for server '%s' to process relay log. "
+                             "Cancelling failover.",
                             reason.c_str(), new_master->server->unique_name);
        rval = false;
    }
@ -3318,28 +3427,29 @@ int failover_redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONIT
 * Performs failover for a simple topology (1 master, N slaves, no intermediate masters).
 *
 * @param mon Server cluster monitor
+ * @param err_out Json output
 * @return True if successful
 */
-static bool do_failover(MYSQL_MONITOR* mon)
+static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out)
 {
    // Topology has already been tested to be simple.
    if (mon->master_gtid_domain < 0)
    {
-        MXS_ERROR("Cluster gtid domain is unknown. Cannot failover.");
+        PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot failover.");
        return false;
    }
    // Step 1: Select new master. Also populate a vector with all slaves not the selected master.
    ServerVector slaves;
-    MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &slaves, NULL);
+    MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &slaves, err_out);
    if (new_master == NULL)
    {
        return false;
    }
    bool rval = false;
    // Step 2: Wait until relay log consumed.
-    if (failover_wait_relay_log(mon, new_master) &&
+    if (failover_wait_relay_log(mon, new_master, err_out) &&
        // Step 3: Stop and reset slave, set read-only to 0.
-        failover_promote_new_master(mon, new_master, NULL))
+        failover_promote_new_master(mon, new_master, err_out))
    {
        // Step 4: Redirect slaves.
        int redirects = failover_redirect_slaves(mon, slaves, new_master);
--- a/server/modules/routing/debugcli/debugcmd.c
+++ b/server/modules/routing/debugcli/debugcmd.c
@ -1585,8 +1585,6 @@ struct subcommand alteroptions[] =
        "backend_connect_attempts Number of re-connection attempts\n"
        "journal_max_age          Maximum age of server state journal\n"
        "script_timeout           Timeout in seconds for monitor scripts\n"
-        "failover                 Enable or disable failover\n"
-        "failover_timeout         Failover timeout in seconds\n"
        "\n"
        "This will alter an existing parameter of a monitor. To remove parameters,\n"
        "pass an empty value for a key e.g. 'maxadmin alter monitor my-monitor my-key='\n"