MXS-1703 Cleanup manual cluster modification command handling

Switchover checks are performed after monitor is stopped to be analogous to other operations.
2018-04-09 14:05:59 +03:00 · 2018-04-09 14:05:59 +03:00 · d8a16dfe21
commit d8a16dfe21
parent 1da33c4423
3 changed files with 143 additions and 134 deletions
--- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc
+++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc
@ -24,9 +24,7 @@ using std::string;
 static void print_redirect_errors(MXS_MONITORED_SERVER* first_server, const MonServerArray& servers,
                                  json_t** err_out);

-bool MariaDBMonitor::manual_switchover(MXS_MONITORED_SERVER* new_master,
-                                       MXS_MONITORED_SERVER* given_current_master,
-                                       json_t** error_out)
+bool MariaDBMonitor::manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out)
 {
    bool stopped = stop();
    if (stopped)
@ -38,43 +36,18 @@ bool MariaDBMonitor::manual_switchover(MXS_MONITORED_SERVER* new_master,
        MXS_NOTICE("Monitor %s already stopped, switchover can proceed.", m_monitor_base->name);
    }

-    // Autoselect current master if not given as parameter.
-    MXS_MONITORED_SERVER* current_master = given_current_master;
-    if (current_master == NULL)
-    {
-        if (m_master)
-        {
-            current_master = m_master;
-        }
-        else
-        {
-            const char NO_MASTER[] = "Monitor '%s' has no master server.";
-            PRINT_MXS_JSON_ERROR(error_out, NO_MASTER, m_monitor_base->name);
-        }
-    }
-
-    bool current_ok = (current_master != NULL) ? switchover_check_current(current_master, error_out) : false;
-    bool new_ok = switchover_check_new(new_master, error_out);
-    // Check that all slaves are using gtid-replication
-    bool gtid_ok = true;
-    for (auto mon_serv = m_monitor_base->monitored_servers; mon_serv != NULL; mon_serv = mon_serv->next)
-    {
-        if (SERVER_IS_SLAVE(mon_serv->server))
-        {
-            if (!uses_gtid(mon_serv, error_out))
-            {
-                gtid_ok = false;
-            }
-        }
-    }
+    MariaDBServer *found_new_master = NULL, *found_curr_master = NULL;
+    auto ok_to_switch = switchover_check(new_master, current_master, &found_new_master, &found_curr_master,
+                                         error_out);

    bool rval = false;
-    if (current_ok && new_ok && gtid_ok)
+    if (ok_to_switch)
    {
-        bool switched = do_switchover(current_master, new_master, error_out);
+        bool switched = do_switchover(found_curr_master->server_base, found_new_master->server_base,
+                                      error_out);

-        const char* curr_master_name = current_master->server->unique_name;
-        const char* new_master_name = new_master->server->unique_name;
+        const char* curr_master_name = found_curr_master->server_base->server->unique_name;
+        const char* new_master_name = found_new_master->server_base->server->unique_name;

        if (switched)
        {
@ -152,21 +125,21 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
    if (cluster_can_be_joined())
    {
        const char* rejoin_serv_name = rejoin_server->unique_name;
-        MXS_MONITORED_SERVER* mon_server = mon_get_monitored_server(m_monitor_base, rejoin_server);
-        if (mon_server)
+        MXS_MONITORED_SERVER* mon_slave_cand = mon_get_monitored_server(m_monitor_base, rejoin_server);
+        if (mon_slave_cand)
        {
            const char* master_name = m_master->server->unique_name;
-            MariaDBServer* master_info = get_server_info(m_master);
-            MariaDBServer* server_info = get_server_info(mon_server);
+            MariaDBServer* master = get_server_info(m_master);
+            MariaDBServer* slave_cand = get_server_info(mon_slave_cand);

-            if (server_is_rejoin_suspect(mon_server, master_info, output))
+            if (server_is_rejoin_suspect(slave_cand, master, output))
            {
-                if (master_info->update_gtids())
+                if (master->update_gtids())
                {
-                    if (can_replicate_from(mon_server, server_info, master_info))
+                    if (can_replicate_from(slave_cand, master))
                    {
-                        MonServerArray joinable_server;
-                        joinable_server.push_back(mon_server);
+                        ServerRefArray joinable_server;
+                        joinable_server.push_back(slave_cand);
                        if (do_rejoin(joinable_server) == 1)
                        {
                            rval = true;
@ -359,33 +332,30 @@ bool MariaDBMonitor::redirect_one_slave(MXS_MONITORED_SERVER* slave, const char*
 * @param joinable_servers Which servers to rejoin
 * @return The number of servers successfully rejoined
 */
-uint32_t MariaDBMonitor::do_rejoin(const MonServerArray& joinable_servers)
+uint32_t MariaDBMonitor::do_rejoin(const ServerRefArray& joinable_servers)
 {
    SERVER* master_server = m_master->server;
+    const char* master_name = master_server->unique_name;
    uint32_t servers_joined = 0;
    if (!joinable_servers.empty())
    {
        string change_cmd = generate_change_master_cmd(master_server->name, master_server->port);
-        for (MonServerArray::const_iterator iter = joinable_servers.begin();
-             iter != joinable_servers.end();
-             iter++)
+        for (auto iter = joinable_servers.begin(); iter != joinable_servers.end(); iter++)
        {
-            MXS_MONITORED_SERVER* joinable = *iter;
-            const char* name = joinable->server->unique_name;
-            const char* master_name = master_server->unique_name;
-            MariaDBServer* redir_info = get_server_info(joinable);
-
+            auto joinable = *iter;
+            const char* name = joinable->server_base->server->unique_name;
            bool op_success;
-            if (redir_info->n_slaves_configured == 0)
+
+            if (joinable->n_slaves_configured == 0)
            {
                MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name);
-                op_success = join_cluster(joinable, change_cmd.c_str());
+                op_success = join_cluster(joinable->server_base, change_cmd.c_str());
            }
            else
            {
                MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
                           "redirecting it to '%s'.", name, master_name, master_name);
-                op_success = redirect_one_slave(joinable, change_cmd.c_str());
+                op_success = redirect_one_slave(joinable->server_base, change_cmd.c_str());
            }

            if (op_success)
@ -416,21 +386,19 @@ bool MariaDBMonitor::cluster_can_be_joined()
 * @return False, if there were possible rejoinable servers but communications error to master server
 * prevented final checks.
 */
-bool MariaDBMonitor::get_joinable_servers(MonServerArray* output)
+bool MariaDBMonitor::get_joinable_servers(ServerRefArray* output)
 {
    ss_dassert(output);
-    MariaDBServer *master_info = get_server_info(m_master);
+    MariaDBServer* master = get_server_info(m_master);

    // Whether a join operation should be attempted or not depends on several criteria. Start with the ones
    // easiest to test. Go though all slaves and construct a preliminary list.
-    MonServerArray suspects;
-    for (MXS_MONITORED_SERVER* server = m_monitor_base->monitored_servers;
-         server != NULL;
-         server = server->next)
+    ServerRefArray suspects;
+    for (size_t i = 0; i < m_servers.size(); i++)
    {
-        if (server_is_rejoin_suspect(server, master_info, NULL))
+        if (server_is_rejoin_suspect(&m_servers[i], master, NULL))
        {
-            suspects.push_back(server);
+            suspects.push_back(&m_servers[i]);
        }
    }

@ -438,15 +406,13 @@ bool MariaDBMonitor::get_joinable_servers(MonServerArray* output)
    bool comm_ok = true;
    if (!suspects.empty())
    {
-        if (master_info->update_gtids())
+        if (master->update_gtids())
        {
            for (size_t i = 0; i < suspects.size(); i++)
            {
-                MXS_MONITORED_SERVER* suspect = suspects[i];
-                MariaDBServer* suspect_info = get_server_info(suspect);
-                if (can_replicate_from(suspect, suspect_info, master_info))
+                if (can_replicate_from(suspects[i], master))
                {
-                    output->push_back(suspect);
+                    output->push_back(suspects[i]);
                }
            }
        }
@ -509,30 +475,30 @@ bool MariaDBMonitor::join_cluster(MXS_MONITORED_SERVER* server, const char* chan
 * Checks if a server is a possible rejoin candidate. A true result from this function is not yet sufficient
 * criteria and another call to can_replicate_from() should be made.
 *
- * @param server Server to check
- * @param master_info Master server info
+ * @param rejoin_cand Server to check
+ * @param master Master server info
 * @param output Error output. If NULL, no error is printed to log.
 * @return True, if server is a rejoin suspect.
 */
-bool MariaDBMonitor::server_is_rejoin_suspect(MXS_MONITORED_SERVER* rejoin_server,
-                                              MariaDBServer* master_info, json_t** output)
+bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, MariaDBServer* master,
+                                              json_t** output)
 {
    bool is_suspect = false;
-    if (!SERVER_IS_MASTER(rejoin_server->server) && SERVER_IS_RUNNING(rejoin_server->server))
+    auto rejoin_mon_serv = rejoin_cand->server_base;
+    if (!SERVER_IS_MASTER(rejoin_mon_serv->server) && SERVER_IS_RUNNING(rejoin_mon_serv->server))
    {
-        MariaDBServer* server_info = get_server_info(rejoin_server);
-        SlaveStatusInfo* slave_status = &server_info->slave_status;
+        SlaveStatusInfo* slave_status = &rejoin_cand->slave_status;
        // Has no slave connection, yet is not a master.
-        if (server_info->n_slaves_configured == 0)
+        if (rejoin_cand->n_slaves_configured == 0)
        {
            is_suspect = true;
        }
        // Or has existing slave connection ...
-        else if (server_info->n_slaves_configured == 1)
+        else if (rejoin_cand->n_slaves_configured == 1)
        {
            // which is connected to master but it's the wrong one
            if (slave_status->slave_io_running  &&
-                slave_status->master_server_id != master_info->server_id)
+                slave_status->master_server_id != master->server_id)
            {
                is_suspect = true;
            }
@ -548,7 +514,7 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MXS_MONITORED_SERVER* rejoin_serve
    else if (output != NULL)
    {
        PRINT_MXS_JSON_ERROR(output, "Server '%s' is master or not running.",
-                             rejoin_server->server->unique_name);
+                             rejoin_mon_serv->server->unique_name);
    }
    return is_suspect;
 }
@ -557,6 +523,8 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MXS_MONITORED_SERVER* rejoin_serve
 * Performs switchover for a simple topology (1 master, N slaves, no intermediate masters). If an
 * intermediate step fails, the cluster may be left without a master.
 *
+ * @param current_master Current master server
+ * @param new_master Slave which should be promoted
 * @param err_out json object for error printing. Can be NULL.
 * @return True if successful. If false, the cluster can be in various situations depending on which step
 * failed. In practice, manual intervention is usually required on failure.
@ -1432,21 +1400,19 @@ bool MariaDBMonitor::failover_check(json_t** error_out)
 }

 /**
- * Checks if slave can replicate from master. Only considers gtid:s and only detects obvious errors. The
- * non-detected errors will mostly be detected once the slave tries to start replicating.
+ * Checks if slave candidate can replicate from master. Only considers gtid:s and only detects obvious errors.
+ * The non-detected errors will mostly be detected once the slave tries to start replicating.
 *
- * @param slave Slave server candidate
- * @param slave_info Slave info
- * @param master_info Master info
+ * @param slave_cand Slave candidate server
+ * @param master_info Master server
 * @return True if slave can replicate from master
 */
-bool MariaDBMonitor::can_replicate_from(MXS_MONITORED_SERVER* slave,
-                                        MariaDBServer* slave_info, MariaDBServer* master_info)
+bool MariaDBMonitor::can_replicate_from(MariaDBServer* slave_cand, MariaDBServer* master)
 {
    bool rval = false;
-    if (slave_info->update_gtids())
+    if (slave_cand->update_gtids())
    {
-        rval = slave_info->gtid_current_pos.can_replicate_from(master_info->gtid_binlog_pos);
+        rval = slave_cand->gtid_current_pos.can_replicate_from(master->gtid_binlog_pos);
    }
    return rval;
 }
@ -1629,3 +1595,72 @@ static void print_redirect_errors(MXS_MONITORED_SERVER* first_server, const MonS
                                         "%s Errors: %s.", MSG, combined_error.c_str());
    }
 }
+
+/**
+ * Check cluster and parameters for suitability to switchover. Also writes found servers to output pointers.
+ * In case of error the output server values will not be written to.
+ *
+ * @param new_master New master requested by the user
+ * @param current_master Current master given by user. Can be null for autoselect.
+ * @param found_new_master Where to write found new master
+ * @param found_current_master Where to write found current master
+ * @param error_out Error output, can be null.
+ * @return True if cluster is suitable and server parameters were valid and found.
+ */
+bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master,
+                                      MariaDBServer** found_new_master, MariaDBServer** found_current_master,
+                                      json_t** error_out)
+{
+    const char NO_SERVER[] = "Server '%s' is not a member of monitor '%s'.";
+    // Check that both .
+    MXS_MONITORED_SERVER* mon_new_master = mon_get_monitored_server(m_monitor_base, new_master);
+    if (mon_new_master == NULL)
+    {
+        PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, new_master->unique_name, m_monitor_base->name);
+    }
+
+    // Check given current master. If NULL, autoselect.
+    MXS_MONITORED_SERVER* mon_curr_master = NULL;
+    if (current_master)
+    {
+        mon_curr_master = mon_get_monitored_server(m_monitor_base, current_master);
+        if (mon_curr_master == NULL)
+        {
+            PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, current_master->unique_name,
+                                 m_monitor_base->name);
+        }
+    }
+    else
+    {
+        // Autoselect current master.
+        if (m_master)
+        {
+            mon_curr_master = m_master;
+        }
+        else
+        {
+            const char NO_MASTER[] = "Monitor '%s' has no master server.";
+            PRINT_MXS_JSON_ERROR(error_out, NO_MASTER, m_monitor_base->name);
+        }
+    }
+
+    bool current_ok = mon_curr_master ? switchover_check_current(mon_curr_master, error_out) : false;
+    bool new_ok = mon_new_master ? switchover_check_new(mon_new_master, error_out) : false;
+    // Check that all slaves are using gtid-replication
+    bool gtid_ok = true;
+    for (auto mon_serv = m_monitor_base->monitored_servers; mon_serv != NULL; mon_serv = mon_serv->next)
+    {
+        if (SERVER_IS_SLAVE(mon_serv->server) && !uses_gtid(mon_serv, error_out))
+        {
+            gtid_ok = false;
+        }
+    }
+
+    bool rval = current_ok && new_ok && gtid_ok;
+    if (rval)
+    {
+        *found_new_master = get_server_info(mon_new_master);
+        *found_current_master = get_server_info(mon_curr_master);
+    }
+    return rval;
+}
--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@ -951,7 +951,7 @@ void MariaDBMonitor::log_master_changes(MariaDBServer* root_master_server, int*

 void MariaDBMonitor::handle_auto_rejoin()
 {
-    MonServerArray joinable_servers;
+    ServerRefArray joinable_servers;
    if (get_joinable_servers(&joinable_servers))
    {
        uint32_t joins = do_rejoin(joinable_servers);
@ -962,8 +962,8 @@ void MariaDBMonitor::handle_auto_rejoin()
        if (joins < joinable_servers.size())
        {
            MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. "
-                "To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
-                "the REST API.", CN_AUTO_REJOIN, m_monitor_base->name);
+                      "To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or "
+                      "the REST API.", CN_AUTO_REJOIN, m_monitor_base->name);
            m_auto_rejoin = false;
            disable_setting(CN_AUTO_REJOIN);
        }
@ -1299,7 +1299,7 @@ static json_t* diagnostics_json(const MXS_MONITOR *mon)
 *
 * @return True, if the command was executed, false otherwise.
 */
-bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** error_out)
+bool handle_manual_switchover(const MODULECMD_ARG* args, json_t** error_out)
 {
    ss_dassert((args->argc == 2) || (args->argc == 3));
    ss_dassert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
@ -1318,33 +1318,7 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** error_out)
        auto handle = static_cast<MariaDBMonitor*>(mon->handle);
        SERVER* new_master = args->argv[1].value.server;
        SERVER* current_master = (args->argc == 3) ? args->argv[2].value.server : NULL;
-        bool error = false;
-        const char NO_SERVER[] = "Server '%s' is not a member of monitor '%s'.";
-
-        // Check given new master.
-        MXS_MONITORED_SERVER* mon_new_master = mon_get_monitored_server(mon, new_master);
-        if (mon_new_master == NULL)
-        {
-            PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, new_master->unique_name, mon->name);
-            error = true;
-        }
-
-        // Check given old master. If NULL, manual_switchover() will autoselect.
-        MXS_MONITORED_SERVER* mon_curr_master = NULL;
-        if (current_master)
-        {
-            mon_curr_master = mon_get_monitored_server(mon, current_master);
-            if (mon_curr_master == NULL)
-            {
-                PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, current_master->unique_name, mon->name);
-                error = true;
-            }
-        }
-
-        if (!error)
-        {
-            rval = handle->manual_switchover(mon_new_master, mon_curr_master, error_out);
-        }
+        rval = handle->manual_switchover(new_master, current_master, error_out);
    }
    return rval;
 }
@ -1356,7 +1330,7 @@ bool mysql_handle_switchover(const MODULECMD_ARG* args, json_t** error_out)
 * @param output Json error output
 * @return True on success
 */
-bool mysql_handle_failover(const MODULECMD_ARG* args, json_t** output)
+bool handle_manual_failover(const MODULECMD_ARG* args, json_t** output)
 {
    ss_dassert(args->argc == 1);
    ss_dassert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
@ -1382,7 +1356,7 @@ bool mysql_handle_failover(const MODULECMD_ARG* args, json_t** output)
 * @param output Json error output
 * @return True on success
 */
-bool mysql_handle_rejoin(const MODULECMD_ARG* args, json_t** output)
+bool handle_manual_rejoin(const MODULECMD_ARG* args, json_t** output)
 {
    ss_dassert(args->argc == 2);
    ss_dassert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
@ -1424,7 +1398,7 @@ MXS_MODULE* MXS_CREATE_MODULE()
    };

    modulecmd_register_command(MXS_MODULE_NAME, "switchover", MODULECMD_TYPE_ACTIVE,
-                               mysql_handle_switchover, MXS_ARRAY_NELEMS(switchover_argv),
+                               handle_manual_switchover, MXS_ARRAY_NELEMS(switchover_argv),
                               switchover_argv, "Perform master switchover");

    static modulecmd_arg_type_t failover_argv[] =
@ -1436,7 +1410,7 @@ MXS_MODULE* MXS_CREATE_MODULE()
    };

    modulecmd_register_command(MXS_MODULE_NAME, "failover", MODULECMD_TYPE_ACTIVE,
-                               mysql_handle_failover, MXS_ARRAY_NELEMS(failover_argv),
+                               handle_manual_failover, MXS_ARRAY_NELEMS(failover_argv),
                               failover_argv, "Perform master failover");

    static modulecmd_arg_type_t rejoin_argv[] =
@ -1449,7 +1423,7 @@ MXS_MODULE* MXS_CREATE_MODULE()
    };

    modulecmd_register_command(MXS_MODULE_NAME, "rejoin", MODULECMD_TYPE_ACTIVE,
-                               mysql_handle_rejoin, MXS_ARRAY_NELEMS(rejoin_argv),
+                               handle_manual_rejoin, MXS_ARRAY_NELEMS(rejoin_argv),
                               rejoin_argv, "Rejoin server to a cluster");

    static MXS_MONITOR_OBJECT MyObject =
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -89,8 +89,7 @@ public:
     *
     * @return True, if switchover was performed, false otherwise.
     */
-    bool manual_switchover(MXS_MONITORED_SERVER* new_master, MXS_MONITORED_SERVER* current_master,
-                           json_t** error_out);
+    bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);

    /**
     * Perform user-activated failover.
@ -213,7 +212,7 @@ private:
    bool start_external_replication(MXS_MONITORED_SERVER* new_master, json_t** err_out);
    bool switchover_start_slave(MXS_MONITORED_SERVER* old_master, SERVER* new_master);
    bool redirect_one_slave(MXS_MONITORED_SERVER* slave, const char* change_cmd);
-    bool get_joinable_servers(MonServerArray* output);
+    bool get_joinable_servers(ServerRefArray* output);
    bool join_cluster(MXS_MONITORED_SERVER* server, const char* change_cmd);
    void set_master_heartbeat(MXS_MONITORED_SERVER *);
    void set_slave_heartbeat(MXS_MONITORED_SERVER *);
@ -223,18 +222,19 @@ private:
    bool do_switchover(MXS_MONITORED_SERVER* current_master, MXS_MONITORED_SERVER* new_master,
                       json_t** err_out);
    bool do_failover(json_t** err_out);
-    uint32_t do_rejoin(const MonServerArray& joinable_servers);
+    uint32_t do_rejoin(const ServerRefArray& joinable_servers);
    bool mon_process_failover(bool* cluster_modified_out);
-    bool server_is_rejoin_suspect(MXS_MONITORED_SERVER* server, MariaDBServer* master_info,
-                                  json_t** output);
+    bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, MariaDBServer* master, json_t** output);
    bool cluster_can_be_joined();
    bool failover_check(json_t** error_out);
    void disable_setting(const char* setting);
+    bool switchover_check(SERVER* new_master, SERVER* current_master,
+                          MariaDBServer** found_new_master, MariaDBServer** found_current_master,
+                          json_t** error_out);
    bool switchover_check_new(const MXS_MONITORED_SERVER* monitored_server, json_t** error);
    bool switchover_check_current(const MXS_MONITORED_SERVER* suggested_curr_master,
                                  json_t** error_out) const;
-    bool can_replicate_from(MXS_MONITORED_SERVER* slave, MariaDBServer* slave_info,
-                            MariaDBServer* master_info);
+    bool can_replicate_from(MariaDBServer* slave_cand, MariaDBServer* master);
    void monitor_one_server(MariaDBServer& server);
    MariaDBServer* find_root_master();
    void update_gtid_domain();