MXS-1513: Obay user-given new master server

If given a readily selected master server, Switchover will use it as the new master. If the given server is invalid, nothing will happen and an error is returned.
2017-11-17 16:02:36 +02:00 · 2017-11-17 16:02:36 +02:00 · dc2286c774
commit dc2286c774
parent e0818fdbeb
1 changed files with 158 additions and 69 deletions
--- a/server/modules/monitor/mysqlmon/mysql_mon.cc
+++ b/server/modules/monitor/mysqlmon/mysql_mon.cc
@ -87,7 +87,8 @@ static bool isMySQLEvent(mxs_monitor_event_t event);
 void check_maxscale_schema_replication(MXS_MONITOR *monitor);
 static bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout);
 static bool do_failover(MYSQL_MONITOR* mon);
-static bool do_switchover(MYSQL_MONITOR* mon, json_t** err_out);
+static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
+                          MXS_MONITORED_SERVER* new_master,json_t** err_out);
 static bool update_gtids(MXS_MONITORED_SERVER *database, int64_t domain, MySqlServerInfo* info);
 static bool update_replication_settings(MXS_MONITORED_SERVER *database, MySqlServerInfo* info);

@ -304,7 +305,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast
    if (rv)
    {
        bool failover = config_get_bool(mon->parameters, CN_FAILOVER);
-        rv = do_switchover(handle, output);
+        rv = do_switchover(handle, monitored_current_master, monitored_new_master, output);

        if (rv)
        {
@ -2949,7 +2950,103 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout)
 }

 /**
- * Selects a new master. Also adds slaves which should be redirected to an array.
+ * Update replication settings and gtid:s of the slave server.
+ *
+ * @param mon Cluster monitor
+ * @param server Slave to update
+ * @return Slave server info. NULL on error, or if server is not a slave.
+ */
+static MySqlServerInfo* update_slave_info(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* server)
+{
+    MySqlServerInfo* info = get_server_info(mon, server);
+    if (info->slave_status.slave_sql_running &&
+        update_replication_settings(server, info) &&
+        update_gtids(server, info->slave_status.gtid_io_pos.domain, info))
+    {
+        return info;
+    }
+    return NULL;
+}
+
+/**
+ * Check if server has binary log enabled. Print warnings if gtid_strict_mode or log_slave_updates is off.
+ *
+ * @param server Server to check
+ * @param server_info Server info
+ * @return True if log_bin is on
+ */
+static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySqlServerInfo* server_info)
+{
+    bool rval = true;
+    const char* servername = server->server->unique_name;
+    if (server_info->rpl_settings.log_bin == false)
+    {
+        const char NO_BINLOG[] =
+            "Slave '%s' has binary log disabled and is not a valid promotion candidate.";
+        MXS_WARNING(NO_BINLOG, servername);
+        rval = false;
+    }
+    else
+    {
+        if (server_info->rpl_settings.gtid_strict_mode == false)
+        {
+            const char NO_STRICT[] =
+                "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. "
+                "For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
+            MXS_WARNING(NO_STRICT, servername);
+        }
+        if (server_info->rpl_settings.log_slave_updates == false)
+        {
+            const char NO_SLAVE_UPDATES[] =
+                "Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication "
+                "will break for lagging slaves if '%s' is promoted.";
+            MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
+        }
+    }
+    return rval;
+}
+
+/**
+ * Check that the given slave is a valid promotion candidate. Update the server info structs of all slaves.
+ * Also populate the output vector with other slave servers.
+ *
+ * @param mon Cluster monitor
+ * @param preferred Preferred new master
+ * @param slaves_out Output array for other slaves. These should be redirected to the new master. Can be NULL.
+ * @param err_out Json object for error printing. Can be NULL.
+ * @return True, if given slave is a valid promotion candidate.
+ */
+bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* preferred,
+                                       ServerVector* slaves_out, json_t** err_out)
+{
+    ss_dassert(preferred);
+    bool rval = true;
+    MySqlServerInfo* preferred_info = update_slave_info(mon, preferred);
+    if (preferred_info == NULL || !check_replication_settings(preferred, preferred_info))
+    {
+        PRINT_MXS_JSON_ERROR(err_out, "The requested server '%s' is not a valid promotion candidate.",
+                             preferred->server->unique_name);
+        rval = false;
+    }
+    for (MXS_MONITORED_SERVER *slave = mon->monitor->monitored_servers; slave; slave = slave->next)
+    {
+        if (slave != preferred)
+        {
+            // The update_slave_info()-call is not strictly necessary here, but it should be ran to keep this
+            // function analogous with failover_select_new_master(). The later functions can then assume that
+            // slave server info is up to date.
+            MySqlServerInfo* slave_info = update_slave_info(mon, slave);
+            if (slave_info && slaves_out)
+            {
+                slaves_out->push_back(slave);
+            }
+        }
+    }
+    return rval;
+}
+
+/**
+ * Select a new master. Also add slaves which should be redirected to an array.
 *
 * @param mon The monitor
 * @param out_slaves Vector for storing slave servers, can be NULL
@ -2957,7 +3054,7 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout)
 * @return The found master, or NULL if not found
 */
 MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon,
-                                                 ServerVector* out_slaves,
+                                                 ServerVector* slaves_out,
                                                 json_t** err_out)
 {
    /* Select a new master candidate. Selects the one with the latest event in relay log.
@ -2965,84 +3062,62 @@ MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon,
    MXS_MONITORED_SERVER* new_master = NULL;
    MySqlServerInfo* new_master_info = NULL;
    int master_vector_index = -1;
-    for (MXS_MONITORED_SERVER *mon_server = mon->monitor->monitored_servers;
-         mon_server;
-         mon_server = mon_server->next)
+    for (MXS_MONITORED_SERVER *cand = mon->monitor->monitored_servers; cand; cand = cand->next)
    {
-        MySqlServerInfo* cand_info = get_server_info(mon, mon_server);
-        if (cand_info->slave_status.slave_sql_running &&
-                update_replication_settings(mon_server, cand_info) &&
-                update_gtids(mon_server, cand_info->slave_status.gtid_io_pos.domain, cand_info))
+        MySqlServerInfo* cand_info = update_slave_info(mon, cand);
+        if (cand_info)
        {
-            if (out_slaves)
+            if (slaves_out)
            {
-                out_slaves->push_back(mon_server);
+                slaves_out->push_back(cand);
            }
-            char* servername = mon_server->server->unique_name;
-            if (cand_info->rpl_settings.log_bin == false)
+            if (check_replication_settings(cand, cand_info))
            {
-                const char NO_BINLOG[] = "Slave '%s' has binary log disabled and is not a valid promotion "
-                                         "candidate.";
-                MXS_WARNING(NO_BINLOG, servername);
-                continue;
-            }
-            if (cand_info->rpl_settings.gtid_strict_mode == false)
-            {
-                const char NO_STRICT[] = "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is "
-                                         "recommended. For more information, see "
-                                         "https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
-                MXS_WARNING(NO_STRICT, servername);
-            }
-            if (cand_info->rpl_settings.log_slave_updates == false)
-            {
-                const char NO_SLAVE_UPDATES[] = "Slave '%s' has log_slave_updates disabled. It is a valid "
-                                                "candidate but replication will break for lagging slaves if "
-                                                "'%s' is promoted.";
-                MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
-            }
-            bool select_this = false;
-            // If no candidate yet, accept any.
-            if (new_master == NULL)
-            {
-                select_this = true;
-            }
-            else
-            {
-                uint64_t cand_io = cand_info->slave_status.gtid_io_pos.sequence;
-                uint64_t cand_processed = cand_info->gtid_slave_pos.sequence;
-                uint64_t master_io = new_master_info->slave_status.gtid_io_pos.sequence;
-                uint64_t master_processed = new_master_info->gtid_slave_pos.sequence;
-                bool cand_updates = cand_info->rpl_settings.log_slave_updates;
-                bool master_updates = new_master_info->rpl_settings.log_slave_updates;
-                // Otherwise accept a slave with a later event in relay log.
-                if (cand_io > master_io ||
-                        // If io sequences are identical, the slave with more events processed wins.
-                        (cand_io == master_io && (cand_processed > master_processed ||
-                        // Finally, if binlog positions are identical, prefer a slave with log_slave_updates.
-                        (cand_processed == master_processed && cand_updates && !master_updates))))
+                bool select_this = false;
+                // If no candidate yet, accept any.
+                if (new_master == NULL)
                {
                    select_this = true;
                }
-            }
-
-            if (select_this)
-            {
-                new_master = mon_server;
-                new_master_info = cand_info;
-                if (out_slaves)
+                else
                {
-                    master_vector_index = out_slaves->size() - 1;
+                    uint64_t cand_io = cand_info->slave_status.gtid_io_pos.sequence;
+                    uint64_t cand_processed = cand_info->gtid_slave_pos.sequence;
+                    uint64_t master_io = new_master_info->slave_status.gtid_io_pos.sequence;
+                    uint64_t master_processed = new_master_info->gtid_slave_pos.sequence;
+                    bool cand_updates = cand_info->rpl_settings.log_slave_updates;
+                    bool master_updates = new_master_info->rpl_settings.log_slave_updates;
+                    // Otherwise accept a slave with a later event in relay log.
+                    if (cand_io > master_io ||
+                            // If io sequences are identical, the slave with more events processed wins.
+                            (cand_io == master_io && (cand_processed > master_processed ||
+                            // Finally, if binlog positions are identical, prefer a slave with
+                            // log_slave_updates.
+                            (cand_processed == master_processed && cand_updates && !master_updates))))
+                    {
+                        select_this = true;
+                    }
+                }
+
+                if (select_this)
+                {
+                    new_master = cand;
+                    new_master_info = cand_info;
+                    if (slaves_out)
+                    {
+                        master_vector_index = slaves_out->size() - 1;
+                    }
                }
            }
        }
    }

-    if (new_master && out_slaves)
+    if (new_master && slaves_out)
    {
        // Remove the selected master from the vector.
-        ServerVector::iterator remove_this = out_slaves->begin();
+        ServerVector::iterator remove_this = slaves_out->begin();
        remove_this += master_vector_index;
-        out_slaves->erase(remove_this);
+        slaves_out->erase(remove_this);
    }
    if (new_master == NULL)
    {
@ -3481,21 +3556,35 @@ static string get_connection_errors(ServerVector& slaves, MXS_MONITORED_SERVER*
 * @return True if successful. If false, the cluster can be in various situations depending on which step
 * failed. In practice, manual intervention is usually required on failure.
 */
-static bool do_switchover(MYSQL_MONITOR* mon, json_t** err_out)
+static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_master,
+                          MXS_MONITORED_SERVER* new_master, json_t** err_out)
 {
-    MXS_MONITORED_SERVER* demotion_target = mon->master;
+    MXS_MONITORED_SERVER* demotion_target = current_master ? current_master : mon->master;
    if (demotion_target == NULL)
    {
        PRINT_MXS_JSON_ERROR(err_out, "Cluster does not have a running master. Run failover instead.");
        return false;
    }
+    // Step 1: Select promotion candidate, save all slaves except promotion target to an array. If we have a
+    // user-defined master candidate, check it. Otherwise, autoselect.
+    MXS_MONITORED_SERVER* promotion_target = NULL;
    ServerVector slaves;
-    // Step 1: Select promotion candidate, save all slaves except promotion target to an array.
-    MXS_MONITORED_SERVER* promotion_target = failover_select_new_master(mon, &slaves, err_out);
+    if (new_master)
+    {
+        if (switchover_check_preferred_master(mon, new_master, &slaves, err_out))
+        {
+            promotion_target = new_master;
+        }
+    }
+    else
+    {
+        promotion_target = failover_select_new_master(mon, &slaves, err_out);
+    }
    if (promotion_target == NULL)
    {
        return false;
    }
+
    bool rval = false;
    MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target);
    uint32_t repl_domain = get_server_info(mon, promotion_target)->slave_status.gtid_io_pos.domain;