MXS-1845 New algorithm for finding the master server

Not yet used, as more is needed to replace the old code. The algorithm is based on counting the total number of slave nodes a server has, possibly in multiple layers and/or cycles.
2018-06-05 18:17:29 +03:00
parent 8094c67ac2
commit 3f82c25c62
5 changed files with 247 additions and 21 deletions
--- a/server/modules/monitor/mariadbmon/cluster_discovery.cc
+++ b/server/modules/monitor/mariadbmon/cluster_discovery.cc
@ -13,15 +13,20 @@

 #include "mariadbmon.hh"
 #include <inttypes.h>
+#include <string>
 #include <maxscale/modutil.h>
 #include <maxscale/mysql_utils.h>

+using std::string;
+
 static bool check_replicate_ignore_table(MXS_MONITORED_SERVER* database);
 static bool check_replicate_do_table(MXS_MONITORED_SERVER* database);
 static bool check_replicate_wild_do_table(MXS_MONITORED_SERVER* database);
 static bool check_replicate_wild_ignore_table(MXS_MONITORED_SERVER* database);

 static const char HB_TABLE_NAME[] = "maxscale_schema.replication_heartbeat";
+static const char SERVER_DISQUALIFIED[] = "Server '%s' was disqualified from new master selection because "
+                                          "it is %s.";

 /**
 * This function computes the replication tree from a set of monitored servers and returns the root server
@ -317,7 +322,8 @@ void MariaDBMonitor::build_replication_graph()
    // First, reset all node data.
    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
    {
-        (*iter)->m_node.reset();
+        (*iter)->m_node.reset_indexes();
+        (*iter)->m_node.reset_results();
    }

    /* Here, all slave connections are added to the graph, even if the IO thread cannot connect. Strictly
@ -382,8 +388,9 @@ void MariaDBMonitor::find_graph_cycles()
    m_cycles.clear();
    // The next items need to be passed around in the recursive calls to keep track of algorithm state.
    ServerArray stack;
-    int index = 1; // Node visit index.
-    int cycle = 1; // If cycles are found, the nodes in the cycle are given an identical cycle index.
+    int index = NodeData::INDEX_FIRST; /* Node visit index */
+    int cycle = NodeData::CYCLE_FIRST; /* If cycles are found, the nodes in the cycle are given an identical
+                                        * cycle index. */

    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
    {
@ -404,7 +411,7 @@ void MariaDBMonitor::assign_cycle_roles(int cycle)
    {
        MariaDBServer& server = **iter;
        MXS_MONITORED_SERVER* mon_srv = server.m_server_base;
-        if (server.m_node.cycle > 0)
+        if (server.m_node.cycle != NodeData::CYCLE_NONE)
        {
            /** We have at least one cycle in the graph */
            if (server.m_read_only)
@ -791,7 +798,7 @@ void MariaDBMonitor::assign_relay_master(MariaDBServer& candidate)
    if (ptr->server->node_id > 0 && ptr->server->master_id > 0 &&
        getSlaveOfNodeId(ptr->server->node_id, REJECT_DOWN) &&
        getServerByNodeId(ptr->server->master_id) &&
-        (!m_detect_multimaster || candidate.m_node.cycle == 0))
+        (!m_detect_multimaster || candidate.m_node.cycle == NodeData::CYCLE_NONE))
    {
        /** This server is both a slave and a master i.e. a relay master */
        monitor_set_pending_status(ptr, SERVER_RELAY_MASTER);
@ -878,3 +885,192 @@ void MariaDBMonitor::update_server_states(MariaDBServer& db_server, MariaDBServe
        }
    }
 }
+
+/**
+ * Find the server with the best reach in the candidates-array. Running state or 'read_only' is ignored by
+ * this method.
+ *
+ * @param candidates Which servers to check. All servers in the array will have their 'reach' calculated
+ * @return The best server out of the candidates
+ */
+MariaDBServer* MariaDBMonitor::find_best_reach_server(const ServerArray& candidates)
+{
+    ss_dassert(!candidates.empty());
+    MariaDBServer* best_reach = NULL;
+    /* Search for the server with the best reach. */
+    for (auto iter = candidates.begin(); iter != candidates.end(); iter++)
+    {
+        MariaDBServer* candidate = *iter;
+        calculate_node_reach(candidate);
+        // This is the first valid node or this node has better reach than the so far best found ...
+        if (best_reach == NULL || (candidate->m_node.reach > best_reach->m_node.reach))
+        {
+            best_reach = candidate;
+        }
+    }
+
+    return best_reach;
+}
+
+static string disqualify_reasons_to_string(MariaDBServer* disqualified)
+{
+    string reasons;
+    string separator;
+    const string word_and = " and ";
+    if (disqualified->is_in_maintenance())
+    {
+        reasons += separator + "in maintenance";
+        separator = word_and;
+    }
+    if (disqualified->is_down())
+    {
+        reasons += separator + "down";
+        separator = word_and;
+    }
+    if (disqualified->m_read_only)
+    {
+        reasons += separator + "in read_only mode";
+    }
+    return reasons;
+}
+
+/**
+ * Find the best master server in the cluster. This method should only be called when the monitor
+ * is starting, a cluster operation (e.g. failover) has occurred or the user has changed something on
+ * the current master making it unsuitable. Because of this, the method can be quite vocal and not
+ * consider the previous master.
+ *
+ * @return The master with most slaves
+ */
+MariaDBServer* MariaDBMonitor::find_topology_master_server()
+{
+    /* Finding the best master server may get somewhat tricky if the graph is complicated. The general
+     * criteria for the best master is that it reaches the most slaves (possibly in multiple layers and
+     * cycles). To avoid having to calculate this reachability (doable by a recursive search) to all nodes,
+     * let's use the knowledge that the best master is either a server with no masters (external ones don't
+     * count) or is part of a cycle. The server must be running and writable to be eligible. */
+    ServerArray master_candidates;
+    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
+    {
+        MariaDBServer* server = *iter;
+        if (server->m_node.parents.empty())
+        {
+            if (server->is_running() && server->m_read_only)
+            {
+                master_candidates.push_back(server);
+            }
+            else
+            {
+                string reasons = disqualify_reasons_to_string(server);
+                MXS_WARNING(SERVER_DISQUALIFIED, server->name(), reasons.c_str());
+            }
+        }
+    }
+
+    // For each cycle, it's enough to take one sample server, as all members of a cycle have the same reach.
+    for (auto iter = m_cycles.begin(); iter != m_cycles.end(); iter++)
+    {
+        ServerArray& cycle_members = m_cycles[(*iter).first];
+        MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
+        if (sample_server)
+        {
+            master_candidates.push_back(sample_server);
+        }
+        else
+        {
+            // No single server in the cycle was viable.
+            const char WARN_MSG[] = "No valid master server could be found  in the cycle with servers '%s'.";
+            string server_names = monitored_servers_to_string(cycle_members);
+            MXS_WARNING(WARN_MSG, server_names.c_str());
+
+            for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
+            {
+                MariaDBServer* disqualified_server = *iter2;
+                string reasons = disqualify_reasons_to_string(disqualified_server);
+                MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
+            }
+        }
+    }
+
+    MariaDBServer* found_master = NULL;
+    if (!master_candidates.empty())
+    {
+        found_master = find_best_reach_server(master_candidates);
+    }
+    else
+    {
+        MXS_WARNING("No valid master servers in the cluster.");
+    }
+    return found_master;
+}
+
+/**
+ * Calculate the total number of reachable child nodes for the given node. A node can always reach itself.
+ * The result is saved into the node data.
+ */
+void MariaDBMonitor::calculate_node_reach(MariaDBServer* node)
+{
+    ss_dassert(node && node->m_node.reach == NodeData::REACH_UNKNOWN);
+    // Reset indexes since they will be reused.
+    for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
+    {
+        (*iter)->m_node.reset_indexes();
+    }
+
+    int reach = 1; // The starting node can reach itself.
+    for (auto iter = node->m_node.children.begin(); iter != node->m_node.children.end(); iter++)
+    {
+        MariaDBServer* slave = *iter;
+        if (slave->m_node.index == NodeData::INDEX_NOT_VISITED)
+        { // TODO: Think if is_down() should be checked here. Could cause weird behaviour.
+            reach += calc_reach_visit_node(slave);
+        }
+    }
+
+    node->m_node.reach = reach;
+}
+
+/**
+ * Handle a node for the "reach" calculation.
+ *
+ * @param node Node to visit
+ * @return Total number of children without counting already visited nodes.
+ */
+int MariaDBMonitor::calc_reach_visit_node(MariaDBServer* node)
+{
+    node->m_node.index = NodeData::INDEX_FIRST; // Indexing is not required other than preventing extra visits
+
+    int reachables = 1;
+    for (auto iter = node->m_node.children.begin(); iter != node->m_node.children.end(); iter++)
+    {
+        MariaDBServer* slave = *iter;
+        if (slave->m_node.index == NodeData::INDEX_NOT_VISITED)
+        {
+            reachables += calc_reach_visit_node(slave);
+        }
+    }
+
+    return reachables;
+}
+
+/**
+ * Check which node in a cycle should be the master. The node must be running without read_only.
+ *
+ * @param cycle The cycle index
+ * @return The selected node
+ */
+MariaDBServer* MariaDBMonitor::find_master_inside_cycle(ServerArray& cycle_members)
+{
+    /* For a cycle, all servers are equally good in a sense. The question is just if the server is up
+     * and writable. */
+    for (auto iter = cycle_members.begin(); iter != cycle_members.end(); iter++)
+    {
+        MariaDBServer* server = *iter;
+        ss_dassert(server->m_node.cycle != NodeData::CYCLE_NONE);
+        if (server->is_running() && !server->m_read_only)
+        {
+            return server;
+        }
+    }
+    return NULL;
+}
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -147,6 +147,9 @@ private:
    uint64_t m_events;                   /**< enabled events */
    bool m_warn_set_standalone_master;   /**< Log a warning when setting standalone master */
    bool m_log_no_master;                /**< Should it be logged that there is no master */
+    bool m_warn_no_valid_in_cycle;       /**< Log a warning when a replication cycle has no valid master */
+    bool m_warn_no_valid_outside_cycle;  /**< Log a warning when a replication topology has no valid master
+                                          *   outside of a cycle. */

    enum slave_down_setting_t
    {
@ -183,9 +186,13 @@ private:
    MXS_MONITORED_SERVER* getServerByNodeId(long);
    MXS_MONITORED_SERVER* getSlaveOfNodeId(long, slave_down_setting_t);
    void build_replication_graph();
-    void tarjan_scc_visit_node(MariaDBServer *node, ServerArray* stack, int *index,
-                               int *cycle);
+    void tarjan_scc_visit_node(MariaDBServer *node, ServerArray* stack, int *index, int *cycle);
    void assign_cycle_roles(int cycle);
+    MariaDBServer* find_topology_master_server();
+    MariaDBServer* find_best_reach_server(const ServerArray& candidates);
+    void calculate_node_reach(MariaDBServer* node);
+    int calc_reach_visit_node(MariaDBServer* node);
+    MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);

    // Switchover methods
    bool switchover_check(SERVER* new_master, SERVER* current_master,
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -56,21 +56,27 @@ MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server)
 NodeData::NodeData()
    : index(INDEX_NOT_VISITED)
    , lowest_index(INDEX_NOT_VISITED)
-    , cycle(INDEX_NOT_VISITED)
    , in_stack(false)
+    , cycle(CYCLE_NONE)
+    , reach(REACH_UNKNOWN)
 {}

-void NodeData::reset()
+void NodeData::reset_results()
 {
-    index = INDEX_NOT_VISITED;
-    lowest_index = INDEX_NOT_VISITED;
-    cycle = INDEX_NOT_VISITED;
-    in_stack = false;
+    cycle = CYCLE_NONE;
+    reach = REACH_UNKNOWN;
    parents.clear();
    children.clear();
    external_masters.clear();
 }

+void NodeData::reset_indexes()
+{
+    index = INDEX_NOT_VISITED;
+    lowest_index = INDEX_NOT_VISITED;
+    in_stack = false;
+}
+
 int64_t MariaDBServer::relay_log_events()
 {
    /* The events_ahead-call below ignores domains where current_pos is ahead of io_pos. This situation is
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -71,16 +71,28 @@ public:
 };

 /**
- * Data required for checking replication topology cycles. Not all of the listed data is used yet.
+ * Data required for checking replication topology cycles and other graph algorithms. This data is mostly
+ * used by the monitor object, as the data only makes sense in relation to other nodes.
 */
 struct NodeData
 {
+    // Default values for index parameters
    static const int INDEX_NOT_VISITED = 0;
+    static const int INDEX_FIRST = 1;
+    // Default values for the cycle
+    static const int CYCLE_NONE = 0;
+    static const int CYCLE_FIRST = 1;
+    // Default value for reach
+    static const int REACH_UNKNOWN = 0;

+    // Bookkeeping for graph searches. May be overwritten by multiple algorithms.
    int index;           /* Marks the order in which this node was visited. */
    int lowest_index;    /* The lowest index node this node has in its subtree. */
-    int cycle;           /* Which cycle is this node part of, if any. */
    bool in_stack;       /* Is this node currently is the search stack. */
+
+    // Results from algorithm runs. Should only be overwritten when server data has been queried.
+    int cycle;           /* Which cycle is this node part of, if any. */
+    int reach;           /* How many servers replicate from this server or its children. */
    ServerArray parents; /* Which nodes is this node replicating from. External masters excluded. */
    ServerArray children;/* Which nodes are replicating from this node. */
    std::vector<int64_t> external_masters; /* Server id:s of external masters. */
@ -88,9 +100,14 @@ struct NodeData
    NodeData();

    /**
-     * Reset the data to default values
+     * Reset result data to default values. Should be ran when starting an iteration.
     */
-    void reset();
+    void reset_results();
+
+    /**
+     * Reset index data. Should be ran before an algorithm run.
+     */
+    void reset_indexes();
 };

 /**
--- a/server/modules/monitor/mariadbmon/test/test_cycle_find.cc
+++ b/server/modules/monitor/mariadbmon/test/test_cycle_find.cc
@ -211,7 +211,7 @@ int MariaDBMonitor::Test::check_result_cycles(CycleArray expected_cycles)
    std::set<int> used_cycle_ids;
    for (auto iter = 0; iter < MAX_CYCLES; iter++)
    {
-        int cycle_id = 0;
+        int cycle_id = NodeData::CYCLE_NONE;
        CycleMembers cycle_member_ids = expected_cycles.cycles[iter];
        for (auto iter2 = 0; iter2 < MAX_CYCLE_SIZE; iter2++)
        {
@ -221,13 +221,13 @@ int MariaDBMonitor::Test::check_result_cycles(CycleArray expected_cycles)
                break;
            }
            auto cycle_server = m_monitor->get_server(search_id);
-            if (cycle_server->m_node.cycle == 0)
+            if (cycle_server->m_node.cycle == NodeData::CYCLE_NONE)
            {
                cout << test_name << cycle_server->name() << " is not in a cycle when it should.\n";
                errors++;
            }
            // If this is the first element, check what the cycle id should be for all members of the cycle.
-            else if (cycle_id == 0)
+            else if (cycle_id == NodeData::CYCLE_NONE)
            {
                cycle_id = cycle_server->m_node.cycle;
                if (used_cycle_ids.count(cycle_id) > 0)
@ -255,7 +255,7 @@ int MariaDBMonitor::Test::check_result_cycles(CycleArray expected_cycles)
    for (auto iter = no_cycle_servers.begin(); iter != no_cycle_servers.end(); iter++)
    {
        MariaDBServer* server = (*iter).second;
-        if (server->m_node.cycle != 0)
+        if (server->m_node.cycle != NodeData::CYCLE_NONE)
        {
            cout << server->name() << " is in cycle " << server->m_node.cycle << " when none was expected.\n";
            errors++;