Clean up mariadbmon.hh

2018-10-05 13:18:36 +03:00
parent 68d65682b5
commit 5cc4eb08ee
2 changed files with 174 additions and 170 deletions
--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@ -59,17 +59,6 @@ static const char DIAG_ERROR[] = "Internal error, could not print diagnostics. "
 MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
    : maxscale::MonitorInstance(monitor)
    , m_master_gtid_domain(GTID_DOMAIN_UNKNOWN)
    , m_external_master_port(PORT_UNKNOWN)
    , m_cluster_topology_changed(true)
    , m_cluster_modified(false)
    , m_log_no_master(true)
    , m_warn_failover_precond(true)
    , m_warn_switchover_precond(true)
    , m_warn_cannot_rejoin(true)
    , m_warn_current_master_invalid(true)
    , m_warn_have_better_master(true)
    , m_warn_master_down(true)
 {
 }
--- a/server/modules/monitor/mariadbmon/mariadbmon.hh
+++ b/server/modules/monitor/mariadbmon/mariadbmon.hh
@ -12,45 +12,49 @@
 */
 #pragma once
 #include "mariadbmon_common.hh"
 #include <condition_variable>
 #include <functional>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <maxscale/monitor.hh>
+
 #include <maxbase/stopwatch.hh>
 #include <maxscale/monitor.hh>
 #include "mariadbserver.hh"
 // Used by multiple source files.
 extern const char* const CN_AUTO_FAILOVER;
 extern const char* const CN_SWITCHOVER_ON_LOW_DISK_SPACE;
 extern const char* const CN_PROMOTION_SQL_FILE;
 extern const char* const CN_DEMOTION_SQL_FILE;
-// Map of base struct to MariaDBServer. Does not own the server objects. May not be needed at the end.
+// Map of base struct to MariaDBServer. Does not own the server objects.
 typedef std::unordered_map<MXS_MONITORED_SERVER*, MariaDBServer*> ServerInfoMap;
 // Map of server id:s to MariaDBServer. Useful when constructing the replication graph.
 typedef std::unordered_map<int64_t, MariaDBServer*> IdToServerMap;
-// Map of cycle number to cycle members. The elements should be in order for predictability when iterating.
+// Map of cycle number to cycle members. The elements should be ordered for predictability when iterating.
 typedef std::map<int, ServerArray> CycleMap;
 // Some methods need a log on/off setting.
 enum class Log
 {
    OFF,
    ON
 };
 // MariaDB Monitor instance data
 class MariaDBMonitor : public maxscale::MonitorInstance
 {
 private:
-    MariaDBMonitor(const MariaDBMonitor&);
+    MariaDBMonitor(const MariaDBMonitor&) = delete;
-    MariaDBMonitor& operator=(const MariaDBMonitor&);
+    MariaDBMonitor& operator=(const MariaDBMonitor&) = delete;
 public:
-    // Helper class used for testing
+    // Helper class used for testing.
    class Test;
    friend class Test;
    /**
     * Create the monitor instance and return the instance data.
     *
     * @param monitor Generic monitor data
     * @return MariaDBMonitor instance
     */
    static MariaDBMonitor* create(MXS_MONITOR* monitor);
    ~MariaDBMonitor();
    /**
@ -67,14 +71,6 @@ public:
     */
    json_t* diagnostics_json() const;
    /**
     * Create the monitor instance and return the instance data.
     *
     * @param monitor General monitor data
     * @return A pointer to MariaDBMonitor specific data.
     */
    static MariaDBMonitor* create(MXS_MONITOR* monitor);
    /**
     * Perform user-activated switchover.
     *
@ -102,15 +98,29 @@ public:
     */
    bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out);
    /**
     * Perform user-activated reset-replication
     *
     * @param master_server The server to promote. If NULL, monitor will select the current master.
     * @param error_out Error output
     * @return True if operation completed successfully
     */
    bool run_manual_reset_replication(SERVER* master_server, json_t** error_out);
 protected:
    void pre_loop();
    void tick();
    void process_state_changes();
 private:
    // Some methods need a log on/off setting.
    enum class Log
    {
        OFF,
        ON
    };
    // Information about a multimaster group (replication cycle)
    struct CycleInfo
    {
        int         cycle_id = NodeData::CYCLE_NONE;
@ -123,176 +133,181 @@ private:
    struct ManualCommand
    {
    public:
-        std::mutex              mutex;                          /**< Mutex used by the condition variables */
+        std::mutex                mutex;        /* Mutex used by the condition variables */
-        std::condition_variable has_command;                    /**< Notified when a command is waiting
+        std::condition_variable   has_command;  /* Notified when a command is waiting execution */
-                                                                 * execution */
+        std::condition_variable   has_result;   /* Notified when the command has ran */
-        bool                      command_waiting_exec = false; /**< Guard variable for the above */
+        std::function<void(void)> method;       /* The method to run when executing the command */
-        std::function<void(void)> method;                       /**< The method to run when executing the
+
-                                                                 * command */
+        bool command_waiting_exec = false;  /* Guard variable for has_command */
-        std::condition_variable has_result;                     /**< Notified when the command has ran */
+        bool result_waiting = false;        /* Guard variable for has_result */
        bool                    result_waiting = false;         /**< Guard variable for the above */
    };
-    ServerArray   m_servers;            /**< Servers of the monitor */
+    ManualCommand m_manual_cmd;     /* Communicates manual commands and results */
    ServerInfoMap m_server_info;        /**< Map from server base struct to MariaDBServer */
    ManualCommand m_manual_cmd;         /**< Communicates manual commands and results */
-    // Values updated by monitor
+    // Server containers, mostly constant.
-    MariaDBServer* m_master;                    /**< Master server for Master/Slave replication */
+    ServerArray   m_servers;        /* Servers of the monitor */
-    MariaDBServer* m_next_master;               /**< When master changes because of a failover/switchover, the
+    ServerInfoMap m_server_info;    /* Map from server base struct to MariaDBServer */
-                                                 * new
+    IdToServerMap m_servers_by_id;  /* Map from server id:s to MariaDBServer */
                                                 *  master is written here so the next monitor loop picks it
                                                 * up. */
    IdToServerMap m_servers_by_id;              /**< Map from server id:s to MariaDBServer */
    int64_t       m_master_gtid_domain;         /**< gtid_domain_id most recently seen on the master  */
    std::string   m_external_master_host;       /**< External master host, for fail/switchover */
    int           m_external_master_port;       /**< External master port */
    bool          m_cluster_topology_changed;   /**< Has cluster topology changed since last monitor loop? */
    bool          m_cluster_modified;           /**< Has a failover/switchover/rejoin been performed this
                                                 * loop? */
    CycleMap  m_cycles;                         /**< Map from cycle number to cycle member servers */
    CycleInfo m_master_cycle_status;            /**< Info about master server cycle from previous round */
-    // Replication topology detection settings
+    // Topology related fields
-    bool m_detect_stale_master;         /**< Monitor flag for MySQL replication Stale Master detection */
+    MariaDBServer* m_master = NULL;         /* The most "master-like" server in the cluster. Is the only
-    bool m_detect_stale_slave;          /**< Monitor flag for MySQL replication Stale Slave detection */
+                                             * server which can get the Master status. */
-    bool m_detect_standalone_master;    /**< If standalone master are detected */
+    MariaDBServer* m_next_master = NULL;    /* When a cluster operation changes the master, the new master is
-    bool m_ignore_external_masters;     /**< Ignore masters outside of the monitor configuration */
+                                             * written here so the next monitor tick picks it up. */
    bool m_cluster_topology_changed = true; /* Has cluster topology changed since last monitor loop?
                                             * Causes a topology rebuild on the current tick. */
    bool m_cluster_modified = false;        /* Has a cluster operation been performed this loop? Prevents
                                             * other operations during this tick. */
    CycleMap  m_cycles;                     /* Map from cycle number to cycle member servers */
    CycleInfo m_master_cycle_status;        /* Info about master server cycle from previous round */
-    // Failover, switchover and rejoin settings
+    // Miscellaneous info
-    bool m_auto_failover;                   /**< Is automatic master failover is enabled? */
+    int64_t m_master_gtid_domain = GTID_DOMAIN_UNKNOWN;     /* gtid_domain_id most recently seen on
-    bool m_auto_rejoin;                     /**< Is automatic rejoin enabled? */
+                                                             * the master */
-    int  m_failcount;                       /**< Numer of cycles master must be down before auto-failover
+    std::string m_external_master_host;                     /* External master host, for fail/switchover */
-                                             * begins */
+    int         m_external_master_port = PORT_UNKNOWN;      /* External master port */
    std::string m_replication_user;         /**< Replication user for CHANGE MASTER TO-commands */
    std::string m_replication_password;     /**< Replication password for CHANGE MASTER TO-commands */
    uint32_t    m_failover_timeout;         /**< Time limit in seconds for master failover */
    uint32_t    m_switchover_timeout;       /**< Time limit in seconds for master switchover */
    bool        m_verify_master_failure;    /**< Is master failure is verified via slaves? */
    int         m_master_failure_timeout;   /**< Master failure verification (via slaves) time in seconds
                                             * */
    ServerArray m_excluded_servers;         /**< Servers banned for master promotion during auto-failover
                                             * or
                                             *   autoselect switchover. */
    std::string m_promote_sql_file;         /**< File with sql commands which are ran to a server being
                                             * promoted. */
    std::string m_demote_sql_file;          /**< File with sql commands which are ran to a server being
                                             * demoted. */
    bool m_enforce_read_only_slaves;        /**< Should the monitor set read-only=1 on any slave servers.
                                             * */
    bool m_switchover_on_low_disk_space;    /**< Should the monitor do a switchover on low disk space. */
    bool m_maintenance_on_low_disk_space;   /**< Set slave and unreplicating servers with low disk space
                                             * to
                                             *   maintenance. */
    bool m_handle_event_scheduler;          /**< Should failover/switchover handle any scheduled events on
                                             *  the servers */
-    // Other settings
+    /* The default setting values given here may not be the actual defaults given by
-    bool m_log_no_master;               /**< Should it be logged that there is no master */
+     * the module configuration. */
-    bool m_warn_no_valid_in_cycle;      /**< Log a warning when a replication cycle has no valid master */
+
-    bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
+    // Replication topology detection settings.
-                                         *   outside of a cycle. */
+    bool m_detect_stale_master = true;      /* Allow stale masters. TODO: Remove this */
-    bool m_warn_failover_precond;       /**< Print failover preconditions error message? */
+    bool m_detect_stale_slave = true;       /* Allow stale slaves: a running slave behind a downed
-    bool m_warn_switchover_precond;     /**< Print switchover preconditions error message? */
+                                             * master/relay is still a valid slave */
-    bool m_warn_cannot_rejoin;          /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
+    bool m_detect_standalone_master = true; /* Allow writes to a master without any slaves.
-    bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */
+                                             * TODO: think about removing */
-    bool m_warn_have_better_master;     /**< Print warning if the current master is not the best one? */
+    bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration.
-    bool m_warn_master_down;            /**< Print warning that failover may happen soon? */
+                                             * TODO: requires work */
    int m_failcount = 1;                    /* Number of ticks master must be down before it's considered
                                             * totally down, allowing failover or master change. */
    // Cluster operations activation settings
    bool m_auto_failover = false;                   /* Automatic master failover enabled? */
    bool m_auto_rejoin = false;                     /* Automatic rejoin enabled? */
    bool m_switchover_on_low_disk_space = false;    /* Automatically switch over a master low on disk space */
    bool m_maintenance_on_low_disk_space = false;   /* Automatically set slave and unreplicating servers low
                                                     * on disk space to maintenance. */
    bool m_enforce_read_only_slaves = false;        /* If true, the monitor checks and enforces every tick
                                                     * that all slaves are in read-only-mode. */
    // Cluster operations additional settings
    std::string m_replication_user;             /* Replication user for CHANGE MASTER TO-commands */
    std::string m_replication_password;         /* Replication password for CHANGE MASTER TO-commands */
    bool        m_handle_event_scheduler = true;/* Should failover/switchover enable/disable any scheduled
                                                 * events on the servers during promote/demote? */
    uint32_t    m_failover_timeout = 10;        /* Time limit in seconds for failover */
    uint32_t    m_switchover_timeout = 10;      /* Time limit in seconds for switchover */
    bool        m_verify_master_failure = true; /* Is master failure is verified via slaves? */
    int         m_master_failure_timeout = 10;  /* Master failure verification (via slaves) time in seconds */
    ServerArray m_excluded_servers;             /* Servers which cannot be autoselected when deciding which
                                                 * slave to promote during failover switchover. */
    std::string m_promote_sql_file;             /* File with sql commands which are ran to a server being
                                                 * promoted. */
    std::string m_demote_sql_file;              /* File with sql commands which are ran to a server being
                                                 * demoted. */
    // Fields controlling logging of various events. TODO: Check these
    bool m_log_no_master = true;                /* Should it be logged that there is no master? */
    bool m_warn_current_master_invalid = true;  /* Print warning if current master is not valid? */
    bool m_warn_have_better_master = true;      /* Print warning if the current master is not the best one? */
    bool m_warn_master_down = true;             /* Print warning that failover may happen soon? */
    bool m_warn_failover_precond = true;        /* Print failover preconditions error message? */
    bool m_warn_switchover_precond = true;      /* Print switchover preconditions error message? */
    bool m_warn_cannot_rejoin = true;           /* Print warning if auto_rejoin fails because of invalid
                                                 * gtid:s? */
    // Base methods
    MariaDBMonitor(MXS_MONITOR* monitor_base);
-    void           reset_server_info();
+    bool configure(const MXS_CONFIG_PARAMETER* params);
-    void           clear_server_info();
+    bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
-    void           reset_node_index_info();
+    void reset_server_info();
-    bool           configure(const MXS_CONFIG_PARAMETER* params);
+    void clear_server_info();
-    bool           set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
+    void reset_node_index_info();
    bool execute_manual_command(std::function<void ()> command, json_t** error_out);
    std::string diagnostics_to_string() const;
    json_t*     to_json() const;
    MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
    MariaDBServer* get_server(int64_t id);
    MariaDBServer* get_server(SERVER* server);
    bool           execute_manual_command(std::function<void ()> command, json_t** error_out);
    std::string    diagnostics_to_string() const;
    json_t*        to_json() const;
-    // Cluster discovery and status assignment methods
+    // Cluster discovery and status assignment methods, top levels
-    void           update_server(MariaDBServer* server);
+    void update_server(MariaDBServer* server);
-    void           find_graph_cycles();
+    void update_topology();
-    void           update_topology();
+    void build_replication_graph();
-    void           log_master_changes();
+    void assign_new_master(MariaDBServer* new_master);
-    void           update_gtid_domain();
+    void find_graph_cycles();
-    void           update_external_master();
+    bool master_is_valid(std::string* reason_out);
-    void           build_replication_graph();
+    void assign_server_roles();
-    void           tarjan_scc_visit_node(MariaDBServer* node, ServerArray* stack, int* index, int* cycle);
+    void assign_slave_and_relay_master(MariaDBServer* start_node);
    void check_cluster_operations_support();
    MariaDBServer* find_topology_master_server(std::string* msg_out);
    MariaDBServer* find_best_reach_server(const ServerArray& candidates);
    void           calculate_node_reach(MariaDBServer* search_root);
    int            running_slaves(MariaDBServer* search_root);
    MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
-    void           assign_server_roles();
+    MariaDBServer* find_best_reach_server(const ServerArray& candidates);
    void           assign_slave_and_relay_master(MariaDBServer* start_node);
    bool           master_is_valid(std::string* reason_out);
    bool           cycle_has_master_server(ServerArray& cycle_servers);
    void           update_master_cycle_info();
    void           set_low_disk_slaves_maintenance();
    void           assign_new_master(MariaDBServer* new_master);
    void           check_cluster_operations_support();
-    // Switchover methods
+    // Cluster discovery and status assignment methods, low level
    void tarjan_scc_visit_node(MariaDBServer* node, ServerArray* stack, int* index, int* cycle);
    void calculate_node_reach(MariaDBServer* search_root);
    int  running_slaves(MariaDBServer* search_root);
    bool cycle_has_master_server(ServerArray& cycle_servers);
    void update_gtid_domain();
    void update_external_master();
    void update_master_cycle_info();
    // Cluster operation launchers
    bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
    bool manual_failover(json_t** output);
    bool manual_rejoin(SERVER* rejoin_server, json_t** output);
    void handle_low_disk_space_master();
    void handle_auto_failover();
    void handle_auto_rejoin();
    const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target,
                                                maxbase::Duration*   event_age_out,
                                                maxbase::Duration*   delay_out);
    std::unique_ptr<ClusterOperation> switchover_prepare(SERVER* new_master, SERVER* current_master,
                                                         Log log_mode, json_t** error_out);
    bool switchover_perform(ClusterOperation& operation);
    bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
    bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
    void handle_low_disk_space_master();
    // Failover methods
    std::unique_ptr<ClusterOperation> failover_prepare(Log log_mode, json_t** error_out);
    bool                              failover_perform(ClusterOperation& operation);
    const MariaDBServer*              slave_receiving_events(const MariaDBServer* demotion_target,
                                                             maxbase::Duration* event_age_out,
                                                             maxbase::Duration* delay_out);
    bool manual_failover(json_t** output);
    void handle_auto_failover();
-    // Rejoin methods
+    bool switchover_perform(ClusterOperation& operation);
-    bool     manual_rejoin(SERVER* rejoin_server, json_t** output);
+    bool failover_perform(ClusterOperation& operation);
    bool     cluster_can_be_joined();
    void     handle_auto_rejoin();
    bool     get_joinable_servers(ServerArray* output);
    bool     server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
    uint32_t do_rejoin(const ServerArray& joinable_servers, json_t** output);
-    // Methods common to failover/switchover/rejoin
+    // Methods used by failover/switchover/rejoin
-    MariaDBServer* select_promotion_target(MariaDBServer* current_master,
+    MariaDBServer* select_promotion_target(MariaDBServer* current_master, OperationType op,
-                                           OperationType  op,
+                                           Log log_mode, json_t** error_out);
-                                           Log log_mode,
+    bool is_candidate_better(const MariaDBServer* candidate, const MariaDBServer* current_best,
-                                           json_t** error_out);
+                             const MariaDBServer* demotion_target, uint32_t gtid_domain,
    bool server_is_excluded(const MariaDBServer* server);
    bool is_candidate_better(const MariaDBServer* candidate,
                             const MariaDBServer* current_best,
                             const MariaDBServer* demotion_target,
                             uint32_t gtid_domain,
                             std::string* reason_out = NULL);
-    int redirect_slaves(MariaDBServer* new_master,
+    bool server_is_excluded(const MariaDBServer* server);
-                        const ServerArray& slaves,
+    bool check_gtid_replication(Log log_mode, const MariaDBServer* demotion_target,
                                json_t** error_out);
    ServerArray get_redirectables(const MariaDBServer* promotion_target,
                                  const MariaDBServer* demotion_target);
    int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves,
                        ServerArray* redirected_slaves);
    int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves,
                           ServerArray* redirected_slaves);
-    std::string generate_change_master_cmd(const std::string& master_host, int master_port);
+    bool        switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
    bool        start_external_replication(MariaDBServer* new_master, json_t** err_out);
    std::string generate_change_master_cmd(const std::string& master_host, int master_port);
    void        wait_cluster_stabilization(ClusterOperation& op, const ServerArray& slaves);
-    void        report_and_disable(const std::string& operation,
+    void        report_and_disable(const std::string& operation, const std::string& setting_name,
                                   const std::string& setting_name,
                                   bool* setting_var);
-    bool check_gtid_replication(Log log_mode,
+
-                                const MariaDBServer* demotion_target,
+    // Rejoin methods
-                                json_t** error_out);
+    bool     cluster_can_be_joined();
-    ServerArray get_redirectables(const MariaDBServer* promotion_target,
+    bool     get_joinable_servers(ServerArray* output);
-                                  const MariaDBServer* demotion_target);
+    bool     server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
    uint32_t do_rejoin(const ServerArray& joinable_servers, json_t** output);
    // Other methods
    void disable_setting(const std::string& setting);
    bool check_sql_files();
    void enforce_read_only_on_slaves();
    void log_master_changes();
    void set_low_disk_slaves_maintenance();
    bool manual_reset_replication(SERVER* master_server, json_t** error_out);
 };