Clean up mariadbmon.hh

This commit is contained in:
Esa Korhonen
2018-10-05 13:18:36 +03:00
parent 68d65682b5
commit 5cc4eb08ee
2 changed files with 174 additions and 170 deletions

View File

@ -59,17 +59,6 @@ static const char DIAG_ERROR[] = "Internal error, could not print diagnostics. "
MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor) MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
: maxscale::MonitorInstance(monitor) : maxscale::MonitorInstance(monitor)
, m_master_gtid_domain(GTID_DOMAIN_UNKNOWN)
, m_external_master_port(PORT_UNKNOWN)
, m_cluster_topology_changed(true)
, m_cluster_modified(false)
, m_log_no_master(true)
, m_warn_failover_precond(true)
, m_warn_switchover_precond(true)
, m_warn_cannot_rejoin(true)
, m_warn_current_master_invalid(true)
, m_warn_have_better_master(true)
, m_warn_master_down(true)
{ {
} }

View File

@ -12,45 +12,49 @@
*/ */
#pragma once #pragma once
#include "mariadbmon_common.hh" #include "mariadbmon_common.hh"
#include <condition_variable> #include <condition_variable>
#include <functional> #include <functional>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include <maxscale/monitor.hh>
#include <maxbase/stopwatch.hh> #include <maxbase/stopwatch.hh>
#include <maxscale/monitor.hh>
#include "mariadbserver.hh" #include "mariadbserver.hh"
// Used by multiple source files.
extern const char* const CN_AUTO_FAILOVER; extern const char* const CN_AUTO_FAILOVER;
extern const char* const CN_SWITCHOVER_ON_LOW_DISK_SPACE; extern const char* const CN_SWITCHOVER_ON_LOW_DISK_SPACE;
extern const char* const CN_PROMOTION_SQL_FILE; extern const char* const CN_PROMOTION_SQL_FILE;
extern const char* const CN_DEMOTION_SQL_FILE; extern const char* const CN_DEMOTION_SQL_FILE;
// Map of base struct to MariaDBServer. Does not own the server objects. May not be needed at the end. // Map of base struct to MariaDBServer. Does not own the server objects.
typedef std::unordered_map<MXS_MONITORED_SERVER*, MariaDBServer*> ServerInfoMap; typedef std::unordered_map<MXS_MONITORED_SERVER*, MariaDBServer*> ServerInfoMap;
// Map of server id:s to MariaDBServer. Useful when constructing the replication graph. // Map of server id:s to MariaDBServer. Useful when constructing the replication graph.
typedef std::unordered_map<int64_t, MariaDBServer*> IdToServerMap; typedef std::unordered_map<int64_t, MariaDBServer*> IdToServerMap;
// Map of cycle number to cycle members. The elements should be in order for predictability when iterating. // Map of cycle number to cycle members. The elements should be ordered for predictability when iterating.
typedef std::map<int, ServerArray> CycleMap; typedef std::map<int, ServerArray> CycleMap;
// Some methods need a log on/off setting.
enum class Log
{
OFF,
ON
};
// MariaDB Monitor instance data // MariaDB Monitor instance data
class MariaDBMonitor : public maxscale::MonitorInstance class MariaDBMonitor : public maxscale::MonitorInstance
{ {
private: private:
MariaDBMonitor(const MariaDBMonitor&); MariaDBMonitor(const MariaDBMonitor&) = delete;
MariaDBMonitor& operator=(const MariaDBMonitor&); MariaDBMonitor& operator=(const MariaDBMonitor&) = delete;
public: public:
// Helper class used for testing // Helper class used for testing.
class Test; class Test;
friend class Test; friend class Test;
/**
* Create the monitor instance and return the instance data.
*
* @param monitor Generic monitor data
* @return MariaDBMonitor instance
*/
static MariaDBMonitor* create(MXS_MONITOR* monitor);
~MariaDBMonitor(); ~MariaDBMonitor();
/** /**
@ -67,14 +71,6 @@ public:
*/ */
json_t* diagnostics_json() const; json_t* diagnostics_json() const;
/**
* Create the monitor instance and return the instance data.
*
* @param monitor General monitor data
* @return A pointer to MariaDBMonitor specific data.
*/
static MariaDBMonitor* create(MXS_MONITOR* monitor);
/** /**
* Perform user-activated switchover. * Perform user-activated switchover.
* *
@ -102,15 +98,29 @@ public:
*/ */
bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out); bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out);
/**
* Perform user-activated reset-replication
*
* @param master_server The server to promote. If NULL, monitor will select the current master.
* @param error_out Error output
* @return True if operation completed successfully
*/
bool run_manual_reset_replication(SERVER* master_server, json_t** error_out); bool run_manual_reset_replication(SERVER* master_server, json_t** error_out);
protected: protected:
void pre_loop(); void pre_loop();
void tick(); void tick();
void process_state_changes(); void process_state_changes();
private: private:
// Some methods need a log on/off setting.
enum class Log
{
OFF,
ON
};
// Information about a multimaster group (replication cycle)
struct CycleInfo struct CycleInfo
{ {
int cycle_id = NodeData::CYCLE_NONE; int cycle_id = NodeData::CYCLE_NONE;
@ -123,176 +133,181 @@ private:
struct ManualCommand struct ManualCommand
{ {
public: public:
std::mutex mutex; /**< Mutex used by the condition variables */ std::mutex mutex; /* Mutex used by the condition variables */
std::condition_variable has_command; /**< Notified when a command is waiting std::condition_variable has_command; /* Notified when a command is waiting execution */
* execution */ std::condition_variable has_result; /* Notified when the command has ran */
bool command_waiting_exec = false; /**< Guard variable for the above */ std::function<void(void)> method; /* The method to run when executing the command */
std::function<void(void)> method; /**< The method to run when executing the
* command */ bool command_waiting_exec = false; /* Guard variable for has_command */
std::condition_variable has_result; /**< Notified when the command has ran */ bool result_waiting = false; /* Guard variable for has_result */
bool result_waiting = false; /**< Guard variable for the above */
}; };
ServerArray m_servers; /**< Servers of the monitor */ ManualCommand m_manual_cmd; /* Communicates manual commands and results */
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
ManualCommand m_manual_cmd; /**< Communicates manual commands and results */
// Values updated by monitor // Server containers, mostly constant.
MariaDBServer* m_master; /**< Master server for Master/Slave replication */ ServerArray m_servers; /* Servers of the monitor */
MariaDBServer* m_next_master; /**< When master changes because of a failover/switchover, the ServerInfoMap m_server_info; /* Map from server base struct to MariaDBServer */
* new IdToServerMap m_servers_by_id; /* Map from server id:s to MariaDBServer */
* master is written here so the next monitor loop picks it
* up. */
IdToServerMap m_servers_by_id; /**< Map from server id:s to MariaDBServer */
int64_t m_master_gtid_domain; /**< gtid_domain_id most recently seen on the master */
std::string m_external_master_host; /**< External master host, for fail/switchover */
int m_external_master_port; /**< External master port */
bool m_cluster_topology_changed; /**< Has cluster topology changed since last monitor loop? */
bool m_cluster_modified; /**< Has a failover/switchover/rejoin been performed this
* loop? */
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
// Replication topology detection settings // Topology related fields
bool m_detect_stale_master; /**< Monitor flag for MySQL replication Stale Master detection */ MariaDBServer* m_master = NULL; /* The most "master-like" server in the cluster. Is the only
bool m_detect_stale_slave; /**< Monitor flag for MySQL replication Stale Slave detection */ * server which can get the Master status. */
bool m_detect_standalone_master; /**< If standalone master are detected */ MariaDBServer* m_next_master = NULL; /* When a cluster operation changes the master, the new master is
bool m_ignore_external_masters; /**< Ignore masters outside of the monitor configuration */ * written here so the next monitor tick picks it up. */
bool m_cluster_topology_changed = true; /* Has cluster topology changed since last monitor loop?
* Causes a topology rebuild on the current tick. */
bool m_cluster_modified = false; /* Has a cluster operation been performed this loop? Prevents
* other operations during this tick. */
CycleMap m_cycles; /* Map from cycle number to cycle member servers */
CycleInfo m_master_cycle_status; /* Info about master server cycle from previous round */
// Failover, switchover and rejoin settings // Miscellaneous info
bool m_auto_failover; /**< Is automatic master failover is enabled? */ int64_t m_master_gtid_domain = GTID_DOMAIN_UNKNOWN; /* gtid_domain_id most recently seen on
bool m_auto_rejoin; /**< Is automatic rejoin enabled? */ * the master */
int m_failcount; /**< Numer of cycles master must be down before auto-failover std::string m_external_master_host; /* External master host, for fail/switchover */
* begins */ int m_external_master_port = PORT_UNKNOWN; /* External master port */
std::string m_replication_user; /**< Replication user for CHANGE MASTER TO-commands */
std::string m_replication_password; /**< Replication password for CHANGE MASTER TO-commands */
uint32_t m_failover_timeout; /**< Time limit in seconds for master failover */
uint32_t m_switchover_timeout; /**< Time limit in seconds for master switchover */
bool m_verify_master_failure; /**< Is master failure is verified via slaves? */
int m_master_failure_timeout; /**< Master failure verification (via slaves) time in seconds
* */
ServerArray m_excluded_servers; /**< Servers banned for master promotion during auto-failover
* or
* autoselect switchover. */
std::string m_promote_sql_file; /**< File with sql commands which are ran to a server being
* promoted. */
std::string m_demote_sql_file; /**< File with sql commands which are ran to a server being
* demoted. */
bool m_enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers.
* */
bool m_switchover_on_low_disk_space; /**< Should the monitor do a switchover on low disk space. */
bool m_maintenance_on_low_disk_space; /**< Set slave and unreplicating servers with low disk space
* to
* maintenance. */
bool m_handle_event_scheduler; /**< Should failover/switchover handle any scheduled events on
* the servers */
// Other settings /* The default setting values given here may not be the actual defaults given by
bool m_log_no_master; /**< Should it be logged that there is no master */ * the module configuration. */
bool m_warn_no_valid_in_cycle; /**< Log a warning when a replication cycle has no valid master */
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master // Replication topology detection settings.
* outside of a cycle. */ bool m_detect_stale_master = true; /* Allow stale masters. TODO: Remove this */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */ bool m_detect_stale_slave = true; /* Allow stale slaves: a running slave behind a downed
bool m_warn_switchover_precond; /**< Print switchover preconditions error message? */ * master/relay is still a valid slave */
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */ bool m_detect_standalone_master = true; /* Allow writes to a master without any slaves.
bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */ * TODO: think about removing */
bool m_warn_have_better_master; /**< Print warning if the current master is not the best one? */ bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration.
bool m_warn_master_down; /**< Print warning that failover may happen soon? */ * TODO: requires work */
int m_failcount = 1; /* Number of ticks master must be down before it's considered
* totally down, allowing failover or master change. */
// Cluster operations activation settings
bool m_auto_failover = false; /* Automatic master failover enabled? */
bool m_auto_rejoin = false; /* Automatic rejoin enabled? */
bool m_switchover_on_low_disk_space = false; /* Automatically switch over a master low on disk space */
bool m_maintenance_on_low_disk_space = false; /* Automatically set slave and unreplicating servers low
* on disk space to maintenance. */
bool m_enforce_read_only_slaves = false; /* If true, the monitor checks and enforces every tick
* that all slaves are in read-only-mode. */
// Cluster operations additional settings
std::string m_replication_user; /* Replication user for CHANGE MASTER TO-commands */
std::string m_replication_password; /* Replication password for CHANGE MASTER TO-commands */
bool m_handle_event_scheduler = true;/* Should failover/switchover enable/disable any scheduled
* events on the servers during promote/demote? */
uint32_t m_failover_timeout = 10; /* Time limit in seconds for failover */
uint32_t m_switchover_timeout = 10; /* Time limit in seconds for switchover */
bool m_verify_master_failure = true; /* Is master failure is verified via slaves? */
int m_master_failure_timeout = 10; /* Master failure verification (via slaves) time in seconds */
ServerArray m_excluded_servers; /* Servers which cannot be autoselected when deciding which
* slave to promote during failover switchover. */
std::string m_promote_sql_file; /* File with sql commands which are ran to a server being
* promoted. */
std::string m_demote_sql_file; /* File with sql commands which are ran to a server being
* demoted. */
// Fields controlling logging of various events. TODO: Check these
bool m_log_no_master = true; /* Should it be logged that there is no master? */
bool m_warn_current_master_invalid = true; /* Print warning if current master is not valid? */
bool m_warn_have_better_master = true; /* Print warning if the current master is not the best one? */
bool m_warn_master_down = true; /* Print warning that failover may happen soon? */
bool m_warn_failover_precond = true; /* Print failover preconditions error message? */
bool m_warn_switchover_precond = true; /* Print switchover preconditions error message? */
bool m_warn_cannot_rejoin = true; /* Print warning if auto_rejoin fails because of invalid
* gtid:s? */
// Base methods // Base methods
MariaDBMonitor(MXS_MONITOR* monitor_base); MariaDBMonitor(MXS_MONITOR* monitor_base);
void reset_server_info(); bool configure(const MXS_CONFIG_PARAMETER* params);
void clear_server_info(); bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
void reset_node_index_info(); void reset_server_info();
bool configure(const MXS_CONFIG_PARAMETER* params); void clear_server_info();
bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params); void reset_node_index_info();
bool execute_manual_command(std::function<void ()> command, json_t** error_out);
std::string diagnostics_to_string() const;
json_t* to_json() const;
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db); MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id); MariaDBServer* get_server(int64_t id);
MariaDBServer* get_server(SERVER* server); MariaDBServer* get_server(SERVER* server);
bool execute_manual_command(std::function<void ()> command, json_t** error_out);
std::string diagnostics_to_string() const;
json_t* to_json() const;
// Cluster discovery and status assignment methods // Cluster discovery and status assignment methods, top levels
void update_server(MariaDBServer* server); void update_server(MariaDBServer* server);
void find_graph_cycles(); void update_topology();
void update_topology(); void build_replication_graph();
void log_master_changes(); void assign_new_master(MariaDBServer* new_master);
void update_gtid_domain(); void find_graph_cycles();
void update_external_master(); bool master_is_valid(std::string* reason_out);
void build_replication_graph(); void assign_server_roles();
void tarjan_scc_visit_node(MariaDBServer* node, ServerArray* stack, int* index, int* cycle); void assign_slave_and_relay_master(MariaDBServer* start_node);
void check_cluster_operations_support();
MariaDBServer* find_topology_master_server(std::string* msg_out); MariaDBServer* find_topology_master_server(std::string* msg_out);
MariaDBServer* find_best_reach_server(const ServerArray& candidates);
void calculate_node_reach(MariaDBServer* search_root);
int running_slaves(MariaDBServer* search_root);
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers); MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
void assign_server_roles(); MariaDBServer* find_best_reach_server(const ServerArray& candidates);
void assign_slave_and_relay_master(MariaDBServer* start_node);
bool master_is_valid(std::string* reason_out);
bool cycle_has_master_server(ServerArray& cycle_servers);
void update_master_cycle_info();
void set_low_disk_slaves_maintenance();
void assign_new_master(MariaDBServer* new_master);
void check_cluster_operations_support();
// Switchover methods // Cluster discovery and status assignment methods, low level
void tarjan_scc_visit_node(MariaDBServer* node, ServerArray* stack, int* index, int* cycle);
void calculate_node_reach(MariaDBServer* search_root);
int running_slaves(MariaDBServer* search_root);
bool cycle_has_master_server(ServerArray& cycle_servers);
void update_gtid_domain();
void update_external_master();
void update_master_cycle_info();
// Cluster operation launchers
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
bool manual_failover(json_t** output);
bool manual_rejoin(SERVER* rejoin_server, json_t** output);
void handle_low_disk_space_master();
void handle_auto_failover();
void handle_auto_rejoin();
const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target,
maxbase::Duration* event_age_out,
maxbase::Duration* delay_out);
std::unique_ptr<ClusterOperation> switchover_prepare(SERVER* new_master, SERVER* current_master, std::unique_ptr<ClusterOperation> switchover_prepare(SERVER* new_master, SERVER* current_master,
Log log_mode, json_t** error_out); Log log_mode, json_t** error_out);
bool switchover_perform(ClusterOperation& operation);
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
void handle_low_disk_space_master();
// Failover methods
std::unique_ptr<ClusterOperation> failover_prepare(Log log_mode, json_t** error_out); std::unique_ptr<ClusterOperation> failover_prepare(Log log_mode, json_t** error_out);
bool failover_perform(ClusterOperation& operation);
const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target,
maxbase::Duration* event_age_out,
maxbase::Duration* delay_out);
bool manual_failover(json_t** output);
void handle_auto_failover();
// Rejoin methods bool switchover_perform(ClusterOperation& operation);
bool manual_rejoin(SERVER* rejoin_server, json_t** output); bool failover_perform(ClusterOperation& operation);
bool cluster_can_be_joined();
void handle_auto_rejoin();
bool get_joinable_servers(ServerArray* output);
bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
uint32_t do_rejoin(const ServerArray& joinable_servers, json_t** output);
// Methods common to failover/switchover/rejoin // Methods used by failover/switchover/rejoin
MariaDBServer* select_promotion_target(MariaDBServer* current_master, MariaDBServer* select_promotion_target(MariaDBServer* current_master, OperationType op,
OperationType op, Log log_mode, json_t** error_out);
Log log_mode, bool is_candidate_better(const MariaDBServer* candidate, const MariaDBServer* current_best,
json_t** error_out); const MariaDBServer* demotion_target, uint32_t gtid_domain,
bool server_is_excluded(const MariaDBServer* server);
bool is_candidate_better(const MariaDBServer* candidate,
const MariaDBServer* current_best,
const MariaDBServer* demotion_target,
uint32_t gtid_domain,
std::string* reason_out = NULL); std::string* reason_out = NULL);
int redirect_slaves(MariaDBServer* new_master, bool server_is_excluded(const MariaDBServer* server);
const ServerArray& slaves, bool check_gtid_replication(Log log_mode, const MariaDBServer* demotion_target,
json_t** error_out);
ServerArray get_redirectables(const MariaDBServer* promotion_target,
const MariaDBServer* demotion_target);
int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves,
ServerArray* redirected_slaves); ServerArray* redirected_slaves);
int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves, int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves,
ServerArray* redirected_slaves); ServerArray* redirected_slaves);
std::string generate_change_master_cmd(const std::string& master_host, int master_port); bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
bool start_external_replication(MariaDBServer* new_master, json_t** err_out); bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
void wait_cluster_stabilization(ClusterOperation& op, const ServerArray& slaves); void wait_cluster_stabilization(ClusterOperation& op, const ServerArray& slaves);
void report_and_disable(const std::string& operation, void report_and_disable(const std::string& operation, const std::string& setting_name,
const std::string& setting_name,
bool* setting_var); bool* setting_var);
bool check_gtid_replication(Log log_mode,
const MariaDBServer* demotion_target, // Rejoin methods
json_t** error_out); bool cluster_can_be_joined();
ServerArray get_redirectables(const MariaDBServer* promotion_target, bool get_joinable_servers(ServerArray* output);
const MariaDBServer* demotion_target); bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
uint32_t do_rejoin(const ServerArray& joinable_servers, json_t** output);
// Other methods // Other methods
void disable_setting(const std::string& setting); void disable_setting(const std::string& setting);
bool check_sql_files(); bool check_sql_files();
void enforce_read_only_on_slaves(); void enforce_read_only_on_slaves();
void log_master_changes();
void set_low_disk_slaves_maintenance();
bool manual_reset_replication(SERVER* master_server, json_t** error_out); bool manual_reset_replication(SERVER* master_server, json_t** error_out);
}; };