Esa Korhonen c20a17238b MXS-1944 Store failover parameters in an object
Several of the parameters are passed on from function to function. Having them all
in an object cleans things up and makes adding more data easier.
2018-09-26 12:26:35 +03:00

322 lines
16 KiB
C++

/*
* Copyright (c) 2018 MariaDB Corporation Ab
*
* Use of this software is governed by the Business Source License included
* in the LICENSE.TXT file and at www.mariadb.com/bsl11.
*
* Change Date: 2022-01-01
*
* On the date above, in accordance with the Business Source License, use
* of this software will be governed by version 2 or later of the General
* Public License.
*/
#pragma once
#include "mariadbmon_common.hh"
#include <chrono>
#include <condition_variable>
#include <functional>
#include <string>
#include <unordered_map>
#include <vector>
#include <maxscale/monitor.hh>
#include "mariadbserver.hh"
extern const char* const CN_AUTO_FAILOVER;
extern const char* const CN_SWITCHOVER_ON_LOW_DISK_SPACE;
extern const char* const CN_PROMOTION_SQL_FILE;
extern const char* const CN_DEMOTION_SQL_FILE;
// Map of base struct to MariaDBServer. Does not own the server objects. May not be needed at the end.
typedef std::unordered_map<MXS_MONITORED_SERVER*, MariaDBServer*> ServerInfoMap;
// Map of server id:s to MariaDBServer. Useful when constructing the replication graph.
typedef std::unordered_map<int64_t, MariaDBServer*> IdToServerMap;
// Map of cycle number to cycle members. The elements should be in order for predictability when iterating.
typedef std::map<int, ServerArray> CycleMap;
// MariaDB Monitor instance data
class MariaDBMonitor : public maxscale::MonitorInstance
{
private:
MariaDBMonitor(const MariaDBMonitor&);
MariaDBMonitor& operator=(const MariaDBMonitor&);
public:
// Helper class used for testing
class Test;
friend class Test;
~MariaDBMonitor();
/**
* Print diagnostics.
*
* @param dcb DCB to print to
*/
void diagnostics(DCB* dcb) const;
/**
* Print diagnostics to json object.
*
* @return Diagnostics messages
*/
json_t* diagnostics_json() const;
/**
* Create the monitor instance and return the instance data.
*
* @param monitor General monitor data
* @return A pointer to MariaDBMonitor specific data.
*/
static MariaDBMonitor* create(MXS_MONITOR* monitor);
/**
* Perform user-activated switchover.
*
* @param new_master The specified new master. If NULL, monitor will autoselect.
* @param current_master The specified current master. If NULL, monitor will autoselect.
* @param error_out Json error output
* @return True if switchover was performed
*/
bool run_manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
/**
* Perform user-activated failover.
*
* @param error_out Json error output
* @return True if failover was performed
*/
bool run_manual_failover(json_t** error_out);
/**
* Perform user-activated rejoin
*
* @param rejoin_server Server to join
* @param error_out Json error output
* @return True if rejoin was performed
*/
bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out);
bool run_manual_reset_replication(SERVER* master_server, json_t** error_out);
protected:
void pre_loop();
void tick();
void process_state_changes();
private:
typedef std::chrono::duration<double> Duration;
struct CycleInfo
{
int cycle_id = NodeData::CYCLE_NONE;
ServerArray cycle_members;
};
/* Structure used to communicate commands and results between the MaxAdmin and monitor threads.
* The monitor can only process one manual command at a time, which is already enforced by
* the admin thread. */
struct ManualCommand
{
public:
std::mutex mutex; /**< Mutex used by the condition variables */
std::condition_variable has_command; /**< Notified when a command is waiting
* execution */
bool command_waiting_exec = false; /**< Guard variable for the above */
std::function<void(void)> method; /**< The method to run when executing the
* command */
std::condition_variable has_result; /**< Notified when the command has ran */
bool result_waiting = false; /**< Guard variable for the above */
};
ServerArray m_servers; /**< Servers of the monitor */
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
ManualCommand m_manual_cmd; /**< Communicates manual commands and results */
// Values updated by monitor
MariaDBServer* m_master; /**< Master server for Master/Slave replication */
MariaDBServer* m_next_master; /**< When master changes because of a failover/switchover, the
* new
* master is written here so the next monitor loop picks it
* up. */
IdToServerMap m_servers_by_id; /**< Map from server id:s to MariaDBServer */
int64_t m_master_gtid_domain; /**< gtid_domain_id most recently seen on the master */
std::string m_external_master_host; /**< External master host, for fail/switchover */
int m_external_master_port; /**< External master port */
bool m_cluster_topology_changed; /**< Has cluster topology changed since last monitor loop? */
bool m_cluster_modified; /**< Has a failover/switchover/rejoin been performed this
* loop? */
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
// Replication topology detection settings
bool m_detect_stale_master; /**< Monitor flag for MySQL replication Stale Master detection */
bool m_detect_stale_slave; /**< Monitor flag for MySQL replication Stale Slave detection */
bool m_detect_standalone_master; /**< If standalone master are detected */
bool m_ignore_external_masters; /**< Ignore masters outside of the monitor configuration */
// Failover, switchover and rejoin settings
bool m_auto_failover; /**< Is automatic master failover is enabled? */
bool m_auto_rejoin; /**< Is automatic rejoin enabled? */
int m_failcount; /**< Numer of cycles master must be down before auto-failover
* begins */
std::string m_replication_user; /**< Replication user for CHANGE MASTER TO-commands */
std::string m_replication_password; /**< Replication password for CHANGE MASTER TO-commands */
uint32_t m_failover_timeout; /**< Time limit in seconds for master failover */
uint32_t m_switchover_timeout; /**< Time limit in seconds for master switchover */
bool m_verify_master_failure; /**< Is master failure is verified via slaves? */
int m_master_failure_timeout; /**< Master failure verification (via slaves) time in seconds
* */
ServerArray m_excluded_servers; /**< Servers banned for master promotion during auto-failover
* or
* autoselect switchover. */
std::string m_promote_sql_file; /**< File with sql commands which are ran to a server being
* promoted. */
std::string m_demote_sql_file; /**< File with sql commands which are ran to a server being
* demoted. */
bool m_enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers.
* */
bool m_switchover_on_low_disk_space; /**< Should the monitor do a switchover on low disk space. */
bool m_maintenance_on_low_disk_space; /**< Set slave and unreplicating servers with low disk space
* to
* maintenance. */
bool m_handle_event_scheduler; /**< Should failover/switchover handle any scheduled events on
* the servers */
// Other settings
bool m_log_no_master; /**< Should it be logged that there is no master */
bool m_warn_no_valid_in_cycle; /**< Log a warning when a replication cycle has no valid master */
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
* outside of a cycle. */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
bool m_warn_switchover_precond; /**< Print switchover preconditions error message? */
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */
bool m_warn_have_better_master; /**< Print warning if the current master is not the best one? */
bool m_warn_master_down; /**< Print warning that failover may happen soon? */
// Base methods
MariaDBMonitor(MXS_MONITOR* monitor_base);
void reset_server_info();
void clear_server_info();
void reset_node_index_info();
bool configure(const MXS_CONFIG_PARAMETER* params);
bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id);
MariaDBServer* get_server(SERVER* server);
bool execute_manual_command(std::function<void ()> command, json_t** error_out);
std::string diagnostics_to_string() const;
json_t* to_json() const;
// Cluster discovery and status assignment methods
void update_server(MariaDBServer* server);
void find_graph_cycles();
void update_topology();
void log_master_changes();
void update_gtid_domain();
void update_external_master();
void build_replication_graph();
void tarjan_scc_visit_node(MariaDBServer* node, ServerArray* stack, int* index, int* cycle);
MariaDBServer* find_topology_master_server(std::string* msg_out);
MariaDBServer* find_best_reach_server(const ServerArray& candidates);
void calculate_node_reach(MariaDBServer* search_root);
int running_slaves(MariaDBServer* search_root);
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
void assign_server_roles();
void assign_slave_and_relay_master(MariaDBServer* start_node);
bool master_is_valid(std::string* reason_out);
bool cycle_has_master_server(ServerArray& cycle_servers);
void update_master_cycle_info();
void set_low_disk_slaves_maintenance();
void assign_new_master(MariaDBServer* new_master);
void check_cluster_operations_support();
// Switchover methods
bool switchover_prepare(SERVER* new_master,
SERVER* current_master,
Log log_mode,
MariaDBServer** promotion_target_out,
MariaDBServer** demotion_target_out,
json_t** error_out);
bool switchover_perform(MariaDBServer* promotion_target,
MariaDBServer* demotion_target,
json_t** error_out);
bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerArray& slaves,
const GtidList& gtid,
int total_timeout,
json_t** err_out);
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
void handle_low_disk_space_master();
// Failover methods
std::unique_ptr<ClusterOperation> failover_prepare(Log log_mode, json_t** error_out);
bool failover_perform(ClusterOperation& operation);
const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target,
Duration* event_age_out);
bool manual_failover(json_t** output);
void handle_auto_failover();
// Rejoin methods
bool manual_rejoin(SERVER* rejoin_server, json_t** output);
bool cluster_can_be_joined();
void handle_auto_rejoin();
bool get_joinable_servers(ServerArray* output);
bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
uint32_t do_rejoin(const ServerArray& joinable_servers, json_t** output);
// Methods common to failover/switchover/rejoin
MariaDBServer* select_promotion_target(MariaDBServer* current_master,
OperationType op,
Log log_mode,
json_t** error_out);
bool server_is_excluded(const MariaDBServer* server);
bool is_candidate_better(const MariaDBServer* candidate,
const MariaDBServer* current_best,
const MariaDBServer* demotion_target,
uint32_t gtid_domain,
std::string* reason_out = NULL);
bool promote_new_master(MariaDBServer* new_master, json_t** err_out);
int redirect_slaves(MariaDBServer* new_master,
const ServerArray& slaves,
ServerArray* redirected_slaves);
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
bool wait_cluster_stabilization(MariaDBServer* new_master,
const ServerArray& slaves,
int seconds_remaining);
void report_and_disable(const std::string& operation,
const std::string& setting_name,
bool* setting_var);
bool check_gtid_replication(Log log_mode,
const MariaDBServer* demotion_target,
json_t** error_out);
ServerArray get_redirectables(const MariaDBServer* promotion_target,
const MariaDBServer* demotion_target);
// Other methods
void disable_setting(const std::string& setting);
bool check_sql_files();
void enforce_read_only_on_slaves();
bool manual_reset_replication(SERVER* master_server, json_t** error_out);
};
/**
* Generates a list of server names separated by ', '
*
* @param servers The servers
* @return Server names
*/
std::string monitored_servers_to_string(const ServerArray& servers);
/**
* Get MariaDB connection error strings from all the given servers, form one string.
*
* @param servers Servers with errors
* @return Concatenated string.
*/
std::string get_connection_errors(const ServerArray& servers);