From 61bb172033d0e98cd2b327acf53a77a7e23b1c08 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Tue, 21 Aug 2018 18:16:49 +0300 Subject: [PATCH] Cleanup failover/switchover Replication settings warnings are printed once more. Changed some parameter names to be more consistent within the monitor. --- .../mariadbmon/cluster_manipulation.cc | 58 ++++++++++++------- .../modules/monitor/mariadbmon/mariadbmon.hh | 20 +++---- .../monitor/mariadbmon/mariadbserver.cc | 41 +++++-------- .../monitor/mariadbmon/mariadbserver.hh | 14 +---- 4 files changed, 64 insertions(+), 69 deletions(-) diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index fd533c843..899459b41 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -53,7 +53,7 @@ bool MariaDBMonitor::manual_switchover(SERVER* promotion_server, SERVER* demotio error_out); if (ok_to_switch) { - switchover_done = do_switchover(demotion_target, promotion_target, error_out); + switchover_done = switchover_perform(promotion_target, demotion_target, error_out); if (switchover_done) { MXS_NOTICE("Switchover '%s' -> '%s' performed.", @@ -89,7 +89,7 @@ bool MariaDBMonitor::manual_failover(json_t** output) bool ok_to_failover = failover_prepare(Log::ON, &promotion_target, &demotion_target, output); if (ok_to_failover) { - failover_done = do_failover(promotion_target, demotion_target, output); + failover_done = failover_perform(promotion_target, demotion_target, output); if (failover_done) { MXS_NOTICE("Failover '%s' -> '%s' performed.", @@ -467,15 +467,15 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t * intermediate step fails, the cluster may be left without a master and manual intervention is * required to fix things. * - * @param demotion_target Server to demote * @param promotion_target Server to promote - * @param err_out json object for error printing. Can be NULL. + * @param demotion_target Server to demote + * @param error_out Error output. Can be NULL. * @return True if successful. If false, replication may be broken. */ -bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, - json_t** err_out) +bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, + json_t** error_out) { - mxb_assert(demotion_target && promotion_target); + mxb_assert(promotion_target && demotion_target); // Total time limit on how long this operation may take. Checked and modified after significant steps are // completed. @@ -489,7 +489,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer bool rval = false; // Step 2: Set read-only to on, flush logs, update master gtid:s - if (switchover_demote_master(demotion_target, err_out)) + if (switchover_demote_master(demotion_target, error_out)) { m_cluster_modified = true; bool catchup_and_promote_success = false; @@ -500,7 +500,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer ServerArray catchup_slaves = redirectable_slaves; catchup_slaves.push_back(promotion_target); if (switchover_wait_slaves_catchup(catchup_slaves, demotion_target->m_gtid_binlog_pos, - seconds_remaining, err_out)) + seconds_remaining, error_out)) { time_t step3_time = time(NULL); int seconds_step3 = difftime(step3_time, step2_time); @@ -508,7 +508,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer seconds_remaining -= seconds_step3; // Step 4: On new master STOP and RESET SLAVE, set read-only to off. - if (promote_new_master(promotion_target, err_out)) + if (promote_new_master(promotion_target, error_out)) { catchup_and_promote_success = true; m_next_master = promotion_target; @@ -549,7 +549,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer } else { - print_redirect_errors(demotion_target, redirectable_slaves, err_out); + print_redirect_errors(demotion_target, redirectable_slaves, error_out); } } } @@ -560,11 +560,11 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer const char QUERY_UNDO[] = "SET GLOBAL read_only=0;"; if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0) { - PRINT_MXS_JSON_ERROR(err_out, "read_only disabled on server %s.", demotion_target->name()); + PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server %s.", demotion_target->name()); } else { - PRINT_MXS_JSON_ERROR(err_out, "Could not disable read_only on server %s: '%s'.", + PRINT_MXS_JSON_ERROR(error_out, "Could not disable read_only on server %s: '%s'.", demotion_target->name(), mysql_error(demotion_target->m_server_base->con)); } @@ -572,7 +572,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer // Try to reactivate external replication if any. if (m_external_master_port != PORT_UNKNOWN) { - start_external_replication(promotion_target, err_out); + start_external_replication(promotion_target, error_out); } } } @@ -582,14 +582,16 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer /** * Performs failover for a simple topology (1 master, N slaves, no intermediate masters). * - * @param demotion_target Server to demote * @param promotion_target Server to promote - * @param err_out Error output + * @param demotion_target Server to demote + * @param error_out Error output. Can be NULL. * @return True if successful */ -bool MariaDBMonitor::do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target, - json_t** error_out) +bool MariaDBMonitor::failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, + json_t** error_out) { + mxb_assert(promotion_target && demotion_target); + // Total time limit on how long this operation may take. Checked and modified after significant steps are // completed. int seconds_remaining = m_failover_timeout; @@ -946,6 +948,17 @@ bool MariaDBMonitor::promote_new_master(MariaDBServer* new_master, json_t** err_ return success; } +/** + * Select a promotion target for failover/switchover. Looks at the slaves of 'demotion_target' and selects + * the server with the most up-do-date event or, if events are equal, the one with the best settings and + * status. + * + * @param demotion_target The former master server/relay + * @param op Switchover or failover + * @param log_mode Print log or operate silently + * @param error_out Error output + * @return The selected promotion target or NULL if no valid candidates + */ MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_target, ClusterOperation op, Log log_mode, json_t** error_out) @@ -993,6 +1006,11 @@ MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_t else { candidates.push_back(cand); + // Print some warnings about the candidate server. + if (log_mode == Log::ON) + { + cand->warn_replication_settings(); + } } } @@ -1297,7 +1315,7 @@ void MariaDBMonitor::handle_auto_failover() MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", failed_master->name()); failed_master->m_server_base->new_event = false; - if (!do_failover(promotion_target, demotion_target, NULL)) + if (!failover_perform(promotion_target, demotion_target, NULL)) { report_and_disable("failover", CN_AUTO_FAILOVER, &m_auto_failover); } @@ -1611,7 +1629,7 @@ void MariaDBMonitor::handle_low_disk_space_master() if (ok_to_switch) { m_warn_switchover_precond = true; - bool switched = do_switchover(demotion_target, promotion_target, NULL); + bool switched = switchover_perform(promotion_target, demotion_target, NULL); if (switched) { MXS_NOTICE("Switchover %s -> %s performed.", diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 434c293e1..2115ed9ca 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -213,26 +213,26 @@ private: void assign_new_master(MariaDBServer* new_master); // Switchover methods - bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out); bool switchover_prepare(SERVER* new_master, SERVER* current_master, Log log_mode, - MariaDBServer** new_master_out, MariaDBServer** current_master_out, + MariaDBServer** promotion_target_out, MariaDBServer** demotion_target_out, json_t** error_out); - bool do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, json_t** error_out); - bool switchover_demote_master(MariaDBServer* current_master, - json_t** err_out); + bool switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** error_out); + bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out); bool switchover_wait_slaves_catchup(const ServerArray& slaves, const GtidList& gtid, int total_timeout, json_t** err_out); bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master); + bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out); void handle_low_disk_space_master(); // Failover methods - bool manual_failover(json_t** output); - void handle_auto_failover(); - bool cluster_supports_failover(std::string* reasons_out); - bool slave_receiving_events(); bool failover_prepare(Log log_mode, MariaDBServer** promotion_target_out, MariaDBServer** demotion_target_out, json_t** error_out); - bool do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** err_out); + bool failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, + json_t** error_out); + bool cluster_supports_failover(std::string* reasons_out); + bool slave_receiving_events(); + bool manual_failover(json_t** output); + void handle_auto_failover(); // Rejoin methods bool manual_rejoin(SERVER* rejoin_server, json_t** output); diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index c74f12f9e..a6ca58325 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -353,38 +353,23 @@ bool MariaDBServer::read_server_variables(string* errmsg_out) return rval; } -bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warnings) const +void MariaDBServer::warn_replication_settings() const { - bool rval = true; const char* servername = name(); - if (m_rpl_settings.log_bin == false) + if (m_rpl_settings.gtid_strict_mode == false) { - if (print_warnings == WARNINGS_ON) - { - const char NO_BINLOG[] = - "Slave '%s' has binary log disabled and is not a valid promotion candidate."; - MXS_WARNING(NO_BINLOG, servername); - } - rval = false; + const char NO_STRICT[] = + "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. " + "For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode"; + MXS_WARNING(NO_STRICT, servername); } - else if (print_warnings == WARNINGS_ON) + if (m_rpl_settings.log_slave_updates == false) { - if (m_rpl_settings.gtid_strict_mode == false) - { - const char NO_STRICT[] = - "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. " - "For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode"; - MXS_WARNING(NO_STRICT, servername); - } - if (m_rpl_settings.log_slave_updates == false) - { - const char NO_SLAVE_UPDATES[] = - "Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication " - "will break for lagging slaves if '%s' is promoted."; - MXS_WARNING(NO_SLAVE_UPDATES, servername, servername); - } + const char NO_SLAVE_UPDATES[] = + "Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication " + "will break for lagging slaves if '%s' is promoted."; + MXS_WARNING(NO_SLAVE_UPDATES, servername, servername); } - return rval; } bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out) @@ -1006,7 +991,7 @@ bool MariaDBServer::can_be_demoted_switchover(string* reason_out) } else if (!update_replication_settings(&query_error)) { - reason = string_printf("it could not be queried: '%s'.", query_error.c_str()); + reason = string_printf("it could not be queried: %s", query_error.c_str()); } else if (!binlog_on()) { @@ -1084,7 +1069,7 @@ bool MariaDBServer::can_be_promoted(ClusterOperation op, } else if (!update_replication_settings(&query_error)) { - reason = string_printf("it could not be queried: '%s'.", query_error.c_str()); + reason = string_printf("it could not be queried: %s", query_error.c_str()); } else if (!binlog_on()) { diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index c355122d6..fbe467056 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -17,12 +17,6 @@ #include #include "gtid.hh" -enum print_repl_warnings_t -{ - WARNINGS_ON, - WARNINGS_OFF -}; - class QueryResult; class MariaDBServer; // Server pointer array @@ -230,12 +224,10 @@ public: bool read_server_variables(std::string* errmsg_out = NULL); /** - * Check if server has binary log enabled. Print warnings if gtid_strict_mode or log_slave_updates is off. - * - * @param print_on Print warnings or not - * @return True if log_bin is on + * Print warnings if gtid_strict_mode or log_slave_updates is off. Does not query the server, + * so 'update_replication_settings' should have been called recently to update the values. */ - bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON) const; + void warn_replication_settings() const; /** * Wait until server catches up to the target gtid. Only considers gtid domains common to this server