From 019d62bbb8028e23da7be6f40bcc1bce7ab93359 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Fri, 15 Jun 2018 17:29:00 +0300 Subject: [PATCH] MXS-1886 Better auto-rejoin error description and tolerance Contains changes from commit 09df01752812444c6e7c409a8957d292f7de63cf adapted to the 2.3 branch. --- .../mariadbmon/cluster_manipulation.cc | 25 +++++++++++++++---- .../modules/monitor/mariadbmon/mariadbmon.cc | 9 +------ .../modules/monitor/mariadbmon/mariadbmon.hh | 1 + .../monitor/mariadbmon/mariadbserver.cc | 25 +++++++++++++++++-- .../monitor/mariadbmon/mariadbserver.hh | 3 ++- 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index ce99ecde8..f35001ee1 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -146,7 +146,8 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) { if (m_master->update_gtids()) { - if (slave_cand->can_replicate_from(m_master)) + string no_rejoin_reason; + if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason)) { ServerArray joinable_server; joinable_server.push_back(slave_cand); @@ -162,9 +163,9 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) } else { - PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s' " - "or it could not be queried.", rejoin_serv_name, - m_master->name()); + PRINT_MXS_JSON_ERROR(output, "Server '%s' cannot replicate from cluster master '%s': " + "%s.", rejoin_serv_name, m_master->name(), + no_rejoin_reason.c_str()); } } else @@ -398,10 +399,20 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output) { for (size_t i = 0; i < suspects.size(); i++) { - if (suspects[i]->can_replicate_from(m_master)) + string rejoin_err_msg; + if (suspects[i]->can_replicate_from(m_master, &rejoin_err_msg)) { output->push_back(suspects[i]); } + else if (m_warn_cannot_rejoin) + { + // Print a message explaining why an auto-rejoin is not done. Suppress printing. + MXS_WARNING("Automatic rejoin was not attempted on server '%s' even though it is a " + "valid candidate. Will keep retrying with this message suppressed for all " + "servers. Errors: \n%s", + suspects[i]->name(), rejoin_err_msg.c_str()); + m_warn_cannot_rejoin = false; + } } } else @@ -409,6 +420,10 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output) comm_ok = false; } } + else + { + m_warn_cannot_rejoin = true; + } return comm_ok; } diff --git a/server/modules/monitor/mariadbmon/mariadbmon.cc b/server/modules/monitor/mariadbmon/mariadbmon.cc index 1ffe27dd4..7a7bf7dd9 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.cc +++ b/server/modules/monitor/mariadbmon/mariadbmon.cc @@ -60,6 +60,7 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor) , m_warn_set_standalone_master(true) , m_log_no_master(true) , m_warn_failover_precond(true) + , m_warn_cannot_rejoin(true) {} MariaDBMonitor::~MariaDBMonitor() @@ -670,14 +671,6 @@ void MariaDBMonitor::handle_auto_rejoin() MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins); m_cluster_modified = true; } - if (joins < joinable_servers.size()) - { - MXS_ERROR("A cluster join operation failed, disabling automatic rejoining. " - "To re-enable, manually set '%s' to 'true' for monitor '%s' via MaxAdmin or " - "the REST API.", CN_AUTO_REJOIN, m_monitor->name); - m_auto_rejoin = false; - disable_setting(CN_AUTO_REJOIN); - } } else { diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 64b5ba44c..362087e4f 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -159,6 +159,7 @@ private: bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master * outside of a cycle. */ bool m_warn_failover_precond; /**< Print failover preconditions error message? */ + bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */ enum slave_down_setting_t { diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 9aa4efdc9..782a8f4e8 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -537,12 +537,33 @@ bool MariaDBServer::update_slave_info() update_gtids() && do_show_slave_status()); } -bool MariaDBServer::can_replicate_from(MariaDBServer* master) +bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* error_out) { bool rval = false; if (update_gtids()) { - rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos); + if (m_gtid_current_pos.empty()) + { + *error_out = string("'") + name() + "' does not have a valid 'gtid_current_pos'."; + } + else if (master->m_gtid_binlog_pos.empty()) + { + *error_out = string("'") + master->name() + "' does not have a valid 'gtid_binlog_pos'."; + } + else + { + rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos); + if (!rval) + { + *error_out = string("gtid_current_pos of '") + name() + "' (" + + m_gtid_current_pos.to_string() + ") is incompatible with gtid_binlog_pos of '" + + master->name() + "' (" + master->m_gtid_binlog_pos.to_string() + ")."; + } + } + } + else + { + *error_out = string("Server '") + name() + "' could not be queried."; } return rval; } diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 56eb01567..b0c1dcf6a 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -313,9 +313,10 @@ public: * The non-detected errors will mostly be detected once the slave tries to start replicating. * * @param master_info Master server + * @param error_out Details the reason for a negative result * @return True if slave can replicate from master */ - bool can_replicate_from(MariaDBServer* master); + bool can_replicate_from(MariaDBServer* master, std::string* error_out); /** * Redirect one slave server to another master