From 382a017518b03708c94e1be08ac22eb65769d775 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Thu, 19 Jul 2018 19:23:16 +0300 Subject: [PATCH] A master which is down for longer than failcount is considered an invalid master If auto_failover is disabled and an alternative master exists, the monitor will swap the master. This may break replication, but the situation requires that the dba has set up a cluster with multiple masters. --- .../monitor/mariadbmon/cluster_discovery.cc | 114 ++++++++++++------ .../modules/monitor/mariadbmon/mariadbmon.cc | 10 +- .../modules/monitor/mariadbmon/mariadbmon.hh | 2 + 3 files changed, 88 insertions(+), 38 deletions(-) diff --git a/server/modules/monitor/mariadbmon/cluster_discovery.cc b/server/modules/monitor/mariadbmon/cluster_discovery.cc index c758abf35..2d9f69ecf 100644 --- a/server/modules/monitor/mariadbmon/cluster_discovery.cc +++ b/server/modules/monitor/mariadbmon/cluster_discovery.cc @@ -20,6 +20,7 @@ #include using std::string; +using maxscale::string_printf; static bool check_replicate_ignore_table(MXS_MONITORED_SERVER* database); static bool check_replicate_do_table(MXS_MONITORED_SERVER* database); @@ -783,7 +784,7 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node) /** * Is the current master server still valid or should a new one be selected? * - * @param reason_out Output for a text description + * @param reason_out If master is not valid, the reason is printed here. * @return True, if master is ok. False if the current master has changed in a way that * a new master should be selected. */ @@ -791,8 +792,8 @@ bool MariaDBMonitor::master_is_valid(std::string* reason_out) { // The master server of the cluster needs to be re-calculated in the following four cases: bool rval = true; - // 1) There is no master. - if (m_master == NULL || m_master->is_down()) + // 1) There is no master. This typically only applies when MaxScale is first ran. + if (m_master == NULL) { rval = false; } @@ -802,6 +803,13 @@ bool MariaDBMonitor::master_is_valid(std::string* reason_out) rval = false; *reason_out = "it is in read-only mode"; } + // 3) The master has been down for failcount iterations and auto_failover is not on. + else if (m_master->is_down() && !m_auto_failover && m_master->m_server_base->mon_err_count >= m_failcount) + { + rval = false; + *reason_out = string_printf("it has been down over %d (failcount) monitor updates and failover " + "is not on", m_failcount); + } // 3) The master was a non-replicating master (not in a cycle) but now has a slave connection. else if (m_master_cycle_status.cycle_id == NodeData::CYCLE_NONE) { @@ -897,51 +905,87 @@ void MariaDBMonitor::update_topology() // Find the server that looks like it would be the best master. It does not yet overwrite the // current master. string topology_messages; - MariaDBServer* root_master = find_topology_master_server(&topology_messages); - + MariaDBServer* master_candidate = find_topology_master_server(&topology_messages); + // If the 'master_candidate' is a valid server but different from the current master, + // a change may be necessary. It will only happen if the current master is no longer usable. + bool have_better = (master_candidate && master_candidate != m_master); // Check if current master is still valid. - string reason; - if (master_is_valid(&reason)) + string reason_not_valid; + bool current_is_ok = master_is_valid(&reason_not_valid); + + if (current_is_ok) { - // Update master cycle info in case it has changed + m_warn_current_master_invalid = true; + // Update master cycle info in case it has changed. update_master_cycle_info(); - if (root_master && m_master != root_master) + if (have_better) { - // Master is still valid but it is no longer the best master. Print a warning. - MXS_WARNING("'%s' is a better master candidate than the current master '%s'. " - "Master will change if '%s' is no longer a valid master.", - root_master->name(), m_master->name(), m_master->name()); + // Master is still valid but it is no longer the best master. Print a warning. This + // may be a continuous situation so only print once. + if (m_warn_have_better_master) + { + MXS_WARNING("'%s' is a better master candidate than the current master '%s'. " + "Master will change when '%s' is no longer a valid master.", + master_candidate->name(), m_master->name(), m_master->name()); + m_warn_have_better_master = false; + } } } else { - if (m_master && !reason.empty()) + // Current master is faulty or does not exist + m_warn_have_better_master = true; + if (have_better) { - MXS_WARNING("The previous master server '%s' is no longer a valid master because %s. " - "Selecting new master.", m_master->name(), reason.c_str()); + // We have an alternative. Swap master. The messages give the impression + // that new master selection has not yet happened, but this is just for clarity. + const char sel_new_master[] = "Selecting new master server."; + if (m_master) + { + ss_dassert(!reason_not_valid.empty()); + MXS_WARNING("The current master server '%s' is no longer valid because %s. %s", + m_master->name(), reason_not_valid.c_str(), sel_new_master); + } + else + { + // This typically happens only when starting from scratch. + MXS_NOTICE("%s", sel_new_master); + } + + // At this point, print messages explaining why any/other possible master servers weren't picked. + if (!topology_messages.empty()) + { + MXS_WARNING("%s", topology_messages.c_str()); + } + + MXS_NOTICE("Setting '%s' as master.", master_candidate->name()); + // Change the master, even though this may break replication. + assign_new_master(master_candidate); } else { - MXS_NOTICE("Selecting master server."); - } + // No alternative master. Keep current status and print warnings. + // This situation may stick so only print the messages once. + if (m_warn_current_master_invalid) + { + if (m_master) + { + ss_dassert(!reason_not_valid.empty()); + MXS_WARNING("The current master server '%s' is no longer valid because %s, " + "but there is no better alternative to swap to.", + m_master->name(), reason_not_valid.c_str()); + } + else + { + MXS_WARNING("No valid master server found."); + } - // The current master is no longer ok (or it never was). Change the master, even though this may - // break replication. Master changes are logged by the log_master_changes()-method. - if (!topology_messages.empty()) - { - MXS_WARNING("%s", topology_messages.c_str()); - } - - assign_new_master(root_master); - - if (m_master) - { - MXS_NOTICE("'%s' is the best master candidate.", m_master->name()); - } - else - { - MXS_WARNING("No valid master servers found."); + if (!topology_messages.empty()) + { + MXS_WARNING("%s", topology_messages.c_str()); + } + m_warn_current_master_invalid = false; + } } } - m_cluster_topology_changed = false; } diff --git a/server/modules/monitor/mariadbmon/mariadbmon.cc b/server/modules/monitor/mariadbmon/mariadbmon.cc index 130934f25..50cfb2afb 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.cc +++ b/server/modules/monitor/mariadbmon/mariadbmon.cc @@ -67,6 +67,8 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor) , m_log_no_master(true) , m_warn_failover_precond(true) , m_warn_cannot_rejoin(true) + , m_warn_current_master_invalid(true) + , m_warn_have_better_master(true) {} MariaDBMonitor::~MariaDBMonitor() @@ -485,11 +487,11 @@ void MariaDBMonitor::tick() } } - if (m_cluster_topology_changed) + // Topology needs to be rechecked if it has changed or if master is down. + if (m_cluster_topology_changed || (m_master && m_master->is_down())) { - // This means that a server id or a slave connection has changed, or read_only was set. - // Various things need to be checked and updated. update_topology(); + m_cluster_topology_changed = false; } // Always re-assign master, slave etc bits as these depend on other factors outside topology @@ -740,6 +742,8 @@ void MariaDBMonitor::assign_new_master(MariaDBServer* new_master) { m_master = new_master; update_master_cycle_info(); + m_warn_current_master_invalid = true; + m_warn_have_better_master = true; } /** diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 8bf53f2d4..e15e6da33 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -179,6 +179,8 @@ private: * outside of a cycle. */ bool m_warn_failover_precond; /**< Print failover preconditions error message? */ bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */ + bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */ + bool m_warn_have_better_master; /**< Print warning if the current master is not the best one? */ // Base methods MariaDBMonitor(MXS_MONITOR* monitor_base);