diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 29f76196a..27376d3be 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -421,10 +421,13 @@ int MariaDBMonitor::redirect_slaves(MariaDBServer* new_master, const ServerArray * * @param op Operation descriptor * @param slaves An array of slaves to redirect + * @param old_master The connections to this server are redirected + * @param new_master The new master for the redirected connections * @param redirected_slaves A vector where to insert successfully redirected slaves * @return The number of slaves successfully redirected */ int MariaDBMonitor::redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves, + const MariaDBServer* old_master, const MariaDBServer* new_master, ServerArray* redirected_slaves) { mxb_assert(redirected_slaves != NULL); @@ -440,7 +443,7 @@ int MariaDBMonitor::redirect_slaves_ex(ClusterOperation& op, const ServerArray& int successes = 0; for (MariaDBServer* redirectable : slaves) { - if (redirectable->redirect_existing_slave_conn(op)) + if (redirectable->redirect_existing_slave_conn(op, old_master, new_master)) { successes++; redirected_slaves->push_back(redirectable); @@ -689,11 +692,12 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) json_t** const error_out = op.error_out; mxb_assert(promotion_target && demotion_target); - // Step 1: Save all slaves except promotion target to an array. + // Step 1a: Save all slaves except promotion target to an array. // Try to redirect even disconnected slaves. - // TODO: 'switchover_wait_slaves_catchup' needs to be smarter and not bother with such slaves. - ServerArray redirectable_slaves = get_redirectables(promotion_target, demotion_target); - + ServerArray redirect_to_promo_target = get_redirectables(demotion_target, promotion_target); + // Step 1b: The slaves of the promotion target must be redirected to the old master. This + // list contains elements only when promoting a relay. + ServerArray redirect_to_demo_target = get_redirectables(promotion_target, demotion_target); bool rval = false; // Step 2: Set read-only to on, flush logs, update gtid:s. if (demotion_target->demote(op)) @@ -721,23 +725,29 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) } // Step 5: Start replication on old master and redirect slaves. - ServerArray redirected_slaves; + ServerArray redirected_to_promo_target; if (demotion_target->copy_slave_conns(op, op.promotion_target_conns, promotion_target)) { - redirected_slaves.push_back(demotion_target); + redirected_to_promo_target.push_back(demotion_target); } else { MXS_WARNING("Could not copy slave connections from %s to %s.", promotion_target->name(), demotion_target->name()); } - redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves); + redirect_slaves_ex(op, redirect_to_promo_target, demotion_target, promotion_target, + &redirected_to_promo_target); - if (!redirected_slaves.empty()) + ServerArray redirected_to_demo_target; + redirect_slaves_ex(op, redirect_to_demo_target, promotion_target, demotion_target, + &redirected_to_demo_target); + + if (!redirected_to_promo_target.empty() || !redirected_to_demo_target.empty()) { timer.restart(); // Step 6: Finally, check that slaves are replicating. - wait_cluster_stabilization(op, redirected_slaves); + wait_cluster_stabilization(op, redirected_to_promo_target, promotion_target); + wait_cluster_stabilization(op, redirected_to_demo_target, demotion_target); auto step6_duration = timer.lap(); MXS_INFO("Switchover: slave replication confirmation took %.1f seconds with " "%.1f seconds to spare.", @@ -784,7 +794,7 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) MariaDBServer* const promotion_target = op.promotion_target; // Step 1: Populate a vector with all slaves not the selected master. - ServerArray redirectable_slaves = get_redirectables(promotion_target, op.demotion_target); + ServerArray redirectable_slaves = get_redirectables(op.demotion_target, promotion_target); bool rval = false; // Step 2: Stop and reset slave, set read-only to OFF. @@ -802,14 +812,14 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) // Step 3: Redirect slaves. ServerArray redirected_slaves; - redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves); + redirect_slaves_ex(op, redirectable_slaves, op.demotion_target, promotion_target, &redirected_slaves); if (!redirected_slaves.empty()) { StopWatch timer; /* Step 4: Finally, check that slaves are connected to the new master. Even if * time is out at this point, wait_cluster_stabilization() will check the slaves * once so that latest status is printed. */ - wait_cluster_stabilization(op, redirected_slaves); + wait_cluster_stabilization(op, redirected_slaves, promotion_target); MXS_INFO("Failover: slave replication confirmation took %.1f seconds with " "%.1f seconds to spare.", timer.lap().secs(), op.time_remaining.secs()); @@ -824,8 +834,10 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) * * @param op Operation descriptor * @param redirected_slaves Slaves to check + * @param new_master The target server of the slave connections */ -void MariaDBMonitor::wait_cluster_stabilization(ClusterOperation& op, const ServerArray& redirected_slaves) +void MariaDBMonitor::wait_cluster_stabilization(ClusterOperation& op, const ServerArray& redirected_slaves, + const MariaDBServer* new_master) { if (redirected_slaves.empty()) { @@ -834,7 +846,6 @@ void MariaDBMonitor::wait_cluster_stabilization(ClusterOperation& op, const Serv } StopWatch timer; - const MariaDBServer* new_master = op.promotion_target; // Check all the servers in the list. Using a set because erasing from container. std::set unconfirmed(redirected_slaves.begin(), redirected_slaves.end()); ServerArray successes; @@ -1733,19 +1744,19 @@ bool MariaDBMonitor::check_gtid_replication(Log log_mode, const MariaDBServer* d /** * List slaves which should be redirected to the new master. * - * @param promotion_target The server which will be promoted - * @param demotion_target The server which will be demoted + * @param old_master The server whose slaves are listed + * @param ignored_slave A slave which should not be listed even if otherwise valid * @return A list of slaves to redirect */ -ServerArray MariaDBMonitor::get_redirectables(const MariaDBServer* promotion_target, - const MariaDBServer* demotion_target) +ServerArray MariaDBMonitor::get_redirectables(const MariaDBServer* old_master, + const MariaDBServer* ignored_slave) { ServerArray redirectable_slaves; - for (MariaDBServer* slave : demotion_target->m_node.children) + for (MariaDBServer* slave : old_master->m_node.children) { - if (slave->is_usable() && slave != promotion_target) + if (slave->is_usable() && slave != ignored_slave) { - auto sstatus = slave->slave_connection_status(demotion_target); + auto sstatus = slave->slave_connection_status(old_master); if (sstatus && !sstatus->gtid_io_pos.empty()) { redirectable_slaves.push_back(slave); diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 11406f9c6..867c0bb02 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -283,15 +283,16 @@ private: bool check_gtid_replication(Log log_mode, const MariaDBServer* demotion_target, json_t** error_out); - ServerArray get_redirectables(const MariaDBServer* promotion_target, - const MariaDBServer* demotion_target); + ServerArray get_redirectables(const MariaDBServer* old_master, const MariaDBServer* ignored_slave); int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves, ServerArray* redirected_slaves); int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves, + const MariaDBServer* old_master, const MariaDBServer* new_master, ServerArray* redirected_slaves); bool start_external_replication(MariaDBServer* new_master, json_t** err_out); std::string generate_change_master_cmd(const std::string& master_host, int master_port); - void wait_cluster_stabilization(ClusterOperation& op, const ServerArray& slaves); + void wait_cluster_stabilization(ClusterOperation& op, const ServerArray& slaves, + const MariaDBServer* new_master); void report_and_disable(const std::string& operation, const std::string& setting_name, bool* setting_var); diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 7597b6a7e..712a04953 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -2041,15 +2041,14 @@ string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const Sla return change_cmd; } -bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op) +bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op, const MariaDBServer* old_master, + const MariaDBServer* new_master) { StopWatch timer; - const MariaDBServer* old_master = op.demotion_target; - const MariaDBServer* new_master = op.promotion_target; - auto old_conn = slave_connection_status(old_master); mxb_assert(old_conn); bool success = false; + // First, just stop the slave connection. bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, op.time_remaining, op.error_out); op.time_remaining -= timer.restart(); diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index f8d0c2bd1..e4d04b716 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -368,12 +368,15 @@ public: bool demote(ClusterOperation& op); /** - * Redirect the slave connection going to demotion target to replicate from promotion target. + * Redirect the slave connection going to old master to replicate from new master. * * @param op Operation descriptor + * @param old_master The connection to this server is redirected + * @param new_master The new master for the redirected connection * @return True on success */ - bool redirect_existing_slave_conn(ClusterOperation& op); + bool redirect_existing_slave_conn(ClusterOperation& op, const MariaDBServer* old_master, + const MariaDBServer* new_master); /** * Copy slave connections to this server. This is usually needed during switchover promotion and on