diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 2173dfb89..0789a8e95 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -22,6 +22,7 @@ using std::string; using std::unique_ptr; using maxscale::string_printf; +using maxbase::StopWatch; static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' " "for monitor '%s' via MaxAdmin or the REST API, or restart MaxScale."; @@ -417,6 +418,46 @@ int MariaDBMonitor::redirect_slaves(MariaDBServer* new_master, return successes; } +/** + * Redirect slaves to replicate from the promotion target. + * + * @param op Operation descriptor + * @param slaves An array of slaves to redirect + * @param redirected_slaves A vector where to insert successfully redirected slaves + * @return The number of slaves successfully redirected + */ +int MariaDBMonitor::redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves, + ServerArray* redirected_slaves) +{ + mxb_assert(redirected_slaves != NULL); + if (slaves.empty()) + { + // This is ok, nothing to do. + return 0; + } + + string slave_names = monitored_servers_to_string(slaves); + MXS_NOTICE("Redirecting %s to replicate from %s instead of %s.", + slave_names.c_str(), op.promotion_target->name(), op.demotion_target->name()); + int successes = 0; + for (MariaDBServer* redirectable : slaves) + { + if (redirectable->redirect_existing_slave_conn(op)) + { + successes++; + redirected_slaves->push_back(redirectable); + } + } + if (size_t(successes) == slaves.size()) + { + MXS_NOTICE("All redirects successful."); + } + else + { + MXS_WARNING("%lu out of %lu redirects failed.", slaves.size() - successes, slaves.size()); + } + return successes; +} /** * Set the new master to replicate from the cluster external master. * @@ -723,13 +764,11 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) { redirected_slaves.push_back(demotion_target); } - int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves); + int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves); bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0; if (success) { - op.time_remaining -= timer.restart(); - // Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves // to receive it. If using external replication, skip this step. Come up with an // alternative later. @@ -793,7 +832,6 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) { mxb_assert(op.promotion_target && op.demotion_target); MariaDBServer* const promotion_target = op.promotion_target; - maxbase::StopWatch timer; // Step 1: Populate a vector with all slaves not the selected master. ServerArray redirectable_slaves = get_redirectables(promotion_target, op.demotion_target); @@ -802,17 +840,17 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) // Step 2: Stop and reset slave, set read-only to 0. if (promotion_target->promote(op)) { + // Point of no return. Even if following steps fail, do not try to undo. m_next_master = promotion_target; m_cluster_modified = true; // Step 3: Redirect slaves. ServerArray redirected_slaves; - int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves); + int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves); bool success = redirectable_slaves.empty() ? true : redirects > 0; if (success) { - op.time_remaining -= timer.restart(); - + StopWatch timer; // Step 4: Finally, add an event to the new master to advance gtid and wait for the slaves // to receive it. seconds_remaining can be 0 or less at this point. Even in such a case // wait_cluster_stabilization() may succeed if replication is fast enough. If using external @@ -1679,9 +1717,9 @@ void MariaDBMonitor::check_cluster_operations_support() * @return The first connected slave or NULL if none found */ const MariaDBServer* MariaDBMonitor::slave_receiving_events(const MariaDBServer* demotion_target, - maxbase::Duration* event_age_out) + maxbase::Duration* event_age_out) { - auto time_now = maxbase::Clock::now(); + auto time_now = maxbase::Clock::now(); maxbase::Clock::time_point alive_after = time_now - std::chrono::seconds(m_master_failure_timeout); const MariaDBServer* connected_slave = NULL; diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 17811300b..29bc47f2f 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -273,6 +273,8 @@ private: int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves, ServerArray* redirected_slaves); + int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves, + ServerArray* redirected_slaves); std::string generate_change_master_cmd(const std::string& master_host, int master_port); bool start_external_replication(MariaDBServer* new_master, json_t** err_out); bool wait_cluster_stabilization(MariaDBServer* new_master, diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index e351253d3..e68239f42 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -1824,7 +1824,7 @@ bool MariaDBServer::create_start_slave(ClusterOperation& op, const SlaveStatus& /** * Generate a CHANGE MASTER TO-query. * - * @param op Operation descriptor + * @param op Operation descriptor, required for username and password * @param slave_conn Existing slave connection to emulate * @return Generated query */ @@ -1832,8 +1832,8 @@ string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const Sla { string change_cmd; change_cmd += string_printf("CHANGE MASTER '%s' TO MASTER_HOST = '%s', MASTER_PORT = %i, ", - slave_conn.name.c_str(), slave_conn.master_host.c_str(), - slave_conn.master_port); + slave_conn.name.c_str(), + slave_conn.master_host.c_str(), slave_conn.master_port); change_cmd += "MASTER_USE_GTID = current_pos, "; change_cmd += string_printf("MASTER_USER = '%s', ", op.replication_user.c_str()); const char MASTER_PW[] = "MASTER_PASSWORD = '%s';"; @@ -1846,6 +1846,58 @@ string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const Sla return change_cmd; } +bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op) +{ + StopWatch timer; + const MariaDBServer* old_master = op.demotion_target; + const MariaDBServer* new_master = op.promotion_target; + + auto old_conn = slave_connection_status_mutable(old_master); + mxb_assert(old_conn); + bool success = false; + // First, just stop the slave connection. + bool stopped = stop_slave_conn(old_conn, StopMode::STOP_ONLY, op.time_remaining, op.error_out); + op.time_remaining -= timer.restart(); + if (stopped) + { + SlaveStatus modified_conn = *old_conn; + SERVER* target_server = new_master->m_server_base->server; + modified_conn.master_host = target_server->address; + modified_conn.master_port = target_server->port; + string change_master = generate_change_master_cmd(op, modified_conn); + string error_msg; + bool changed = execute_cmd_time_limit(change_master, op.time_remaining, &error_msg); + op.time_remaining -= timer.restart(); + if (changed) + { + string start = string_printf("START SLAVE '%s';", old_conn->name.c_str()); + bool started = execute_cmd_time_limit(start, op.time_remaining, &error_msg); + op.time_remaining -= timer.restart(); + if (started) + { + success = true; + } + else + { + PRINT_MXS_JSON_ERROR(op.error_out, + "%s could not be started: %s", + modified_conn.to_short_string(name()).c_str(), + error_msg.c_str()); + } + } + else + { + // TODO: This may currently print out passwords. + PRINT_MXS_JSON_ERROR(op.error_out, + "%s could not be redirected to [%s]:%i: %s", + old_conn->to_short_string(name()).c_str(), + modified_conn.master_host.c_str(), modified_conn.master_port, + error_msg.c_str()); + } + } // 'stop_slave_conn' prints its own errors + return success; +} + string SlaveStatus::to_string() const { // Print all of this on the same line to make things compact. Are the widths reasonable? The format is diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 67eec6dd1..3e7b78a6f 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -502,6 +502,14 @@ public: */ bool promote(ClusterOperation& operation); + /** + * Redirect the slave connection going to demotion target to replicate from promotion target. + * + * @param op Operation descriptor + * @return True on success + */ + bool redirect_existing_slave_conn(ClusterOperation& op); + private: class EventInfo; typedef std::function ManipulatorFunc;