From d0e6921604adf08cb635fadd57fdb44d59bdfcab Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Fri, 7 Jun 2019 17:26:17 +0300 Subject: [PATCH] Improve switchover undo when new master fails Now the monitor properly restores the old master by running promotion code on it. Also, binlog is disabled when enabling server events. --- .../mariadbmon/cluster_manipulation.cc | 84 ++------------ .../modules/monitor/mariadbmon/mariadbmon.hh | 3 +- .../monitor/mariadbmon/mariadbserver.cc | 107 +++++++++++++----- .../monitor/mariadbmon/mariadbserver.hh | 3 +- .../monitor/mariadbmon/server_utils.hh | 3 +- 5 files changed, 93 insertions(+), 107 deletions(-) diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index b0fec14b3..4e68c94dd 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -330,7 +330,8 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er { if (old_master) { - if (!new_master->enable_events(old_master->m_enabled_events, error_out)) + if (!new_master->enable_events(MariaDBServer::BinlogMode::BINLOG_ON, + old_master->m_enabled_events, error_out)) { error = true; PRINT_MXS_JSON_ERROR(error_out, "Could not enable events on '%s': %s", @@ -411,36 +412,6 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er return rval; } -/** - * Generate a CHANGE MASTER TO-query. TODO: Use the version in MariaDBServer instead. - * - * @param master_host Master hostname/address - * @param master_port Master port - * @return Generated query - */ -string MariaDBMonitor::generate_change_master_cmd(const string& master_host, int master_port) -{ - std::stringstream change_cmd; - change_cmd << "CHANGE MASTER TO MASTER_HOST = '" << master_host << "', "; - change_cmd << "MASTER_PORT = " << master_port << ", "; - change_cmd << "MASTER_USE_GTID = current_pos, "; - if (m_settings.shared.replication_ssl) - { - change_cmd << "MASTER_SSL = 1, "; - } - change_cmd << "MASTER_USER = '" << m_settings.shared.replication_user << "', "; - const char MASTER_PW[] = "MASTER_PASSWORD = '"; - const char END[] = "';"; -#if defined (SS_DEBUG) - std::stringstream change_cmd_nopw; - change_cmd_nopw << change_cmd.str(); - change_cmd_nopw << MASTER_PW << "******" << END; - MXS_DEBUG("Change master command is '%s'.", change_cmd_nopw.str().c_str()); -#endif - change_cmd << MASTER_PW << m_settings.shared.replication_password << END; - return change_cmd.str(); -} - /** * Redirect slave connections from the promotion target to replicate from the demotion target and vice versa. * @@ -569,34 +540,6 @@ int MariaDBMonitor::redirect_slaves_ex(GeneralOpData& general, OperationType typ } return successes; } -/** - * Set the new master to replicate from the cluster external master. - * - * @param new_master The server being promoted - * @param err_out Error output - * @return True if new master accepted commands - */ -bool MariaDBMonitor::start_external_replication(MariaDBServer* new_master, json_t** err_out) -{ - bool rval = false; - MYSQL* new_master_conn = new_master->m_server_base->con; - string change_cmd = generate_change_master_cmd(m_external_master_host, m_external_master_port); - if (mxs_mysql_query(new_master_conn, change_cmd.c_str()) == 0 - && mxs_mysql_query(new_master_conn, "START SLAVE;") == 0) - { - MXS_NOTICE("New master starting replication from external master %s:%d.", - m_external_master_host.c_str(), - m_external_master_port); - rval = true; - } - else - { - PRINT_MXS_JSON_ERROR(err_out, - "Could not start replication from external master: '%s'.", - mysql_error(new_master_conn)); - } - return rval; -} /** * (Re)join given servers to the cluster. The servers in the array are assumed to be joinable. @@ -885,25 +828,20 @@ bool MariaDBMonitor::switchover_perform(SwitchoverParams& op) if (!catchup_and_promote_success) { - // Step 2 or 3 failed, try to undo step 2. - const char QUERY_UNDO[] = "SET GLOBAL read_only=0;"; - if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0) + // Step 2 or 3 failed, try to undo step 1 by promoting the demotion target back to master. + // Reset the time limit since the last part may have used it all. + MXS_NOTICE("Attempting to undo changes to '%s'.", demotion_target->name()); + const mxb::Duration demotion_undo_time_limit((double)m_settings.switchover_timeout); + GeneralOpData general_undo(op.general.error_out, demotion_undo_time_limit); + if (demotion_target->promote(general_undo, op.promotion, OperationType::UNDO_DEMOTION, nullptr)) { - PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server '%s'.", - demotion_target->name()); + MXS_NOTICE("'%s' restored to original status.", demotion_target->name()); } else { PRINT_MXS_JSON_ERROR(error_out, - "Could not disable read_only on server '%s': '%s'.", - demotion_target->name(), - mysql_error(demotion_target->m_server_base->con)); - } - - // Try to reactivate external replication if any. - if (m_external_master_port != PORT_UNKNOWN) - { - start_external_replication(promotion_target, error_out); + "Restoring of '%s' failed, cluster may be in an invalid state.", + demotion_target->name()); } } } diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 9c14ded07..126e30c7a 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -336,8 +336,7 @@ private: const MariaDBServer* demotion_target, ServerArray* redirected_to_promo, ServerArray* redirected_to_demo); - bool start_external_replication(MariaDBServer* new_master, json_t** err_out); - std::string generate_change_master_cmd(const std::string& master_host, int master_port); + void wait_cluster_stabilization(GeneralOpData& op, const ServerArray& slaves, const MariaDBServer* new_master); diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 55968d6e2..7dc95c52b 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -1229,7 +1229,7 @@ const SlaveStatus* MariaDBServer::slave_connection_status_host_port(const MariaD return NULL; } -bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** error_out) +bool MariaDBServer::enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out) { int found_disabled_events = 0; int events_enabled = 0; @@ -1248,6 +1248,17 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro } }; + string error_msg; + if (binlog_mode == BinlogMode::BINLOG_OFF) + { + if (!execute_cmd("SET @@session.sql_log_bin=0;", &error_msg)) + { + const char FMT[] = "Could not disable session binlog on '%s': %s Server events not enabled."; + PRINT_MXS_JSON_ERROR(error_out, FMT, name(), error_msg.c_str()); + return false; + } + } + bool rval = false; if (events_foreach(enabler, error_out)) { @@ -1260,6 +1271,12 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro rval = true; } } + if (binlog_mode == BinlogMode::BINLOG_OFF) + { + // Failure in re-enabling the session binlog doesn't really matter because we don't want the monitor + // generating binlog events anyway. + execute_cmd("SET @@session.sql_log_bin=1;"); + } return rval; } @@ -1459,36 +1476,40 @@ bool MariaDBServer::reset_all_slave_conns(json_t** error_out) bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, OperationType type, const MariaDBServer* demotion_target) { - mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER); + mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER + || type == OperationType::UNDO_DEMOTION); json_t** const error_out = general.error_out; - // Function should only be called for a master-slave pair. - auto master_conn = slave_connection_status(demotion_target); - mxb_assert(master_conn); - if (master_conn == NULL) + + StopWatch timer; + bool stopped = false; + if (type == OperationType::SWITCHOVER || type == OperationType::FAILOVER) { - PRINT_MXS_JSON_ERROR(error_out, - "'%s' is not a slave of '%s' and cannot be promoted to its place.", - name(), demotion_target->name()); - return false; + // In normal circumstances, this should only be called for a master-slave pair. + auto master_conn = slave_connection_status(demotion_target); + mxb_assert(master_conn); + if (master_conn == NULL) + { + PRINT_MXS_JSON_ERROR(error_out, + "'%s' is not a slave of '%s' and cannot be promoted to its place.", + name(), demotion_target->name()); + return false; + } + + // Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion + // target. In case of switchover, remove other slave connections as well since the demotion target + // will take them over. + if (type == OperationType::SWITCHOVER) + { + stopped = remove_slave_conns(general, m_slave_status); + } + else if (type == OperationType::FAILOVER) + { + stopped = remove_slave_conns(general, {*master_conn}); + } } bool success = false; - StopWatch timer; - // Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion - // target. In case of switchover, remove other slave connections as well since the demotion target - // will take them over. - bool stopped = false; - if (type == OperationType::SWITCHOVER) - { - stopped = remove_slave_conns(general, m_slave_status); - } - else if (type == OperationType::FAILOVER) - { - stopped = remove_slave_conns(general, {*master_conn}); - master_conn = NULL; // The connection pointed to may no longer exist. - } - - if (stopped) + if (stopped || type == OperationType::UNDO_DEMOTION) { // Step 2: If demotion target is master, meaning this server will become the master, // enable writing and scheduled events. Also, run promotion_sql_file. @@ -1507,7 +1528,8 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, if (m_settings.handle_event_scheduler) { // TODO: Add query replying to enable_events - bool events_enabled = enable_events(promotion.events_to_enable, error_out); + bool events_enabled = enable_events(BinlogMode::BINLOG_OFF, promotion.events_to_enable, + error_out); general.time_remaining -= timer.restart(); if (!events_enabled) { @@ -1561,6 +1583,18 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, demotion_target->name(), name()); } } + else if (type == OperationType::UNDO_DEMOTION) + { + if (copy_slave_conns(general, promotion.conns_to_copy, nullptr)) + { + success = true; + } + else + { + PRINT_MXS_JSON_ERROR(error_out, "Could not restore slave connections of '%s' when " + "reversing demotion.", name()); + } + } } } return success; @@ -2001,12 +2035,25 @@ bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& { // Any slave connection that was going to this server itself are instead directed // to the replacement server. + bool ok_to_copy = true; if (slave_conn.master_server_id == m_server_id) { - slave_conn.master_host = replacement->m_server_base->server->address; - slave_conn.master_port = replacement->m_server_base->server->port; + if (replacement) + { + slave_conn.master_host = replacement->m_server_base->server->address; + slave_conn.master_port = replacement->m_server_base->server->port; + } + else + { + // This is only possible if replication is configured wrong and we are + // undoing a switchover demotion. + MXB_WARNING("Server id:s of '%s' and [%s]:%i are identical, not copying the connection " + "to '%s'.", + name(), slave_conn.master_host.c_str(), slave_conn.master_port, name()); + } } - if (!create_start_slave(op, slave_conn)) + + if (ok_to_copy && !create_start_slave(op, slave_conn)) { start_slave_error = true; } diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 913fac7af..64e2f6730 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -347,11 +347,12 @@ public: * Enable any "SLAVESIDE_DISABLED" or "DISABLED events. Event scheduler is not touched. Only events * with names matching an element in the event_names set are enabled. * + * @param binlog_mode If OFF, binlog event creation is disabled for the session during method execution. * @param event_names Names of events that should be enabled * @param error_out Error output * @return True if all SLAVESIDE_DISABLED events were enabled */ - bool enable_events(const EventNameSet& event_names, json_t** error_out); + bool enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out); /** * Disable any "ENABLED" events. Event scheduler is not touched. diff --git a/server/modules/monitor/mariadbmon/server_utils.hh b/server/modules/monitor/mariadbmon/server_utils.hh index e87224d0e..752bfefc9 100644 --- a/server/modules/monitor/mariadbmon/server_utils.hh +++ b/server/modules/monitor/mariadbmon/server_utils.hh @@ -222,7 +222,8 @@ using EventNameSet = std::unordered_set; enum class OperationType { SWITCHOVER, - FAILOVER + FAILOVER, + UNDO_DEMOTION // Performed when switchover fails in its first stages. }; class GeneralOpData