Improve switchover undo when new master fails
Now the monitor properly restores the old master by running promotion code on it. Also, binlog is disabled when enabling server events.
This commit is contained in:
parent
53606934a6
commit
d0e6921604
@ -330,7 +330,8 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er
|
||||
{
|
||||
if (old_master)
|
||||
{
|
||||
if (!new_master->enable_events(old_master->m_enabled_events, error_out))
|
||||
if (!new_master->enable_events(MariaDBServer::BinlogMode::BINLOG_ON,
|
||||
old_master->m_enabled_events, error_out))
|
||||
{
|
||||
error = true;
|
||||
PRINT_MXS_JSON_ERROR(error_out, "Could not enable events on '%s': %s",
|
||||
@ -411,36 +412,6 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a CHANGE MASTER TO-query. TODO: Use the version in MariaDBServer instead.
|
||||
*
|
||||
* @param master_host Master hostname/address
|
||||
* @param master_port Master port
|
||||
* @return Generated query
|
||||
*/
|
||||
string MariaDBMonitor::generate_change_master_cmd(const string& master_host, int master_port)
|
||||
{
|
||||
std::stringstream change_cmd;
|
||||
change_cmd << "CHANGE MASTER TO MASTER_HOST = '" << master_host << "', ";
|
||||
change_cmd << "MASTER_PORT = " << master_port << ", ";
|
||||
change_cmd << "MASTER_USE_GTID = current_pos, ";
|
||||
if (m_settings.shared.replication_ssl)
|
||||
{
|
||||
change_cmd << "MASTER_SSL = 1, ";
|
||||
}
|
||||
change_cmd << "MASTER_USER = '" << m_settings.shared.replication_user << "', ";
|
||||
const char MASTER_PW[] = "MASTER_PASSWORD = '";
|
||||
const char END[] = "';";
|
||||
#if defined (SS_DEBUG)
|
||||
std::stringstream change_cmd_nopw;
|
||||
change_cmd_nopw << change_cmd.str();
|
||||
change_cmd_nopw << MASTER_PW << "******" << END;
|
||||
MXS_DEBUG("Change master command is '%s'.", change_cmd_nopw.str().c_str());
|
||||
#endif
|
||||
change_cmd << MASTER_PW << m_settings.shared.replication_password << END;
|
||||
return change_cmd.str();
|
||||
}
|
||||
|
||||
/**
|
||||
* Redirect slave connections from the promotion target to replicate from the demotion target and vice versa.
|
||||
*
|
||||
@ -569,34 +540,6 @@ int MariaDBMonitor::redirect_slaves_ex(GeneralOpData& general, OperationType typ
|
||||
}
|
||||
return successes;
|
||||
}
|
||||
/**
|
||||
* Set the new master to replicate from the cluster external master.
|
||||
*
|
||||
* @param new_master The server being promoted
|
||||
* @param err_out Error output
|
||||
* @return True if new master accepted commands
|
||||
*/
|
||||
bool MariaDBMonitor::start_external_replication(MariaDBServer* new_master, json_t** err_out)
|
||||
{
|
||||
bool rval = false;
|
||||
MYSQL* new_master_conn = new_master->m_server_base->con;
|
||||
string change_cmd = generate_change_master_cmd(m_external_master_host, m_external_master_port);
|
||||
if (mxs_mysql_query(new_master_conn, change_cmd.c_str()) == 0
|
||||
&& mxs_mysql_query(new_master_conn, "START SLAVE;") == 0)
|
||||
{
|
||||
MXS_NOTICE("New master starting replication from external master %s:%d.",
|
||||
m_external_master_host.c_str(),
|
||||
m_external_master_port);
|
||||
rval = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(err_out,
|
||||
"Could not start replication from external master: '%s'.",
|
||||
mysql_error(new_master_conn));
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* (Re)join given servers to the cluster. The servers in the array are assumed to be joinable.
|
||||
@ -885,25 +828,20 @@ bool MariaDBMonitor::switchover_perform(SwitchoverParams& op)
|
||||
|
||||
if (!catchup_and_promote_success)
|
||||
{
|
||||
// Step 2 or 3 failed, try to undo step 2.
|
||||
const char QUERY_UNDO[] = "SET GLOBAL read_only=0;";
|
||||
if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0)
|
||||
// Step 2 or 3 failed, try to undo step 1 by promoting the demotion target back to master.
|
||||
// Reset the time limit since the last part may have used it all.
|
||||
MXS_NOTICE("Attempting to undo changes to '%s'.", demotion_target->name());
|
||||
const mxb::Duration demotion_undo_time_limit((double)m_settings.switchover_timeout);
|
||||
GeneralOpData general_undo(op.general.error_out, demotion_undo_time_limit);
|
||||
if (demotion_target->promote(general_undo, op.promotion, OperationType::UNDO_DEMOTION, nullptr))
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server '%s'.",
|
||||
demotion_target->name());
|
||||
MXS_NOTICE("'%s' restored to original status.", demotion_target->name());
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out,
|
||||
"Could not disable read_only on server '%s': '%s'.",
|
||||
demotion_target->name(),
|
||||
mysql_error(demotion_target->m_server_base->con));
|
||||
}
|
||||
|
||||
// Try to reactivate external replication if any.
|
||||
if (m_external_master_port != PORT_UNKNOWN)
|
||||
{
|
||||
start_external_replication(promotion_target, error_out);
|
||||
"Restoring of '%s' failed, cluster may be in an invalid state.",
|
||||
demotion_target->name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -336,8 +336,7 @@ private:
|
||||
const MariaDBServer* demotion_target,
|
||||
ServerArray* redirected_to_promo,
|
||||
ServerArray* redirected_to_demo);
|
||||
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
|
||||
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
|
||||
|
||||
void wait_cluster_stabilization(GeneralOpData& op, const ServerArray& slaves,
|
||||
const MariaDBServer* new_master);
|
||||
|
||||
|
@ -1229,7 +1229,7 @@ const SlaveStatus* MariaDBServer::slave_connection_status_host_port(const MariaD
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** error_out)
|
||||
bool MariaDBServer::enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out)
|
||||
{
|
||||
int found_disabled_events = 0;
|
||||
int events_enabled = 0;
|
||||
@ -1248,6 +1248,17 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro
|
||||
}
|
||||
};
|
||||
|
||||
string error_msg;
|
||||
if (binlog_mode == BinlogMode::BINLOG_OFF)
|
||||
{
|
||||
if (!execute_cmd("SET @@session.sql_log_bin=0;", &error_msg))
|
||||
{
|
||||
const char FMT[] = "Could not disable session binlog on '%s': %s Server events not enabled.";
|
||||
PRINT_MXS_JSON_ERROR(error_out, FMT, name(), error_msg.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool rval = false;
|
||||
if (events_foreach(enabler, error_out))
|
||||
{
|
||||
@ -1260,6 +1271,12 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro
|
||||
rval = true;
|
||||
}
|
||||
}
|
||||
if (binlog_mode == BinlogMode::BINLOG_OFF)
|
||||
{
|
||||
// Failure in re-enabling the session binlog doesn't really matter because we don't want the monitor
|
||||
// generating binlog events anyway.
|
||||
execute_cmd("SET @@session.sql_log_bin=1;");
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
@ -1459,36 +1476,40 @@ bool MariaDBServer::reset_all_slave_conns(json_t** error_out)
|
||||
bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, OperationType type,
|
||||
const MariaDBServer* demotion_target)
|
||||
{
|
||||
mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER);
|
||||
mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER
|
||||
|| type == OperationType::UNDO_DEMOTION);
|
||||
json_t** const error_out = general.error_out;
|
||||
// Function should only be called for a master-slave pair.
|
||||
auto master_conn = slave_connection_status(demotion_target);
|
||||
mxb_assert(master_conn);
|
||||
if (master_conn == NULL)
|
||||
|
||||
StopWatch timer;
|
||||
bool stopped = false;
|
||||
if (type == OperationType::SWITCHOVER || type == OperationType::FAILOVER)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out,
|
||||
"'%s' is not a slave of '%s' and cannot be promoted to its place.",
|
||||
name(), demotion_target->name());
|
||||
return false;
|
||||
// In normal circumstances, this should only be called for a master-slave pair.
|
||||
auto master_conn = slave_connection_status(demotion_target);
|
||||
mxb_assert(master_conn);
|
||||
if (master_conn == NULL)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out,
|
||||
"'%s' is not a slave of '%s' and cannot be promoted to its place.",
|
||||
name(), demotion_target->name());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion
|
||||
// target. In case of switchover, remove other slave connections as well since the demotion target
|
||||
// will take them over.
|
||||
if (type == OperationType::SWITCHOVER)
|
||||
{
|
||||
stopped = remove_slave_conns(general, m_slave_status);
|
||||
}
|
||||
else if (type == OperationType::FAILOVER)
|
||||
{
|
||||
stopped = remove_slave_conns(general, {*master_conn});
|
||||
}
|
||||
}
|
||||
|
||||
bool success = false;
|
||||
StopWatch timer;
|
||||
// Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion
|
||||
// target. In case of switchover, remove other slave connections as well since the demotion target
|
||||
// will take them over.
|
||||
bool stopped = false;
|
||||
if (type == OperationType::SWITCHOVER)
|
||||
{
|
||||
stopped = remove_slave_conns(general, m_slave_status);
|
||||
}
|
||||
else if (type == OperationType::FAILOVER)
|
||||
{
|
||||
stopped = remove_slave_conns(general, {*master_conn});
|
||||
master_conn = NULL; // The connection pointed to may no longer exist.
|
||||
}
|
||||
|
||||
if (stopped)
|
||||
if (stopped || type == OperationType::UNDO_DEMOTION)
|
||||
{
|
||||
// Step 2: If demotion target is master, meaning this server will become the master,
|
||||
// enable writing and scheduled events. Also, run promotion_sql_file.
|
||||
@ -1507,7 +1528,8 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
||||
if (m_settings.handle_event_scheduler)
|
||||
{
|
||||
// TODO: Add query replying to enable_events
|
||||
bool events_enabled = enable_events(promotion.events_to_enable, error_out);
|
||||
bool events_enabled = enable_events(BinlogMode::BINLOG_OFF, promotion.events_to_enable,
|
||||
error_out);
|
||||
general.time_remaining -= timer.restart();
|
||||
if (!events_enabled)
|
||||
{
|
||||
@ -1561,6 +1583,18 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
||||
demotion_target->name(), name());
|
||||
}
|
||||
}
|
||||
else if (type == OperationType::UNDO_DEMOTION)
|
||||
{
|
||||
if (copy_slave_conns(general, promotion.conns_to_copy, nullptr))
|
||||
{
|
||||
success = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, "Could not restore slave connections of '%s' when "
|
||||
"reversing demotion.", name());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return success;
|
||||
@ -2001,12 +2035,25 @@ bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray&
|
||||
{
|
||||
// Any slave connection that was going to this server itself are instead directed
|
||||
// to the replacement server.
|
||||
bool ok_to_copy = true;
|
||||
if (slave_conn.master_server_id == m_server_id)
|
||||
{
|
||||
slave_conn.master_host = replacement->m_server_base->server->address;
|
||||
slave_conn.master_port = replacement->m_server_base->server->port;
|
||||
if (replacement)
|
||||
{
|
||||
slave_conn.master_host = replacement->m_server_base->server->address;
|
||||
slave_conn.master_port = replacement->m_server_base->server->port;
|
||||
}
|
||||
else
|
||||
{
|
||||
// This is only possible if replication is configured wrong and we are
|
||||
// undoing a switchover demotion.
|
||||
MXB_WARNING("Server id:s of '%s' and [%s]:%i are identical, not copying the connection "
|
||||
"to '%s'.",
|
||||
name(), slave_conn.master_host.c_str(), slave_conn.master_port, name());
|
||||
}
|
||||
}
|
||||
if (!create_start_slave(op, slave_conn))
|
||||
|
||||
if (ok_to_copy && !create_start_slave(op, slave_conn))
|
||||
{
|
||||
start_slave_error = true;
|
||||
}
|
||||
|
@ -347,11 +347,12 @@ public:
|
||||
* Enable any "SLAVESIDE_DISABLED" or "DISABLED events. Event scheduler is not touched. Only events
|
||||
* with names matching an element in the event_names set are enabled.
|
||||
*
|
||||
* @param binlog_mode If OFF, binlog event creation is disabled for the session during method execution.
|
||||
* @param event_names Names of events that should be enabled
|
||||
* @param error_out Error output
|
||||
* @return True if all SLAVESIDE_DISABLED events were enabled
|
||||
*/
|
||||
bool enable_events(const EventNameSet& event_names, json_t** error_out);
|
||||
bool enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out);
|
||||
|
||||
/**
|
||||
* Disable any "ENABLED" events. Event scheduler is not touched.
|
||||
|
@ -222,7 +222,8 @@ using EventNameSet = std::unordered_set<std::string>;
|
||||
enum class OperationType
|
||||
{
|
||||
SWITCHOVER,
|
||||
FAILOVER
|
||||
FAILOVER,
|
||||
UNDO_DEMOTION // Performed when switchover fails in its first stages.
|
||||
};
|
||||
|
||||
class GeneralOpData
|
||||
|
Loading…
x
Reference in New Issue
Block a user