Improve switchover undo when new master fails
Now the monitor properly restores the old master by running promotion code on it. Also, binlog is disabled when enabling server events.
This commit is contained in:
@ -330,7 +330,8 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er
|
|||||||
{
|
{
|
||||||
if (old_master)
|
if (old_master)
|
||||||
{
|
{
|
||||||
if (!new_master->enable_events(old_master->m_enabled_events, error_out))
|
if (!new_master->enable_events(MariaDBServer::BinlogMode::BINLOG_ON,
|
||||||
|
old_master->m_enabled_events, error_out))
|
||||||
{
|
{
|
||||||
error = true;
|
error = true;
|
||||||
PRINT_MXS_JSON_ERROR(error_out, "Could not enable events on '%s': %s",
|
PRINT_MXS_JSON_ERROR(error_out, "Could not enable events on '%s': %s",
|
||||||
@ -411,36 +412,6 @@ bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** er
|
|||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate a CHANGE MASTER TO-query. TODO: Use the version in MariaDBServer instead.
|
|
||||||
*
|
|
||||||
* @param master_host Master hostname/address
|
|
||||||
* @param master_port Master port
|
|
||||||
* @return Generated query
|
|
||||||
*/
|
|
||||||
string MariaDBMonitor::generate_change_master_cmd(const string& master_host, int master_port)
|
|
||||||
{
|
|
||||||
std::stringstream change_cmd;
|
|
||||||
change_cmd << "CHANGE MASTER TO MASTER_HOST = '" << master_host << "', ";
|
|
||||||
change_cmd << "MASTER_PORT = " << master_port << ", ";
|
|
||||||
change_cmd << "MASTER_USE_GTID = current_pos, ";
|
|
||||||
if (m_settings.shared.replication_ssl)
|
|
||||||
{
|
|
||||||
change_cmd << "MASTER_SSL = 1, ";
|
|
||||||
}
|
|
||||||
change_cmd << "MASTER_USER = '" << m_settings.shared.replication_user << "', ";
|
|
||||||
const char MASTER_PW[] = "MASTER_PASSWORD = '";
|
|
||||||
const char END[] = "';";
|
|
||||||
#if defined (SS_DEBUG)
|
|
||||||
std::stringstream change_cmd_nopw;
|
|
||||||
change_cmd_nopw << change_cmd.str();
|
|
||||||
change_cmd_nopw << MASTER_PW << "******" << END;
|
|
||||||
MXS_DEBUG("Change master command is '%s'.", change_cmd_nopw.str().c_str());
|
|
||||||
#endif
|
|
||||||
change_cmd << MASTER_PW << m_settings.shared.replication_password << END;
|
|
||||||
return change_cmd.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Redirect slave connections from the promotion target to replicate from the demotion target and vice versa.
|
* Redirect slave connections from the promotion target to replicate from the demotion target and vice versa.
|
||||||
*
|
*
|
||||||
@ -569,34 +540,6 @@ int MariaDBMonitor::redirect_slaves_ex(GeneralOpData& general, OperationType typ
|
|||||||
}
|
}
|
||||||
return successes;
|
return successes;
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* Set the new master to replicate from the cluster external master.
|
|
||||||
*
|
|
||||||
* @param new_master The server being promoted
|
|
||||||
* @param err_out Error output
|
|
||||||
* @return True if new master accepted commands
|
|
||||||
*/
|
|
||||||
bool MariaDBMonitor::start_external_replication(MariaDBServer* new_master, json_t** err_out)
|
|
||||||
{
|
|
||||||
bool rval = false;
|
|
||||||
MYSQL* new_master_conn = new_master->m_server_base->con;
|
|
||||||
string change_cmd = generate_change_master_cmd(m_external_master_host, m_external_master_port);
|
|
||||||
if (mxs_mysql_query(new_master_conn, change_cmd.c_str()) == 0
|
|
||||||
&& mxs_mysql_query(new_master_conn, "START SLAVE;") == 0)
|
|
||||||
{
|
|
||||||
MXS_NOTICE("New master starting replication from external master %s:%d.",
|
|
||||||
m_external_master_host.c_str(),
|
|
||||||
m_external_master_port);
|
|
||||||
rval = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PRINT_MXS_JSON_ERROR(err_out,
|
|
||||||
"Could not start replication from external master: '%s'.",
|
|
||||||
mysql_error(new_master_conn));
|
|
||||||
}
|
|
||||||
return rval;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* (Re)join given servers to the cluster. The servers in the array are assumed to be joinable.
|
* (Re)join given servers to the cluster. The servers in the array are assumed to be joinable.
|
||||||
@ -885,25 +828,20 @@ bool MariaDBMonitor::switchover_perform(SwitchoverParams& op)
|
|||||||
|
|
||||||
if (!catchup_and_promote_success)
|
if (!catchup_and_promote_success)
|
||||||
{
|
{
|
||||||
// Step 2 or 3 failed, try to undo step 2.
|
// Step 2 or 3 failed, try to undo step 1 by promoting the demotion target back to master.
|
||||||
const char QUERY_UNDO[] = "SET GLOBAL read_only=0;";
|
// Reset the time limit since the last part may have used it all.
|
||||||
if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0)
|
MXS_NOTICE("Attempting to undo changes to '%s'.", demotion_target->name());
|
||||||
|
const mxb::Duration demotion_undo_time_limit((double)m_settings.switchover_timeout);
|
||||||
|
GeneralOpData general_undo(op.general.error_out, demotion_undo_time_limit);
|
||||||
|
if (demotion_target->promote(general_undo, op.promotion, OperationType::UNDO_DEMOTION, nullptr))
|
||||||
{
|
{
|
||||||
PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server '%s'.",
|
MXS_NOTICE("'%s' restored to original status.", demotion_target->name());
|
||||||
demotion_target->name());
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
PRINT_MXS_JSON_ERROR(error_out,
|
PRINT_MXS_JSON_ERROR(error_out,
|
||||||
"Could not disable read_only on server '%s': '%s'.",
|
"Restoring of '%s' failed, cluster may be in an invalid state.",
|
||||||
demotion_target->name(),
|
demotion_target->name());
|
||||||
mysql_error(demotion_target->m_server_base->con));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to reactivate external replication if any.
|
|
||||||
if (m_external_master_port != PORT_UNKNOWN)
|
|
||||||
{
|
|
||||||
start_external_replication(promotion_target, error_out);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -336,8 +336,7 @@ private:
|
|||||||
const MariaDBServer* demotion_target,
|
const MariaDBServer* demotion_target,
|
||||||
ServerArray* redirected_to_promo,
|
ServerArray* redirected_to_promo,
|
||||||
ServerArray* redirected_to_demo);
|
ServerArray* redirected_to_demo);
|
||||||
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
|
|
||||||
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
|
|
||||||
void wait_cluster_stabilization(GeneralOpData& op, const ServerArray& slaves,
|
void wait_cluster_stabilization(GeneralOpData& op, const ServerArray& slaves,
|
||||||
const MariaDBServer* new_master);
|
const MariaDBServer* new_master);
|
||||||
|
|
||||||
|
@ -1229,7 +1229,7 @@ const SlaveStatus* MariaDBServer::slave_connection_status_host_port(const MariaD
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** error_out)
|
bool MariaDBServer::enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out)
|
||||||
{
|
{
|
||||||
int found_disabled_events = 0;
|
int found_disabled_events = 0;
|
||||||
int events_enabled = 0;
|
int events_enabled = 0;
|
||||||
@ -1248,6 +1248,17 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
string error_msg;
|
||||||
|
if (binlog_mode == BinlogMode::BINLOG_OFF)
|
||||||
|
{
|
||||||
|
if (!execute_cmd("SET @@session.sql_log_bin=0;", &error_msg))
|
||||||
|
{
|
||||||
|
const char FMT[] = "Could not disable session binlog on '%s': %s Server events not enabled.";
|
||||||
|
PRINT_MXS_JSON_ERROR(error_out, FMT, name(), error_msg.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool rval = false;
|
bool rval = false;
|
||||||
if (events_foreach(enabler, error_out))
|
if (events_foreach(enabler, error_out))
|
||||||
{
|
{
|
||||||
@ -1260,6 +1271,12 @@ bool MariaDBServer::enable_events(const EventNameSet& event_names, json_t** erro
|
|||||||
rval = true;
|
rval = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (binlog_mode == BinlogMode::BINLOG_OFF)
|
||||||
|
{
|
||||||
|
// Failure in re-enabling the session binlog doesn't really matter because we don't want the monitor
|
||||||
|
// generating binlog events anyway.
|
||||||
|
execute_cmd("SET @@session.sql_log_bin=1;");
|
||||||
|
}
|
||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1459,9 +1476,15 @@ bool MariaDBServer::reset_all_slave_conns(json_t** error_out)
|
|||||||
bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, OperationType type,
|
bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, OperationType type,
|
||||||
const MariaDBServer* demotion_target)
|
const MariaDBServer* demotion_target)
|
||||||
{
|
{
|
||||||
mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER);
|
mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER
|
||||||
|
|| type == OperationType::UNDO_DEMOTION);
|
||||||
json_t** const error_out = general.error_out;
|
json_t** const error_out = general.error_out;
|
||||||
// Function should only be called for a master-slave pair.
|
|
||||||
|
StopWatch timer;
|
||||||
|
bool stopped = false;
|
||||||
|
if (type == OperationType::SWITCHOVER || type == OperationType::FAILOVER)
|
||||||
|
{
|
||||||
|
// In normal circumstances, this should only be called for a master-slave pair.
|
||||||
auto master_conn = slave_connection_status(demotion_target);
|
auto master_conn = slave_connection_status(demotion_target);
|
||||||
mxb_assert(master_conn);
|
mxb_assert(master_conn);
|
||||||
if (master_conn == NULL)
|
if (master_conn == NULL)
|
||||||
@ -1472,12 +1495,9 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = false;
|
|
||||||
StopWatch timer;
|
|
||||||
// Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion
|
// Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion
|
||||||
// target. In case of switchover, remove other slave connections as well since the demotion target
|
// target. In case of switchover, remove other slave connections as well since the demotion target
|
||||||
// will take them over.
|
// will take them over.
|
||||||
bool stopped = false;
|
|
||||||
if (type == OperationType::SWITCHOVER)
|
if (type == OperationType::SWITCHOVER)
|
||||||
{
|
{
|
||||||
stopped = remove_slave_conns(general, m_slave_status);
|
stopped = remove_slave_conns(general, m_slave_status);
|
||||||
@ -1485,10 +1505,11 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
|||||||
else if (type == OperationType::FAILOVER)
|
else if (type == OperationType::FAILOVER)
|
||||||
{
|
{
|
||||||
stopped = remove_slave_conns(general, {*master_conn});
|
stopped = remove_slave_conns(general, {*master_conn});
|
||||||
master_conn = NULL; // The connection pointed to may no longer exist.
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stopped)
|
bool success = false;
|
||||||
|
if (stopped || type == OperationType::UNDO_DEMOTION)
|
||||||
{
|
{
|
||||||
// Step 2: If demotion target is master, meaning this server will become the master,
|
// Step 2: If demotion target is master, meaning this server will become the master,
|
||||||
// enable writing and scheduled events. Also, run promotion_sql_file.
|
// enable writing and scheduled events. Also, run promotion_sql_file.
|
||||||
@ -1507,7 +1528,8 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
|||||||
if (m_settings.handle_event_scheduler)
|
if (m_settings.handle_event_scheduler)
|
||||||
{
|
{
|
||||||
// TODO: Add query replying to enable_events
|
// TODO: Add query replying to enable_events
|
||||||
bool events_enabled = enable_events(promotion.events_to_enable, error_out);
|
bool events_enabled = enable_events(BinlogMode::BINLOG_OFF, promotion.events_to_enable,
|
||||||
|
error_out);
|
||||||
general.time_remaining -= timer.restart();
|
general.time_remaining -= timer.restart();
|
||||||
if (!events_enabled)
|
if (!events_enabled)
|
||||||
{
|
{
|
||||||
@ -1561,6 +1583,18 @@ bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion,
|
|||||||
demotion_target->name(), name());
|
demotion_target->name(), name());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (type == OperationType::UNDO_DEMOTION)
|
||||||
|
{
|
||||||
|
if (copy_slave_conns(general, promotion.conns_to_copy, nullptr))
|
||||||
|
{
|
||||||
|
success = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRINT_MXS_JSON_ERROR(error_out, "Could not restore slave connections of '%s' when "
|
||||||
|
"reversing demotion.", name());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return success;
|
return success;
|
||||||
@ -2001,12 +2035,25 @@ bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray&
|
|||||||
{
|
{
|
||||||
// Any slave connection that was going to this server itself are instead directed
|
// Any slave connection that was going to this server itself are instead directed
|
||||||
// to the replacement server.
|
// to the replacement server.
|
||||||
|
bool ok_to_copy = true;
|
||||||
if (slave_conn.master_server_id == m_server_id)
|
if (slave_conn.master_server_id == m_server_id)
|
||||||
|
{
|
||||||
|
if (replacement)
|
||||||
{
|
{
|
||||||
slave_conn.master_host = replacement->m_server_base->server->address;
|
slave_conn.master_host = replacement->m_server_base->server->address;
|
||||||
slave_conn.master_port = replacement->m_server_base->server->port;
|
slave_conn.master_port = replacement->m_server_base->server->port;
|
||||||
}
|
}
|
||||||
if (!create_start_slave(op, slave_conn))
|
else
|
||||||
|
{
|
||||||
|
// This is only possible if replication is configured wrong and we are
|
||||||
|
// undoing a switchover demotion.
|
||||||
|
MXB_WARNING("Server id:s of '%s' and [%s]:%i are identical, not copying the connection "
|
||||||
|
"to '%s'.",
|
||||||
|
name(), slave_conn.master_host.c_str(), slave_conn.master_port, name());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok_to_copy && !create_start_slave(op, slave_conn))
|
||||||
{
|
{
|
||||||
start_slave_error = true;
|
start_slave_error = true;
|
||||||
}
|
}
|
||||||
|
@ -347,11 +347,12 @@ public:
|
|||||||
* Enable any "SLAVESIDE_DISABLED" or "DISABLED events. Event scheduler is not touched. Only events
|
* Enable any "SLAVESIDE_DISABLED" or "DISABLED events. Event scheduler is not touched. Only events
|
||||||
* with names matching an element in the event_names set are enabled.
|
* with names matching an element in the event_names set are enabled.
|
||||||
*
|
*
|
||||||
|
* @param binlog_mode If OFF, binlog event creation is disabled for the session during method execution.
|
||||||
* @param event_names Names of events that should be enabled
|
* @param event_names Names of events that should be enabled
|
||||||
* @param error_out Error output
|
* @param error_out Error output
|
||||||
* @return True if all SLAVESIDE_DISABLED events were enabled
|
* @return True if all SLAVESIDE_DISABLED events were enabled
|
||||||
*/
|
*/
|
||||||
bool enable_events(const EventNameSet& event_names, json_t** error_out);
|
bool enable_events(BinlogMode binlog_mode, const EventNameSet& event_names, json_t** error_out);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Disable any "ENABLED" events. Event scheduler is not touched.
|
* Disable any "ENABLED" events. Event scheduler is not touched.
|
||||||
|
@ -222,7 +222,8 @@ using EventNameSet = std::unordered_set<std::string>;
|
|||||||
enum class OperationType
|
enum class OperationType
|
||||||
{
|
{
|
||||||
SWITCHOVER,
|
SWITCHOVER,
|
||||||
FAILOVER
|
FAILOVER,
|
||||||
|
UNDO_DEMOTION // Performed when switchover fails in its first stages.
|
||||||
};
|
};
|
||||||
|
|
||||||
class GeneralOpData
|
class GeneralOpData
|
||||||
|
Reference in New Issue
Block a user