MXS-1944 Use time limited methods in rejoin

Uses switchover time limit, since the typical rejoin of a standalone server
is somewhat similar to a switchover.
This commit is contained in:
Esa Korhonen
2018-11-06 11:12:51 +02:00
parent 184e187732
commit e4e2235297
3 changed files with 43 additions and 76 deletions

View File

@ -510,7 +510,8 @@ int MariaDBMonitor::redirect_slaves_ex(GeneralOpData& general, OperationType typ
else else
{ {
// No conflict, redirect as normal. // No conflict, redirect as normal.
if (redirectable->redirect_existing_slave_conn(general, from, to)) auto old_conn = redirectable->slave_connection_status(from);
if (redirectable->redirect_existing_slave_conn(general, *old_conn, to))
{ {
successes++; successes++;
redirected->push_back(redirectable); redirected->push_back(redirectable);
@ -587,35 +588,46 @@ uint32_t MariaDBMonitor::do_rejoin(const ServerArray& joinable_servers, json_t**
uint32_t servers_joined = 0; uint32_t servers_joined = 0;
if (!joinable_servers.empty()) if (!joinable_servers.empty())
{ {
string change_cmd = generate_change_master_cmd(master_server->address, master_server->port);
for (MariaDBServer* joinable : joinable_servers) for (MariaDBServer* joinable : joinable_servers)
{ {
const char* name = joinable->name(); const char* name = joinable->name();
bool op_success = false; bool op_success = false;
// Rejoin doesn't have its own time limit setting. Use switchover time limit for now since
// the first phase of standalone rejoin is similar to switchover.
maxbase::Duration time_limit((double)m_switchover_timeout);
GeneralOpData op(m_replication_user, m_replication_password, output, time_limit);
if (joinable->m_slave_status.empty()) if (joinable->m_slave_status.empty())
{ {
if (!m_demote_sql_file.empty() && !joinable->run_sql_from_file(m_demote_sql_file, output)) // Assume that server is an old master which was failed over. Even if this is not really
// the case, the following is unlikely to do damage.
ServerOperation demotion(joinable, true, /* treat as old master */
m_handle_event_scheduler, m_demote_sql_file, {} /* unused */);
if (joinable->demote(demotion, op))
{ {
PRINT_MXS_JSON_ERROR(output, MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name);
"%s execution failed when attempting to rejoin server '%s'.", // A slave connection description is required. As this is the only connection, no name
CN_DEMOTION_SQL_FILE, // is required.
joinable->name()); SlaveStatus new_conn;
new_conn.master_host = master_server->address;
new_conn.master_port = master_server->port;
op_success = joinable->create_start_slave(op, new_conn);
} }
else else
{ {
MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name); PRINT_MXS_JSON_ERROR(output,
op_success = joinable->join_cluster(change_cmd, m_handle_event_scheduler); "Failed to prepare (demote) standalone server %s for rejoin.", name);
} }
} }
else else
{ {
MXS_NOTICE("Server '%s' is replicating from a server other than '%s', " MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
"redirecting it to '%s'.", "redirecting it to '%s'.",
name, name, master_name, master_name);
master_name, // Multisource replication does not get to this point.
master_name); mxb_assert(joinable->m_slave_status.size() == 1);
op_success = joinable->redirect_one_slave(change_cmd); op_success = joinable->redirect_existing_slave_conn(op, joinable->m_slave_status[0],
m_master);
} }
if (op_success) if (op_success)

View File

@ -750,42 +750,6 @@ bool MariaDBServer::redirect_one_slave(const string& change_cmd)
return success; return success;
} }
bool MariaDBServer::join_cluster(const string& change_cmd, bool disable_server_events)
{
/* Server does not have slave connections. This operation can fail, or the resulting
* replication may end up broken. */
bool success = false;
MYSQL* server_conn = m_server_base->con;
const char* query = "SET GLOBAL read_only=1;";
if (mxs_mysql_query(server_conn, query) == 0)
{
if (disable_server_events)
{
// This is unlikely to change anything, since a restarted server does not have event scheduler
// ON. If it were on and events were running while the server was standalone, its data would have
// diverged from the rest of the cluster.
disable_events(BinlogMode::BINLOG_OFF, NULL);
}
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(server_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(server_conn, query) == 0)
{
success = true;
MXS_NOTICE("Standalone server '%s' starting replication.", name());
}
}
}
if (!success)
{
const char ERROR_MSG[] = "Standalone server '%s' failed to start replication: '%s'. Query: '%s'.";
MXS_WARNING(ERROR_MSG, name(), mysql_error(server_conn), query);
}
return success;
}
bool MariaDBServer::run_sql_from_file(const string& path, json_t** error_out) bool MariaDBServer::run_sql_from_file(const string& path, json_t** error_out)
{ {
MYSQL* conn = m_server_base->con; MYSQL* conn = m_server_base->con;
@ -1579,7 +1543,8 @@ bool MariaDBServer::demote(ServerOperation& demo_op, GeneralOpData& general)
bool demotion_error = false; bool demotion_error = false;
if (demo_op.to_from_master) if (demo_op.to_from_master)
{ {
mxb_assert(is_master()); // The server should either be the master or be a standalone being rejoined.
mxb_assert(is_master() || m_slave_status.empty());
StopWatch timer; StopWatch timer;
// Step 2a: Enabling read-only can take time if writes are on or table locks taken. // Step 2a: Enabling read-only can take time if writes are on or table locks taken.
// TODO: use max_statement_time to be safe! // TODO: use max_statement_time to be safe!
@ -2002,13 +1967,6 @@ bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray&
return !start_slave_error; return !start_slave_error;
} }
/**
* Create a new slave connection on the server and start it.
*
* @param op Operation descriptor
* @param slave_conn Existing connection to emulate
* @return True on success
*/
bool MariaDBServer::create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn) bool MariaDBServer::create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn)
{ {
maxbase::Duration& time_remaining = op.time_remaining; maxbase::Duration& time_remaining = op.time_remaining;
@ -2070,22 +2028,20 @@ string MariaDBServer::generate_change_master_cmd(GeneralOpData& op, const SlaveS
return change_cmd; return change_cmd;
} }
bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master, bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const SlaveStatus& old_conn,
const MariaDBServer* new_master) const MariaDBServer* new_master)
{ {
auto error_out = op.error_out; auto error_out = op.error_out;
maxbase::Duration& time_remaining = op.time_remaining; maxbase::Duration& time_remaining = op.time_remaining;
StopWatch timer; StopWatch timer;
auto old_conn = slave_connection_status(old_master);
mxb_assert(old_conn);
bool success = false; bool success = false;
// First, just stop the slave connection. // First, just stop the slave connection.
bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, time_remaining, error_out); bool stopped = stop_slave_conn(old_conn.name, StopMode::STOP_ONLY, time_remaining, error_out);
time_remaining -= timer.restart(); time_remaining -= timer.restart();
if (stopped) if (stopped)
{ {
SlaveStatus modified_conn = *old_conn; SlaveStatus modified_conn = old_conn;
SERVER* target_server = new_master->m_server_base->server; SERVER* target_server = new_master->m_server_base->server;
modified_conn.master_host = target_server->address; modified_conn.master_host = target_server->address;
modified_conn.master_port = target_server->port; modified_conn.master_port = target_server->port;
@ -2095,7 +2051,7 @@ bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaD
time_remaining -= timer.restart(); time_remaining -= timer.restart();
if (changed) if (changed)
{ {
string start = string_printf("START SLAVE '%s';", old_conn->name.c_str()); string start = string_printf("START SLAVE '%s';", old_conn.name.c_str());
bool started = execute_cmd_time_limit(start, time_remaining, &error_msg); bool started = execute_cmd_time_limit(start, time_remaining, &error_msg);
time_remaining -= timer.restart(); time_remaining -= timer.restart();
if (started) if (started)
@ -2114,7 +2070,7 @@ bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaD
// TODO: This may currently print out passwords. // TODO: This may currently print out passwords.
PRINT_MXS_JSON_ERROR(error_out, PRINT_MXS_JSON_ERROR(error_out,
"%s could not be redirected to [%s]:%i: %s", "%s could not be redirected to [%s]:%i: %s",
old_conn->to_short_string().c_str(), old_conn.to_short_string().c_str(),
modified_conn.master_host.c_str(), modified_conn.master_port, modified_conn.master_host.c_str(), modified_conn.master_port,
error_msg.c_str()); error_msg.c_str());
} }

View File

@ -278,15 +278,6 @@ public:
*/ */
bool redirect_one_slave(const std::string& change_cmd); bool redirect_one_slave(const std::string& change_cmd);
/**
* Joins this standalone server to the cluster.
*
* @param change_cmd Change master command
* @param disable_server_events Should events be disabled on the server
* @return True if commands were accepted by server
*/
bool join_cluster(const std::string& change_cmd, bool disable_server_events);
/** /**
* Check if the server can be demoted by switchover. * Check if the server can be demoted by switchover.
* *
@ -372,11 +363,11 @@ public:
* Redirect the slave connection going to old master to replicate from new master. * Redirect the slave connection going to old master to replicate from new master.
* *
* @param op Operation descriptor * @param op Operation descriptor
* @param old_master The connection to this server is redirected * @param old_conn The connection which is redirected
* @param new_master The new master for the redirected connection * @param new_master The new master for the redirected connection
* @return True on success * @return True on success
*/ */
bool redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master, bool redirect_existing_slave_conn(GeneralOpData& op, const SlaveStatus& old_conn,
const MariaDBServer* new_master); const MariaDBServer* new_master);
/** /**
@ -395,6 +386,15 @@ public:
bool copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_copy, bool copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_copy,
const MariaDBServer* replacement); const MariaDBServer* replacement);
/**
* Create a new slave connection on the server and start it.
*
* @param op Operation descriptor
* @param slave_conn Existing connection to emulate
* @return True on success
*/
bool create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn);
/** /**
* Is binary log on? 'update_replication_settings' should be ran before this function to query the data. * Is binary log on? 'update_replication_settings' should be ran before this function to query the data.
* *
@ -545,6 +545,5 @@ private:
bool set_read_only(ReadOnlySetting value, maxbase::Duration time_limit, json_t** error_out); bool set_read_only(ReadOnlySetting value, maxbase::Duration time_limit, json_t** error_out);
bool merge_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_merge); bool merge_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_merge);
bool create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn);
std::string generate_change_master_cmd(GeneralOpData& op, const SlaveStatus& slave_conn); std::string generate_change_master_cmd(GeneralOpData& op, const SlaveStatus& slave_conn);
}; };