MXS-1944 Use time limited methods in rejoin

Uses switchover time limit, since the typical rejoin of a standalone server
is somewhat similar to a switchover.
This commit is contained in:
Esa Korhonen
2018-11-06 11:12:51 +02:00
parent 184e187732
commit e4e2235297
3 changed files with 43 additions and 76 deletions

View File

@ -510,7 +510,8 @@ int MariaDBMonitor::redirect_slaves_ex(GeneralOpData& general, OperationType typ
else
{
// No conflict, redirect as normal.
if (redirectable->redirect_existing_slave_conn(general, from, to))
auto old_conn = redirectable->slave_connection_status(from);
if (redirectable->redirect_existing_slave_conn(general, *old_conn, to))
{
successes++;
redirected->push_back(redirectable);
@ -587,35 +588,46 @@ uint32_t MariaDBMonitor::do_rejoin(const ServerArray& joinable_servers, json_t**
uint32_t servers_joined = 0;
if (!joinable_servers.empty())
{
string change_cmd = generate_change_master_cmd(master_server->address, master_server->port);
for (MariaDBServer* joinable : joinable_servers)
{
const char* name = joinable->name();
bool op_success = false;
// Rejoin doesn't have its own time limit setting. Use switchover time limit for now since
// the first phase of standalone rejoin is similar to switchover.
maxbase::Duration time_limit((double)m_switchover_timeout);
GeneralOpData op(m_replication_user, m_replication_password, output, time_limit);
if (joinable->m_slave_status.empty())
{
if (!m_demote_sql_file.empty() && !joinable->run_sql_from_file(m_demote_sql_file, output))
// Assume that server is an old master which was failed over. Even if this is not really
// the case, the following is unlikely to do damage.
ServerOperation demotion(joinable, true, /* treat as old master */
m_handle_event_scheduler, m_demote_sql_file, {} /* unused */);
if (joinable->demote(demotion, op))
{
PRINT_MXS_JSON_ERROR(output,
"%s execution failed when attempting to rejoin server '%s'.",
CN_DEMOTION_SQL_FILE,
joinable->name());
MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name);
// A slave connection description is required. As this is the only connection, no name
// is required.
SlaveStatus new_conn;
new_conn.master_host = master_server->address;
new_conn.master_port = master_server->port;
op_success = joinable->create_start_slave(op, new_conn);
}
else
{
MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name);
op_success = joinable->join_cluster(change_cmd, m_handle_event_scheduler);
PRINT_MXS_JSON_ERROR(output,
"Failed to prepare (demote) standalone server %s for rejoin.", name);
}
}
else
{
MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
"redirecting it to '%s'.",
name,
master_name,
master_name);
op_success = joinable->redirect_one_slave(change_cmd);
name, master_name, master_name);
// Multisource replication does not get to this point.
mxb_assert(joinable->m_slave_status.size() == 1);
op_success = joinable->redirect_existing_slave_conn(op, joinable->m_slave_status[0],
m_master);
}
if (op_success)

View File

@ -750,42 +750,6 @@ bool MariaDBServer::redirect_one_slave(const string& change_cmd)
return success;
}
bool MariaDBServer::join_cluster(const string& change_cmd, bool disable_server_events)
{
/* Server does not have slave connections. This operation can fail, or the resulting
* replication may end up broken. */
bool success = false;
MYSQL* server_conn = m_server_base->con;
const char* query = "SET GLOBAL read_only=1;";
if (mxs_mysql_query(server_conn, query) == 0)
{
if (disable_server_events)
{
// This is unlikely to change anything, since a restarted server does not have event scheduler
// ON. If it were on and events were running while the server was standalone, its data would have
// diverged from the rest of the cluster.
disable_events(BinlogMode::BINLOG_OFF, NULL);
}
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(server_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(server_conn, query) == 0)
{
success = true;
MXS_NOTICE("Standalone server '%s' starting replication.", name());
}
}
}
if (!success)
{
const char ERROR_MSG[] = "Standalone server '%s' failed to start replication: '%s'. Query: '%s'.";
MXS_WARNING(ERROR_MSG, name(), mysql_error(server_conn), query);
}
return success;
}
bool MariaDBServer::run_sql_from_file(const string& path, json_t** error_out)
{
MYSQL* conn = m_server_base->con;
@ -1579,7 +1543,8 @@ bool MariaDBServer::demote(ServerOperation& demo_op, GeneralOpData& general)
bool demotion_error = false;
if (demo_op.to_from_master)
{
mxb_assert(is_master());
// The server should either be the master or be a standalone being rejoined.
mxb_assert(is_master() || m_slave_status.empty());
StopWatch timer;
// Step 2a: Enabling read-only can take time if writes are on or table locks taken.
// TODO: use max_statement_time to be safe!
@ -2002,13 +1967,6 @@ bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray&
return !start_slave_error;
}
/**
* Create a new slave connection on the server and start it.
*
* @param op Operation descriptor
* @param slave_conn Existing connection to emulate
* @return True on success
*/
bool MariaDBServer::create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn)
{
maxbase::Duration& time_remaining = op.time_remaining;
@ -2070,22 +2028,20 @@ string MariaDBServer::generate_change_master_cmd(GeneralOpData& op, const SlaveS
return change_cmd;
}
bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master,
bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const SlaveStatus& old_conn,
const MariaDBServer* new_master)
{
auto error_out = op.error_out;
maxbase::Duration& time_remaining = op.time_remaining;
StopWatch timer;
auto old_conn = slave_connection_status(old_master);
mxb_assert(old_conn);
bool success = false;
// First, just stop the slave connection.
bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, time_remaining, error_out);
bool stopped = stop_slave_conn(old_conn.name, StopMode::STOP_ONLY, time_remaining, error_out);
time_remaining -= timer.restart();
if (stopped)
{
SlaveStatus modified_conn = *old_conn;
SlaveStatus modified_conn = old_conn;
SERVER* target_server = new_master->m_server_base->server;
modified_conn.master_host = target_server->address;
modified_conn.master_port = target_server->port;
@ -2095,7 +2051,7 @@ bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaD
time_remaining -= timer.restart();
if (changed)
{
string start = string_printf("START SLAVE '%s';", old_conn->name.c_str());
string start = string_printf("START SLAVE '%s';", old_conn.name.c_str());
bool started = execute_cmd_time_limit(start, time_remaining, &error_msg);
time_remaining -= timer.restart();
if (started)
@ -2114,7 +2070,7 @@ bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaD
// TODO: This may currently print out passwords.
PRINT_MXS_JSON_ERROR(error_out,
"%s could not be redirected to [%s]:%i: %s",
old_conn->to_short_string().c_str(),
old_conn.to_short_string().c_str(),
modified_conn.master_host.c_str(), modified_conn.master_port,
error_msg.c_str());
}

View File

@ -278,15 +278,6 @@ public:
*/
bool redirect_one_slave(const std::string& change_cmd);
/**
* Joins this standalone server to the cluster.
*
* @param change_cmd Change master command
* @param disable_server_events Should events be disabled on the server
* @return True if commands were accepted by server
*/
bool join_cluster(const std::string& change_cmd, bool disable_server_events);
/**
* Check if the server can be demoted by switchover.
*
@ -372,11 +363,11 @@ public:
* Redirect the slave connection going to old master to replicate from new master.
*
* @param op Operation descriptor
* @param old_master The connection to this server is redirected
* @param old_conn The connection which is redirected
* @param new_master The new master for the redirected connection
* @return True on success
*/
bool redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master,
bool redirect_existing_slave_conn(GeneralOpData& op, const SlaveStatus& old_conn,
const MariaDBServer* new_master);
/**
@ -395,6 +386,15 @@ public:
bool copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_copy,
const MariaDBServer* replacement);
/**
* Create a new slave connection on the server and start it.
*
* @param op Operation descriptor
* @param slave_conn Existing connection to emulate
* @return True on success
*/
bool create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn);
/**
* Is binary log on? 'update_replication_settings' should be ran before this function to query the data.
*
@ -545,6 +545,5 @@ private:
bool set_read_only(ReadOnlySetting value, maxbase::Duration time_limit, json_t** error_out);
bool merge_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_merge);
bool create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn);
std::string generate_change_master_cmd(GeneralOpData& op, const SlaveStatus& slave_conn);
};