MXS-1845 Add redirection code

Should work with multimaster replication.
This commit is contained in:
Esa Korhonen
2018-09-28 16:18:38 +03:00
parent 75f9921ca2
commit 1ca5d02abb
4 changed files with 112 additions and 12 deletions

View File

@ -22,6 +22,7 @@
using std::string;
using std::unique_ptr;
using maxscale::string_printf;
using maxbase::StopWatch;
static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' "
"for monitor '%s' via MaxAdmin or the REST API, or restart MaxScale.";
@ -417,6 +418,46 @@ int MariaDBMonitor::redirect_slaves(MariaDBServer* new_master,
return successes;
}
/**
* Redirect slaves to replicate from the promotion target.
*
* @param op Operation descriptor
* @param slaves An array of slaves to redirect
* @param redirected_slaves A vector where to insert successfully redirected slaves
* @return The number of slaves successfully redirected
*/
int MariaDBMonitor::redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves,
ServerArray* redirected_slaves)
{
mxb_assert(redirected_slaves != NULL);
if (slaves.empty())
{
// This is ok, nothing to do.
return 0;
}
string slave_names = monitored_servers_to_string(slaves);
MXS_NOTICE("Redirecting %s to replicate from %s instead of %s.",
slave_names.c_str(), op.promotion_target->name(), op.demotion_target->name());
int successes = 0;
for (MariaDBServer* redirectable : slaves)
{
if (redirectable->redirect_existing_slave_conn(op))
{
successes++;
redirected_slaves->push_back(redirectable);
}
}
if (size_t(successes) == slaves.size())
{
MXS_NOTICE("All redirects successful.");
}
else
{
MXS_WARNING("%lu out of %lu redirects failed.", slaves.size() - successes, slaves.size());
}
return successes;
}
/**
* Set the new master to replicate from the cluster external master.
*
@ -723,13 +764,11 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op)
{
redirected_slaves.push_back(demotion_target);
}
int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves);
int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves);
bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0;
if (success)
{
op.time_remaining -= timer.restart();
// Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves
// to receive it. If using external replication, skip this step. Come up with an
// alternative later.
@ -793,7 +832,6 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op)
{
mxb_assert(op.promotion_target && op.demotion_target);
MariaDBServer* const promotion_target = op.promotion_target;
maxbase::StopWatch timer;
// Step 1: Populate a vector with all slaves not the selected master.
ServerArray redirectable_slaves = get_redirectables(promotion_target, op.demotion_target);
@ -802,17 +840,17 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op)
// Step 2: Stop and reset slave, set read-only to 0.
if (promotion_target->promote(op))
{
// Point of no return. Even if following steps fail, do not try to undo.
m_next_master = promotion_target;
m_cluster_modified = true;
// Step 3: Redirect slaves.
ServerArray redirected_slaves;
int redirects = redirect_slaves(promotion_target, redirectable_slaves, &redirected_slaves);
int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves);
bool success = redirectable_slaves.empty() ? true : redirects > 0;
if (success)
{
op.time_remaining -= timer.restart();
StopWatch timer;
// Step 4: Finally, add an event to the new master to advance gtid and wait for the slaves
// to receive it. seconds_remaining can be 0 or less at this point. Even in such a case
// wait_cluster_stabilization() may succeed if replication is fast enough. If using external
@ -1679,9 +1717,9 @@ void MariaDBMonitor::check_cluster_operations_support()
* @return The first connected slave or NULL if none found
*/
const MariaDBServer* MariaDBMonitor::slave_receiving_events(const MariaDBServer* demotion_target,
maxbase::Duration* event_age_out)
maxbase::Duration* event_age_out)
{
auto time_now = maxbase::Clock::now();
auto time_now = maxbase::Clock::now();
maxbase::Clock::time_point alive_after = time_now - std::chrono::seconds(m_master_failure_timeout);
const MariaDBServer* connected_slave = NULL;

View File

@ -273,6 +273,8 @@ private:
int redirect_slaves(MariaDBServer* new_master,
const ServerArray& slaves,
ServerArray* redirected_slaves);
int redirect_slaves_ex(ClusterOperation& op, const ServerArray& slaves,
ServerArray* redirected_slaves);
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
bool wait_cluster_stabilization(MariaDBServer* new_master,

View File

@ -1824,7 +1824,7 @@ bool MariaDBServer::create_start_slave(ClusterOperation& op, const SlaveStatus&
/**
* Generate a CHANGE MASTER TO-query.
*
* @param op Operation descriptor
* @param op Operation descriptor, required for username and password
* @param slave_conn Existing slave connection to emulate
* @return Generated query
*/
@ -1832,8 +1832,8 @@ string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const Sla
{
string change_cmd;
change_cmd += string_printf("CHANGE MASTER '%s' TO MASTER_HOST = '%s', MASTER_PORT = %i, ",
slave_conn.name.c_str(), slave_conn.master_host.c_str(),
slave_conn.master_port);
slave_conn.name.c_str(),
slave_conn.master_host.c_str(), slave_conn.master_port);
change_cmd += "MASTER_USE_GTID = current_pos, ";
change_cmd += string_printf("MASTER_USER = '%s', ", op.replication_user.c_str());
const char MASTER_PW[] = "MASTER_PASSWORD = '%s';";
@ -1846,6 +1846,58 @@ string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const Sla
return change_cmd;
}
bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op)
{
StopWatch timer;
const MariaDBServer* old_master = op.demotion_target;
const MariaDBServer* new_master = op.promotion_target;
auto old_conn = slave_connection_status_mutable(old_master);
mxb_assert(old_conn);
bool success = false;
// First, just stop the slave connection.
bool stopped = stop_slave_conn(old_conn, StopMode::STOP_ONLY, op.time_remaining, op.error_out);
op.time_remaining -= timer.restart();
if (stopped)
{
SlaveStatus modified_conn = *old_conn;
SERVER* target_server = new_master->m_server_base->server;
modified_conn.master_host = target_server->address;
modified_conn.master_port = target_server->port;
string change_master = generate_change_master_cmd(op, modified_conn);
string error_msg;
bool changed = execute_cmd_time_limit(change_master, op.time_remaining, &error_msg);
op.time_remaining -= timer.restart();
if (changed)
{
string start = string_printf("START SLAVE '%s';", old_conn->name.c_str());
bool started = execute_cmd_time_limit(start, op.time_remaining, &error_msg);
op.time_remaining -= timer.restart();
if (started)
{
success = true;
}
else
{
PRINT_MXS_JSON_ERROR(op.error_out,
"%s could not be started: %s",
modified_conn.to_short_string(name()).c_str(),
error_msg.c_str());
}
}
else
{
// TODO: This may currently print out passwords.
PRINT_MXS_JSON_ERROR(op.error_out,
"%s could not be redirected to [%s]:%i: %s",
old_conn->to_short_string(name()).c_str(),
modified_conn.master_host.c_str(), modified_conn.master_port,
error_msg.c_str());
}
} // 'stop_slave_conn' prints its own errors
return success;
}
string SlaveStatus::to_string() const
{
// Print all of this on the same line to make things compact. Are the widths reasonable? The format is

View File

@ -502,6 +502,14 @@ public:
*/
bool promote(ClusterOperation& operation);
/**
* Redirect the slave connection going to demotion target to replicate from promotion target.
*
* @param op Operation descriptor
* @return True on success
*/
bool redirect_existing_slave_conn(ClusterOperation& op);
private:
class EventInfo;
typedef std::function<void (const EventInfo&, json_t** error_out)> ManipulatorFunc;