MXS-1712 Add reset replication to MariaDB Monitor
The 'reset_replication' module command deletes all slave connections and binlogs, sets gtid to sequence 0 and restarts replication from the given master. Should be only used if gtid:s are incompatible but the actual data is known to be in sync.
This commit is contained in:
@ -26,6 +26,7 @@ using maxscale::string_printf;
|
||||
|
||||
static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' "
|
||||
"for monitor '%s' via MaxAdmin or the REST API, or restart MaxScale.";
|
||||
const char NO_SERVER[] = "Server '%s' is not monitored by '%s'.";
|
||||
|
||||
static void print_redirect_errors(MariaDBServer* first_server,
|
||||
const ServerArray& servers,
|
||||
@ -184,6 +185,150 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset replication of the cluster. Removes all slave connections and deletes binlogs. Then resets the
|
||||
* gtid sequence of the cluster to 0 and directs all servers to replicate from the given master.
|
||||
*
|
||||
* @param master_server Server to use as master
|
||||
* @param error_out Error output
|
||||
* @return True if operation was successful
|
||||
*/
|
||||
bool MariaDBMonitor::manual_reset_replication(SERVER* master_server, json_t** error_out)
|
||||
{
|
||||
// This command is a hail-mary type, so no need to be that careful. Users are only supposed to run this
|
||||
// when replication is broken and they know the cluster is in sync.
|
||||
|
||||
// If a master has been given, use that as the master. Otherwise autoselect.
|
||||
MariaDBServer* new_master = NULL;
|
||||
if (master_server)
|
||||
{
|
||||
MariaDBServer* new_master_cand = get_server(master_server);
|
||||
if (new_master_cand == NULL)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, master_server->name, m_monitor->name);
|
||||
return false;
|
||||
}
|
||||
else if (!new_master_cand->is_usable())
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out,
|
||||
"Server '%s' is down or in maintenance and cannot be used as master.",
|
||||
new_master_cand->name());
|
||||
}
|
||||
else
|
||||
{
|
||||
new_master = new_master_cand;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char BAD_MASTER[] = "Could not autoselect new master for replication reset because %s";
|
||||
if (m_master == NULL)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, BAD_MASTER, "the cluster has no master.");
|
||||
}
|
||||
else if (!m_master->is_usable())
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, BAD_MASTER, "the master is down or in maintenance.");
|
||||
}
|
||||
else
|
||||
{
|
||||
new_master = m_master;
|
||||
}
|
||||
}
|
||||
|
||||
bool rval = false;
|
||||
if (new_master)
|
||||
{
|
||||
bool error = false;
|
||||
// Step 1: Gather the list of affected servers. If any operation on the servers fails,
|
||||
// the reset fails as well.
|
||||
ServerArray targets;
|
||||
for (MariaDBServer* server : m_servers)
|
||||
{
|
||||
if (server->is_usable())
|
||||
{
|
||||
targets.push_back(server);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function for running a command on all servers in the list.
|
||||
auto exec_cmd_on_array = [&error](const ServerArray& targets, const string& query,
|
||||
json_t** error_out) {
|
||||
if (!error)
|
||||
{
|
||||
for (MariaDBServer* server : targets)
|
||||
{
|
||||
string error_msg;
|
||||
if (!server->execute_cmd(query, &error_msg))
|
||||
{
|
||||
error = true;
|
||||
PRINT_MXS_JSON_ERROR(error_out, "%s", error_msg.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Step 2: Stop and reset all slave connections, even external ones.
|
||||
for (MariaDBServer* server : targets)
|
||||
{
|
||||
if (!server->reset_all_slave_conns(error_out))
|
||||
{
|
||||
error = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In theory, this is wrong if there are no slaves. Cluster is modified soon anyway.
|
||||
m_cluster_modified = true;
|
||||
|
||||
// Step 3: Set read_only and delete binary logs,.
|
||||
exec_cmd_on_array(targets, "SET GLOBAL read_only=1;", error_out);
|
||||
exec_cmd_on_array(targets, "RESET MASTER;", error_out);
|
||||
|
||||
// Step 4: Set gtid_slave_pos on all servers. This is also sets gtid_current_pos.
|
||||
if (!error)
|
||||
{
|
||||
string set_slave_pos = string_printf("SET GLOBAL gtid_slave_pos='%" PRIi64 "-%" PRIi64 "-0';",
|
||||
new_master->m_gtid_domain_id, new_master->m_server_id);
|
||||
exec_cmd_on_array(targets, set_slave_pos, error_out);
|
||||
}
|
||||
|
||||
// Step 5: Set all slaves to replicate from the master.
|
||||
if (!error)
|
||||
{
|
||||
m_next_master = new_master;
|
||||
// The following commands are only sent to slaves.
|
||||
std::remove_if(targets.begin(), targets.end(), [new_master](MariaDBServer* elem) {
|
||||
return elem == new_master;
|
||||
});
|
||||
// TODO: the following call does stop slave & reset slave again. Fix this later, although it
|
||||
// doesn't cause error.
|
||||
ServerArray dummy;
|
||||
if ((size_t)redirect_slaves(new_master, targets, &dummy) < targets.size())
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out,
|
||||
"Some servers were not redirected to '%s'.", new_master->name());
|
||||
error = true;
|
||||
}
|
||||
// Perform this step even if previous step wasn't 100% success.
|
||||
string error_msg;
|
||||
if (!new_master->execute_cmd("SET GLOBAL read_only=0;", &error_msg))
|
||||
{
|
||||
error = true;
|
||||
PRINT_MXS_JSON_ERROR(error_out, "%s", error_msg.c_str());
|
||||
}
|
||||
}
|
||||
if (error)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(error_out, "Replication reset failed. Servers may be in invalid state "
|
||||
"for replication.");
|
||||
}
|
||||
rval = !error;
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a CHANGE MASTER TO-query.
|
||||
*
|
||||
@ -1587,9 +1732,8 @@ bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
|
||||
const auto op = ClusterOperation::SWITCHOVER;
|
||||
// Check that both servers are ok if specified, or autoselect them. Demotion target must be checked
|
||||
// first since the promotion target depends on it.
|
||||
mxb_assert(promotion_target_out && demotion_target_out
|
||||
&& !*promotion_target_out && !*demotion_target_out);
|
||||
const char NO_SERVER[] = "Server '%s' is not a member of monitor '%s'.";
|
||||
mxb_assert(promotion_target_out && demotion_target_out &&
|
||||
!*promotion_target_out && !*demotion_target_out);
|
||||
|
||||
MariaDBServer* demotion_target = NULL;
|
||||
string demotion_msg;
|
||||
|
@ -822,6 +822,17 @@ bool MariaDBMonitor::run_manual_rejoin(SERVER* rejoin_server, json_t** error_out
|
||||
return send_ok && rval;
|
||||
}
|
||||
|
||||
bool MariaDBMonitor::run_manual_reset_replication(SERVER* master_server, json_t** error_out)
|
||||
{
|
||||
bool rval = false;
|
||||
bool send_ok = execute_manual_command([this, &rval, master_server, error_out]() {
|
||||
rval = manual_reset_replication(master_server, error_out);
|
||||
}, error_out);
|
||||
return send_ok && rval;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Command handler for 'switchover'
|
||||
*
|
||||
@ -908,6 +919,28 @@ bool handle_manual_rejoin(const MODULECMD_ARG* args, json_t** output)
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool handle_manual_reset_replication(const MODULECMD_ARG* args, json_t** output)
|
||||
{
|
||||
mxb_assert(args->argc >= 1);
|
||||
mxb_assert(MODULECMD_GET_TYPE(&args->argv[0].type) == MODULECMD_ARG_MONITOR);
|
||||
mxb_assert(args->argc == 1 || MODULECMD_GET_TYPE(&args->argv[1].type) == MODULECMD_ARG_SERVER);
|
||||
|
||||
bool rv = false;
|
||||
if (config_get_global_options()->passive)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(output, "Replication reset requested but not performed, as MaxScale is in "
|
||||
"passive mode.");
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_MONITOR* mon = args->argv[0].value.monitor;
|
||||
SERVER* server = args->argv[1].value.server;
|
||||
auto handle = static_cast<MariaDBMonitor*>(mon->instance);
|
||||
rv = handle->run_manual_reset_replication(server, output);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
string monitored_servers_to_string(const ServerArray& servers)
|
||||
{
|
||||
string rval;
|
||||
@ -1000,6 +1033,23 @@ extern "C" MXS_MODULE* MXS_CREATE_MODULE()
|
||||
rejoin_argv,
|
||||
"Rejoin server to a cluster");
|
||||
|
||||
static modulecmd_arg_type_t reset_gtid_argv[] =
|
||||
{
|
||||
{
|
||||
MODULECMD_ARG_MONITOR | MODULECMD_ARG_NAME_MATCHES_DOMAIN,
|
||||
ARG_MONITOR_DESC
|
||||
},
|
||||
{MODULECMD_ARG_SERVER | MODULECMD_ARG_OPTIONAL, "Master server (optional)"}
|
||||
};
|
||||
|
||||
modulecmd_register_command(MXS_MODULE_NAME,
|
||||
"reset_replication",
|
||||
MODULECMD_TYPE_ACTIVE,
|
||||
handle_manual_reset_replication,
|
||||
MXS_ARRAY_NELEMS(reset_gtid_argv), reset_gtid_argv,
|
||||
"Delete slave connections, delete binary logs and "
|
||||
"set up replication (dangerous)");
|
||||
|
||||
static MXS_MODULE info =
|
||||
{
|
||||
MXS_MODULE_API_MONITOR,
|
||||
|
@ -97,6 +97,8 @@ public:
|
||||
*/
|
||||
bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out);
|
||||
|
||||
bool run_manual_reset_replication(SERVER* master_server, json_t** error_out);
|
||||
|
||||
protected:
|
||||
void pre_loop();
|
||||
void tick();
|
||||
@ -303,6 +305,7 @@ private:
|
||||
void disable_setting(const std::string& setting);
|
||||
bool check_sql_files();
|
||||
void enforce_read_only_on_slaves();
|
||||
bool manual_reset_replication(SERVER* master_server, json_t** error_out);
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -1301,6 +1301,29 @@ bool MariaDBServer::alter_event(const EventInfo& event, const string& target_sta
|
||||
return rval;
|
||||
}
|
||||
|
||||
bool MariaDBServer::reset_all_slave_conns(json_t** error_out)
|
||||
{
|
||||
string error_msg;
|
||||
bool error = false;
|
||||
for (auto& sstatus : m_slave_status)
|
||||
{
|
||||
auto stop = string_printf("STOP SLAVE '%s';", sstatus.name.c_str());
|
||||
auto reset = string_printf("RESET SLAVE '%s' ALL;", sstatus.name.c_str());
|
||||
if (!execute_cmd(stop, &error_msg) || !execute_cmd(reset, &error_msg))
|
||||
{
|
||||
error = true;
|
||||
string log_message = sstatus.name.empty() ?
|
||||
string_printf("Error when reseting the default slave connection of '%s': %s",
|
||||
name(), error_msg.c_str()) :
|
||||
string_printf("Error when reseting the slave connection '%s' of '%s': %s",
|
||||
sstatus.name.c_str(), name(), error_msg.c_str());
|
||||
PRINT_MXS_JSON_ERROR(error_out, "%s", log_message.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
return !error;
|
||||
}
|
||||
|
||||
string SlaveStatus::to_string() const
|
||||
{
|
||||
// Print all of this on the same line to make things compact. Are the widths reasonable? The format is
|
||||
|
@ -489,6 +489,14 @@ public:
|
||||
*/
|
||||
bool disable_events(BinlogMode binlog_mode, json_t** error_out);
|
||||
|
||||
/**
|
||||
* Stop and delete all slave connections.
|
||||
*
|
||||
* @param error_out Error output
|
||||
* @return True if successful. If false, some connections may have been successfully deleted.
|
||||
*/
|
||||
bool reset_all_slave_conns(json_t** error_out);
|
||||
|
||||
private:
|
||||
class EventInfo;
|
||||
typedef std::function<void (const EventInfo&, json_t** error_out)> ManipulatorFunc;
|
||||
|
Reference in New Issue
Block a user