Run manual commands without stopping the monitor

The command is saved in a function object which is read by the monitor
thread. This way, manual and automatic cluster modification commands are
ran in the same step of a monitor cycle.

This update required several modifications in related code.
This commit is contained in:
Esa Korhonen
2018-06-21 18:34:27 +03:00
parent 6bf10904d7
commit 9525d3507b
5 changed files with 231 additions and 132 deletions

View File

@ -1227,24 +1227,25 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node)
} }
/** /**
* Should a new master server be selected? * Is the current master server still valid or should a new one be selected?
* *
* @param reason_out Output for a text description * @param reason_out Output for a text description
* @return True, if the current master has changed in a way that a new master should be selected. * @return True, if master is ok. False if the current master has changed in a way that
* a new master should be selected.
*/ */
bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out) bool MariaDBMonitor::master_is_valid(std::string* reason_out)
{ {
// The master server of the cluster needs to be re-calculated in the following four cases: // The master server of the cluster needs to be re-calculated in the following four cases:
bool rval = false; bool rval = true;
// 1) There is no master. // 1) There is no master.
if (m_master == NULL) if (m_master == NULL)
{ {
rval = true; rval = false;
} }
// 2) read_only has been activated on the master. // 2) read_only has been activated on the master.
else if (m_master->is_read_only()) else if (m_master->is_read_only())
{ {
rval = true; rval = false;
*reason_out = "it is in read-only mode"; *reason_out = "it is in read-only mode";
} }
// 3) The master was a non-replicating master (not in a cycle) but now has a slave connection. // 3) The master was a non-replicating master (not in a cycle) but now has a slave connection.
@ -1253,7 +1254,7 @@ bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
// The master should not have a master of its own. // The master should not have a master of its own.
if (!m_master->m_node.parents.empty()) if (!m_master->m_node.parents.empty())
{ {
rval = true; rval = false;
*reason_out = "it has started replicating from another server in the cluster"; *reason_out = "it has started replicating from another server in the cluster";
} }
} }
@ -1268,7 +1269,7 @@ bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
// 4a) The master is no longer in a cycle. // 4a) The master is no longer in a cycle.
if (current_cycle_id == NodeData::CYCLE_NONE) if (current_cycle_id == NodeData::CYCLE_NONE)
{ {
rval = true; rval = false;
ServerArray& old_members = m_master_cycle_status.cycle_members; ServerArray& old_members = m_master_cycle_status.cycle_members;
string server_names_old = monitored_servers_to_string(old_members); string server_names_old = monitored_servers_to_string(old_members);
*reason_out = "it is no longer in the multimaster group (" + server_names_old + ")"; *reason_out = "it is no longer in the multimaster group (" + server_names_old + ")";
@ -1279,7 +1280,7 @@ bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
ServerArray& current_members = m_cycles[current_cycle_id]; ServerArray& current_members = m_cycles[current_cycle_id];
if (cycle_has_master_server(current_members)) if (cycle_has_master_server(current_members))
{ {
rval = true; rval = false;
string server_names_current = monitored_servers_to_string(current_members); string server_names_current = monitored_servers_to_string(current_members);
*reason_out = "a server in the master's multimaster group (" + server_names_current + *reason_out = "a server in the master's multimaster group (" + server_names_current +
") is replicating from a server not in the group"; ") is replicating from a server not in the group";

View File

@ -25,17 +25,6 @@ static void print_redirect_errors(MariaDBServer* first_server, const ServerArray
bool MariaDBMonitor::manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out) bool MariaDBMonitor::manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out)
{ {
bool running = is_running();
if (running)
{
stop();
MXS_NOTICE("Stopped the monitor %s for the duration of switchover.", m_monitor->name);
}
else
{
MXS_NOTICE("Monitor %s already stopped, switchover can proceed.", m_monitor->name);
}
/* It's possible for either current_master, or both new_master & current_master to be NULL, which means /* It's possible for either current_master, or both new_master & current_master to be NULL, which means
* autoselect. Only autoselecting new_master is not possible. Autoselection will happen at the actual * autoselect. Only autoselecting new_master is not possible. Autoselection will happen at the actual
* switchover function. */ * switchover function. */
@ -70,27 +59,11 @@ bool MariaDBMonitor::manual_switchover(SERVER* new_master, SERVER* current_maste
} }
} }
if (running)
{
// TODO: What if this fails?
start(m_monitor->parameters);
}
return rval; return rval;
} }
bool MariaDBMonitor::manual_failover(json_t** output) bool MariaDBMonitor::manual_failover(json_t** output)
{ {
bool running = is_running();
if (running)
{
stop();
MXS_NOTICE("Stopped monitor %s for the duration of failover.", m_monitor->name);
}
else
{
MXS_NOTICE("Monitor %s already stopped, failover can proceed.", m_monitor->name);
}
bool rv = true; bool rv = true;
string failover_error; string failover_error;
rv = failover_check(&failover_error); rv = failover_check(&failover_error);
@ -112,27 +85,11 @@ bool MariaDBMonitor::manual_failover(json_t** output)
failover_error.c_str()); failover_error.c_str());
} }
if (running)
{
// TODO: What if this fails?
start(m_monitor->parameters);
}
return rv; return rv;
} }
bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
{ {
bool running = is_running();
if (running)
{
stop();
MXS_NOTICE("Stopped monitor %s for the duration of rejoin.", m_monitor->name);
}
else
{
MXS_NOTICE("Monitor %s already stopped, rejoin can proceed.", m_monitor->name);
}
bool rval = false; bool rval = false;
if (cluster_can_be_joined()) if (cluster_can_be_joined())
{ {
@ -188,11 +145,6 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
PRINT_MXS_JSON_ERROR(output, BAD_CLUSTER, m_monitor->name); PRINT_MXS_JSON_ERROR(output, BAD_CLUSTER, m_monitor->name);
} }
if (running)
{
// TODO: What if this fails?
start(m_monitor->parameters);
}
return rval; return rval;
} }
@ -351,6 +303,7 @@ uint32_t MariaDBMonitor::do_rejoin(const ServerArray& joinable_servers, json_t**
if (op_success) if (op_success)
{ {
servers_joined++; servers_joined++;
m_cluster_modified = true;
} }
} }
} }
@ -587,6 +540,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer
// Step 2: Set read-only to on, flush logs, update master gtid:s // Step 2: Set read-only to on, flush logs, update master gtid:s
if (switchover_demote_master(demotion_target, err_out)) if (switchover_demote_master(demotion_target, err_out))
{ {
m_cluster_modified = true;
bool catchup_and_promote_success = false; bool catchup_and_promote_success = false;
time_t step2_time = time(NULL); time_t step2_time = time(NULL);
seconds_remaining -= difftime(step2_time, start_time); seconds_remaining -= difftime(step2_time, start_time);
@ -606,6 +560,8 @@ bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer
if (promote_new_master(promotion_target, err_out)) if (promote_new_master(promotion_target, err_out))
{ {
catchup_and_promote_success = true; catchup_and_promote_success = true;
m_next_master = promotion_target;
// Step 5: Redirect slaves and start replication on old master. // Step 5: Redirect slaves and start replication on old master.
ServerArray redirected_slaves; ServerArray redirected_slaves;
bool start_ok = switchover_start_slave(demotion_target, promotion_target); bool start_ok = switchover_start_slave(demotion_target, promotion_target);
@ -706,6 +662,8 @@ bool MariaDBMonitor::do_failover(json_t** err_out)
// Step 3: Stop and reset slave, set read-only to 0. // Step 3: Stop and reset slave, set read-only to 0.
if (promote_new_master(new_master, err_out)) if (promote_new_master(new_master, err_out))
{ {
m_next_master = new_master;
m_cluster_modified = true;
// Step 4: Redirect slaves. // Step 4: Redirect slaves.
ServerArray redirected_slaves; ServerArray redirected_slaves;
int redirects = redirect_slaves(new_master, redirectable_slaves, &redirected_slaves); int redirects = redirect_slaves(new_master, redirectable_slaves, &redirected_slaves);
@ -1378,17 +1336,14 @@ bool MariaDBMonitor::failover_check(string* error_out)
* If a master failure has occurred and MaxScale is configured with failover functionality, this fuction * If a master failure has occurred and MaxScale is configured with failover functionality, this fuction
* executes failover to select and promote a new master server. This function should be called immediately * executes failover to select and promote a new master server. This function should be called immediately
* after @c mon_process_state_changes. If an error occurs, this method disables automatic failover. * after @c mon_process_state_changes. If an error occurs, this method disables automatic failover.
*
* @return True if failover was performed, or at least attempted
*/ */
bool MariaDBMonitor::handle_auto_failover() void MariaDBMonitor::handle_auto_failover()
{ {
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor " const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
"'%s' via MaxAdmin or the REST API, or restart MaxScale."; "'%s' via MaxAdmin or the REST API, or restart MaxScale.";
bool cluster_modified = false; if (m_master && m_master->is_master())
if (config_get_global_options()->passive || (m_master && m_master->is_master()))
{ {
return cluster_modified; return;
} }
if (failover_not_possible()) if (failover_not_possible())
@ -1400,14 +1355,14 @@ bool MariaDBMonitor::handle_auto_failover()
MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor->name); MXS_ERROR(RE_ENABLE_FMT, PROBLEMS, CN_AUTO_FAILOVER, m_monitor->name);
m_auto_failover = false; m_auto_failover = false;
disable_setting(CN_AUTO_FAILOVER); disable_setting(CN_AUTO_FAILOVER);
return cluster_modified; return;
} }
// If master seems to be down, check if slaves are receiving events. // If master seems to be down, check if slaves are receiving events.
if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events()) if (m_verify_master_failure && m_master && m_master->is_down() && slave_receiving_events())
{ {
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover."); MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
return cluster_modified; return;
} }
MariaDBServer* failed_master = NULL; MariaDBServer* failed_master = NULL;
@ -1463,7 +1418,6 @@ bool MariaDBMonitor::handle_auto_failover()
m_auto_failover = false; m_auto_failover = false;
disable_setting(CN_AUTO_FAILOVER); disable_setting(CN_AUTO_FAILOVER);
} }
cluster_modified = true;
} }
else else
{ {
@ -1482,8 +1436,6 @@ bool MariaDBMonitor::handle_auto_failover()
{ {
m_warn_failover_precond = true; m_warn_failover_precond = true;
} }
return cluster_modified;
} }
bool MariaDBMonitor::failover_not_possible() bool MariaDBMonitor::failover_not_possible()

View File

@ -55,7 +55,8 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
, m_id(config_get_global_options()->id) , m_id(config_get_global_options()->id)
, m_master_gtid_domain(GTID_DOMAIN_UNKNOWN) , m_master_gtid_domain(GTID_DOMAIN_UNKNOWN)
, m_external_master_port(PORT_UNKNOWN) , m_external_master_port(PORT_UNKNOWN)
, m_cluster_modified(true) , m_cluster_topology_changed(true)
, m_cluster_modified(false)
, m_switchover_on_low_disk_space(false) , m_switchover_on_low_disk_space(false)
, m_warn_set_standalone_master(true) , m_warn_set_standalone_master(true)
, m_log_no_master(true) , m_log_no_master(true)
@ -102,6 +103,7 @@ void MariaDBMonitor::clear_server_info()
m_servers_by_id.clear(); m_servers_by_id.clear();
m_excluded_servers.clear(); m_excluded_servers.clear();
m_master = NULL; m_master = NULL;
m_next_master = NULL;
m_master_gtid_domain = GTID_DOMAIN_UNKNOWN; m_master_gtid_domain = GTID_DOMAIN_UNKNOWN;
m_external_master_host.clear(); m_external_master_host.clear();
m_external_master_port = PORT_UNKNOWN; m_external_master_port = PORT_UNKNOWN;
@ -293,7 +295,6 @@ json_t* MariaDBMonitor::diagnostics_json() const
*/ */
void MariaDBMonitor::update_server(MariaDBServer& server) void MariaDBMonitor::update_server(MariaDBServer& server)
{ {
server.m_topology_changed = false;
MXS_MONITORED_SERVER* mon_srv = server.m_server_base; MXS_MONITORED_SERVER* mon_srv = server.m_server_base;
/* Monitor server if not in maintenance. */ /* Monitor server if not in maintenance. */
bool in_maintenance = server.is_in_maintenance(); bool in_maintenance = server.is_in_maintenance();
@ -395,18 +396,18 @@ void MariaDBMonitor::tick()
} }
// Query all servers for their status. // Query all servers for their status.
bool topology_changed = false;
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++) for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{ {
MariaDBServer* server = *iter; MariaDBServer* server = *iter;
update_server(*server); update_server(*server);
if (server->m_topology_changed) if (server->m_topology_changed)
{ {
topology_changed = true; m_cluster_topology_changed = true;
server->m_topology_changed = false;
} }
} }
if (topology_changed) if (m_cluster_topology_changed)
{ {
// This means that a server id or a slave connection has changed, or read_only was set. // This means that a server id or a slave connection has changed, or read_only was set.
// Update the server id array and check various things. // Update the server id array and check various things.
@ -417,6 +418,16 @@ void MariaDBMonitor::tick()
} }
build_replication_graph(); build_replication_graph();
find_graph_cycles(); find_graph_cycles();
/* Check if a failover/switchover was performed last loop and the master should change.
* In this case, update the master and its cycle info here. */
if (m_next_master)
{
m_master = m_next_master;
update_master_cycle_info();
m_next_master = NULL;
}
// Find the server that looks like it would be the best master. It does not yet overwrite the // Find the server that looks like it would be the best master. It does not yet overwrite the
// current master. // current master.
string topology_messages; string topology_messages;
@ -424,7 +435,19 @@ void MariaDBMonitor::tick()
// Check if current master is still valid. // Check if current master is still valid.
string reason; string reason;
if (master_no_longer_valid(&reason)) if (master_is_valid(&reason))
{
// Update master cycle info in case it has changed
update_master_cycle_info();
if (root_master && m_master != root_master)
{
// Master is still valid but it is no longer the best master. Print a warning.
MXS_WARNING("'%s' is a better master candidate than the current master '%s'. "
"Master will change if '%s' is no longer a valid master.",
root_master->name(), m_master->name(), m_master->name());
}
}
else
{ {
if (m_master && !reason.empty()) if (m_master && !reason.empty())
{ {
@ -444,36 +467,17 @@ void MariaDBMonitor::tick()
} }
m_master = root_master; m_master = root_master;
update_master_cycle_info();
if (m_master) if (m_master)
{ {
// A new master has been set. Save some data regarding the type of the master.
int new_cycle_id = m_master->m_node.cycle;
m_master_cycle_status.cycle_id = new_cycle_id;
if (new_cycle_id == NodeData::CYCLE_NONE)
{
m_master_cycle_status.cycle_members.clear();
}
else
{
m_master_cycle_status.cycle_members = m_cycles[new_cycle_id];
}
MXS_NOTICE("'%s' is the best master candidate.", m_master->name()); MXS_NOTICE("'%s' is the best master candidate.", m_master->name());
} }
else else
{ {
// The current master cannot be used and no proper candidate exists.
m_master_cycle_status.cycle_id = NodeData::CYCLE_NONE;
m_master_cycle_status.cycle_members.clear();
MXS_WARNING("No valid master servers found."); MXS_WARNING("No valid master servers found.");
} }
} }
else if (root_master && m_master != root_master) m_cluster_topology_changed = false;
{
// Master is still valid but it is no longer the best master. Print a warning.
MXS_WARNING("'%s' is a better master candidate than the current master '%s'. "
"Master will change if '%s' is no longer a valid master.",
root_master->name(), m_master->name(), m_master->name());
}
} }
// Always re-assign master, slave etc bits as these depend on other factors outside topology // Always re-assign master, slave etc bits as these depend on other factors outside topology
@ -542,19 +546,38 @@ void MariaDBMonitor::process_state_changes()
MonitorInstance::process_state_changes(); MonitorInstance::process_state_changes();
m_cluster_modified = false; m_cluster_modified = false;
if (m_auto_failover) // Check for manual commands
if (m_manual_cmd.command_waiting_exec)
{ {
if ((m_cluster_modified = handle_auto_failover())) // Looks like a command is waiting. Lock mutex, check again and wait for the condition variable.
std::unique_lock<std::mutex> lock(m_manual_cmd.mutex);
if (m_manual_cmd.command_waiting_exec)
{ {
// Force a master selection on next monitor loop, otherwise the old master would stay. m_manual_cmd.has_command.wait(lock, [this]{return m_manual_cmd.command_waiting_exec;});
m_master = NULL; m_manual_cmd.method();
m_manual_cmd.command_waiting_exec = false;
m_manual_cmd.result_waiting = true;
// Manual command ran, signal the sender to continue.
lock.unlock();
m_manual_cmd.has_result.notify_one();
} }
else
{
// There was no command after all.
lock.unlock();
}
}
if (!config_get_global_options()->passive)
{
if (m_auto_failover && !m_cluster_modified)
{
handle_auto_failover();
} }
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification) // Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
// has been performed, as server states have not been updated yet. It will happen next iteration. // has been performed, as server states have not been updated yet. It will happen next iteration.
if (!config_get_global_options()->passive && m_auto_rejoin && !m_cluster_modified && if (m_auto_rejoin && !m_cluster_modified && cluster_can_be_joined())
cluster_can_be_joined())
{ {
// Check if any servers should be autojoined to the cluster and try to join them. // Check if any servers should be autojoined to the cluster and try to join them.
handle_auto_rejoin(); handle_auto_rejoin();
@ -563,10 +586,37 @@ void MariaDBMonitor::process_state_changes()
/* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not /* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not
* perform this if cluster has been modified this loop since it may not be clear which server * perform this if cluster has been modified this loop since it may not be clear which server
* should be a slave. */ * should be a slave. */
if (!config_get_global_options()->passive && m_enforce_read_only_slaves && !m_cluster_modified) if (m_enforce_read_only_slaves && !m_cluster_modified)
{ {
enforce_read_only_on_slaves(); enforce_read_only_on_slaves();
} }
}
}
/**
* Save info on the master server's multimaster group, if any. This is required when checking for changes
* in the topology.
*/
void MariaDBMonitor::update_master_cycle_info()
{
if (m_master)
{
int new_cycle_id = m_master->m_node.cycle;
m_master_cycle_status.cycle_id = new_cycle_id;
if (new_cycle_id == NodeData::CYCLE_NONE)
{
m_master_cycle_status.cycle_members.clear();
}
else
{
m_master_cycle_status.cycle_members = m_cycles[new_cycle_id];
}
}
else
{
m_master_cycle_status.cycle_id = NodeData::CYCLE_NONE;
m_master_cycle_status.cycle_members.clear();
}
} }
void MariaDBMonitor::update_gtid_domain() void MariaDBMonitor::update_gtid_domain()
@ -675,7 +725,6 @@ void MariaDBMonitor::handle_auto_rejoin()
if (joins > 0) if (joins > 0)
{ {
MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins); MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins);
m_cluster_modified = true;
} }
} }
else else
@ -968,6 +1017,78 @@ bool MariaDBMonitor::check_sql_files()
return rval; return rval;
} }
/**
* Schedule a manual command for execution. It will be ran during the next monitor loop. This method waits
* for the command to have finished running.
*
* @param command Function object containing the method the monitor should execute: switchover, failover or
* rejoin.
* @param error_out Json error output
* @return True if command execution was attempted. False if monitor was in an invalid state
* to run the command.
*/
bool MariaDBMonitor::execute_manual_command(std::function<void (void)> command, json_t** error_out)
{
bool rval = false;
if (state() != MXS_MONITOR_RUNNING)
{
PRINT_MXS_JSON_ERROR(error_out, "The monitor is not running, cannot execute manual command.");
}
else if (m_manual_cmd.command_waiting_exec)
{
PRINT_MXS_JSON_ERROR(error_out,
"Previous command has not been executed, cannot send another command.");
ss_dassert(!true);
}
else
{
rval = true;
// Write the command.
std::unique_lock<std::mutex> lock(m_manual_cmd.mutex);
m_manual_cmd.method = command;
m_manual_cmd.command_waiting_exec = true;
// Signal the monitor thread to start running the command.
lock.unlock();
m_manual_cmd.has_command.notify_one();
// Wait for the result.
lock.lock();
m_manual_cmd.has_result.wait(lock, [this]{return m_manual_cmd.result_waiting;});
m_manual_cmd.result_waiting = false;
}
return rval;
}
bool MariaDBMonitor::run_manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out)
{
bool rval = false;
bool send_ok = execute_manual_command([this, &rval, new_master, current_master, error_out]()
{
rval = manual_switchover(new_master, current_master, error_out);
}, error_out);
return send_ok && rval;
}
bool MariaDBMonitor::run_manual_failover(json_t** error_out)
{
bool rval = false;
bool send_ok = execute_manual_command([this, &rval, error_out]()
{
rval = manual_failover(error_out);
}, error_out);
return send_ok && rval;
}
bool MariaDBMonitor::run_manual_rejoin(SERVER* rejoin_server, json_t** error_out)
{
bool rval = false;
bool send_ok = execute_manual_command([this, &rval, rejoin_server, error_out]()
{
rval = manual_rejoin(rejoin_server, error_out);
}, error_out);
return send_ok && rval;
}
/** /**
* Command handler for 'switchover' * Command handler for 'switchover'
* *
@ -995,7 +1116,7 @@ bool handle_manual_switchover(const MODULECMD_ARG* args, json_t** error_out)
auto handle = static_cast<MariaDBMonitor*>(mon->instance); auto handle = static_cast<MariaDBMonitor*>(mon->instance);
SERVER* new_master = (args->argc >= 2) ? args->argv[1].value.server : NULL; SERVER* new_master = (args->argc >= 2) ? args->argv[1].value.server : NULL;
SERVER* current_master = (args->argc == 3) ? args->argv[2].value.server : NULL; SERVER* current_master = (args->argc == 3) ? args->argv[2].value.server : NULL;
rval = handle->manual_switchover(new_master, current_master, error_out); rval = handle->run_manual_switchover(new_master, current_master, error_out);
} }
return rval; return rval;
} }
@ -1021,7 +1142,7 @@ bool handle_manual_failover(const MODULECMD_ARG* args, json_t** output)
{ {
MXS_MONITOR* mon = args->argv[0].value.monitor; MXS_MONITOR* mon = args->argv[0].value.monitor;
auto handle = static_cast<MariaDBMonitor*>(mon->instance); auto handle = static_cast<MariaDBMonitor*>(mon->instance);
rv = handle->manual_failover(output); rv = handle->run_manual_failover(output);
} }
return rv; return rv;
} }
@ -1049,7 +1170,7 @@ bool handle_manual_rejoin(const MODULECMD_ARG* args, json_t** output)
MXS_MONITOR* mon = args->argv[0].value.monitor; MXS_MONITOR* mon = args->argv[0].value.monitor;
SERVER* server = args->argv[1].value.server; SERVER* server = args->argv[1].value.server;
auto handle = static_cast<MariaDBMonitor*>(mon->instance); auto handle = static_cast<MariaDBMonitor*>(mon->instance);
rv = handle->manual_rejoin(server, output); rv = handle->run_manual_rejoin(server, output);
} }
return rv; return rv;
} }

View File

@ -13,6 +13,8 @@
* Public License. * Public License.
*/ */
#include "mariadbmon_common.hh" #include "mariadbmon_common.hh"
#include <condition_variable>
#include <functional>
#include <string> #include <string>
#include <tr1/unordered_map> #include <tr1/unordered_map>
#include <vector> #include <vector>
@ -69,32 +71,31 @@ public:
static MariaDBMonitor* create(MXS_MONITOR *monitor); static MariaDBMonitor* create(MXS_MONITOR *monitor);
/** /**
* Handle switchover * Perform user-activated switchover.
* *
* @new_master The specified new master * @param new_master The specified new master. If NULL, monitor will autoselect.
* @current_master The specified current master. If NULL, monitor will autoselect. * @param current_master The specified current master. If NULL, monitor will autoselect.
* @output Pointer where to place output object * @param error_out Json error output
* * @return True if switchover was performed
* @return True, if switchover was performed, false otherwise.
*/ */
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out); bool run_manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
/** /**
* Perform user-activated failover. * Perform user-activated failover.
* *
* @param output Json error output * @param error_out Json error output
* @return True on success * @return True if failover was performed
*/ */
bool manual_failover(json_t** output); bool run_manual_failover(json_t** error_out);
/** /**
* Perform user-activated rejoin * Perform user-activated rejoin
* *
* @param rejoin_server Server to join * @param rejoin_server Server to join
* @param output Json error output * @param error_out Json error output
* @return True on success * @return True if rejoin was performed
*/ */
bool manual_rejoin(SERVER* rejoin_server, json_t** output); bool run_manual_rejoin(SERVER* rejoin_server, json_t** error_out);
protected: protected:
void pre_loop(); void pre_loop();
@ -109,17 +110,35 @@ private:
ServerArray cycle_members; ServerArray cycle_members;
}; };
/* Structure used to communicate commands and results between the MaxAdmin and monitor threads.
* The monitor can only process one manual command at a time, which is already enforced by
* the admin thread. */
struct ManualCommand
{
public:
std::mutex mutex; /**< Mutex used by the condition variables */
std::condition_variable has_command; /**< Notified when a command is waiting execution */
bool command_waiting_exec = false; /**< Guard variable for the above */
std::function<void (void)> method; /**< The method to run when executing the command */
std::condition_variable has_result; /**< Notified when the command has ran */
bool result_waiting = false; /**< Guard variable for the above */
};
unsigned long m_id; /**< Monitor ID */ unsigned long m_id; /**< Monitor ID */
ServerArray m_servers; /**< Servers of the monitor */ ServerArray m_servers; /**< Servers of the monitor */
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */ ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
ManualCommand m_manual_cmd; /**< Communicates manual commands and results */
// Values updated by monitor // Values updated by monitor
MariaDBServer* m_master; /**< Master server for Master/Slave replication */ MariaDBServer* m_master; /**< Master server for Master/Slave replication */
MariaDBServer* m_next_master; /**< When master changes because of a failover/switchover, the new
* master is written here so the next monitor loop picks it up. */
IdToServerMap m_servers_by_id; /**< Map from server id:s to MariaDBServer */ IdToServerMap m_servers_by_id; /**< Map from server id:s to MariaDBServer */
int64_t m_master_gtid_domain; /**< gtid_domain_id most recently seen on the master */ int64_t m_master_gtid_domain; /**< gtid_domain_id most recently seen on the master */
std::string m_external_master_host; /**< External master host, for fail/switchover */ std::string m_external_master_host; /**< External master host, for fail/switchover */
int m_external_master_port; /**< External master port */ int m_external_master_port; /**< External master port */
bool m_cluster_modified; /**< Has an automatic failover/rejoin been performed this loop? */ bool m_cluster_topology_changed; /**< Has cluster topology changed since last monitor loop? */
bool m_cluster_modified; /**< Has a failover/switchover/rejoin been performed this loop? */
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */ CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */ CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
@ -176,6 +195,7 @@ private:
bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params); bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db); MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id); MariaDBServer* get_server(int64_t id);
bool execute_manual_command(std::function<void (void)> command, json_t** error_out);
// Cluster discovery and status assignment methods // Cluster discovery and status assignment methods
void update_server(MariaDBServer& server); void update_server(MariaDBServer& server);
@ -206,10 +226,12 @@ private:
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers); MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
void assign_master_and_slave(); void assign_master_and_slave();
void assign_slave_and_relay_master(MariaDBServer* node); void assign_slave_and_relay_master(MariaDBServer* node);
bool master_no_longer_valid(std::string* reason_out); bool master_is_valid(std::string* reason_out);
bool cycle_has_master_server(ServerArray& cycle_servers); bool cycle_has_master_server(ServerArray& cycle_servers);
void update_master_cycle_info();
// Switchover methods // Switchover methods
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
bool switchover_check(SERVER* new_master, SERVER* current_master, bool switchover_check(SERVER* new_master, SERVER* current_master,
MariaDBServer** new_master_out, MariaDBServer** current_master_out, MariaDBServer** new_master_out, MariaDBServer** current_master_out,
json_t** error_out); json_t** error_out);
@ -225,13 +247,15 @@ private:
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master); bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
// Failover methods // Failover methods
bool handle_auto_failover(); bool manual_failover(json_t** output);
void handle_auto_failover();
bool failover_not_possible(); bool failover_not_possible();
bool slave_receiving_events(); bool slave_receiving_events();
bool failover_check(std::string* error_out); bool failover_check(std::string* error_out);
bool do_failover(json_t** err_out); bool do_failover(json_t** err_out);
// Rejoin methods // Rejoin methods
bool manual_rejoin(SERVER* rejoin_server, json_t** output);
bool cluster_can_be_joined(); bool cluster_can_be_joined();
void handle_auto_rejoin(); void handle_auto_rejoin();
bool get_joinable_servers(ServerArray* output); bool get_joinable_servers(ServerArray* output);

View File

@ -49,6 +49,7 @@ MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_
, m_heartbeat_period(0) , m_heartbeat_period(0)
, m_latest_event(0) , m_latest_event(0)
, m_gtid_domain_id(GTID_DOMAIN_UNKNOWN) , m_gtid_domain_id(GTID_DOMAIN_UNKNOWN)
, m_topology_changed(true)
, m_print_update_errormsg(true) , m_print_update_errormsg(true)
{ {
ss_dassert(monitored_server); ss_dassert(monitored_server);