MXS-1905 Switchover if master is low on disk space

Required quite a bit of refactoring.
This commit is contained in:
Esa Korhonen
2018-08-01 10:39:30 +03:00
parent 84b3e4672f
commit c0bd5ca3a1
8 changed files with 577 additions and 255 deletions

View File

@ -350,9 +350,14 @@ inline bool server_is_slave_of_ext_master(const SERVER* server)
(SERVER_RUNNING | SERVER_SLAVE_OF_EXT_MASTER));
}
inline bool status_is_disk_space_exhausted(uint64_t status)
{
return (status & SERVER_DISK_SPACE_EXHAUSTED);
}
inline bool server_is_disk_space_exhausted(const SERVER* server)
{
return (server->status & SERVER_DISK_SPACE_EXHAUSTED);
return status_is_disk_space_exhausted(server->status);
}
/**

View File

@ -21,45 +21,60 @@
using std::string;
using maxscale::string_printf;
static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' "
"for monitor '%s' via MaxAdmin or the REST API, or restart MaxScale.";
static void print_redirect_errors(MariaDBServer* first_server, const ServerArray& servers,
json_t** err_out);
bool MariaDBMonitor::manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out)
/**
* Run a manual switchover, promoting a new master server and demoting the existing master.
*
* @param promotion_server The server which should be promoted. If null, monitor will autoselect.
* @param demotion_server The server which should be demoted. Can be null for autoselect, in which case
* monitor will select the cluster master server. Otherwise must be a valid master server or a relay.
* @param error_out Error output
* @return True, if switchover was performed successfully
*/
bool MariaDBMonitor::manual_switchover(SERVER* promotion_server, SERVER* demotion_server, json_t** error_out)
{
/* It's possible for either current_master, or both new_master & current_master to be NULL, which means
* autoselect. Only autoselecting new_master is not possible. Autoselection will happen at the actual
* switchover function. */
MariaDBServer *found_new_master = NULL, *found_curr_master = NULL;
auto ok_to_switch = switchover_check(new_master, current_master, &found_new_master, &found_curr_master,
error_out);
/* The server parameters may be null, in which case the monitor will autoselect.
*
* Manual commands (as well as automatic ones) are ran at the end of a normal monitor loop,
* so server states can be assumed to be up-to-date.
*/
MariaDBServer* promotion_target = NULL;
MariaDBServer* demotion_target = NULL;
auto ok_to_switch = switchover_prepare(promotion_server, demotion_server,
&promotion_target, &demotion_target, error_out);
bool rval = false;
if (ok_to_switch)
{
bool switched = do_switchover(&found_curr_master, &found_new_master, error_out);
const char AUTOSELECT[] = "<autoselect>";
const char* curr_master_name = found_curr_master ? found_curr_master->name() : AUTOSELECT;
const char* new_master_name = found_new_master ? found_new_master->name() : AUTOSELECT;
bool switched = do_switchover(demotion_target, promotion_target, error_out);
if (switched)
{
MXS_NOTICE("Switchover %s -> %s performed.", curr_master_name, new_master_name);
MXS_NOTICE("Switchover %s -> %s performed.", demotion_target->name(), promotion_target->name());
rval = true;
}
else
{
string format = "Switchover %s -> %s failed";
string msg = string_printf("Switchover %s -> %s failed",
demotion_target->name(), promotion_target->name());
bool failover_setting = config_get_bool(m_monitor->parameters, CN_AUTO_FAILOVER);
if (failover_setting)
{
disable_setting(CN_AUTO_FAILOVER);
format += ", automatic failover has been disabled.";
msg += ", automatic failover has been disabled";
}
format += ".";
PRINT_MXS_JSON_ERROR(error_out, format.c_str(), curr_master_name, new_master_name);
msg += ".";
PRINT_MXS_JSON_ERROR(error_out, "%s", msg.c_str());
}
}
else
{
PRINT_MXS_JSON_ERROR(error_out, "Switchover cancelled.");
}
return rval;
}
@ -446,96 +461,35 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t
/**
* Performs switchover for a simple topology (1 master, N slaves, no intermediate masters). If an
* intermediate step fails, the cluster may be left without a master.
* intermediate step fails, the cluster may be left without a master and manual intervention is
* required to fix things.
*
* @param current_master Handle to current master server. If null, the autoselected server is written here.
* @param new_master Handle to slave which should be promoted. If null, the autoselected server is written
* here.
* @param demotion_target Server to demote
* @param promotion_target Server to promote
* @param err_out json object for error printing. Can be NULL.
* @return True if successful. If false, the cluster can be in various situations depending on which step
* failed. In practice, manual intervention is usually required on failure.
* @return True if successful. If false, replication may be broken.
*/
bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer** new_master,
bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target,
json_t** err_out)
{
ss_dassert(current_master && new_master);
MariaDBServer* demotion_target = NULL;
if (*current_master == NULL)
{
// Autoselect current master.
if (m_master && m_master->is_master())
{
demotion_target = m_master;
*current_master = demotion_target;
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "Could not autoselect current master for switchover. Cluster does "
"not have a master or master is in maintenance.");
return false;
}
}
else
{
// No need to check a given current master, it has already been checked.
demotion_target = *current_master;
}
if (m_master_gtid_domain == GTID_DOMAIN_UNKNOWN)
{
PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot switchover.");
return false;
}
ss_dassert(demotion_target && promotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed.
int seconds_remaining = m_switchover_timeout;
time_t start_time = time(NULL);
// Step 1: Save all slaves except promotion target to an array. If we have a
// user-defined master candidate, check it. Otherwise, autoselect.
MariaDBServer* promotion_target = NULL;
// Step 1: Save all slaves except promotion target to an array.
ServerArray redirectable_slaves;
if (*new_master == NULL)
for (MariaDBServer* redirectable : demotion_target->m_node.children)
{
// Autoselect new master.
promotion_target = select_new_master(&redirectable_slaves, err_out);
if (promotion_target)
// TODO: Again check valid replication here
if (redirectable != promotion_target && redirectable->is_replicating_from(demotion_target) &&
redirectable->uses_gtid())
{
*new_master = promotion_target;
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "Could not autoselect new master for switchover.");
return false;
redirectable_slaves.push_back(redirectable);
}
}
else
{
// Check user-given new master. Some checks have already been performed but more is needed.
if (switchover_check_preferred_master(*new_master, err_out))
{
promotion_target = *new_master;
/* User-given candidate is good. Update info on all slave servers.
* The update_slave_info()-call is not strictly necessary here, but it should be ran to keep this
* path analogous with failover_select_new_master(). The later functions can then assume that
* slave server info is up to date. If the master is replicating from external master, it is
* updated by update_slave_info() but not added to array. */
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
MariaDBServer* server = *iter;
if (server != promotion_target && server->update_slave_info() && server != demotion_target)
{
redirectable_slaves.push_back(server);
}
}
}
else
{
return false;
}
}
ss_dassert(demotion_target && promotion_target);
bool rval = false;
// Step 2: Set read-only to on, flush logs, update master gtid:s
@ -1058,11 +1012,16 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
{
// If no new master yet, accept any valid candidate. Otherwise check.
if (current_best == NULL ||
is_candidate_better(current_best, cand, m_master_gtid_domain, &current_best_reason))
is_candidate_better(cand, current_best, m_master_gtid_domain, &current_best_reason))
{
// The server has been selected for promotion, for now.
current_best = cand;
master_vector_index = slaves_out->size() - 1;
if (!current_best_reason.empty())
{
current_best_reason = string_printf("Selected '%s' because %s", current_best->name(),
current_best_reason.c_str());
}
}
}
}
@ -1088,7 +1047,7 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
MXS_WARNING(EXCLUDED_ONLY_CAND, excluded_name);
break;
}
else if (is_candidate_better(current_best, excluded_info, m_master_gtid_domain))
else if (is_candidate_better(excluded_info, current_best, m_master_gtid_domain))
{
// Print a warning if this server is actually a better candidate than the previous best.
const char EXCLUDED_CAND[] = "Server '%s' is superior to current best candidate '%s', "
@ -1112,6 +1071,110 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
return current_best;
}
MariaDBServer* MariaDBMonitor::switchover_select_promotion(MariaDBServer* demotion_target, json_t** err_out)
{
/* Select a new master candidate. Selects the one with the latest event in relay log.
* If multiple slaves have same number of events, select the one with most processed events. */
if (!demotion_target->m_node.children.empty())
{
MXS_NOTICE("Selecting a server to promote and replace '%s'. Candidates are: %s.",
demotion_target->name(),
monitored_servers_to_string(demotion_target->m_node.children).c_str());
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "'%s' does not have any slaves to promote.", demotion_target->name());
return NULL;
}
// Servers that cannot be selected because of exclusion, but seem otherwise ok.
ServerArray valid_but_excluded;
string all_reasons;
DelimitedPrinter printer("\n");
// The valid promotion candidates are the slaves replicating directly from the demotion target.
ServerArray candidates;
for (MariaDBServer* cand : demotion_target->m_node.children)
{
string reason;
if (!cand->can_be_promoted(demotion_target, &reason))
{
string msg = string_printf("'%s' cannot be selected because %s", cand->name(), reason.c_str());
printer.cat(all_reasons, msg);
}
else if (server_is_excluded(cand))
{
valid_but_excluded.push_back(cand);
string msg = string_printf("'%s' cannot be selected because it is excluded.", cand->name());
printer.cat(all_reasons, msg);
}
else
{
candidates.push_back(cand);
}
}
MariaDBServer* current_best = NULL;
if (candidates.empty())
{
PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found:\n%s", all_reasons.c_str());
}
else
{
current_best = candidates.front();
candidates.erase(candidates.begin());
if (!all_reasons.empty())
{
MXS_WARNING("Some servers were disqualified for promotion:\n%s", all_reasons.c_str());
}
}
// Check which candidate is best
string current_best_reason;
for (MariaDBServer* cand : candidates)
{
if (is_candidate_better(cand, current_best, m_master_gtid_domain, &current_best_reason))
{
// Select the server for promotion, for now.
current_best = cand;
}
}
// Check if any of the excluded servers would be better than the best candidate. Only print one item.
for (MariaDBServer* excluded_info : valid_but_excluded)
{
const char* excluded_name = excluded_info->name();
if (current_best == NULL)
{
const char EXCLUDED_ONLY_CAND[] = "Server '%s' is a viable choice for new master, "
"but cannot be selected as it's excluded.";
MXS_WARNING(EXCLUDED_ONLY_CAND, excluded_name);
break;
}
else if (is_candidate_better(excluded_info, current_best, m_master_gtid_domain))
{
// Print a warning if this server is actually a better candidate than the previous best.
const char EXCLUDED_CAND[] = "Server '%s' is superior to current best candidate '%s', "
"but cannot be selected as it's excluded. This may lead to "
"loss of data if '%s' is ahead of other servers.";
MXS_WARNING(EXCLUDED_CAND, excluded_name, current_best->name(), excluded_name);
break;
}
}
if (current_best)
{
// If there was a specific reason this server was selected, print it now. If the first candidate
// was chosen (likely all servers were equally good), do not print.
string msg = string_printf("Selected '%s'", current_best->name());
msg += current_best_reason.empty() ? "." : (" because " + current_best_reason);
MXS_NOTICE("%s", msg.c_str());
}
return current_best;
}
/**
* Is the server in the excluded list
*
@ -1133,17 +1196,16 @@ bool MariaDBMonitor::server_is_excluded(const MariaDBServer* server)
/**
* Is the candidate a better choice for master than the previous best?
*
* @param current_best_info Server info of current best choice
* @param candidate_info Server info of new candidate
* @param current_best_info Server info of current best choice
* @param gtid_domain Which domain to compare
* @param reason_out Why is the candidate better than current_best
* @return True if candidate is better
*/
bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate,
bool MariaDBMonitor::is_candidate_better(const MariaDBServer* candidate, const MariaDBServer* current_best,
uint32_t gtid_domain, std::string* reason_out)
{
string reason = string("'") + candidate->name() + "' is the best candidate because it ";
string reason;
bool is_better = false;
uint64_t cand_io = candidate->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence;
uint64_t curr_io = current_best->m_slave_status[0].gtid_io_pos.get_gtid(gtid_domain).m_sequence;
@ -1151,7 +1213,7 @@ bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, cons
if (cand_io > curr_io)
{
is_better = true;
reason += "has received more events.";
reason = "it has received more events.";
}
// If io sequences are identical ...
else if (cand_io == curr_io)
@ -1162,7 +1224,7 @@ bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, cons
if (cand_processed > curr_processed)
{
is_better = true;
reason += "has processed more events.";
reason = "it has processed more events.";
}
// If gtid positions are identical ...
else if (cand_processed == curr_processed)
@ -1173,7 +1235,7 @@ bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, cons
if (cand_updates && !curr_updates)
{
is_better = true;
reason += "has 'log_slave_updates' on.";
reason = "it has 'log_slave_updates' on.";
}
// If both have log_slave_updates on ...
else if (cand_updates && curr_updates)
@ -1184,7 +1246,7 @@ bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, cons
if (cand_disk_ok && !curr_disk_ok)
{
is_better = true;
reason += "is not low on disk space.";
reason = "it is not low on disk space.";
}
}
}
@ -1197,77 +1259,6 @@ bool MariaDBMonitor::is_candidate_better(const MariaDBServer* current_best, cons
return is_better;
}
/**
* Check that the given server is a master and it's the only master.
*
* @param suggested_curr_master The server to check, given by user.
* @param error_out On output, error object if function failed.
* @return True if current master seems ok. False, if there is some error with the
* specified current master.
*/
bool MariaDBMonitor::switchover_check_current(const MariaDBServer* suggested_curr_master,
json_t** error_out) const
{
ss_dassert(suggested_curr_master);
bool server_is_master = false;
MariaDBServer* extra_master = NULL; // A master server which is not the suggested one
for (auto iter = m_servers.begin(); iter != m_servers.end() && extra_master == NULL; iter++)
{
MariaDBServer* server = *iter;
if (server->is_master())
{
if (server == suggested_curr_master)
{
server_is_master = true;
}
else
{
extra_master = server;
}
}
}
if (!server_is_master)
{
PRINT_MXS_JSON_ERROR(error_out, "Server '%s' is not the current master or it's in maintenance.",
suggested_curr_master->name());
}
else if (extra_master)
{
PRINT_MXS_JSON_ERROR(error_out, "Cluster has an additional master server '%s'.",
extra_master->name());
}
return server_is_master && !extra_master;
}
/**
* Check whether specified new master is acceptable.
*
* @param monitored_server The server to check against.
* @param error On output, error object if function failed.
*
* @return True, if suggested new master is a viable promotion candidate.
*/
bool MariaDBMonitor::switchover_check_new(const MariaDBServer* new_master_cand, json_t** error)
{
bool is_master = new_master_cand->is_master();
bool is_slave = new_master_cand->is_slave();
if (is_master)
{
const char IS_MASTER[] = "Specified new master '%s' is already the current master.";
PRINT_MXS_JSON_ERROR(error, IS_MASTER, new_master_cand->name());
}
else if (!is_slave)
{
const char NOT_SLAVE[] = "Specified new master '%s' is not a slave.";
PRINT_MXS_JSON_ERROR(error, NOT_SLAVE, new_master_cand->name());
}
return !is_master && is_slave;
}
/**
* Check that preconditions for a failover are met.
*
@ -1345,9 +1336,6 @@ bool MariaDBMonitor::failover_check(string* error_out)
*/
void MariaDBMonitor::handle_auto_failover()
{
const char RE_ENABLE_FMT[] = "To re-enable automatic failover, manually set '%s' to 'true' for monitor "
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
string cluster_issues;
// TODO: Only check this when topology has changed.
if (!cluster_supports_failover(&cluster_issues))
@ -1357,8 +1345,9 @@ void MariaDBMonitor::handle_auto_failover()
"%s\n\n"
"Automatic failover has been disabled. It should only be enabled after the "
"above issues have been resolved.";
string problems = string_printf(PROBLEMS, cluster_issues.c_str());
string total_msg = problems + " " + string_printf(RE_ENABLE_FMT, CN_AUTO_FAILOVER, m_monitor->name);
string p1 = string_printf(PROBLEMS, cluster_issues.c_str());
string p2 = string_printf(RE_ENABLE_FMT, "failover", CN_AUTO_FAILOVER, m_monitor->name);
string total_msg = p1 + " " + p2;
MXS_ERROR("%s", total_msg.c_str());
m_auto_failover = false;
disable_setting(CN_AUTO_FAILOVER);
@ -1426,12 +1415,7 @@ void MariaDBMonitor::handle_auto_failover()
failed_master->m_server_base->new_event = false;
if (!do_failover(NULL))
{
const string FAILED = "Automatic failover failed, disabling automatic failover.";
string error_msg = FAILED + " " +
string_printf(RE_ENABLE_FMT, CN_AUTO_FAILOVER, m_monitor->name);
MXS_ERROR("%s", error_msg.c_str());
m_auto_failover = false;
disable_setting(CN_AUTO_FAILOVER);
report_and_disable("failover", CN_AUTO_FAILOVER, &m_auto_failover);
}
}
else
@ -1441,7 +1425,7 @@ void MariaDBMonitor::handle_auto_failover()
if (m_warn_failover_precond)
{
MXS_WARNING("Not performing automatic failover. Will keep retrying with this message "
"suppressed. Errors: \n%s", error_msg.c_str());
"suppressed. Errors: \n%s", error_msg.c_str());
m_warn_failover_precond = false;
}
}
@ -1570,71 +1554,130 @@ static void print_redirect_errors(MariaDBServer* first_server, const ServerArray
/**
* Check cluster and parameters for suitability to switchover. Also writes found servers to output pointers.
* If a server parameter is NULL, the corresponding output parameter is not written to.
*
* @param new_master New master requested by the user. Can be null for autoselect.
* @param current_master Current master given by the user. Can be null for autoselect.
* @param new_master_out Where to write found new master.
* @param current_master_out Where to write found current master.
* @param error_out Error output, can be null.
* @return True if cluster is suitable and server parameters were valid and found.
* @param promotion_server The server which should be promoted. If null, monitor will autoselect.
* @param demotion_server The server which should be demoted. Can be null for autoselect.
* @param promotion_target_out Output for promotion target
* @param demotion_target_out Output for demotion target
* @param error_out Error output
* @return True if cluster is suitable and server parameters were valid
*/
bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master,
MariaDBServer** new_master_out, MariaDBServer** current_master_out,
json_t** error_out)
bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server, SERVER* demotion_server,
MariaDBServer** promotion_target_out,
MariaDBServer** demotion_target_out,
json_t** error_out)
{
bool new_master_ok = true, current_master_ok = true;
// Check that both servers are ok if specified, or autoselect them. Demotion target must be checked
// first since the promotion target depends on it.
ss_dassert(promotion_target_out && demotion_target_out &&
!*promotion_target_out && !*demotion_target_out);
const char NO_SERVER[] = "Server '%s' is not a member of monitor '%s'.";
// Check that both servers are ok if specified. Null is a valid value.
if (new_master)
MariaDBServer* demotion_target = NULL;
string demotion_msg;
if (demotion_server)
{
auto mon_new_master = mon_get_monitored_server(m_monitor, new_master);
if (mon_new_master == NULL)
// Manual select.
MariaDBServer* demotion_candidate = get_server(demotion_server);
if (demotion_candidate == NULL)
{
new_master_ok = false;
PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, new_master->name, m_monitor->name);
PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, demotion_server->name, m_monitor->name);
}
else if (!demotion_candidate->can_be_demoted(&demotion_msg))
{
PRINT_MXS_JSON_ERROR(error_out, "'%s' is not a valid demotion target for switchover: %s",
demotion_candidate->name(), demotion_msg.c_str());
}
else
{
MariaDBServer* found_new_master = get_server_info(mon_new_master);
if (switchover_check_new(found_new_master, error_out))
demotion_target = demotion_candidate;
}
}
else
{
// Autoselect current master as demotion target.
if (m_master == NULL)
{
const char msg[] = "Can not autoselect a demotion target for switchover: cluster does "
"not have a master.";
PRINT_MXS_JSON_ERROR(error_out, msg);
}
else if (!m_master->can_be_demoted(&demotion_msg))
{
const char msg[] = "Can not autoselect '%s' as a demotion target for switchover: %s";
PRINT_MXS_JSON_ERROR(error_out, msg, m_master->name(), demotion_msg.c_str());
}
else
{
demotion_target = m_master;
}
}
MariaDBServer* promotion_target = NULL;
if (demotion_target)
{
string promotion_msg;
if (promotion_server)
{
// Manual select.
MariaDBServer* promotion_candidate = get_server(promotion_server);
if (promotion_candidate == NULL)
{
*new_master_out = found_new_master;
PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, promotion_server->name, m_monitor->name);
}
else if (!promotion_candidate->can_be_promoted(demotion_target, &promotion_msg))
{
const char msg[] = "'%s' is not a valid promotion target for switchover: %s";
PRINT_MXS_JSON_ERROR(error_out, msg, promotion_candidate->name(), promotion_msg.c_str());
}
else
{
new_master_ok = false;
promotion_target = promotion_candidate;
}
}
else
{
// Autoselect. More involved than the autoselecting the demotion target.
MariaDBServer* promotion_candidate = switchover_select_promotion(demotion_target, error_out);
if (promotion_candidate)
{
promotion_target = promotion_candidate;
}
else
{
PRINT_MXS_JSON_ERROR(error_out, "Could not autoselect promotion target for switchover.");
}
}
}
if (current_master)
bool gtid_domain_ok = false;
if (m_master_gtid_domain == GTID_DOMAIN_UNKNOWN)
{
auto mon_curr_master = mon_get_monitored_server(m_monitor, current_master);
if (mon_curr_master == NULL)
{
current_master_ok = false;
PRINT_MXS_JSON_ERROR(error_out, NO_SERVER, current_master->name, m_monitor->name);
}
else
{
MariaDBServer* found_curr_master = get_server_info(mon_curr_master);
if (switchover_check_current(found_curr_master, error_out))
{
*current_master_out = get_server_info(mon_curr_master);
}
else
{
current_master_ok = false;
}
}
PRINT_MXS_JSON_ERROR(error_out, "Cluster gtid domain is unknown. Cannot switchover.");
}
else
{
gtid_domain_ok = true;
}
// Check that all slaves are using gtid-replication.
bool gtid_ok = true;
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
bool gtid_ok = slaves_using_gtid(error_out);
if (demotion_target && promotion_target && gtid_domain_ok && gtid_ok)
{
*demotion_target_out = demotion_target;
*promotion_target_out = promotion_target;
return true;
}
return false;
}
bool MariaDBMonitor::slaves_using_gtid(json_t** error_out)
{
// Check that all slaves are using gtid-replication.
bool gtid_ok = true;
for (MariaDBServer* server : m_servers)
{
MariaDBServer* server = *iter;
string gtid_error;
if (server->is_slave() && !server->uses_gtid(&gtid_error))
{
@ -1642,8 +1685,7 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master
PRINT_MXS_JSON_ERROR(error_out, "%s", gtid_error.c_str());
}
}
return new_master_ok && current_master_ok && gtid_ok;
return gtid_ok;
}
void MariaDBMonitor::enforce_read_only_on_slaves()
@ -1677,6 +1719,68 @@ void MariaDBMonitor::set_low_disk_slaves_maintenance()
!server->is_master() && !server->is_relay_master())
{
server->set_status(SERVER_MAINT);
m_cluster_modified = true;
}
}
}
void MariaDBMonitor::handle_low_disk_space_master()
{
if (m_master && m_master->is_master() && m_master->is_low_on_disk_space())
{
if (m_warn_switchover_precond)
{
MXS_WARNING("Master server '%s' is low on disk space. Attempting to switch it with a slave.",
m_master->name());
}
// Looks like the master should be swapped out. Before trying it, check if there is even
// a likely valid slave to swap to.
MariaDBServer* demotion_target = NULL;
MariaDBServer* promotion_target = NULL;
auto ok_to_switch = switchover_prepare(NULL, m_master->m_server_base->server,
&promotion_target, &demotion_target, NULL);
if (ok_to_switch)
{
m_warn_switchover_precond = true;
bool switched = do_switchover(demotion_target, promotion_target, NULL);
if (switched)
{
MXS_NOTICE("Switchover %s -> %s performed.", demotion_target->name(), promotion_target->name());
}
else
{
report_and_disable("switchover", CN_SWITCHOVER_ON_LOW_DISK_SPACE,
&m_switchover_on_low_disk_space);
}
}
else
{
// Switchover was not attempted because of errors, however these errors are not permanent.
// Servers were not modified, so it's ok to try this again.
if (m_warn_switchover_precond)
{
MXS_WARNING("Not performing automatic switchover. Will keep retrying with this message "
"suppressed.");
m_warn_switchover_precond = false;
}
}
}
else
{
m_warn_switchover_precond = true;
}
}
void MariaDBMonitor::report_and_disable(const string& operation, const string& setting_name,
bool* setting_var)
{
string p1 = string_printf("Automatic %s failed, disabling automatic %s.", operation.c_str(),
operation.c_str());
string p2 = string_printf(RE_ENABLE_FMT, operation.c_str(), setting_name.c_str(), m_monitor->name);
string error_msg = p1 + " " + p2;
MXS_ERROR("%s", error_msg.c_str());
*setting_var = false;
disable_setting(setting_name.c_str());
}

View File

@ -31,9 +31,11 @@
#include "../../../core/internal/monitor.h"
using std::string;
using maxscale::string_printf;
// Config parameter names
const char * const CN_AUTO_FAILOVER = "auto_failover";
const char * const CN_SWITCHOVER_ON_LOW_DISK_SPACE = "switchover_on_low_disk_space";
const char * const CN_PROMOTION_SQL_FILE = "promotion_sql_file";
const char * const CN_DEMOTION_SQL_FILE = "demotion_sql_file";
@ -42,7 +44,6 @@ static const char CN_FAILCOUNT[] = "failcount";
static const char CN_ENFORCE_READONLY[] = "enforce_read_only_slaves";
static const char CN_NO_PROMOTE_SERVERS[] = "servers_no_promotion";
static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER_ON_LOW_DISK_SPACE[] = "switchover_on_low_disk_space";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_DETECT_STANDALONE_MASTER[] = "detect_standalone_master";
static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space";
@ -63,9 +64,9 @@ MariaDBMonitor::MariaDBMonitor(MXS_MONITOR* monitor)
, m_external_master_port(PORT_UNKNOWN)
, m_cluster_topology_changed(true)
, m_cluster_modified(false)
, m_switchover_on_low_disk_space(false)
, m_log_no_master(true)
, m_warn_failover_precond(true)
, m_warn_switchover_precond(true)
, m_warn_cannot_rejoin(true)
, m_warn_current_master_invalid(true)
, m_warn_have_better_master(true)
@ -143,6 +144,23 @@ MariaDBServer* MariaDBMonitor::get_server(int64_t id)
return (found != m_servers_by_id.end()) ? (*found).second : NULL;
}
/**
* Get the equivalent MariaDBServer.
*
* @param server Which server to search for
* @return MariaDBServer if found, NULL otherwise
*/
MariaDBServer* MariaDBMonitor::get_server(SERVER* server)
{
MariaDBServer* found = NULL;
auto mon_server = mon_get_monitored_server(m_monitor, server);
if (mon_server)
{
found = get_server_info(mon_server);
}
return found;
}
bool MariaDBMonitor::set_replication_credentials(const MXS_CONFIG_PARAMETER* params)
{
bool rval = false;
@ -486,11 +504,6 @@ void MariaDBMonitor::tick()
measure_replication_lag();
}
if (m_maintenance_on_low_disk_space)
{
set_low_disk_slaves_maintenance();
}
// Update shared status. The next functions read the shared status. TODO: change the following
// functions to read "pending_status" instead.
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
@ -555,6 +568,19 @@ void MariaDBMonitor::process_state_changes()
{
enforce_read_only_on_slaves();
}
/* Set low disk space slaves to maintenance.
*/
if (m_maintenance_on_low_disk_space && !m_cluster_modified)
{
set_low_disk_slaves_maintenance();
}
/* Check if the master server is on low disk space and act on it. */
if (m_switchover_on_low_disk_space && !m_cluster_modified)
{
handle_low_disk_space_master();
}
}
}
@ -1037,12 +1063,13 @@ bool MariaDBMonitor::execute_manual_command(std::function<void (void)> command,
return rval;
}
bool MariaDBMonitor::run_manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out)
bool MariaDBMonitor::run_manual_switchover(SERVER* promotion_server, SERVER* demotion_server,
json_t** error_out)
{
bool rval = false;
bool send_ok = execute_manual_command([this, &rval, new_master, current_master, error_out]()
bool send_ok = execute_manual_command([this, &rval, promotion_server, demotion_server, error_out]()
{
rval = manual_switchover(new_master, current_master, error_out);
rval = manual_switchover(promotion_server, demotion_server, error_out);
}, error_out);
return send_ok && rval;
}
@ -1092,9 +1119,9 @@ bool handle_manual_switchover(const MODULECMD_ARG* args, json_t** error_out)
{
MXS_MONITOR* mon = args->argv[0].value.monitor;
auto handle = static_cast<MariaDBMonitor*>(mon->instance);
SERVER* new_master = (args->argc >= 2) ? args->argv[1].value.server : NULL;
SERVER* current_master = (args->argc == 3) ? args->argv[2].value.server : NULL;
rval = handle->run_manual_switchover(new_master, current_master, error_out);
SERVER* promotion_server = (args->argc >= 2) ? args->argv[1].value.server : NULL;
SERVER* demotion_server = (args->argc == 3) ? args->argv[2].value.server : NULL;
rval = handle->run_manual_switchover(promotion_server, demotion_server, error_out);
}
return rval;
}

View File

@ -25,6 +25,7 @@
#include "mariadbserver.hh"
extern const char * const CN_AUTO_FAILOVER;
extern const char * const CN_SWITCHOVER_ON_LOW_DISK_SPACE;
extern const char * const CN_PROMOTION_SQL_FILE;
extern const char * const CN_DEMOTION_SQL_FILE;
@ -177,6 +178,7 @@ private:
bool m_warn_no_valid_outside_cycle; /**< Log a warning when a replication topology has no valid master
* outside of a cycle. */
bool m_warn_failover_precond; /**< Print failover preconditions error message? */
bool m_warn_switchover_precond; /**< Print switchover preconditions error message? */
bool m_warn_cannot_rejoin; /**< Print warning if auto_rejoin fails because of invalid gtid:s? */
bool m_warn_current_master_invalid; /**< Print warning if current master is not valid? */
bool m_warn_have_better_master; /**< Print warning if the current master is not the best one? */
@ -190,6 +192,7 @@ private:
bool set_replication_credentials(const MXS_CONFIG_PARAMETER* params);
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id);
MariaDBServer* get_server(SERVER* server);
bool execute_manual_command(GenericFunction command, json_t** error_out);
std::string diagnostics_to_string() const;
json_t* diagnostics_to_json() const;
@ -218,21 +221,21 @@ private:
void update_master_cycle_info();
void set_low_disk_slaves_maintenance();
void assign_new_master(MariaDBServer* new_master);
bool slaves_using_gtid(json_t** error_out);
// Switchover methods
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
bool switchover_check(SERVER* new_master, SERVER* current_master,
bool switchover_prepare(SERVER* new_master, SERVER* current_master,
MariaDBServer** new_master_out, MariaDBServer** current_master_out,
json_t** error_out);
bool switchover_check_new(const MariaDBServer* new_master_cand, json_t** error);
bool switchover_check_current(const MariaDBServer* suggested_curr_master, json_t** error_out) const;
bool do_switchover(MariaDBServer** current_master, MariaDBServer** new_master, json_t** err_out);
bool do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, json_t** error_out);
bool switchover_check_preferred_master(MariaDBServer* preferred, json_t** err_out);
bool switchover_demote_master(MariaDBServer* current_master,
json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerArray& slaves, const GtidList& gtid, int total_timeout,
json_t** err_out);
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
void handle_low_disk_space_master();
// Failover methods
bool manual_failover(json_t** output);
@ -252,8 +255,9 @@ private:
// Methods common to failover/switchover/rejoin
MariaDBServer* select_new_master(ServerArray* slaves_out, json_t** err_out);
MariaDBServer* switchover_select_promotion(MariaDBServer* current_master, json_t** err_out);
bool server_is_excluded(const MariaDBServer* server);
bool is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate,
bool is_candidate_better(const MariaDBServer* candidate, const MariaDBServer* current_best,
uint32_t gtid_domain, std::string* reason_out = NULL);
bool promote_new_master(MariaDBServer* new_master, json_t** err_out);
int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves,
@ -262,6 +266,7 @@ private:
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
bool wait_cluster_stabilization(MariaDBServer* new_master, const ServerArray& slaves,
int seconds_remaining);
void report_and_disable(const std::string& operation, const std::string& setting_name, bool* setting_var);
// Other methods
void disable_setting(const std::string& setting);

View File

@ -19,3 +19,15 @@ const int64_t SERVER_ID_UNKNOWN = -1;
const int64_t GTID_DOMAIN_UNKNOWN = -1;
/** Default port */
const int PORT_UNKNOWN = 0;
using std::string;
DelimitedPrinter::DelimitedPrinter(const string& separator)
: m_separator(separator)
{}
void DelimitedPrinter::cat(string& target, const string& addition)
{
target += m_current_separator + addition;
m_current_separator = m_separator;
}

View File

@ -20,6 +20,8 @@
#define MXS_MODULE_NAME "mariadbmon"
#include <maxscale/cppdefs.hh>
#include <string>
#include <maxscale/json_api.h>
/** Utility macro for printing both MXS_ERROR and json error */
@ -35,3 +37,25 @@
extern const int64_t SERVER_ID_UNKNOWN;
extern const int64_t GTID_DOMAIN_UNKNOWN;
extern const int PORT_UNKNOWN;
// Helper class for concatenating strings with a delimiter.
class DelimitedPrinter
{
private:
DelimitedPrinter(const DelimitedPrinter&) = delete;
DelimitedPrinter& operator = (const DelimitedPrinter&) = delete;
DelimitedPrinter() = delete;
public:
DelimitedPrinter(const std::string& separator);
/**
* Add to string.
*
* @param target String to modify
* @param addition String to add. The delimiter is printed before the addition.
*/
void cat(std::string& target, const std::string& addition);
private:
const std::string m_separator;
std::string m_current_separator;
};

View File

@ -276,11 +276,12 @@ bool MariaDBServer::update_gtids(string* errmsg_out)
return rval;
}
bool MariaDBServer::update_replication_settings()
bool MariaDBServer::update_replication_settings(std::string* error_out)
{
static const string query = "SELECT @@gtid_strict_mode, @@log_bin, @@log_slave_updates;";
string query_error;
bool rval = false;
auto result = execute_query(query);
auto result = execute_query(query, &query_error);
if (result.get() != NULL && result->next_row())
{
m_rpl_settings.gtid_strict_mode = result->get_bool(0);
@ -288,6 +289,10 @@ bool MariaDBServer::update_replication_settings()
m_rpl_settings.log_slave_updates = result->get_bool(2);
rval = true;
}
else if (error_out)
{
*error_out = query_error;
}
return rval;
}
@ -349,7 +354,7 @@ bool MariaDBServer::read_server_variables(string* errmsg_out)
return rval;
}
bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warnings)
bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warnings) const
{
bool rval = true;
const char* servername = name();
@ -433,6 +438,11 @@ bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t*
return gtid_reached;
}
bool MariaDBServer::binlog_on() const
{
return m_rpl_settings.log_bin;
}
bool MariaDBServer::is_master() const
{
return status_is_master(m_server_base->pending_status);
@ -468,6 +478,11 @@ bool MariaDBServer::is_relay_master() const
return status_is_relay(m_server_base->pending_status);
}
bool MariaDBServer::is_low_on_disk_space() const
{
return status_is_disk_space_exhausted(m_server_base->pending_status);
}
bool MariaDBServer::has_status(uint64_t bits) const
{
return (m_server_base->pending_status & bits) == bits;
@ -945,6 +960,96 @@ bool MariaDBServer::sstatus_arrays_topology_equal(const SlaveStatusArray& lhs, c
return rval;
}
/**
* Check if the server can be demoted.
*
* @param reason_out Output for the reason server cannot be demoted
* @return True, if suggested new master is a viable demotion candidate
*/
bool MariaDBServer::can_be_demoted(string* reason_out)
{
bool demotable = false;
string reason;
string query_error;
// TODO: Add relay server support
if (!is_master())
{
reason = "it is not the current master or it is in maintenance.";
}
else if (!update_replication_settings(&query_error))
{
reason = string_printf("it could not be queried: '%s'.", query_error.c_str());
}
else if (!binlog_on())
{
reason = "its binary log is disabled.";
}
else if (m_gtid_binlog_pos.empty())
{
reason = "it does not have a 'gtid_binlog_pos'.";
}
else
{
demotable = true;
}
if (!demotable && reason_out)
{
*reason_out = reason;
}
return demotable;
}
/**
* Check if the server can be promoted.
*
* @param demotion_target The server this should be promoted to
* @param reason_out Output for the reason server cannot be promoted
* @return True, if suggested new master is a viable promotion candidate
*/
bool MariaDBServer::can_be_promoted(const MariaDBServer* demotion_target, std::string* reason_out)
{
bool promotable = false;
string reason;
string query_error;
if (is_master())
{
reason = "it is already the master.";
}
// TODO: Check that the correct slave connection is working properly in case of switchover.
// For failover the connection may be in CONNECTING-stage.
else if (!is_replicating_from(demotion_target))
{
reason = string_printf("it is not replicating from '%s'.", demotion_target->name());
}
else if (!update_replication_settings(&query_error))
{
string_printf("it could not be queried: '%s'.", query_error.c_str());
}
else if (!binlog_on())
{
reason = "its binary log is disabled.";
}
else
{
promotable = true;
}
if (!promotable && reason_out)
{
*reason_out = reason;
}
return promotable;
}
bool MariaDBServer::is_replicating_from(const MariaDBServer* target)
{
// Not properly implemented yet, TODO
return is_slave();
}
string SlaveStatus::to_string() const
{
using std::setw;

View File

@ -149,7 +149,9 @@ public:
* read-only. */
NodeData m_node; /**< Replication topology data */
SlaveStatusArray m_slave_status; /**< Data returned from SHOW SLAVE STATUS */
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings */
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings. These are not
* normally queried from the server, call
* 'update_replication_settings' before use. */
bool m_print_update_errormsg;/**< Should an update error be printed. */
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
@ -196,9 +198,10 @@ public:
/**
* Query a few miscellaneous replication settings.
*
* @param error_out Query error output
* @return True on success
*/
bool update_replication_settings();
bool update_replication_settings(std::string* error_out = NULL);
/**
* Query and save server_id, read_only and (if 10.X) gtid_domain_id.
@ -214,7 +217,7 @@ public:
* @param print_on Print warnings or not
* @return True if log_bin is on
*/
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON);
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON) const;
/**
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server
@ -228,6 +231,21 @@ public:
*/
bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out);
/**
* Is the server replicating (or trying to) from the target server.
*
* @param target Immediate master or relay server
* @return True if replicating
*/
bool is_replicating_from(const MariaDBServer* target);
/**
* Is binary log on? 'update_replication_settings' should be ran before this function to query the data.
*
* @return True if server has binary log enabled
*/
bool binlog_on() const;
/**
* Check if server is a running master.
*
@ -273,6 +291,11 @@ public:
*/
bool is_relay_master() const;
/**
* Is the server low on disk space?
*/
bool is_low_on_disk_space() const;
/**
* Check if server has the given bits on in 'pending_status'.
*
@ -370,6 +393,23 @@ public:
*/
bool failover_wait_relay_log(int seconds_remaining, json_t** err_out);
/**
* Is the server a valid demotion target?
*
* @param reason_out Output explaining why server cannot be demoted
* @return True if server can be demoted by switchover
*/
bool can_be_demoted(std::string* reason_out);
/**
* Is the server a valid promotion target?
*
* @param demotion_target Which server would be demoted
* @param reason_out Output explaining why server cannot be promoted
* @return True if server can be promoted by switchover
*/
bool can_be_promoted(const MariaDBServer* demotion_target, std::string* reason_out);
/**
* Read the file contents and send them as sql queries to the server. Any data returned by the queries is
* discarded.