MXS-1944 Store switchover parameters in an object

This commit is contained in:
Esa Korhonen
2018-09-24 15:53:34 +03:00
parent c20a17238b
commit bfb1c3f1b3
2 changed files with 49 additions and 85 deletions

View File

@ -30,6 +30,8 @@ static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s
const char NO_SERVER[] = "Server '%s' is not monitored by '%s'."; const char NO_SERVER[] = "Server '%s' is not monitored by '%s'.";
const char FAILOVER_OK[] = "Failover '%s' -> '%s' performed."; const char FAILOVER_OK[] = "Failover '%s' -> '%s' performed.";
const char FAILOVER_FAIL[] = "Failover '%s' -> '%s' failed."; const char FAILOVER_FAIL[] = "Failover '%s' -> '%s' failed.";
const char SWITCHOVER_OK[] = "Switchover '%s' -> '%s' performed.";
const char SWITCHOVER_FAIL[] = "Switchover %s -> %s failed";
static void print_redirect_errors(MariaDBServer* first_server, static void print_redirect_errors(MariaDBServer* first_server,
const ServerArray& servers, const ServerArray& servers,
@ -52,29 +54,18 @@ bool MariaDBMonitor::manual_switchover(SERVER* promotion_server, SERVER* demotio
* so server states can be assumed to be up-to-date. * so server states can be assumed to be up-to-date.
*/ */
bool switchover_done = false; bool switchover_done = false;
MariaDBServer* promotion_target = NULL; auto op = switchover_prepare(promotion_server, demotion_server, Log::ON, error_out);
MariaDBServer* demotion_target = NULL; if (op)
auto ok_to_switch = switchover_prepare(promotion_server,
demotion_server,
Log::ON,
&promotion_target,
&demotion_target,
error_out);
if (ok_to_switch)
{ {
switchover_done = switchover_perform(promotion_target, demotion_target, error_out); switchover_done = switchover_perform(*op);
if (switchover_done) if (switchover_done)
{ {
MXS_NOTICE("Switchover '%s' -> '%s' performed.", MXS_NOTICE(SWITCHOVER_OK, op->demotion_target->name(), op->promotion_target->name());
demotion_target->name(),
promotion_target->name());
} }
else else
{ {
string msg = string_printf("Switchover %s -> %s failed", string msg = string_printf(SWITCHOVER_FAIL,
demotion_target->name(), op->demotion_target->name(), op->promotion_target->name());
promotion_target->name());
bool failover_setting = config_get_bool(m_monitor->parameters, CN_AUTO_FAILOVER); bool failover_setting = config_get_bool(m_monitor->parameters, CN_AUTO_FAILOVER);
if (failover_setting) if (failover_setting)
{ {
@ -684,21 +675,17 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t
* intermediate step fails, the cluster may be left without a master and manual intervention is * intermediate step fails, the cluster may be left without a master and manual intervention is
* required to fix things. * required to fix things.
* *
* @param promotion_target Server to promote * @param op Operation descriptor
* @param demotion_target Server to demote
* @param error_out Error output. Can be NULL.
* @return True if successful. If false, replication may be broken. * @return True if successful. If false, replication may be broken.
*/ */
bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target, bool MariaDBMonitor::switchover_perform(ClusterOperation& op)
MariaDBServer* demotion_target,
json_t** error_out)
{ {
MariaDBServer* const promotion_target = op.promotion_target;
MariaDBServer* const demotion_target = op.demotion_target;
json_t** const error_out = op.error_out;
mxb_assert(promotion_target && demotion_target); mxb_assert(promotion_target && demotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are maxbase::StopWatch timer;
// completed.
int seconds_remaining = m_switchover_timeout;
time_t start_time = time(NULL);
// Step 1: Save all slaves except promotion target to an array. // Step 1: Save all slaves except promotion target to an array.
// Try to redirect even disconnected slaves. // Try to redirect even disconnected slaves.
@ -711,21 +698,19 @@ bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target,
{ {
m_cluster_modified = true; m_cluster_modified = true;
bool catchup_and_promote_success = false; bool catchup_and_promote_success = false;
time_t step2_time = time(NULL); op.time_remaining -= timer.restart();
seconds_remaining -= difftime(step2_time, start_time);
// Step 3: Wait for the slaves (including promotion target) to catch up with master. // Step 3: Wait for the slaves (including promotion target) to catch up with master.
ServerArray catchup_slaves = redirectable_slaves; ServerArray catchup_slaves = redirectable_slaves;
catchup_slaves.push_back(promotion_target); catchup_slaves.push_back(promotion_target);
if (switchover_wait_slaves_catchup(catchup_slaves, if (switchover_wait_slaves_catchup(catchup_slaves,
demotion_target->m_gtid_binlog_pos, demotion_target->m_gtid_binlog_pos,
seconds_remaining, op.time_remaining.secs(),
error_out)) error_out))
{ {
time_t step3_time = time(NULL); auto step3_duration = timer.restart();
int seconds_step3 = difftime(step3_time, step2_time); MXS_DEBUG("Switchover: slave catchup took %.1f seconds.", step3_duration.secs());
MXS_DEBUG("Switchover: slave catchup took %d seconds.", seconds_step3); op.time_remaining -= step3_duration;
seconds_remaining -= seconds_step3;
// Step 4: On new master STOP and RESET SLAVE, set read-only to off. // Step 4: On new master STOP and RESET SLAVE, set read-only to off.
if (promote_new_master(promotion_target, error_out)) if (promote_new_master(promotion_target, error_out))
@ -745,8 +730,7 @@ bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target,
bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0; bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0;
if (success) if (success)
{ {
time_t step5_time = time(NULL); op.time_remaining -= timer.restart();
seconds_remaining -= difftime(step5_time, step3_time);
// Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves // Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves
// to receive it. If using external replication, skip this step. Come up with an // to receive it. If using external replication, skip this step. Come up with an
@ -758,16 +742,14 @@ bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target,
} }
else if (wait_cluster_stabilization(promotion_target, else if (wait_cluster_stabilization(promotion_target,
redirected_slaves, redirected_slaves,
seconds_remaining)) op.time_remaining.secs()))
{ {
rval = true; rval = true;
time_t step6_time = time(NULL); auto step6_duration = timer.restart();
int seconds_step6 = difftime(step6_time, step5_time); op.time_remaining -= step6_duration;
seconds_remaining -= seconds_step6; MXS_DEBUG("Switchover: slave replication confirmation took %.1f seconds with "
MXS_DEBUG("Switchover: slave replication confirmation took %d seconds with " "%.1f seconds to spare.",
"%d seconds to spare.", step6_duration.secs(), op.time_remaining.secs());
seconds_step6,
seconds_remaining);
} }
} }
else else
@ -1762,24 +1744,16 @@ static void print_redirect_errors(MariaDBServer* first_server,
* @param promotion_server The server which should be promoted. If null, monitor will autoselect. * @param promotion_server The server which should be promoted. If null, monitor will autoselect.
* @param demotion_server The server which should be demoted. Can be null for autoselect. * @param demotion_server The server which should be demoted. Can be null for autoselect.
* @param log_mode Logging mode * @param log_mode Logging mode
* @param promotion_target_out Output for promotion target
* @param demotion_target_out Output for demotion target
* @param error_out Error output * @param error_out Error output
* @return True if cluster is suitable and server parameters were valid * @return Operation object if cluster is suitable and switchover may proceed, or NULL on error
*/ */
bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server, unique_ptr<ClusterOperation> MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
SERVER* demotion_server, SERVER* demotion_server,
Log log_mode, Log log_mode,
MariaDBServer** promotion_target_out, json_t** error_out)
MariaDBServer** demotion_target_out,
json_t** error_out)
{ {
const auto op = OperationType::SWITCHOVER;
// Check that both servers are ok if specified, or autoselect them. Demotion target must be checked // Check that both servers are ok if specified, or autoselect them. Demotion target must be checked
// first since the promotion target depends on it. // first since the promotion target depends on it.
mxb_assert(promotion_target_out && demotion_target_out
&& !*promotion_target_out && !*demotion_target_out);
MariaDBServer* demotion_target = NULL; MariaDBServer* demotion_target = NULL;
string demotion_msg; string demotion_msg;
if (demotion_server) if (demotion_server)
@ -1823,6 +1797,7 @@ bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
} }
} }
const auto op_type = OperationType::SWITCHOVER;
MariaDBServer* promotion_target = NULL; MariaDBServer* promotion_target = NULL;
if (demotion_target) if (demotion_target)
{ {
@ -1835,7 +1810,7 @@ bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
{ {
PRINT_ERROR_IF(log_mode, error_out, NO_SERVER, promotion_server->name, m_monitor->name); PRINT_ERROR_IF(log_mode, error_out, NO_SERVER, promotion_server->name, m_monitor->name);
} }
else if (!promotion_candidate->can_be_promoted(op, demotion_target, &promotion_msg)) else if (!promotion_candidate->can_be_promoted(op_type, demotion_target, &promotion_msg))
{ {
const char msg[] = "'%s' is not a valid promotion target for switchover because %s"; const char msg[] = "'%s' is not a valid promotion target for switchover because %s";
PRINT_ERROR_IF(log_mode, error_out, msg, promotion_candidate->name(), promotion_msg.c_str()); PRINT_ERROR_IF(log_mode, error_out, msg, promotion_candidate->name(), promotion_msg.c_str());
@ -1849,7 +1824,7 @@ bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
{ {
// Autoselect. More involved than the autoselecting the demotion target. // Autoselect. More involved than the autoselecting the demotion target.
MariaDBServer* promotion_candidate = select_promotion_target(demotion_target, MariaDBServer* promotion_candidate = select_promotion_target(demotion_target,
op, op_type,
log_mode, log_mode,
error_out); error_out);
if (promotion_candidate) if (promotion_candidate)
@ -1869,13 +1844,16 @@ bool MariaDBMonitor::switchover_prepare(SERVER* promotion_server,
gtid_ok = check_gtid_replication(log_mode, demotion_target, error_out); gtid_ok = check_gtid_replication(log_mode, demotion_target, error_out);
} }
unique_ptr<ClusterOperation> rval;
if (promotion_target && demotion_target && gtid_ok) if (promotion_target && demotion_target && gtid_ok)
{ {
*demotion_target_out = demotion_target; maxbase::Duration time_limit((double)m_switchover_timeout);
*promotion_target_out = promotion_target; rval.reset(new ClusterOperation(op_type,
return true; promotion_target, demotion_target,
demotion_target == m_master, m_handle_event_scheduler,
error_out, time_limit));
} }
return false; return rval;
} }
void MariaDBMonitor::enforce_read_only_on_slaves() void MariaDBMonitor::enforce_read_only_on_slaves()
@ -1925,27 +1903,19 @@ void MariaDBMonitor::handle_low_disk_space_master()
// Looks like the master should be swapped out. Before trying it, check if there is even // Looks like the master should be swapped out. Before trying it, check if there is even
// a likely valid slave to swap to. // a likely valid slave to swap to.
MariaDBServer* demotion_target = NULL;
MariaDBServer* promotion_target = NULL;
Log log_mode = m_warn_switchover_precond ? Log::ON : Log::OFF; Log log_mode = m_warn_switchover_precond ? Log::ON : Log::OFF;
auto ok_to_switch = switchover_prepare(NULL, auto op = switchover_prepare(NULL, m_master->m_server_base->server, log_mode, NULL);
m_master->m_server_base->server, if (op)
log_mode,
&promotion_target,
&demotion_target,
NULL);
if (ok_to_switch)
{ {
m_warn_switchover_precond = true; m_warn_switchover_precond = true;
bool switched = switchover_perform(promotion_target, demotion_target, NULL); bool switched = switchover_perform(*op);
if (switched) if (switched)
{ {
MXS_NOTICE("Switchover %s -> %s performed.", MXS_NOTICE(SWITCHOVER_OK, op->demotion_target->name(), op->promotion_target->name());
demotion_target->name(),
promotion_target->name());
} }
else else
{ {
MXS_ERROR(SWITCHOVER_FAIL, op->demotion_target->name(), op->promotion_target->name());
report_and_disable("switchover", report_and_disable("switchover",
CN_SWITCHOVER_ON_LOW_DISK_SPACE, CN_SWITCHOVER_ON_LOW_DISK_SPACE,
&m_switchover_on_low_disk_space); &m_switchover_on_low_disk_space);

View File

@ -234,15 +234,9 @@ private:
void check_cluster_operations_support(); void check_cluster_operations_support();
// Switchover methods // Switchover methods
bool switchover_prepare(SERVER* new_master, std::unique_ptr<ClusterOperation> switchover_prepare(SERVER* new_master, SERVER* current_master,
SERVER* current_master, Log log_mode, json_t** error_out);
Log log_mode, bool switchover_perform(ClusterOperation& operation);
MariaDBServer** promotion_target_out,
MariaDBServer** demotion_target_out,
json_t** error_out);
bool switchover_perform(MariaDBServer* promotion_target,
MariaDBServer* demotion_target,
json_t** error_out);
bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out); bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerArray& slaves, bool switchover_wait_slaves_catchup(const ServerArray& slaves,
const GtidList& gtid, const GtidList& gtid,