Cleanup failover/switchover

Replication settings warnings are printed once more. Changed some
parameter names to be more consistent within the monitor.
This commit is contained in:
Esa Korhonen
2018-08-21 18:16:49 +03:00
parent f93578c089
commit 61bb172033
4 changed files with 64 additions and 69 deletions

View File

@ -53,7 +53,7 @@ bool MariaDBMonitor::manual_switchover(SERVER* promotion_server, SERVER* demotio
error_out); error_out);
if (ok_to_switch) if (ok_to_switch)
{ {
switchover_done = do_switchover(demotion_target, promotion_target, error_out); switchover_done = switchover_perform(promotion_target, demotion_target, error_out);
if (switchover_done) if (switchover_done)
{ {
MXS_NOTICE("Switchover '%s' -> '%s' performed.", MXS_NOTICE("Switchover '%s' -> '%s' performed.",
@ -89,7 +89,7 @@ bool MariaDBMonitor::manual_failover(json_t** output)
bool ok_to_failover = failover_prepare(Log::ON, &promotion_target, &demotion_target, output); bool ok_to_failover = failover_prepare(Log::ON, &promotion_target, &demotion_target, output);
if (ok_to_failover) if (ok_to_failover)
{ {
failover_done = do_failover(promotion_target, demotion_target, output); failover_done = failover_perform(promotion_target, demotion_target, output);
if (failover_done) if (failover_done)
{ {
MXS_NOTICE("Failover '%s' -> '%s' performed.", MXS_NOTICE("Failover '%s' -> '%s' performed.",
@ -467,15 +467,15 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t
* intermediate step fails, the cluster may be left without a master and manual intervention is * intermediate step fails, the cluster may be left without a master and manual intervention is
* required to fix things. * required to fix things.
* *
* @param demotion_target Server to demote
* @param promotion_target Server to promote * @param promotion_target Server to promote
* @param err_out json object for error printing. Can be NULL. * @param demotion_target Server to demote
* @param error_out Error output. Can be NULL.
* @return True if successful. If false, replication may be broken. * @return True if successful. If false, replication may be broken.
*/ */
bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** err_out) json_t** error_out)
{ {
mxb_assert(demotion_target && promotion_target); mxb_assert(promotion_target && demotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are // Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed. // completed.
@ -489,7 +489,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
bool rval = false; bool rval = false;
// Step 2: Set read-only to on, flush logs, update master gtid:s // Step 2: Set read-only to on, flush logs, update master gtid:s
if (switchover_demote_master(demotion_target, err_out)) if (switchover_demote_master(demotion_target, error_out))
{ {
m_cluster_modified = true; m_cluster_modified = true;
bool catchup_and_promote_success = false; bool catchup_and_promote_success = false;
@ -500,7 +500,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
ServerArray catchup_slaves = redirectable_slaves; ServerArray catchup_slaves = redirectable_slaves;
catchup_slaves.push_back(promotion_target); catchup_slaves.push_back(promotion_target);
if (switchover_wait_slaves_catchup(catchup_slaves, demotion_target->m_gtid_binlog_pos, if (switchover_wait_slaves_catchup(catchup_slaves, demotion_target->m_gtid_binlog_pos,
seconds_remaining, err_out)) seconds_remaining, error_out))
{ {
time_t step3_time = time(NULL); time_t step3_time = time(NULL);
int seconds_step3 = difftime(step3_time, step2_time); int seconds_step3 = difftime(step3_time, step2_time);
@ -508,7 +508,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
seconds_remaining -= seconds_step3; seconds_remaining -= seconds_step3;
// Step 4: On new master STOP and RESET SLAVE, set read-only to off. // Step 4: On new master STOP and RESET SLAVE, set read-only to off.
if (promote_new_master(promotion_target, err_out)) if (promote_new_master(promotion_target, error_out))
{ {
catchup_and_promote_success = true; catchup_and_promote_success = true;
m_next_master = promotion_target; m_next_master = promotion_target;
@ -549,7 +549,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
} }
else else
{ {
print_redirect_errors(demotion_target, redirectable_slaves, err_out); print_redirect_errors(demotion_target, redirectable_slaves, error_out);
} }
} }
} }
@ -560,11 +560,11 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
const char QUERY_UNDO[] = "SET GLOBAL read_only=0;"; const char QUERY_UNDO[] = "SET GLOBAL read_only=0;";
if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0) if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0)
{ {
PRINT_MXS_JSON_ERROR(err_out, "read_only disabled on server %s.", demotion_target->name()); PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server %s.", demotion_target->name());
} }
else else
{ {
PRINT_MXS_JSON_ERROR(err_out, "Could not disable read_only on server %s: '%s'.", PRINT_MXS_JSON_ERROR(error_out, "Could not disable read_only on server %s: '%s'.",
demotion_target->name(), demotion_target->name(),
mysql_error(demotion_target->m_server_base->con)); mysql_error(demotion_target->m_server_base->con));
} }
@ -572,7 +572,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
// Try to reactivate external replication if any. // Try to reactivate external replication if any.
if (m_external_master_port != PORT_UNKNOWN) if (m_external_master_port != PORT_UNKNOWN)
{ {
start_external_replication(promotion_target, err_out); start_external_replication(promotion_target, error_out);
} }
} }
} }
@ -582,14 +582,16 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
/** /**
* Performs failover for a simple topology (1 master, N slaves, no intermediate masters). * Performs failover for a simple topology (1 master, N slaves, no intermediate masters).
* *
* @param demotion_target Server to demote
* @param promotion_target Server to promote * @param promotion_target Server to promote
* @param err_out Error output * @param demotion_target Server to demote
* @param error_out Error output. Can be NULL.
* @return True if successful * @return True if successful
*/ */
bool MariaDBMonitor::do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target, bool MariaDBMonitor::failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out) json_t** error_out)
{ {
mxb_assert(promotion_target && demotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are // Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed. // completed.
int seconds_remaining = m_failover_timeout; int seconds_remaining = m_failover_timeout;
@ -946,6 +948,17 @@ bool MariaDBMonitor::promote_new_master(MariaDBServer* new_master, json_t** err_
return success; return success;
} }
/**
* Select a promotion target for failover/switchover. Looks at the slaves of 'demotion_target' and selects
* the server with the most up-do-date event or, if events are equal, the one with the best settings and
* status.
*
* @param demotion_target The former master server/relay
* @param op Switchover or failover
* @param log_mode Print log or operate silently
* @param error_out Error output
* @return The selected promotion target or NULL if no valid candidates
*/
MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_target, MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_target,
ClusterOperation op, Log log_mode, ClusterOperation op, Log log_mode,
json_t** error_out) json_t** error_out)
@ -993,6 +1006,11 @@ MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_t
else else
{ {
candidates.push_back(cand); candidates.push_back(cand);
// Print some warnings about the candidate server.
if (log_mode == Log::ON)
{
cand->warn_replication_settings();
}
} }
} }
@ -1297,7 +1315,7 @@ void MariaDBMonitor::handle_auto_failover()
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.", MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
failed_master->name()); failed_master->name());
failed_master->m_server_base->new_event = false; failed_master->m_server_base->new_event = false;
if (!do_failover(promotion_target, demotion_target, NULL)) if (!failover_perform(promotion_target, demotion_target, NULL))
{ {
report_and_disable("failover", CN_AUTO_FAILOVER, &m_auto_failover); report_and_disable("failover", CN_AUTO_FAILOVER, &m_auto_failover);
} }
@ -1611,7 +1629,7 @@ void MariaDBMonitor::handle_low_disk_space_master()
if (ok_to_switch) if (ok_to_switch)
{ {
m_warn_switchover_precond = true; m_warn_switchover_precond = true;
bool switched = do_switchover(demotion_target, promotion_target, NULL); bool switched = switchover_perform(promotion_target, demotion_target, NULL);
if (switched) if (switched)
{ {
MXS_NOTICE("Switchover %s -> %s performed.", MXS_NOTICE("Switchover %s -> %s performed.",

View File

@ -213,26 +213,26 @@ private:
void assign_new_master(MariaDBServer* new_master); void assign_new_master(MariaDBServer* new_master);
// Switchover methods // Switchover methods
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
bool switchover_prepare(SERVER* new_master, SERVER* current_master, Log log_mode, bool switchover_prepare(SERVER* new_master, SERVER* current_master, Log log_mode,
MariaDBServer** new_master_out, MariaDBServer** current_master_out, MariaDBServer** promotion_target_out, MariaDBServer** demotion_target_out,
json_t** error_out); json_t** error_out);
bool do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, json_t** error_out); bool switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** error_out);
bool switchover_demote_master(MariaDBServer* current_master, bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out);
json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerArray& slaves, const GtidList& gtid, int total_timeout, bool switchover_wait_slaves_catchup(const ServerArray& slaves, const GtidList& gtid, int total_timeout,
json_t** err_out); json_t** err_out);
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master); bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
void handle_low_disk_space_master(); void handle_low_disk_space_master();
// Failover methods // Failover methods
bool manual_failover(json_t** output);
void handle_auto_failover();
bool cluster_supports_failover(std::string* reasons_out);
bool slave_receiving_events();
bool failover_prepare(Log log_mode, MariaDBServer** promotion_target_out, bool failover_prepare(Log log_mode, MariaDBServer** promotion_target_out,
MariaDBServer** demotion_target_out, json_t** error_out); MariaDBServer** demotion_target_out, json_t** error_out);
bool do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** err_out); bool failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out);
bool cluster_supports_failover(std::string* reasons_out);
bool slave_receiving_events();
bool manual_failover(json_t** output);
void handle_auto_failover();
// Rejoin methods // Rejoin methods
bool manual_rejoin(SERVER* rejoin_server, json_t** output); bool manual_rejoin(SERVER* rejoin_server, json_t** output);

View File

@ -353,38 +353,23 @@ bool MariaDBServer::read_server_variables(string* errmsg_out)
return rval; return rval;
} }
bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warnings) const void MariaDBServer::warn_replication_settings() const
{ {
bool rval = true;
const char* servername = name(); const char* servername = name();
if (m_rpl_settings.log_bin == false) if (m_rpl_settings.gtid_strict_mode == false)
{ {
if (print_warnings == WARNINGS_ON) const char NO_STRICT[] =
{ "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. "
const char NO_BINLOG[] = "For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
"Slave '%s' has binary log disabled and is not a valid promotion candidate."; MXS_WARNING(NO_STRICT, servername);
MXS_WARNING(NO_BINLOG, servername);
}
rval = false;
} }
else if (print_warnings == WARNINGS_ON) if (m_rpl_settings.log_slave_updates == false)
{ {
if (m_rpl_settings.gtid_strict_mode == false) const char NO_SLAVE_UPDATES[] =
{ "Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication "
const char NO_STRICT[] = "will break for lagging slaves if '%s' is promoted.";
"Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. " MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
"For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
MXS_WARNING(NO_STRICT, servername);
}
if (m_rpl_settings.log_slave_updates == false)
{
const char NO_SLAVE_UPDATES[] =
"Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication "
"will break for lagging slaves if '%s' is promoted.";
MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
}
} }
return rval;
} }
bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out) bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
@ -1006,7 +991,7 @@ bool MariaDBServer::can_be_demoted_switchover(string* reason_out)
} }
else if (!update_replication_settings(&query_error)) else if (!update_replication_settings(&query_error))
{ {
reason = string_printf("it could not be queried: '%s'.", query_error.c_str()); reason = string_printf("it could not be queried: %s", query_error.c_str());
} }
else if (!binlog_on()) else if (!binlog_on())
{ {
@ -1084,7 +1069,7 @@ bool MariaDBServer::can_be_promoted(ClusterOperation op,
} }
else if (!update_replication_settings(&query_error)) else if (!update_replication_settings(&query_error))
{ {
reason = string_printf("it could not be queried: '%s'.", query_error.c_str()); reason = string_printf("it could not be queried: %s", query_error.c_str());
} }
else if (!binlog_on()) else if (!binlog_on())
{ {

View File

@ -17,12 +17,6 @@
#include <maxscale/monitor.h> #include <maxscale/monitor.h>
#include "gtid.hh" #include "gtid.hh"
enum print_repl_warnings_t
{
WARNINGS_ON,
WARNINGS_OFF
};
class QueryResult; class QueryResult;
class MariaDBServer; class MariaDBServer;
// Server pointer array // Server pointer array
@ -230,12 +224,10 @@ public:
bool read_server_variables(std::string* errmsg_out = NULL); bool read_server_variables(std::string* errmsg_out = NULL);
/** /**
* Check if server has binary log enabled. Print warnings if gtid_strict_mode or log_slave_updates is off. * Print warnings if gtid_strict_mode or log_slave_updates is off. Does not query the server,
* * so 'update_replication_settings' should have been called recently to update the values.
* @param print_on Print warnings or not
* @return True if log_bin is on
*/ */
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON) const; void warn_replication_settings() const;
/** /**
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server * Wait until server catches up to the target gtid. Only considers gtid domains common to this server