Cleanup failover/switchover

Replication settings warnings are printed once more. Changed some
parameter names to be more consistent within the monitor.
This commit is contained in:
Esa Korhonen 2018-08-21 18:16:49 +03:00
parent f93578c089
commit 61bb172033
4 changed files with 64 additions and 69 deletions

View File

@ -53,7 +53,7 @@ bool MariaDBMonitor::manual_switchover(SERVER* promotion_server, SERVER* demotio
error_out);
if (ok_to_switch)
{
switchover_done = do_switchover(demotion_target, promotion_target, error_out);
switchover_done = switchover_perform(promotion_target, demotion_target, error_out);
if (switchover_done)
{
MXS_NOTICE("Switchover '%s' -> '%s' performed.",
@ -89,7 +89,7 @@ bool MariaDBMonitor::manual_failover(json_t** output)
bool ok_to_failover = failover_prepare(Log::ON, &promotion_target, &demotion_target, output);
if (ok_to_failover)
{
failover_done = do_failover(promotion_target, demotion_target, output);
failover_done = failover_perform(promotion_target, demotion_target, output);
if (failover_done)
{
MXS_NOTICE("Failover '%s' -> '%s' performed.",
@ -467,15 +467,15 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t
* intermediate step fails, the cluster may be left without a master and manual intervention is
* required to fix things.
*
* @param demotion_target Server to demote
* @param promotion_target Server to promote
* @param err_out json object for error printing. Can be NULL.
* @param demotion_target Server to demote
* @param error_out Error output. Can be NULL.
* @return True if successful. If false, replication may be broken.
*/
bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target,
json_t** err_out)
bool MariaDBMonitor::switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out)
{
mxb_assert(demotion_target && promotion_target);
mxb_assert(promotion_target && demotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed.
@ -489,7 +489,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
bool rval = false;
// Step 2: Set read-only to on, flush logs, update master gtid:s
if (switchover_demote_master(demotion_target, err_out))
if (switchover_demote_master(demotion_target, error_out))
{
m_cluster_modified = true;
bool catchup_and_promote_success = false;
@ -500,7 +500,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
ServerArray catchup_slaves = redirectable_slaves;
catchup_slaves.push_back(promotion_target);
if (switchover_wait_slaves_catchup(catchup_slaves, demotion_target->m_gtid_binlog_pos,
seconds_remaining, err_out))
seconds_remaining, error_out))
{
time_t step3_time = time(NULL);
int seconds_step3 = difftime(step3_time, step2_time);
@ -508,7 +508,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
seconds_remaining -= seconds_step3;
// Step 4: On new master STOP and RESET SLAVE, set read-only to off.
if (promote_new_master(promotion_target, err_out))
if (promote_new_master(promotion_target, error_out))
{
catchup_and_promote_success = true;
m_next_master = promotion_target;
@ -549,7 +549,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
}
else
{
print_redirect_errors(demotion_target, redirectable_slaves, err_out);
print_redirect_errors(demotion_target, redirectable_slaves, error_out);
}
}
}
@ -560,11 +560,11 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
const char QUERY_UNDO[] = "SET GLOBAL read_only=0;";
if (mxs_mysql_query(demotion_target->m_server_base->con, QUERY_UNDO) == 0)
{
PRINT_MXS_JSON_ERROR(err_out, "read_only disabled on server %s.", demotion_target->name());
PRINT_MXS_JSON_ERROR(error_out, "read_only disabled on server %s.", demotion_target->name());
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "Could not disable read_only on server %s: '%s'.",
PRINT_MXS_JSON_ERROR(error_out, "Could not disable read_only on server %s: '%s'.",
demotion_target->name(),
mysql_error(demotion_target->m_server_base->con));
}
@ -572,7 +572,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
// Try to reactivate external replication if any.
if (m_external_master_port != PORT_UNKNOWN)
{
start_external_replication(promotion_target, err_out);
start_external_replication(promotion_target, error_out);
}
}
}
@ -582,14 +582,16 @@ bool MariaDBMonitor::do_switchover(MariaDBServer* demotion_target, MariaDBServer
/**
* Performs failover for a simple topology (1 master, N slaves, no intermediate masters).
*
* @param demotion_target Server to demote
* @param promotion_target Server to promote
* @param err_out Error output
* @param demotion_target Server to demote
* @param error_out Error output. Can be NULL.
* @return True if successful
*/
bool MariaDBMonitor::do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out)
bool MariaDBMonitor::failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out)
{
mxb_assert(promotion_target && demotion_target);
// Total time limit on how long this operation may take. Checked and modified after significant steps are
// completed.
int seconds_remaining = m_failover_timeout;
@ -946,6 +948,17 @@ bool MariaDBMonitor::promote_new_master(MariaDBServer* new_master, json_t** err_
return success;
}
/**
* Select a promotion target for failover/switchover. Looks at the slaves of 'demotion_target' and selects
* the server with the most up-do-date event or, if events are equal, the one with the best settings and
* status.
*
* @param demotion_target The former master server/relay
* @param op Switchover or failover
* @param log_mode Print log or operate silently
* @param error_out Error output
* @return The selected promotion target or NULL if no valid candidates
*/
MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_target,
ClusterOperation op, Log log_mode,
json_t** error_out)
@ -993,6 +1006,11 @@ MariaDBServer* MariaDBMonitor::select_promotion_target(MariaDBServer* demotion_t
else
{
candidates.push_back(cand);
// Print some warnings about the candidate server.
if (log_mode == Log::ON)
{
cand->warn_replication_settings();
}
}
}
@ -1297,7 +1315,7 @@ void MariaDBMonitor::handle_auto_failover()
MXS_NOTICE("Performing automatic failover to replace failed master '%s'.",
failed_master->name());
failed_master->m_server_base->new_event = false;
if (!do_failover(promotion_target, demotion_target, NULL))
if (!failover_perform(promotion_target, demotion_target, NULL))
{
report_and_disable("failover", CN_AUTO_FAILOVER, &m_auto_failover);
}
@ -1611,7 +1629,7 @@ void MariaDBMonitor::handle_low_disk_space_master()
if (ok_to_switch)
{
m_warn_switchover_precond = true;
bool switched = do_switchover(demotion_target, promotion_target, NULL);
bool switched = switchover_perform(promotion_target, demotion_target, NULL);
if (switched)
{
MXS_NOTICE("Switchover %s -> %s performed.",

View File

@ -213,26 +213,26 @@ private:
void assign_new_master(MariaDBServer* new_master);
// Switchover methods
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
bool switchover_prepare(SERVER* new_master, SERVER* current_master, Log log_mode,
MariaDBServer** new_master_out, MariaDBServer** current_master_out,
MariaDBServer** promotion_target_out, MariaDBServer** demotion_target_out,
json_t** error_out);
bool do_switchover(MariaDBServer* demotion_target, MariaDBServer* promotion_target, json_t** error_out);
bool switchover_demote_master(MariaDBServer* current_master,
json_t** err_out);
bool switchover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** error_out);
bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerArray& slaves, const GtidList& gtid, int total_timeout,
json_t** err_out);
bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master);
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);
void handle_low_disk_space_master();
// Failover methods
bool manual_failover(json_t** output);
void handle_auto_failover();
bool cluster_supports_failover(std::string* reasons_out);
bool slave_receiving_events();
bool failover_prepare(Log log_mode, MariaDBServer** promotion_target_out,
MariaDBServer** demotion_target_out, json_t** error_out);
bool do_failover(MariaDBServer* promotion_target, MariaDBServer* demotion_target, json_t** err_out);
bool failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out);
bool cluster_supports_failover(std::string* reasons_out);
bool slave_receiving_events();
bool manual_failover(json_t** output);
void handle_auto_failover();
// Rejoin methods
bool manual_rejoin(SERVER* rejoin_server, json_t** output);

View File

@ -353,38 +353,23 @@ bool MariaDBServer::read_server_variables(string* errmsg_out)
return rval;
}
bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warnings) const
void MariaDBServer::warn_replication_settings() const
{
bool rval = true;
const char* servername = name();
if (m_rpl_settings.log_bin == false)
if (m_rpl_settings.gtid_strict_mode == false)
{
if (print_warnings == WARNINGS_ON)
{
const char NO_BINLOG[] =
"Slave '%s' has binary log disabled and is not a valid promotion candidate.";
MXS_WARNING(NO_BINLOG, servername);
}
rval = false;
const char NO_STRICT[] =
"Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. "
"For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
MXS_WARNING(NO_STRICT, servername);
}
else if (print_warnings == WARNINGS_ON)
if (m_rpl_settings.log_slave_updates == false)
{
if (m_rpl_settings.gtid_strict_mode == false)
{
const char NO_STRICT[] =
"Slave '%s' has gtid_strict_mode disabled. Enabling this setting is recommended. "
"For more information, see https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
MXS_WARNING(NO_STRICT, servername);
}
if (m_rpl_settings.log_slave_updates == false)
{
const char NO_SLAVE_UPDATES[] =
"Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication "
"will break for lagging slaves if '%s' is promoted.";
MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
}
const char NO_SLAVE_UPDATES[] =
"Slave '%s' has log_slave_updates disabled. It is a valid candidate but replication "
"will break for lagging slaves if '%s' is promoted.";
MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
}
return rval;
}
bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
@ -1006,7 +991,7 @@ bool MariaDBServer::can_be_demoted_switchover(string* reason_out)
}
else if (!update_replication_settings(&query_error))
{
reason = string_printf("it could not be queried: '%s'.", query_error.c_str());
reason = string_printf("it could not be queried: %s", query_error.c_str());
}
else if (!binlog_on())
{
@ -1084,7 +1069,7 @@ bool MariaDBServer::can_be_promoted(ClusterOperation op,
}
else if (!update_replication_settings(&query_error))
{
reason = string_printf("it could not be queried: '%s'.", query_error.c_str());
reason = string_printf("it could not be queried: %s", query_error.c_str());
}
else if (!binlog_on())
{

View File

@ -17,12 +17,6 @@
#include <maxscale/monitor.h>
#include "gtid.hh"
enum print_repl_warnings_t
{
WARNINGS_ON,
WARNINGS_OFF
};
class QueryResult;
class MariaDBServer;
// Server pointer array
@ -230,12 +224,10 @@ public:
bool read_server_variables(std::string* errmsg_out = NULL);
/**
* Check if server has binary log enabled. Print warnings if gtid_strict_mode or log_slave_updates is off.
*
* @param print_on Print warnings or not
* @return True if log_bin is on
* Print warnings if gtid_strict_mode or log_slave_updates is off. Does not query the server,
* so 'update_replication_settings' should have been called recently to update the values.
*/
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON) const;
void warn_replication_settings() const;
/**
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server