diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 8e5b730b3..ab253bbb6 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -510,7 +510,7 @@ int MariaDBMonitor::redirect_slaves_ex(ClusterOperation& op, else { // No conflict, redirect as normal. - if (redirectable->redirect_existing_slave_conn(op, from, to)) + if (redirectable->redirect_existing_slave_conn(op.general, from, to)) { successes++; redirected->push_back(redirectable); @@ -786,11 +786,11 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) // Step 2: Wait for the promotion target to catch up with the demotion target. Disregard the other // slaves of the promotion target to avoid needless waiting. // The gtid:s of the demotion target were updated at the end of demotion. - if (promotion_target->catchup_to_master(op)) + if (promotion_target->catchup_to_master(op.general, demotion_target->m_gtid_binlog_pos)) { MXS_INFO("Switchover: Catchup took %.1f seconds.", timer.lap().secs()); // Step 3: On new master: remove slave connections, set read-only to OFF etc. - if (promotion_target->promote(op)) + if (promotion_target->promote(op.general, *op.promotion, op.type, demotion_target)) { // Point of no return. Even if following steps fail, do not try to undo. // Switchover considered at least partially successful. @@ -804,7 +804,8 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) // Step 4: Start replication on old master and redirect slaves. ServerArray redirected_to_promo_target; - if (demotion_target->copy_slave_conns(op, op.demotion->conns_to_copy, promotion_target)) + if (demotion_target->copy_slave_conns(op.general, op.demotion->conns_to_copy, + promotion_target)) { redirected_to_promo_target.push_back(demotion_target); } @@ -869,7 +870,7 @@ bool MariaDBMonitor::failover_perform(ClusterOperation& op) bool rval = false; // Step 1: Stop and reset slave, set read-only to OFF. - if (promotion_target->promote(op)) + if (promotion_target->promote(op.general, *op.promotion, op.type, op.demotion_target)) { // Point of no return. Even if following steps fail, do not try to undo. Failover considered // at least partially successful. @@ -1827,3 +1828,22 @@ ServerArray MariaDBMonitor::get_redirectables(const MariaDBServer* old_master, } return redirectable_slaves; } + +ClusterOperation::ClusterOperation(OperationType type, ServerOperation* dem_op, ServerOperation* prom_op, + MariaDBServer* promotion_target, MariaDBServer* demotion_target, + string& replication_user, string& replication_password, + json_t** error, maxbase::Duration time_remaining) + : type(type) + , demotion(dem_op) + , promotion(prom_op) + , general(type, replication_user, replication_password, error, time_remaining) + , promotion_target(promotion_target) + , demotion_target(demotion_target) +{ +} + +ClusterOperation::~ClusterOperation() +{ + delete demotion; + delete promotion; +} diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index f8fd9a0d8..e24cc5b79 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -36,6 +36,33 @@ typedef std::unordered_map IdToServerMap; // Map of cycle number to cycle members. The elements should be ordered for predictability when iterating. typedef std::map CycleMap; +/** + * Class which encapsulates many settings and status descriptors for a failover/switchover. + * Is more convenient to pass around than the separate elements. Most fields are constants or constant + * pointers since they should not change during an operation. + */ +class ClusterOperation +{ +private: + ClusterOperation(const ClusterOperation&) = delete; + ClusterOperation& operator=(const ClusterOperation&) = delete; + +public: + const OperationType type; // Failover or switchover + ServerOperation* const demotion; // Required by MariaDBServer->demote() + ServerOperation* const promotion; // Required by MariaDBServer->promote() + GeneralOpData general; // General operation data + + MariaDBServer* const promotion_target; // Which server will be promoted + MariaDBServer* const demotion_target; // Which server will be demoted + + ClusterOperation(OperationType type, ServerOperation* dem_op, ServerOperation* prom_op, + MariaDBServer* promotion_target, MariaDBServer* demotion_target, + std::string& replication_user, std::string& replication_password, + json_t** error, maxbase::Duration time_remaining); + ~ClusterOperation(); +}; + // MariaDB Monitor instance data class MariaDBMonitor : public maxscale::MonitorInstance { @@ -120,6 +147,25 @@ private: ON }; + class FailoverParams + { + public: + const MariaDBServer* const demotion_target; + ServerOperation promotion; // Required by MariaDBServer->promote() + GeneralOpData general; + + FailoverParams(const MariaDBServer* demotion_target, ServerOperation promotion, + GeneralOpData general); + }; + + class SwitchoverParams + { + public: + ServerOperation demotion; // Required by MariaDBServer->demote() + ServerOperation promotion; // Required by MariaDBServer->promote() + GeneralOpData general; + }; + // Information about a multimaster group (replication cycle) struct CycleInfo { diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 9c400b29a..9fe304037 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -498,7 +498,7 @@ void MariaDBServer::warn_replication_settings() const } } -bool MariaDBServer::catchup_to_master(ClusterOperation& op) +bool MariaDBServer::catchup_to_master(GeneralOpData& op, const GtidList& target) { /* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on, * use gtid_current_pos. */ @@ -506,8 +506,7 @@ bool MariaDBServer::catchup_to_master(ClusterOperation& op) bool time_is_up = false; // Check at least once. bool gtid_reached = false; bool error = false; - const GtidList& target = op.demotion_target->m_gtid_binlog_pos; - json_t** error_out = op.general.error_out; + json_t** error_out = op.error_out; Duration sleep_time(0.2); // How long to sleep before next iteration. Incremented slowly. StopWatch timer; @@ -525,11 +524,11 @@ bool MariaDBServer::catchup_to_master(ClusterOperation& op) else { // Query was successful but target gtid not yet reached. Check how much time left. - op.general.time_remaining -= timer.lap(); - if (op.general.time_remaining.secs() > 0) + op.time_remaining -= timer.lap(); + if (op.time_remaining.secs() > 0) { // Sleep for a moment, then try again. - Duration this_sleep = MXS_MIN(sleep_time, op.general.time_remaining); + Duration this_sleep = MXS_MIN(sleep_time, op.time_remaining); std::this_thread::sleep_for(this_sleep); sleep_time += Duration(0.1); // Sleep a bit more next iteration. } @@ -1448,18 +1447,19 @@ bool MariaDBServer::reset_all_slave_conns(json_t** error_out) return !error; } -bool MariaDBServer::promote(ClusterOperation& op) +bool MariaDBServer::promote(GeneralOpData& general, ServerOperation& promotion, OperationType type, + const MariaDBServer* demotion_target) { - mxb_assert(op.type == OperationType::SWITCHOVER || op.type == OperationType::FAILOVER); - json_t** const error_out = op.general.error_out; + mxb_assert(type == OperationType::SWITCHOVER || type == OperationType::FAILOVER); + json_t** const error_out = general.error_out; // Function should only be called for a master-slave pair. - auto master_conn = slave_connection_status(op.demotion_target); + auto master_conn = slave_connection_status(demotion_target); mxb_assert(master_conn); if (master_conn == NULL) { PRINT_MXS_JSON_ERROR(error_out, "'%s' is not a slave of '%s' and cannot be promoted to its place.", - name(), op.demotion_target->name()); + name(), demotion_target->name()); return false; } @@ -1469,13 +1469,13 @@ bool MariaDBServer::promote(ClusterOperation& op) // target. In case of switchover, remove other slave connections as well since the demotion target // will take them over. bool stopped = false; - if (op.type == OperationType::SWITCHOVER) + if (type == OperationType::SWITCHOVER) { - stopped = remove_slave_conns(op.general, m_slave_status); + stopped = remove_slave_conns(general, m_slave_status); } - else if (op.type == OperationType::FAILOVER) + else if (type == OperationType::FAILOVER) { - stopped = remove_slave_conns(op.general, {*master_conn}); + stopped = remove_slave_conns(general, {*master_conn}); master_conn = NULL; // The connection pointed to may no longer exist. } @@ -1484,22 +1484,22 @@ bool MariaDBServer::promote(ClusterOperation& op) // Step 2: If demotion target is master, meaning this server will become the master, // enable writing and scheduled events. Also, run promotion_sql_file. bool promotion_error = false; - if (op.demotion->to_from_master) + if (promotion.to_from_master) { // Disabling read-only should be quick. - bool ro_disabled = set_read_only(ReadOnlySetting::DISABLE, op.general.time_remaining, error_out); - op.general.time_remaining -= timer.restart(); + bool ro_disabled = set_read_only(ReadOnlySetting::DISABLE, general.time_remaining, error_out); + general.time_remaining -= timer.restart(); if (!ro_disabled) { promotion_error = true; } else { - if (op.promotion->handle_events) + if (promotion.handle_events) { // TODO: Add query replying to enable_events bool events_enabled = enable_events(error_out); - op.general.time_remaining -= timer.restart(); + general.time_remaining -= timer.restart(); if (!events_enabled) { promotion_error = true; @@ -1508,11 +1508,11 @@ bool MariaDBServer::promote(ClusterOperation& op) } // Run promotion_sql_file if no errors so far. - const string& sql_file = op.promotion->sql_file; + const string& sql_file = promotion.sql_file; if (!promotion_error && !sql_file.empty()) { bool file_ran_ok = run_sql_from_file(sql_file, error_out); - op.general.time_remaining -= timer.restart(); + general.time_remaining -= timer.restart(); if (!file_ran_ok) { promotion_error = true; @@ -1528,9 +1528,9 @@ bool MariaDBServer::promote(ClusterOperation& op) // operation. if (!promotion_error) { - if (op.type == OperationType::SWITCHOVER) + if (type == OperationType::SWITCHOVER) { - if (copy_slave_conns(op, op.promotion->conns_to_copy, op.demotion_target)) + if (copy_slave_conns(general, promotion.conns_to_copy, demotion_target)) { success = true; } @@ -1538,12 +1538,12 @@ bool MariaDBServer::promote(ClusterOperation& op) { PRINT_MXS_JSON_ERROR(error_out, "Could not copy slave connections from %s to %s.", - op.demotion_target->name(), name()); + demotion_target->name(), name()); } } - else if (op.type == OperationType::FAILOVER) + else if (type == OperationType::FAILOVER) { - if (merge_slave_conns(op, op.promotion->conns_to_copy)) + if (merge_slave_conns(general, promotion.conns_to_copy)) { success = true; } @@ -1551,7 +1551,7 @@ bool MariaDBServer::promote(ClusterOperation& op) { PRINT_MXS_JSON_ERROR(error_out, "Could not merge slave connections from %s to %s.", - op.demotion_target->name(), name()); + demotion_target->name(), name()); } } } @@ -1829,11 +1829,8 @@ bool MariaDBServer::set_read_only(ReadOnlySetting setting, maxbase::Duration tim * @param conns_to_merge Connections which should be merged * @return True on success */ -bool MariaDBServer::merge_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_merge) +bool MariaDBServer::merge_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_merge) { - mxb_assert(op.promotion_target == this && op.type == OperationType::FAILOVER - && slave_connection_status(op.demotion_target) == NULL); - /* When promoting a server during failover, the situation is more complicated than in switchover. * Connections cannot be moved to the demotion target (= failed server) as it is off. This means * that the promoting server must combine the roles of both itself and the failed server. Only the @@ -1973,10 +1970,10 @@ bool MariaDBServer::merge_slave_conns(ClusterOperation& op, const SlaveStatusArr return !error; } -bool MariaDBServer::copy_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_copy, +bool MariaDBServer::copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_copy, const MariaDBServer* replacement) { - mxb_assert(op.type == OperationType::SWITCHOVER && m_slave_status.empty()); + mxb_assert(m_slave_status.empty()); bool start_slave_error = false; for (size_t i = 0; i < conns_to_copy.size() && !start_slave_error; i++) { @@ -2012,21 +2009,22 @@ bool MariaDBServer::copy_slave_conns(ClusterOperation& op, const SlaveStatusArra * @param slave_conn Existing connection to emulate * @return True on success */ -bool MariaDBServer::create_start_slave(ClusterOperation& op, const SlaveStatus& slave_conn) +bool MariaDBServer::create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn) { + maxbase::Duration& time_remaining = op.time_remaining; StopWatch timer; string error_msg; bool success = false; SlaveStatus new_conn = slave_conn; new_conn.owning_server = name(); string change_master = generate_change_master_cmd(op, new_conn); - bool conn_created = execute_cmd_time_limit(change_master, op.general.time_remaining, &error_msg); - op.general.time_remaining -= timer.restart(); + bool conn_created = execute_cmd_time_limit(change_master, time_remaining, &error_msg); + time_remaining -= timer.restart(); if (conn_created) { string start_slave = string_printf("START SLAVE '%s';", new_conn.name.c_str()); - bool slave_started = execute_cmd_time_limit(start_slave, op.general.time_remaining, &error_msg); - op.general.time_remaining -= timer.restart(); + bool slave_started = execute_cmd_time_limit(start_slave, time_remaining, &error_msg); + time_remaining -= timer.restart(); if (slave_started) { success = true; @@ -2054,36 +2052,37 @@ bool MariaDBServer::create_start_slave(ClusterOperation& op, const SlaveStatus& * @param slave_conn Existing slave connection to emulate * @return Generated query */ -string MariaDBServer::generate_change_master_cmd(ClusterOperation& op, const SlaveStatus& slave_conn) +string MariaDBServer::generate_change_master_cmd(GeneralOpData& op, const SlaveStatus& slave_conn) { string change_cmd; change_cmd += string_printf("CHANGE MASTER '%s' TO MASTER_HOST = '%s', MASTER_PORT = %i, ", slave_conn.name.c_str(), slave_conn.master_host.c_str(), slave_conn.master_port); change_cmd += "MASTER_USE_GTID = current_pos, "; - change_cmd += string_printf("MASTER_USER = '%s', ", op.general.replication_user.c_str()); + change_cmd += string_printf("MASTER_USER = '%s', ", op.replication_user.c_str()); const char MASTER_PW[] = "MASTER_PASSWORD = '%s';"; #if defined (SS_DEBUG) string change_cmd_nopw = change_cmd; change_cmd_nopw += string_printf(MASTER_PW, "******"); MXS_DEBUG("Change master command is '%s'.", change_cmd_nopw.c_str()); #endif - change_cmd += string_printf(MASTER_PW, op.general.replication_password.c_str()); + change_cmd += string_printf(MASTER_PW, op.replication_password.c_str()); return change_cmd; } -bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op, const MariaDBServer* old_master, +bool MariaDBServer::redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master, const MariaDBServer* new_master) { + auto error_out = op.error_out; + maxbase::Duration& time_remaining = op.time_remaining; StopWatch timer; auto old_conn = slave_connection_status(old_master); mxb_assert(old_conn); bool success = false; // First, just stop the slave connection. - bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, op.general.time_remaining, - op.general.error_out); - op.general.time_remaining -= timer.restart(); + bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, time_remaining, error_out); + time_remaining -= timer.restart(); if (stopped) { SlaveStatus modified_conn = *old_conn; @@ -2092,20 +2091,20 @@ bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op, const Mar modified_conn.master_port = target_server->port; string change_master = generate_change_master_cmd(op, modified_conn); string error_msg; - bool changed = execute_cmd_time_limit(change_master, op.general.time_remaining, &error_msg); - op.general.time_remaining -= timer.restart(); + bool changed = execute_cmd_time_limit(change_master, time_remaining, &error_msg); + time_remaining -= timer.restart(); if (changed) { string start = string_printf("START SLAVE '%s';", old_conn->name.c_str()); - bool started = execute_cmd_time_limit(start, op.general.time_remaining, &error_msg); - op.general.time_remaining -= timer.restart(); + bool started = execute_cmd_time_limit(start, time_remaining, &error_msg); + time_remaining -= timer.restart(); if (started) { success = true; } else { - PRINT_MXS_JSON_ERROR(op.general.error_out, + PRINT_MXS_JSON_ERROR(error_out, "%s could not be started: %s", modified_conn.to_short_string().c_str(), error_msg.c_str()); } @@ -2113,7 +2112,7 @@ bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op, const Mar else { // TODO: This may currently print out passwords. - PRINT_MXS_JSON_ERROR(op.general.error_out, + PRINT_MXS_JSON_ERROR(error_out, "%s could not be redirected to [%s]:%i: %s", old_conn->to_short_string().c_str(), modified_conn.master_host.c_str(), modified_conn.master_port, diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 4a2776bec..1d084d675 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -239,7 +239,7 @@ public: * * @return True, if target server gtid was reached within allotted time */ - bool catchup_to_master(ClusterOperation& op); + bool catchup_to_master(GeneralOpData& op, const GtidList& target); /** * Find slave connection to the target server. If the IO thread is trying to connect @@ -357,7 +357,8 @@ public: * @param op Cluster operation descriptor * @return True if successful */ - bool promote(ClusterOperation& op); + bool promote(GeneralOpData& op, ServerOperation& promotion, OperationType type, + const MariaDBServer* demotion_target); /** * Demote this server. Removes all slave connections. If server was master, sets read_only. @@ -375,7 +376,7 @@ public: * @param new_master The new master for the redirected connection * @return True on success */ - bool redirect_existing_slave_conn(ClusterOperation& op, const MariaDBServer* old_master, + bool redirect_existing_slave_conn(GeneralOpData& op, const MariaDBServer* old_master, const MariaDBServer* new_master); /** @@ -391,7 +392,7 @@ public: * @params replacement Which server should rep * @return True on success */ - bool copy_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_copy, + bool copy_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_copy, const MariaDBServer* replacement); /** @@ -543,7 +544,7 @@ private: std::string* errmsg_out); bool set_read_only(ReadOnlySetting value, maxbase::Duration time_limit, json_t** error_out); - bool merge_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_merge); - bool create_start_slave(ClusterOperation& op, const SlaveStatus& slave_conn); - std::string generate_change_master_cmd(ClusterOperation& op, const SlaveStatus& slave_conn); + bool merge_slave_conns(GeneralOpData& op, const SlaveStatusArray& conns_to_merge); + bool create_start_slave(GeneralOpData& op, const SlaveStatus& slave_conn); + std::string generate_change_master_cmd(GeneralOpData& op, const SlaveStatus& slave_conn); }; diff --git a/server/modules/monitor/mariadbmon/server_utils.cc b/server/modules/monitor/mariadbmon/server_utils.cc index 5cfdb068b..a299b3b8b 100644 --- a/server/modules/monitor/mariadbmon/server_utils.cc +++ b/server/modules/monitor/mariadbmon/server_utils.cc @@ -155,25 +155,6 @@ bool SlaveStatus::should_be_copied(string* ignore_reason_out) const return accepted; } -ClusterOperation::ClusterOperation(OperationType type, ServerOperation* dem_op, ServerOperation* prom_op, - MariaDBServer* promotion_target, MariaDBServer* demotion_target, - string& replication_user, string& replication_password, - json_t** error, maxbase::Duration time_remaining) - : type(type) - , demotion(dem_op) - , promotion(prom_op) - , general(type, replication_user, replication_password, error, time_remaining) - , promotion_target(promotion_target) - , demotion_target(demotion_target) -{ -} - -ClusterOperation::~ClusterOperation() -{ - delete demotion; - delete promotion; -} - ServerOperation::ServerOperation(MariaDBServer* target, bool was_is_master, bool handle_events, const std::string& sql_file, const SlaveStatusArray& conns_to_copy) diff --git a/server/modules/monitor/mariadbmon/server_utils.hh b/server/modules/monitor/mariadbmon/server_utils.hh index 44d5aa66b..3db01686c 100644 --- a/server/modules/monitor/mariadbmon/server_utils.hh +++ b/server/modules/monitor/mariadbmon/server_utils.hh @@ -232,33 +232,6 @@ public: json_t** error, maxbase::Duration time_remaining); }; -/** - * Class which encapsulates many settings and status descriptors for a failover/switchover. - * Is more convenient to pass around than the separate elements. Most fields are constants or constant - * pointers since they should not change during an operation. - */ -class ClusterOperation -{ -private: - ClusterOperation(const ClusterOperation&) = delete; - ClusterOperation& operator=(const ClusterOperation&) = delete; - -public: - const OperationType type; // Failover or switchover - ServerOperation* const demotion; // Required by MariaDBServer->demote() - ServerOperation* const promotion; // Required by MariaDBServer->promote() - GeneralOpData general; // General operation data - - MariaDBServer* const promotion_target; // Which server will be promoted - MariaDBServer* const demotion_target; // Which server will be demoted - - ClusterOperation(OperationType type, ServerOperation* dem_op, ServerOperation* prom_op, - MariaDBServer* promotion_target, MariaDBServer* demotion_target, - std::string& replication_user, std::string& replication_password, - json_t** error, maxbase::Duration time_remaining); - ~ClusterOperation(); -}; - // Operation data which concerns a single server class ServerOperation {