diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index f935e333e..22d03a050 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -733,29 +733,31 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) ServerArray redirectable_slaves = get_redirectables(promotion_target, demotion_target); bool rval = false; - // Step 2: Set read-only to on, flush logs, update master gtid:s + // Step 2: Set read-only to on, flush logs, update gtid:s. if (demotion_target->demote(op)) { m_cluster_modified = true; bool catchup_and_promote_success = false; - maxbase::StopWatch timer; - // Step 3: Wait for the slaves (including promotion target) to catch up with master. - ServerArray catchup_slaves = redirectable_slaves; - catchup_slaves.push_back(promotion_target); - if (switchover_wait_slaves_catchup(catchup_slaves, - demotion_target->m_gtid_binlog_pos, - op.time_remaining.secs(), - error_out)) + StopWatch timer; + // Step 3: Wait for the promotion target to catch up with the demotion target. Disregard the other + // slaves of the promotion target to avoid needless waiting. + // The gtid:s of the demotion target were updated at the end of demotion. + if (promotion_target->catchup_to_master(op)) { - auto step3_duration = timer.lap(); - MXS_DEBUG("Switchover: slave catchup took %.1f seconds.", step3_duration.secs()); - op.time_remaining -= step3_duration; - - // Step 4: On new master STOP and RESET SLAVE, set read-only to off. + MXS_INFO("Switchover: Catchup took %.1f seconds.", timer.lap().secs()); + // Step 4: On new master: remove slave connections, set read-only to OFF etc. if (promotion_target->promote(op)) { + // Point of no return. Even if following steps fail, do not try to undo. + // Switchover considered at least partially successful. catchup_and_promote_success = true; - m_next_master = promotion_target; + rval = true; + if (op.demotion_target_is_master) + { + // Force a master swap on next tick. + m_next_master = promotion_target; + } + timer.restart(); // Step 5: Redirect slaves and start replication on old master. ServerArray redirected_slaves; @@ -764,35 +766,20 @@ bool MariaDBMonitor::switchover_perform(ClusterOperation& op) { redirected_slaves.push_back(demotion_target); } + op.time_remaining -= timer.lap(); + int redirects = redirect_slaves_ex(op, redirectable_slaves, &redirected_slaves); bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0; if (success) { - op.time_remaining -= timer.lap(); - // Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves - // to receive it. If using external replication, skip this step. Come up with an - // alternative later. - if (m_external_master_port != PORT_UNKNOWN) - { - MXS_WARNING("Replicating from external master, skipping final check."); - rval = true; - } - else if (wait_cluster_stabilization(promotion_target, - redirected_slaves, - op.time_remaining.secs())) - { - rval = true; - auto step6_duration = timer.lap(); - op.time_remaining -= step6_duration; - MXS_DEBUG("Switchover: slave replication confirmation took %.1f seconds with " - "%.1f seconds to spare.", - step6_duration.secs(), op.time_remaining.secs()); - } - } - else - { - print_redirect_errors(demotion_target, redirectable_slaves, error_out); + timer.restart(); + // Step 6: Finally, check that slaves are replicating. + wait_cluster_stabilization_ex(op, redirected_slaves); + auto step6_duration = timer.lap(); + MXS_INFO("Switchover: slave replication confirmation took %.1f seconds with " + "%.1f seconds to spare.", + step6_duration.secs(), op.time_remaining.secs()); } } } @@ -994,46 +981,6 @@ bool MariaDBMonitor::switchover_demote_master(MariaDBServer* current_master, jso return !any_error(); } -/** - * Wait until slave replication catches up with the master gtid for all slaves in the vector. - * - * @param slave Slaves to wait on - * @param gtid Which gtid must be reached - * @param total_timeout Maximum wait time in seconds - * @param err_out json object for error printing. Can be NULL. - * @return True, if target gtid was reached within allotted time for all servers - */ -bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerArray& slaves, - const GtidList& gtid, - int total_timeout, - json_t** err_out) -{ - bool success = true; - int seconds_remaining = total_timeout; - - for (auto iter = slaves.begin(); iter != slaves.end() && success; iter++) - { - if (seconds_remaining <= 0) - { - success = false; - } - else - { - time_t begin = time(NULL); - MariaDBServer* slave_server = *iter; - if (slave_server->wait_until_gtid(gtid, seconds_remaining, err_out)) - { - seconds_remaining -= difftime(time(NULL), begin); - } - else - { - success = false; - } - } - } - return success; -} - /** * Send an event to new master and wait for slaves to get the event. * diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 96f88b8de..4079c7134 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -234,10 +234,6 @@ private: Log log_mode, json_t** error_out); bool switchover_perform(ClusterOperation& operation); bool switchover_demote_master(MariaDBServer* current_master, json_t** err_out); - bool switchover_wait_slaves_catchup(const ServerArray& slaves, - const GtidList& gtid, - int total_timeout, - json_t** err_out); bool switchover_start_slave(MariaDBServer* old_master, MariaDBServer* new_master); bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out); void handle_low_disk_space_master(); diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index b71761489..aba27eeaa 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -502,20 +502,24 @@ void MariaDBServer::warn_replication_settings() const } } -bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out) +bool MariaDBServer::catchup_to_master(ClusterOperation& op) { - bool gtid_reached = false; - bool error = false; /* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on, * use gtid_current_pos. */ const bool use_binlog_pos = m_rpl_settings.log_bin && m_rpl_settings.log_slave_updates; + bool time_is_up = false; // Check at least once. + bool gtid_reached = false; + bool error = false; + const GtidList& target = op.demotion_target->m_gtid_binlog_pos; + json_t** error_out = op.error_out; - int seconds_remaining = 1; // Cheat a bit here to allow at least one iteration. - int sleep_ms = 200; // How long to sleep on next iteration. Incremented slowly. - time_t start_time = time(NULL); - while (seconds_remaining > 0 && !gtid_reached && !error) + Duration sleep_time(0.2); // How long to sleep before next iteration. Incremented slowly. + StopWatch timer; + + while (!time_is_up && !gtid_reached && !error) { - if (update_gtids()) + string error_msg; + if (update_gtids(&error_msg)) { const GtidList& compare_to = use_binlog_pos ? m_gtid_binlog_pos : m_gtid_current_pos; if (target.events_ahead(compare_to, GtidList::MISSING_DOMAIN_IGNORE) == 0) @@ -524,31 +528,33 @@ bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t* } else { - // Query was successful but target gtid not yet reached. Check elapsed time. - seconds_remaining = timeout - difftime(time(NULL), start_time); - if (seconds_remaining > 0) + // Query was successful but target gtid not yet reached. Check how much time left. + op.time_remaining -= timer.lap(); + if (op.time_remaining.secs() > 0) { // Sleep for a moment, then try again. - std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); - sleep_ms += 100; // Sleep a bit more next iteration. + Duration this_sleep = MXS_MIN(sleep_time, op.time_remaining); + std::this_thread::sleep_for(this_sleep); + sleep_time += Duration(0.1); // Sleep a bit more next iteration. + } + else + { + time_is_up = true; } } } else { error = true; + PRINT_MXS_JSON_ERROR(error_out, + "Failed to update gtid on %s while waiting for catchup: %s", + name(), error_msg.c_str()); } } - if (error) + if (!error && !gtid_reached) { - PRINT_MXS_JSON_ERROR(err_out, - "Failed to update gtid on server '%s' while waiting for catchup.", - name()); - } - else if (!gtid_reached) - { - PRINT_MXS_JSON_ERROR(err_out, "Slave catchup timed out on slave '%s'.", name()); + PRINT_MXS_JSON_ERROR(error_out, "Slave catchup timed out on slave '%s'.", name()); } return gtid_reached; } @@ -1568,13 +1574,19 @@ bool MariaDBServer::demote(ClusterOperation& op) if (!stop_slave_error) { - // Step 2: If this server is master, disable writes and scheduled events. - // Flush logs, update gtid:s, run demotion_sql_file. + // Step 2: If this server is master, disable writes and scheduled events, flush logs, + // update gtid:s, run demotion_sql_file. + + // In theory, this part should be ran in the opposite order so it would "reverse" + // the promotion code. However, it's probably better to run the most + // likely part to fail, setting read_only=1, first to make undoing easier. Setting + // read_only may fail if another session has table locks or is doing long writes. bool demotion_error = false; if (op.demotion_target_is_master) { mxb_assert(is_master()); // Step 2a: Enabling read-only can take time if writes are on or table locks taken. + // TODO: use max_statement_time to be safe! bool ro_enabled = set_read_only(ReadOnlySetting::ENABLE, op.time_remaining, error_out); op.time_remaining -= timer.lap(); if (!ro_enabled) @@ -1597,33 +1609,7 @@ bool MariaDBServer::demote(ClusterOperation& op) } } - if (!demotion_error) - { - // Step 2c: FLUSH LOGS to ensure that all events have been written to binlog, - // then update gtid:s. - string error_msg; - bool logs_flushed = execute_cmd_time_limit("FLUSH LOGS;", op.time_remaining, &error_msg); - op.time_remaining -= timer.lap(); - if (logs_flushed) - { - if (!update_gtids(&error_msg)) - { - demotion_error = true; - PRINT_MXS_JSON_ERROR(error_out, - "Failed to update gtid:s of %s during demotion: %s.", - name(), error_msg.c_str()); - } - } - else - { - demotion_error = true; - PRINT_MXS_JSON_ERROR(error_out, - "Failed to flush binary logs of %s during demotion: %s.", - name(), error_msg.c_str()); - } - } - - // Step 2d: Run demotion_sql_file if no errors so far. + // Step 2c: Run demotion_sql_file if no errors so far. if (!demotion_error && !op.demotion_sql_file.empty()) { bool file_ran_ok = run_sql_from_file(op.demotion_sql_file, error_out); @@ -1637,19 +1623,48 @@ bool MariaDBServer::demote(ClusterOperation& op) } } - if (demotion_error) + if (!demotion_error) { - // Read_only was enabled but a later step failed. Disable read_only. Connection is - // likely broken so use a short time limit. - // TODO: add smarter undo - set_read_only(ReadOnlySetting::DISABLE, Duration((double)0), NULL); + // Step 2d: FLUSH LOGS to ensure that all events have been written to binlog. + string error_msg; + bool logs_flushed = execute_cmd_time_limit("FLUSH LOGS;", op.time_remaining, &error_msg); + op.time_remaining -= timer.lap(); + if (!logs_flushed) + { + demotion_error = true; + PRINT_MXS_JSON_ERROR(error_out, + "Failed to flush binary logs of %s during demotion: %s.", + name(), error_msg.c_str()); + } } } } if (!demotion_error) { - success = true; + // Finally, update gtid:s. + string error_msg; + if (update_gtids(&error_msg)) + { + success = true; + } + else + { + demotion_error = true; + PRINT_MXS_JSON_ERROR(error_out, + "Failed to update gtid:s of %s during demotion: %s.", + name(), error_msg.c_str()); + } + } + + if (demotion_error && op.demotion_target_is_master) + { + // Read_only was enabled (or tried to be enabled) but a later step failed. + // Disable read_only. Connection is likely broken so use a short time limit. + // Even this is insufficient, because the server may still be executing the old + // 'SET GLOBAL read_only=1' query. + // TODO: add smarter undo, KILL QUERY etc. + set_read_only(ReadOnlySetting::DISABLE, Duration((double)0), NULL); } } return success; diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 9d19c1805..f02f74f81 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -260,16 +260,17 @@ public: void warn_replication_settings() const; /** - * Wait until server catches up to the target gtid. Only considers gtid domains common to this server - * and the target gtid. The gtid compared is the gtid_binlog_pos if this server has both log_bin and - * log_slave_updates on, and gtid_current_pos otherwise. + * Wait until server catches up to demotion target. Only considers gtid domains common + * to this server and the target. The gtid compared to on the demotion target is 'gtid_binlog_pos'. + * It is not updated during this method. * - * @param target Which gtid must be reached - * @param timeout Maximum wait time in seconds - * @param err_out json object for error printing. Can be NULL. - * @return True, if target gtid was reached within allotted time + * The gtid used for comparison on this server is 'gtid_binlog_pos' if this server has both 'log_bin' + * and 'log_slave_updates' on, and 'gtid_current_pos' otherwise. This server is updated during the + * method. + * + * @return True, if target server gtid was reached within allotted time */ - bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out); + bool catchup_to_master(ClusterOperation& op); /** * Find slave connection to the target server. If the IO thread is trying to connect