MXS-1744 Use gtid querying instead of MASTER_GTID_WAIT when waiting for catchup
MASTER_GTID_WAIT uses gtid_slave_pos when comparing to the target gtid. This creates problems with multi-domain gtids. It's simpler to just query the server for its gtids repeatedly. Also, the method is now in MariaDBServer.
This commit is contained in:
@ -979,7 +979,8 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
|
||||
{
|
||||
time_t begin = time(NULL);
|
||||
MXS_MONITORED_SERVER* slave = *iter;
|
||||
if (switchover_wait_slave_catchup(slave, gtid, seconds_remaining, read_timeout, err_out))
|
||||
auto slave_server = get_server_info(slave);
|
||||
if (slave_server->wait_until_gtid(gtid, seconds_remaining, err_out))
|
||||
{
|
||||
seconds_remaining -= difftime(time(NULL), begin);
|
||||
}
|
||||
@ -992,67 +993,6 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait until slave replication catches up with the master gtid
|
||||
*
|
||||
* @param slave Slave to wait on
|
||||
* @param gtid Which gtid must be reached
|
||||
* @param total_timeout Maximum wait time in seconds TODO: timeouts
|
||||
* @param read_timeout The value of read_timeout for the connection
|
||||
* @param err_out json object for error printing. Can be NULL.
|
||||
* @return True, if target gtid was reached within allotted time
|
||||
*/
|
||||
bool MariaDBMonitor::switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
|
||||
int total_timeout, int read_timeout,
|
||||
json_t** err_out)
|
||||
{
|
||||
ss_dassert(read_timeout > 0);
|
||||
StringVector output;
|
||||
bool gtid_reached = false;
|
||||
bool error = false;
|
||||
double seconds_remaining = total_timeout;
|
||||
|
||||
// Determine a reasonable timeout for the MASTER_GTID_WAIT-function depending on the
|
||||
// backend_read_timeout setting (should be >= 1) and time remaining.
|
||||
double loop_timeout = double(read_timeout) - 0.5;
|
||||
string cmd = gtid.generate_master_gtid_wait_cmd(loop_timeout);
|
||||
|
||||
while (seconds_remaining > 0 && !gtid_reached && !error)
|
||||
{
|
||||
if (loop_timeout > seconds_remaining)
|
||||
{
|
||||
// For the last iteration, change the wait timeout.
|
||||
cmd = gtid.generate_master_gtid_wait_cmd(seconds_remaining);
|
||||
}
|
||||
seconds_remaining -= loop_timeout;
|
||||
|
||||
if (query_one_row(slave, cmd.c_str(), 1, &output))
|
||||
{
|
||||
if (output[0] == "0")
|
||||
{
|
||||
gtid_reached = true;
|
||||
}
|
||||
output.clear();
|
||||
}
|
||||
else
|
||||
{
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (error)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.",
|
||||
slave->server->unique_name);
|
||||
}
|
||||
else if (!gtid_reached)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.",
|
||||
slave->server->unique_name);
|
||||
}
|
||||
return gtid_reached;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send an event to new master and wait for slaves to get the event.
|
||||
*
|
||||
|
@ -217,11 +217,6 @@ string Gtid::to_string() const
|
||||
return rval;
|
||||
}
|
||||
|
||||
string GtidList::generate_master_gtid_wait_cmd(double timeout) const
|
||||
{
|
||||
return "SELECT MASTER_GTID_WAIT(\"" + to_string() + "\", " + std::to_string(timeout) + ");";
|
||||
}
|
||||
|
||||
Gtid GtidList::get_gtid(uint32_t domain) const
|
||||
{
|
||||
Gtid rval;
|
||||
|
@ -147,13 +147,11 @@ public:
|
||||
substraction_mode_t domain_substraction_mode);
|
||||
|
||||
/**
|
||||
* Generate a MASTER_GTID_WAIT()-query to this gtid.
|
||||
* Return an individual gtid with the given domain.
|
||||
*
|
||||
* @param timeout Maximum wait time in seconds
|
||||
* @return The query
|
||||
* @param domain Which domain to search for
|
||||
* @return The gtid within the list. If domain is not found, an invalid gtid is returned.
|
||||
*/
|
||||
std::string generate_master_gtid_wait_cmd(double timeout) const;
|
||||
|
||||
Gtid get_gtid(uint32_t domain) const;
|
||||
|
||||
private:
|
||||
|
@ -199,8 +199,6 @@ private:
|
||||
json_t** err_out);
|
||||
bool switchover_wait_slaves_catchup(const ServerVector& slaves, const GtidList& gtid, int total_timeout,
|
||||
int read_timeout, json_t** err_out);
|
||||
bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
|
||||
int total_timeout, int read_timeout, json_t** err_out);
|
||||
bool wait_cluster_stabilization(MXS_MONITORED_SERVER* new_master, const ServerVector& slaves,
|
||||
int seconds_remaining);
|
||||
bool switchover_check_preferred_master(MXS_MONITORED_SERVER* preferred, json_t** err_out);
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <inttypes.h>
|
||||
#include <sstream>
|
||||
#include <maxscale/mysql_utils.h>
|
||||
#include <maxscale/thread.h>
|
||||
#include "utilities.hh"
|
||||
|
||||
SlaveStatusInfo::SlaveStatusInfo()
|
||||
@ -347,3 +348,54 @@ bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warni
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
|
||||
{
|
||||
bool gtid_reached = false;
|
||||
bool error = false;
|
||||
/* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on,
|
||||
* use gtid_current_pos. */
|
||||
const bool use_binlog_pos = rpl_settings.log_bin && rpl_settings.log_slave_updates;
|
||||
|
||||
int seconds_remaining = 1; // Cheat a bit here to allow at least one iteration.
|
||||
int sleep_ms = 200; // How long to sleep on next iteration. Incremented slowly.
|
||||
time_t start_time = time(NULL);
|
||||
while (seconds_remaining > 0 && !gtid_reached && !error)
|
||||
{
|
||||
if (update_gtids())
|
||||
{
|
||||
const GtidList& compare_to = use_binlog_pos ? gtid_binlog_pos : gtid_current_pos;
|
||||
if (GtidList::events_ahead(target, compare_to, GtidList::MISSING_DOMAIN_IGNORE) == 0)
|
||||
{
|
||||
gtid_reached = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Query was successful but target gtid not yet reached. Check elapsed time.
|
||||
seconds_remaining = timeout - difftime(time(NULL), start_time);
|
||||
if (seconds_remaining > 0)
|
||||
{
|
||||
// Sleep for a moment, then try again.
|
||||
thread_millisleep(sleep_ms);
|
||||
sleep_ms += 100; // Sleep a bit more next iteration.
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (error)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(err_out, "Failed to update gtid on server '%s' while waiting for catchup.",
|
||||
server_base->server->unique_name);
|
||||
}
|
||||
else if (!gtid_reached)
|
||||
{
|
||||
PRINT_MXS_JSON_ERROR(err_out, "Slave catchup timed out on slave '%s'.",
|
||||
server_base->server->unique_name);
|
||||
}
|
||||
return gtid_reached;
|
||||
}
|
||||
|
@ -150,4 +150,16 @@ public:
|
||||
* @return True if log_bin is on
|
||||
*/
|
||||
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON);
|
||||
|
||||
/**
|
||||
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server
|
||||
* and the target gtid. The gtid compared is the gtid_binlog_pos if this server has both log_bin and
|
||||
* log_slave_updates on, and gtid_current_pos otherwise.
|
||||
*
|
||||
* @param target Which gtid must be reached
|
||||
* @param timeout Maximum wait time in seconds
|
||||
* @param err_out json object for error printing. Can be NULL.
|
||||
* @return True, if target gtid was reached within allotted time
|
||||
*/
|
||||
bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out);
|
||||
};
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <maxscale/cppdefs.hh>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <maxscale/json_api.h>
|
||||
#include <maxscale/monitor.h>
|
||||
|
||||
/** Utility macro for printing both MXS_ERROR and json error */
|
||||
|
Reference in New Issue
Block a user