MXS-1744 Use gtid querying instead of MASTER_GTID_WAIT when waiting for catchup

MASTER_GTID_WAIT uses gtid_slave_pos when comparing to the target gtid. This creates
problems with multi-domain gtids. It's simpler to just query the server for its
gtids repeatedly. Also, the method is now in MariaDBServer.
This commit is contained in:
Esa Korhonen
2018-04-05 14:32:34 +03:00
parent 174db469f3
commit 147355bbdb
7 changed files with 71 additions and 75 deletions

View File

@ -979,7 +979,8 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
{
time_t begin = time(NULL);
MXS_MONITORED_SERVER* slave = *iter;
if (switchover_wait_slave_catchup(slave, gtid, seconds_remaining, read_timeout, err_out))
auto slave_server = get_server_info(slave);
if (slave_server->wait_until_gtid(gtid, seconds_remaining, err_out))
{
seconds_remaining -= difftime(time(NULL), begin);
}
@ -992,67 +993,6 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
return success;
}
/**
* Wait until slave replication catches up with the master gtid
*
* @param slave Slave to wait on
* @param gtid Which gtid must be reached
* @param total_timeout Maximum wait time in seconds TODO: timeouts
* @param read_timeout The value of read_timeout for the connection
* @param err_out json object for error printing. Can be NULL.
* @return True, if target gtid was reached within allotted time
*/
bool MariaDBMonitor::switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
int total_timeout, int read_timeout,
json_t** err_out)
{
ss_dassert(read_timeout > 0);
StringVector output;
bool gtid_reached = false;
bool error = false;
double seconds_remaining = total_timeout;
// Determine a reasonable timeout for the MASTER_GTID_WAIT-function depending on the
// backend_read_timeout setting (should be >= 1) and time remaining.
double loop_timeout = double(read_timeout) - 0.5;
string cmd = gtid.generate_master_gtid_wait_cmd(loop_timeout);
while (seconds_remaining > 0 && !gtid_reached && !error)
{
if (loop_timeout > seconds_remaining)
{
// For the last iteration, change the wait timeout.
cmd = gtid.generate_master_gtid_wait_cmd(seconds_remaining);
}
seconds_remaining -= loop_timeout;
if (query_one_row(slave, cmd.c_str(), 1, &output))
{
if (output[0] == "0")
{
gtid_reached = true;
}
output.clear();
}
else
{
error = true;
}
}
if (error)
{
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.",
slave->server->unique_name);
}
else if (!gtid_reached)
{
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.",
slave->server->unique_name);
}
return gtid_reached;
}
/**
* Send an event to new master and wait for slaves to get the event.
*

View File

@ -217,11 +217,6 @@ string Gtid::to_string() const
return rval;
}
string GtidList::generate_master_gtid_wait_cmd(double timeout) const
{
return "SELECT MASTER_GTID_WAIT(\"" + to_string() + "\", " + std::to_string(timeout) + ");";
}
Gtid GtidList::get_gtid(uint32_t domain) const
{
Gtid rval;

View File

@ -147,13 +147,11 @@ public:
substraction_mode_t domain_substraction_mode);
/**
* Generate a MASTER_GTID_WAIT()-query to this gtid.
* Return an individual gtid with the given domain.
*
* @param timeout Maximum wait time in seconds
* @return The query
* @param domain Which domain to search for
* @return The gtid within the list. If domain is not found, an invalid gtid is returned.
*/
std::string generate_master_gtid_wait_cmd(double timeout) const;
Gtid get_gtid(uint32_t domain) const;
private:

View File

@ -199,8 +199,6 @@ private:
json_t** err_out);
bool switchover_wait_slaves_catchup(const ServerVector& slaves, const GtidList& gtid, int total_timeout,
int read_timeout, json_t** err_out);
bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
int total_timeout, int read_timeout, json_t** err_out);
bool wait_cluster_stabilization(MXS_MONITORED_SERVER* new_master, const ServerVector& slaves,
int seconds_remaining);
bool switchover_check_preferred_master(MXS_MONITORED_SERVER* preferred, json_t** err_out);

View File

@ -16,6 +16,7 @@
#include <inttypes.h>
#include <sstream>
#include <maxscale/mysql_utils.h>
#include <maxscale/thread.h>
#include "utilities.hh"
SlaveStatusInfo::SlaveStatusInfo()
@ -347,3 +348,54 @@ bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warni
}
return rval;
}
bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
{
bool gtid_reached = false;
bool error = false;
/* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on,
* use gtid_current_pos. */
const bool use_binlog_pos = rpl_settings.log_bin && rpl_settings.log_slave_updates;
int seconds_remaining = 1; // Cheat a bit here to allow at least one iteration.
int sleep_ms = 200; // How long to sleep on next iteration. Incremented slowly.
time_t start_time = time(NULL);
while (seconds_remaining > 0 && !gtid_reached && !error)
{
if (update_gtids())
{
const GtidList& compare_to = use_binlog_pos ? gtid_binlog_pos : gtid_current_pos;
if (GtidList::events_ahead(target, compare_to, GtidList::MISSING_DOMAIN_IGNORE) == 0)
{
gtid_reached = true;
}
else
{
// Query was successful but target gtid not yet reached. Check elapsed time.
seconds_remaining = timeout - difftime(time(NULL), start_time);
if (seconds_remaining > 0)
{
// Sleep for a moment, then try again.
thread_millisleep(sleep_ms);
sleep_ms += 100; // Sleep a bit more next iteration.
}
}
}
else
{
error = true;
}
}
if (error)
{
PRINT_MXS_JSON_ERROR(err_out, "Failed to update gtid on server '%s' while waiting for catchup.",
server_base->server->unique_name);
}
else if (!gtid_reached)
{
PRINT_MXS_JSON_ERROR(err_out, "Slave catchup timed out on slave '%s'.",
server_base->server->unique_name);
}
return gtid_reached;
}

View File

@ -150,4 +150,16 @@ public:
* @return True if log_bin is on
*/
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON);
/**
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server
* and the target gtid. The gtid compared is the gtid_binlog_pos if this server has both log_bin and
* log_slave_updates on, and gtid_current_pos otherwise.
*
* @param target Which gtid must be reached
* @param timeout Maximum wait time in seconds
* @param err_out json object for error printing. Can be NULL.
* @return True, if target gtid was reached within allotted time
*/
bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out);
};

View File

@ -16,6 +16,7 @@
#include <maxscale/cppdefs.hh>
#include <string>
#include <vector>
#include <maxscale/json_api.h>
#include <maxscale/monitor.h>
/** Utility macro for printing both MXS_ERROR and json error */