MXS-1744 Use gtid querying instead of MASTER_GTID_WAIT when waiting for catchup
MASTER_GTID_WAIT uses gtid_slave_pos when comparing to the target gtid. This creates problems with multi-domain gtids. It's simpler to just query the server for its gtids repeatedly. Also, the method is now in MariaDBServer.
This commit is contained in:
@ -979,7 +979,8 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
|
|||||||
{
|
{
|
||||||
time_t begin = time(NULL);
|
time_t begin = time(NULL);
|
||||||
MXS_MONITORED_SERVER* slave = *iter;
|
MXS_MONITORED_SERVER* slave = *iter;
|
||||||
if (switchover_wait_slave_catchup(slave, gtid, seconds_remaining, read_timeout, err_out))
|
auto slave_server = get_server_info(slave);
|
||||||
|
if (slave_server->wait_until_gtid(gtid, seconds_remaining, err_out))
|
||||||
{
|
{
|
||||||
seconds_remaining -= difftime(time(NULL), begin);
|
seconds_remaining -= difftime(time(NULL), begin);
|
||||||
}
|
}
|
||||||
@ -992,67 +993,6 @@ bool MariaDBMonitor::switchover_wait_slaves_catchup(const ServerVector& slaves,
|
|||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Wait until slave replication catches up with the master gtid
|
|
||||||
*
|
|
||||||
* @param slave Slave to wait on
|
|
||||||
* @param gtid Which gtid must be reached
|
|
||||||
* @param total_timeout Maximum wait time in seconds TODO: timeouts
|
|
||||||
* @param read_timeout The value of read_timeout for the connection
|
|
||||||
* @param err_out json object for error printing. Can be NULL.
|
|
||||||
* @return True, if target gtid was reached within allotted time
|
|
||||||
*/
|
|
||||||
bool MariaDBMonitor::switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
|
|
||||||
int total_timeout, int read_timeout,
|
|
||||||
json_t** err_out)
|
|
||||||
{
|
|
||||||
ss_dassert(read_timeout > 0);
|
|
||||||
StringVector output;
|
|
||||||
bool gtid_reached = false;
|
|
||||||
bool error = false;
|
|
||||||
double seconds_remaining = total_timeout;
|
|
||||||
|
|
||||||
// Determine a reasonable timeout for the MASTER_GTID_WAIT-function depending on the
|
|
||||||
// backend_read_timeout setting (should be >= 1) and time remaining.
|
|
||||||
double loop_timeout = double(read_timeout) - 0.5;
|
|
||||||
string cmd = gtid.generate_master_gtid_wait_cmd(loop_timeout);
|
|
||||||
|
|
||||||
while (seconds_remaining > 0 && !gtid_reached && !error)
|
|
||||||
{
|
|
||||||
if (loop_timeout > seconds_remaining)
|
|
||||||
{
|
|
||||||
// For the last iteration, change the wait timeout.
|
|
||||||
cmd = gtid.generate_master_gtid_wait_cmd(seconds_remaining);
|
|
||||||
}
|
|
||||||
seconds_remaining -= loop_timeout;
|
|
||||||
|
|
||||||
if (query_one_row(slave, cmd.c_str(), 1, &output))
|
|
||||||
{
|
|
||||||
if (output[0] == "0")
|
|
||||||
{
|
|
||||||
gtid_reached = true;
|
|
||||||
}
|
|
||||||
output.clear();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
error = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (error)
|
|
||||||
{
|
|
||||||
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.",
|
|
||||||
slave->server->unique_name);
|
|
||||||
}
|
|
||||||
else if (!gtid_reached)
|
|
||||||
{
|
|
||||||
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.",
|
|
||||||
slave->server->unique_name);
|
|
||||||
}
|
|
||||||
return gtid_reached;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Send an event to new master and wait for slaves to get the event.
|
* Send an event to new master and wait for slaves to get the event.
|
||||||
*
|
*
|
||||||
|
|||||||
@ -217,11 +217,6 @@ string Gtid::to_string() const
|
|||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
string GtidList::generate_master_gtid_wait_cmd(double timeout) const
|
|
||||||
{
|
|
||||||
return "SELECT MASTER_GTID_WAIT(\"" + to_string() + "\", " + std::to_string(timeout) + ");";
|
|
||||||
}
|
|
||||||
|
|
||||||
Gtid GtidList::get_gtid(uint32_t domain) const
|
Gtid GtidList::get_gtid(uint32_t domain) const
|
||||||
{
|
{
|
||||||
Gtid rval;
|
Gtid rval;
|
||||||
|
|||||||
@ -147,13 +147,11 @@ public:
|
|||||||
substraction_mode_t domain_substraction_mode);
|
substraction_mode_t domain_substraction_mode);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate a MASTER_GTID_WAIT()-query to this gtid.
|
* Return an individual gtid with the given domain.
|
||||||
*
|
*
|
||||||
* @param timeout Maximum wait time in seconds
|
* @param domain Which domain to search for
|
||||||
* @return The query
|
* @return The gtid within the list. If domain is not found, an invalid gtid is returned.
|
||||||
*/
|
*/
|
||||||
std::string generate_master_gtid_wait_cmd(double timeout) const;
|
|
||||||
|
|
||||||
Gtid get_gtid(uint32_t domain) const;
|
Gtid get_gtid(uint32_t domain) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -199,8 +199,6 @@ private:
|
|||||||
json_t** err_out);
|
json_t** err_out);
|
||||||
bool switchover_wait_slaves_catchup(const ServerVector& slaves, const GtidList& gtid, int total_timeout,
|
bool switchover_wait_slaves_catchup(const ServerVector& slaves, const GtidList& gtid, int total_timeout,
|
||||||
int read_timeout, json_t** err_out);
|
int read_timeout, json_t** err_out);
|
||||||
bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const GtidList& gtid,
|
|
||||||
int total_timeout, int read_timeout, json_t** err_out);
|
|
||||||
bool wait_cluster_stabilization(MXS_MONITORED_SERVER* new_master, const ServerVector& slaves,
|
bool wait_cluster_stabilization(MXS_MONITORED_SERVER* new_master, const ServerVector& slaves,
|
||||||
int seconds_remaining);
|
int seconds_remaining);
|
||||||
bool switchover_check_preferred_master(MXS_MONITORED_SERVER* preferred, json_t** err_out);
|
bool switchover_check_preferred_master(MXS_MONITORED_SERVER* preferred, json_t** err_out);
|
||||||
|
|||||||
@ -16,6 +16,7 @@
|
|||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <maxscale/mysql_utils.h>
|
#include <maxscale/mysql_utils.h>
|
||||||
|
#include <maxscale/thread.h>
|
||||||
#include "utilities.hh"
|
#include "utilities.hh"
|
||||||
|
|
||||||
SlaveStatusInfo::SlaveStatusInfo()
|
SlaveStatusInfo::SlaveStatusInfo()
|
||||||
@ -346,4 +347,55 @@ bool MariaDBServer::check_replication_settings(print_repl_warnings_t print_warni
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool MariaDBServer::wait_until_gtid(const GtidList& target, int timeout, json_t** err_out)
|
||||||
|
{
|
||||||
|
bool gtid_reached = false;
|
||||||
|
bool error = false;
|
||||||
|
/* Prefer to use gtid_binlog_pos, as that is more reliable. But if log_slave_updates is not on,
|
||||||
|
* use gtid_current_pos. */
|
||||||
|
const bool use_binlog_pos = rpl_settings.log_bin && rpl_settings.log_slave_updates;
|
||||||
|
|
||||||
|
int seconds_remaining = 1; // Cheat a bit here to allow at least one iteration.
|
||||||
|
int sleep_ms = 200; // How long to sleep on next iteration. Incremented slowly.
|
||||||
|
time_t start_time = time(NULL);
|
||||||
|
while (seconds_remaining > 0 && !gtid_reached && !error)
|
||||||
|
{
|
||||||
|
if (update_gtids())
|
||||||
|
{
|
||||||
|
const GtidList& compare_to = use_binlog_pos ? gtid_binlog_pos : gtid_current_pos;
|
||||||
|
if (GtidList::events_ahead(target, compare_to, GtidList::MISSING_DOMAIN_IGNORE) == 0)
|
||||||
|
{
|
||||||
|
gtid_reached = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Query was successful but target gtid not yet reached. Check elapsed time.
|
||||||
|
seconds_remaining = timeout - difftime(time(NULL), start_time);
|
||||||
|
if (seconds_remaining > 0)
|
||||||
|
{
|
||||||
|
// Sleep for a moment, then try again.
|
||||||
|
thread_millisleep(sleep_ms);
|
||||||
|
sleep_ms += 100; // Sleep a bit more next iteration.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error)
|
||||||
|
{
|
||||||
|
PRINT_MXS_JSON_ERROR(err_out, "Failed to update gtid on server '%s' while waiting for catchup.",
|
||||||
|
server_base->server->unique_name);
|
||||||
|
}
|
||||||
|
else if (!gtid_reached)
|
||||||
|
{
|
||||||
|
PRINT_MXS_JSON_ERROR(err_out, "Slave catchup timed out on slave '%s'.",
|
||||||
|
server_base->server->unique_name);
|
||||||
|
}
|
||||||
|
return gtid_reached;
|
||||||
|
}
|
||||||
|
|||||||
@ -150,4 +150,16 @@ public:
|
|||||||
* @return True if log_bin is on
|
* @return True if log_bin is on
|
||||||
*/
|
*/
|
||||||
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON);
|
bool check_replication_settings(print_repl_warnings_t print_warnings = WARNINGS_ON);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait until server catches up to the target gtid. Only considers gtid domains common to this server
|
||||||
|
* and the target gtid. The gtid compared is the gtid_binlog_pos if this server has both log_bin and
|
||||||
|
* log_slave_updates on, and gtid_current_pos otherwise.
|
||||||
|
*
|
||||||
|
* @param target Which gtid must be reached
|
||||||
|
* @param timeout Maximum wait time in seconds
|
||||||
|
* @param err_out json object for error printing. Can be NULL.
|
||||||
|
* @return True, if target gtid was reached within allotted time
|
||||||
|
*/
|
||||||
|
bool wait_until_gtid(const GtidList& target, int timeout, json_t** err_out);
|
||||||
};
|
};
|
||||||
|
|||||||
@ -16,6 +16,7 @@
|
|||||||
#include <maxscale/cppdefs.hh>
|
#include <maxscale/cppdefs.hh>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <maxscale/json_api.h>
|
||||||
#include <maxscale/monitor.h>
|
#include <maxscale/monitor.h>
|
||||||
|
|
||||||
/** Utility macro for printing both MXS_ERROR and json error */
|
/** Utility macro for printing both MXS_ERROR and json error */
|
||||||
|
|||||||
Reference in New Issue
Block a user