MXS-1513: Work around the backend_read_timeout-setting

The setting limits the maximum time a MASTER_GTID_WAIT-function
can wait. To work around this limitation, the function is now called
in a loop such that the total timeout is approximately equal to
the requested timeout.
This commit is contained in:
Esa Korhonen
2017-11-17 11:12:52 +02:00
parent 59616b5f3e
commit 8077d97e25

View File

@ -3346,49 +3346,72 @@ static bool switchover_demote_master(MXS_MONITORED_SERVER* current_master,
return rval; return rval;
} }
static string generate_master_gtid_wait_cmd(const Gtid& gtid, double timeout)
{
std::stringstream query_ss;
query_ss << "SELECT MASTER_GTID_WAIT(\"" << gtid.to_string() << "\", " << timeout << ");";
return query_ss.str();
}
/** /**
* Wait until slave replication catches up with the master gtid * Wait until slave replication catches up with the master gtid
* *
* @param slave Slave to wait on * @param slave Slave to wait on
* @param master_binlog_pos Which gtid must be reached * @param gtid Which gtid must be reached
* @param timeout Maximum wait time in seconds * @param total_timeout Maximum wait time in seconds
* @param read_timeout The value of read_timeout for the connection
* @param err_out json object for error printing. Can be NULL. * @param err_out json object for error printing. Can be NULL.
* @return True, if target gtid was reached within allotted time * @return True, if target gtid was reached within allotted time
*/ */
static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const Gtid& gtid,
const Gtid& master_binlog_pos, int timeout, int total_timeout, int read_timeout,
json_t** err_out) json_t** err_out)
{ {
/* ss_dassert(read_timeout > 0);
* TODO: The MASTER_GTID_WAIT()-call is currently buggy, as the connection read timeout prematurely
* stops the wait in an error. Possibilities: wait in a loop for small intervals (how small?) or modify
* connection settings (can they be modified on a living connection? Seems so) to remove the timeout for
* the call, then set it back.
*/
std::stringstream query_ss;
query_ss << "SELECT MASTER_GTID_WAIT(\"" << master_binlog_pos.to_string() << "\", " << timeout << ");";
string query = query_ss.str();
bool rval = false;
StringVector output; StringVector output;
if (query_one_row(slave, query.c_str(), 1, &output)) bool gtid_reached = false;
bool error = false;
double seconds_remaining = total_timeout > 0 ? total_timeout : 0.01;
// Determine a reasonable timeout for the MASTER_GTID_WAIT-function depending on the
// backend_read_timeout setting (should be >= 1) and time remaining.
double loop_timeout = double(read_timeout) - 0.5;
string cmd = generate_master_gtid_wait_cmd(gtid, loop_timeout);
while (seconds_remaining > 0 && !gtid_reached && !error)
{ {
long int result = strtol(output[0].c_str(), NULL, 0); if (loop_timeout > seconds_remaining)
if (result == 0)
{ {
rval = true; // For the last iteration, change the wait timeout.
cmd = generate_master_gtid_wait_cmd(gtid, seconds_remaining);
}
seconds_remaining -= loop_timeout;
if (query_one_row(slave, cmd.c_str(), 1, &output))
{
if (output[0] == "0")
{
gtid_reached = true;
}
output.clear();
} }
else else
{ {
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.", error = true;
slave->server->unique_name);
} }
} }
else
if (error)
{ {
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.", PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.",
slave->server->unique_name); slave->server->unique_name);
} }
return rval; else if (!gtid_reached)
{
PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.",
slave->server->unique_name);
}
return gtid_reached;
} }
/** /**
@ -3480,7 +3503,7 @@ static bool do_switchover(MYSQL_MONITOR* mon, json_t** err_out)
if (switchover_demote_master(demotion_target, curr_master_info, repl_domain, err_out) && if (switchover_demote_master(demotion_target, curr_master_info, repl_domain, err_out) &&
// Step 3: Wait for the selected slave to catch up with master. // Step 3: Wait for the selected slave to catch up with master.
switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos, switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos,
mon->switchover_timeout, err_out) && mon->switchover_timeout, mon->monitor->read_timeout, err_out) &&
// Step 4: Stop and reset slave, set read-only to 0. // Step 4: Stop and reset slave, set read-only to 0.
failover_promote_new_master(mon, promotion_target, err_out)) failover_promote_new_master(mon, promotion_target, err_out))
{ {