MXS-1703 Move more methods to MariaDBServer

These methods only modify or update a single server.
This commit is contained in:
Esa Korhonen 2018-04-16 17:41:37 +03:00
parent 50bc43e4bf
commit 02c57c98e4
5 changed files with 216 additions and 223 deletions

View File

@ -712,18 +712,6 @@ void MariaDBMonitor::monitor_mysql_db(MariaDBServer* serv_info)
}
}
/**
* Update replication settings, gtid:s and slave status of the server.
*
* @param server Slave to update
* @return True on success. False on error, or if server is not a slave (slave SQL not running).
*/
bool MariaDBMonitor::update_slave_info(MariaDBServer* server)
{
return (server->slave_status.slave_sql_running && server->update_replication_settings() &&
server->update_gtids() && server->do_show_slave_status());
}
/**
* Check if the maxscale_schema.replication_heartbeat table is replicated on all
* servers and log a warning if problems were found.

View File

@ -131,11 +131,11 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output)
{
MariaDBServer* slave_cand = get_server_info(mon_slave_cand);
if (server_is_rejoin_suspect(slave_cand, m_master, output))
if (server_is_rejoin_suspect(slave_cand, output))
{
if (m_master->update_gtids())
{
if (can_replicate_from(slave_cand, m_master))
if (slave_cand->can_replicate_from(m_master))
{
ServerArray joinable_server;
joinable_server.push_back(slave_cand);
@ -227,7 +227,7 @@ int MariaDBMonitor::redirect_slaves(MariaDBServer* new_master, const ServerArray
int successes = 0;
for (auto iter = slaves.begin(); iter != slaves.end(); iter++)
{
if (redirect_one_slave(*iter, change_cmd))
if ((*iter)->redirect_one_slave(change_cmd))
{
successes++;
redirected_slaves->push_back(*iter);
@ -293,44 +293,6 @@ bool MariaDBMonitor::switchover_start_slave(MariaDBServer* old_master, MariaDBSe
return rval;
}
/**
* Redirect one slave server to another master
*
* @param slave Server to redirect
* @param change_cmd Change master command, usually generated by generate_change_master_cmd()
* @return True if slave accepted all commands
*/
bool MariaDBMonitor::redirect_one_slave(MariaDBServer* slave, const string& change_cmd)
{
bool success = false;
MYSQL* slave_conn = slave->server_base->con;
const char* query = "STOP SLAVE;";
if (mxs_mysql_query(slave_conn, query) == 0)
{
query = "RESET SLAVE;"; // To erase any old I/O or SQL errors
if (mxs_mysql_query(slave_conn, query) == 0)
{
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(slave_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(slave_conn, query) == 0)
{
success = true;
MXS_NOTICE("Slave '%s' redirected to new master.", slave->name());
}
}
}
}
if (!success)
{
MXS_WARNING("Slave '%s' redirection failed: '%s'. Query: '%s'.", slave->name(),
mysql_error(slave_conn), query);
}
return success;
}
/**
* (Re)join given servers to the cluster. The servers in the array are assumed to be joinable.
* Usually the list is created by get_joinable_servers().
@ -355,13 +317,13 @@ uint32_t MariaDBMonitor::do_rejoin(const ServerArray& joinable_servers)
if (joinable->n_slaves_configured == 0)
{
MXS_NOTICE("Directing standalone server '%s' to replicate from '%s'.", name, master_name);
op_success = join_cluster(joinable, change_cmd);
op_success = joinable->join_cluster(change_cmd);
}
else
{
MXS_NOTICE("Server '%s' is replicating from a server other than '%s', "
"redirecting it to '%s'.", name, master_name, master_name);
op_success = redirect_one_slave(joinable, change_cmd);
op_success = joinable->redirect_one_slave(change_cmd);
}
if (op_success)
@ -401,7 +363,7 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
ServerArray suspects;
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
if (server_is_rejoin_suspect(*iter, m_master, NULL))
if (server_is_rejoin_suspect(*iter, NULL))
{
suspects.push_back(*iter);
}
@ -415,7 +377,7 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
{
for (size_t i = 0; i < suspects.size(); i++)
{
if (can_replicate_from(suspects[i], m_master))
if (suspects[i]->can_replicate_from(m_master))
{
output->push_back(suspects[i]);
}
@ -429,65 +391,15 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output)
return comm_ok;
}
/**
* Joins a standalone server to the cluster.
*
* @param server Server to join
* @param change_cmd Change master command
* @return True if commands were accepted by server
*/
bool MariaDBMonitor::join_cluster(MariaDBServer* server, const string& change_cmd)
{
/* Server does not have slave connections. This operation can fail, or the resulting
* replication may end up broken. */
bool success = false;
string error_msg;
MYSQL* server_conn = server->server_base->con;
const char* query = "SET GLOBAL read_only=1;";
if (mxs_mysql_query(server_conn, query) == 0)
{
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(server_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(server_conn, query) == 0)
{
success = true;
MXS_NOTICE("Standalone server '%s' starting replication.", server->name());
}
}
if (!success)
{
// A step after "SET GLOBAL read_only=1" failed, try to undo. First, backup error message.
error_msg = mysql_error(server_conn);
mxs_mysql_query(server_conn, "SET GLOBAL read_only=0;");
}
}
if (!success)
{
if (error_msg.empty())
{
error_msg = mysql_error(server_conn);
}
MXS_WARNING("Standalone server '%s' failed to start replication: '%s'. Query: '%s'.",
server->name(), error_msg.c_str(), query);
}
return success;
}
/**
* Checks if a server is a possible rejoin candidate. A true result from this function is not yet sufficient
* criteria and another call to can_replicate_from() should be made.
*
* @param rejoin_cand Server to check
* @param master Master server info
* @param output Error output. If NULL, no error is printed to log.
* @return True, if server is a rejoin suspect.
*/
bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, MariaDBServer* master,
json_t** output)
bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output)
{
bool is_suspect = false;
if (rejoin_cand->is_running() && !rejoin_cand->is_master())
@ -502,8 +414,7 @@ bool MariaDBMonitor::server_is_rejoin_suspect(MariaDBServer* rejoin_cand, MariaD
else if (rejoin_cand->n_slaves_configured == 1)
{
// which is connected to master but it's the wrong one
if (slave_status->slave_io_running &&
slave_status->master_server_id != master->server_id)
if (slave_status->slave_io_running && slave_status->master_server_id != m_master->server_id)
{
is_suspect = true;
}
@ -621,7 +532,7 @@ bool MariaDBMonitor::do_switchover(MariaDBServer** current_master, MariaDBServer
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
MariaDBServer* server = *iter;
if (server != promotion_target && update_slave_info(server) && server != demotion_target)
if (server != promotion_target && server->update_slave_info() && server != demotion_target)
{
redirectable_slaves.push_back(server);
}
@ -753,7 +664,7 @@ bool MariaDBMonitor::do_failover(json_t** err_out)
bool rval = false;
// Step 2: Wait until relay log consumed.
if (failover_wait_relay_log(new_master, seconds_remaining, err_out))
if (new_master->failover_wait_relay_log(seconds_remaining, err_out))
{
time_t step2_time = time(NULL);
int seconds_step2 = difftime(step2_time, step1_time);
@ -807,64 +718,6 @@ bool MariaDBMonitor::do_failover(json_t** err_out)
return rval;
}
/**
* Waits until the new master has processed all its relay log, or time is up.
*
* @param new_master The new master
* @param seconds_remaining How much time left
* @param err_out Json error output
* @return True if relay log was processed within time limit, or false if time ran out or an error occurred.
*/
bool MariaDBMonitor::failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining,
json_t** err_out)
{
time_t begin = time(NULL);
bool query_ok = true;
bool io_pos_stable = true;
while (new_master->relay_log_events() > 0 &&
query_ok &&
io_pos_stable &&
difftime(time(NULL), begin) < seconds_remaining)
{
MXS_INFO("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
new_master->name(), new_master->relay_log_events());
thread_millisleep(1000); // Sleep for a while before querying server again.
// Todo: check server version before entering failover.
GtidList old_gtid_io_pos = new_master->slave_status.gtid_io_pos;
// Update gtid:s first to make sure Gtid_IO_Pos is the more recent value.
// It doesn't matter here, but is a general rule.
query_ok = new_master->update_gtids() && new_master->do_show_slave_status();
io_pos_stable = (old_gtid_io_pos == new_master->slave_status.gtid_io_pos);
}
bool rval = false;
if (new_master->relay_log_events() == 0)
{
rval = true;
}
else
{
string reason = "Timeout";
if (!query_ok)
{
reason = "Query error";
}
else if (!io_pos_stable)
{
reason = "Old master sent new event(s)";
}
else if (new_master->relay_log_events() < 0) // TODO: This is currently impossible
{
reason = "Invalid Gtid(s) (current_pos: " + new_master->gtid_current_pos.to_string() +
", io_pos: " + new_master->slave_status.gtid_io_pos.to_string() + ")";
}
PRINT_MXS_JSON_ERROR(err_out, "Failover: %s while waiting for server '%s' to process relay log. "
"Cancelling failover.", reason.c_str(), new_master->name());
rval = false;
}
return rval;
}
/**
* Demotes the current master server, preparing it for replicating from another server. This step can take a
* while if long writes are running on the server.
@ -1107,7 +960,7 @@ bool MariaDBMonitor::switchover_check_preferred_master(MariaDBServer* preferred,
{
ss_dassert(preferred);
bool rval = true;
if (!update_slave_info(preferred) || !preferred->check_replication_settings())
if (!preferred->update_slave_info() || !preferred->check_replication_settings())
{
PRINT_MXS_JSON_ERROR(err_out, "The requested server '%s' is not a valid promotion candidate.",
preferred->name());
@ -1181,7 +1034,7 @@ MariaDBServer* MariaDBMonitor::select_new_master(ServerArray* slaves_out, json_t
* If master is replicating from external master, it is updated by update_slave_info()
* but not added to array. */
MariaDBServer* cand = *iter;
if (update_slave_info(cand) && cand != m_master)
if (cand->update_slave_info() && cand != m_master)
{
slaves_out->push_back(cand);
// Check that server is not in the exclusion list while still being a valid choice.
@ -1405,7 +1258,7 @@ bool MariaDBMonitor::failover_check(json_t** error_out)
}
else if (server->is_slave())
{
if (uses_gtid(server, error_out))
if (server->uses_gtid(error_out))
{
slaves++;
}
@ -1427,24 +1280,6 @@ bool MariaDBMonitor::failover_check(json_t** error_out)
return !error && slaves > 0;
}
/**
* Checks if slave candidate can replicate from master. Only considers gtid:s and only detects obvious errors.
* The non-detected errors will mostly be detected once the slave tries to start replicating.
*
* @param slave_cand Slave candidate server
* @param master_info Master server
* @return True if slave can replicate from master
*/
bool MariaDBMonitor::can_replicate_from(MariaDBServer* slave_cand, MariaDBServer* master)
{
bool rval = false;
if (slave_cand->update_gtids())
{
rval = slave_cand->gtid_current_pos.can_replicate_from(master->gtid_binlog_pos);
}
return rval;
}
/**
* @brief Process possible failover event
*
@ -1540,30 +1375,6 @@ bool MariaDBMonitor::handle_auto_failover()
return cluster_modified;
}
/**
* Check if server is using gtid replication.
*
* @param mon_server Server to check
* @param error_out Error output
* @return True if using gtid-replication. False if not, or if server is not a slave or otherwise does
* not have a gtid_IO_Pos.
*/
bool MariaDBMonitor::uses_gtid(MariaDBServer* server, json_t** error_out)
{
bool rval = false;
if (server->slave_status.gtid_io_pos.empty())
{
string slave_not_gtid_msg = string("Slave server ") + server->name() +
" is not using gtid replication.";
PRINT_MXS_JSON_ERROR(error_out, "%s", slave_not_gtid_msg.c_str());
}
else
{
rval = true;
}
return rval;
}
bool MariaDBMonitor::failover_not_possible()
{
bool rval = false;
@ -1703,7 +1514,7 @@ bool MariaDBMonitor::switchover_check(SERVER* new_master, SERVER* current_master
bool gtid_ok = true;
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
if ((*iter)->is_slave() && !uses_gtid(*iter, error_out))
if ((*iter)->is_slave() && !(*iter)->uses_gtid(error_out))
{
gtid_ok = false;
}

View File

@ -204,28 +204,22 @@ private:
bool slave_receiving_events();
bool failover_check(json_t** error_out);
bool do_failover(json_t** err_out);
bool failover_wait_relay_log(MariaDBServer* new_master, int seconds_remaining, json_t** err_out);
// Rejoin methods
bool cluster_can_be_joined();
void handle_auto_rejoin();
bool get_joinable_servers(ServerArray* output);
bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, MariaDBServer* master, json_t** output);
bool can_replicate_from(MariaDBServer* slave_cand, MariaDBServer* master);
bool server_is_rejoin_suspect(MariaDBServer* rejoin_cand, json_t** output);
uint32_t do_rejoin(const ServerArray& joinable_servers);
bool join_cluster(MariaDBServer* server, const std::string& change_cmd);
// Methods common to failover/switchover/rejoin
bool uses_gtid(MariaDBServer* mon_server, json_t** error_out);
MariaDBServer* select_new_master(ServerArray* slaves_out, json_t** err_out);
bool update_slave_info(MariaDBServer* server);
bool server_is_excluded(const MariaDBServer* server);
bool is_candidate_better(const MariaDBServer* current_best, const MariaDBServer* candidate,
uint32_t gtid_domain);
bool promote_new_master(MariaDBServer* new_master, json_t** err_out);
int redirect_slaves(MariaDBServer* new_master, const ServerArray& slaves,
ServerArray* redirected_slaves);
bool redirect_one_slave(MariaDBServer* slave, const std::string& change_cmd);
std::string generate_change_master_cmd(const std::string& master_host, int master_port);
bool start_external_replication(MariaDBServer* new_master, json_t** err_out);
bool wait_cluster_stabilization(MariaDBServer* new_master, const ServerArray& slaves,

View File

@ -489,6 +489,154 @@ json_t* MariaDBServer::diagnostics_json(bool multimaster) const
return srv;
}
bool MariaDBServer::uses_gtid(json_t** error_out)
{
bool using_gtid = !slave_status.gtid_io_pos.empty();
if (!using_gtid)
{
string slave_not_gtid_msg = string("Slave server ") + name() + " is not using gtid replication.";
PRINT_MXS_JSON_ERROR(error_out, "%s", slave_not_gtid_msg.c_str());
}
return using_gtid;
}
bool MariaDBServer::update_slave_info()
{
return (slave_status.slave_sql_running && update_replication_settings() &&
update_gtids() && do_show_slave_status());
}
bool MariaDBServer::can_replicate_from(MariaDBServer* master)
{
bool rval = false;
if (update_gtids())
{
rval = gtid_current_pos.can_replicate_from(master->gtid_binlog_pos);
}
return rval;
}
bool MariaDBServer::redirect_one_slave(const string& change_cmd)
{
bool success = false;
MYSQL* slave_conn = server_base->con;
const char* query = "STOP SLAVE;";
if (mxs_mysql_query(slave_conn, query) == 0)
{
query = "RESET SLAVE;"; // To erase any old I/O or SQL errors
if (mxs_mysql_query(slave_conn, query) == 0)
{
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(slave_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(slave_conn, query) == 0)
{
success = true;
MXS_NOTICE("Slave '%s' redirected to new master.", name());
}
}
}
}
if (!success)
{
MXS_WARNING("Slave '%s' redirection failed: '%s'. Query: '%s'.", name(),
mysql_error(slave_conn), query);
}
return success;
}
bool MariaDBServer::join_cluster(const string& change_cmd)
{
/* Server does not have slave connections. This operation can fail, or the resulting
* replication may end up broken. */
bool success = false;
string error_msg;
MYSQL* server_conn = server_base->con;
const char* query = "SET GLOBAL read_only=1;";
if (mxs_mysql_query(server_conn, query) == 0)
{
query = "CHANGE MASTER TO ..."; // Don't show the real query as it contains a password.
if (mxs_mysql_query(server_conn, change_cmd.c_str()) == 0)
{
query = "START SLAVE;";
if (mxs_mysql_query(server_conn, query) == 0)
{
success = true;
MXS_NOTICE("Standalone server '%s' starting replication.", name());
}
}
if (!success)
{
// A step after "SET GLOBAL read_only=1" failed, try to undo. First, backup error message.
error_msg = mysql_error(server_conn);
mxs_mysql_query(server_conn, "SET GLOBAL read_only=0;");
}
}
if (!success)
{
if (error_msg.empty())
{
error_msg = mysql_error(server_conn);
}
MXS_WARNING("Standalone server '%s' failed to start replication: '%s'. Query: '%s'.",
name(), error_msg.c_str(), query);
}
return success;
}
bool MariaDBServer::failover_wait_relay_log(int seconds_remaining, json_t** err_out)
{
time_t begin = time(NULL);
bool query_ok = true;
bool io_pos_stable = true;
while (relay_log_events() > 0 &&
query_ok &&
io_pos_stable &&
difftime(time(NULL), begin) < seconds_remaining)
{
MXS_INFO("Relay log of server '%s' not yet empty, waiting to clear %" PRId64 " events.",
name(), relay_log_events());
thread_millisleep(1000); // Sleep for a while before querying server again.
// Todo: check server version before entering failover.
GtidList old_gtid_io_pos = slave_status.gtid_io_pos;
// Update gtid:s first to make sure Gtid_IO_Pos is the more recent value.
// It doesn't matter here, but is a general rule.
query_ok = update_gtids() && do_show_slave_status();
io_pos_stable = (old_gtid_io_pos == slave_status.gtid_io_pos);
}
bool rval = false;
if (relay_log_events() == 0)
{
rval = true;
}
else
{
string reason = "Timeout";
if (!query_ok)
{
reason = "Query error";
}
else if (!io_pos_stable)
{
reason = "Old master sent new event(s)";
}
else if (relay_log_events() < 0) // TODO: This is currently impossible
{
reason = "Invalid Gtid(s) (current_pos: " + gtid_current_pos.to_string() +
", io_pos: " + slave_status.gtid_io_pos.to_string() + ")";
}
PRINT_MXS_JSON_ERROR(err_out, "Failover: %s while waiting for server '%s' to process relay log. "
"Cancelling failover.", reason.c_str(), name());
rval = false;
}
return rval;
}
QueryResult::QueryResult(MYSQL_RES* resultset)
: m_resultset(resultset)
, m_columns(-1)

View File

@ -212,6 +212,58 @@ public:
* @return Diagnostics string
*/
std::string diagnostics(bool multimaster) const;
/**
* Check if server is using gtid replication.
*
* @param error_out Error output
* @return True if using gtid-replication. False if not, or if server is not a slave or otherwise does
* not have a gtid_IO_Pos.
*/
bool uses_gtid(json_t** error_out);
/**
* Update replication settings, gtid:s and slave status of the server.
*
* @param server Slave to update
* @return True on success. False on error, or if server is not a slave (slave SQL not running).
*/
bool update_slave_info();
/**
* Checks if this server can replicate from master. Only considers gtid:s and only detects obvious errors.
* The non-detected errors will mostly be detected once the slave tries to start replicating.
*
* @param master_info Master server
* @return True if slave can replicate from master
*/
bool can_replicate_from(MariaDBServer* master);
/**
* Redirect one slave server to another master
*
* @param change_cmd Change master command, usually generated by generate_change_master_cmd()
* @return True if slave accepted all commands
*/
bool redirect_one_slave(const std::string& change_cmd);
/**
* Joins this standalone server to the cluster.
*
* @param change_cmd Change master command
* @return True if commands were accepted by server
*/
bool join_cluster(const std::string& change_cmd);
/**
* Waits until this server has processed all its relay log, or time is up.
*
* @param seconds_remaining How much time left
* @param err_out Json error output
* @return True if relay log was processed within time limit, or false if time ran out
* or an error occurred.
*/
bool failover_wait_relay_log(int seconds_remaining, json_t** err_out);
};
/**