MXS-1703 Clean up monitor main loop function

Several blocks have been moved to their own functions to shorten
the main function.
This commit is contained in:
Esa Korhonen
2018-03-21 14:55:22 +02:00
parent aa035e4623
commit e5dddf5f74
2 changed files with 394 additions and 335 deletions

View File

@ -446,10 +446,8 @@ bool MariaDBMonitor::set_standalone_master(MXS_MONITORED_SERVER *db)
void MariaDBMonitor::main_loop()
{
m_status = MXS_MONITOR_RUNNING;
MXS_MONITORED_SERVER *ptr;
bool replication_heartbeat;
bool detect_stale_master;
int num_servers = 0;
MXS_MONITORED_SERVER *root_master = NULL;
size_t nrounds = 0;
int log_no_master = 1;
@ -499,19 +497,136 @@ void MariaDBMonitor::main_loop()
continue;
}
nrounds += 1;
/* reset num_servers */
num_servers = 0;
lock_monitor_servers(m_monitor_base);
servers_status_pending_to_current(m_monitor_base);
/* start from the first server in the list */
ptr = m_monitor_base->monitored_servers;
while (ptr)
// Query all servers for their status.
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
ptr->mon_prev_status = ptr->server->status;
monitor_one_server(*iter);
}
// Use the information to find the so far best master server.
find_root_master(&root_master);
if (m_master != NULL && SERVER_IS_MASTER(m_master->server))
{
// Update cluster-wide values dependant on the current master.
update_gtid_domain();
update_external_master();
}
// Assign relay masters, clear SERVER_SLAVE from binlog relays
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
assign_relay_master(*iter);
/* Remove SLAVE status if this server is a Binlog Server relay */
if (iter->binlog_relay)
{
monitor_clear_pending_status(iter->server_base, SERVER_SLAVE);
}
}
/* Update server status from monitor pending status on that server*/
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
update_server_states(*iter, root_master, detect_stale_master);
}
/** Now that all servers have their status correctly set, we can check
if we need to use standalone master. */
if (m_detect_standalone_master)
{
if (standalone_master_required(m_monitor_base->monitored_servers))
{
// Other servers have died, set last remaining server as master
if (set_standalone_master(m_monitor_base->monitored_servers))
{
// Update the root_master to point to the standalone master
root_master = m_master;
}
}
else
{
m_warn_set_standalone_master = true;
}
}
if (root_master && SERVER_IS_MASTER(root_master->server))
{
// Clear slave and stale slave status bits from current master
server_clear_status_nolock(root_master->server, SERVER_SLAVE | SERVER_STALE_SLAVE);
monitor_clear_pending_status(root_master, SERVER_SLAVE | SERVER_STALE_SLAVE);
/**
* Clear external slave status from master if configured to do so.
* This allows parts of a multi-tiered replication setup to be used
* in MaxScale.
*/
if (m_ignore_external_masters)
{
monitor_clear_pending_status(root_master, SERVER_SLAVE_OF_EXTERNAL_MASTER);
server_clear_status_nolock(root_master->server, SERVER_SLAVE_OF_EXTERNAL_MASTER);
}
}
ss_dassert(root_master == NULL || m_master == root_master);
ss_dassert(!root_master ||
((root_master->server->status & (SERVER_SLAVE | SERVER_MASTER))
!= (SERVER_SLAVE | SERVER_MASTER)));
/**
* After updating the status of all servers, check if monitor events
* need to be launched.
*/
mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events);
bool failover_performed = false; // Has an automatic failover been performed this loop?
if (m_auto_failover)
{
handle_auto_failover(&failover_performed);
}
/* log master detection failure of first master becomes available after failure */
log_master_changes(root_master, &log_no_master);
/* Generate the replication heartbeat event by performing an update */
if (replication_heartbeat &&
root_master &&
(SERVER_IS_MASTER(root_master->server) ||
SERVER_IS_RELAY_SERVER(root_master->server)))
{
measure_replication_lag(root_master);
}
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
// has been performed, as server states have not been updated yet. It will happen next iteration.
if (!config_get_global_options()->passive && m_auto_rejoin &&
!failover_performed && cluster_can_be_joined())
{
// Check if any servers should be autojoined to the cluster and try to join them.
handle_auto_rejoin();
}
mon_hangup_failed_servers(m_monitor_base);
servers_status_current_to_pending(m_monitor_base);
store_server_journal(m_monitor_base, m_master);
release_monitor_servers(m_monitor_base);
} /*< while (1) */
}
/**
* Monitor a server. Should be moved to the server class later on.
*
* @param server The server
*/
void MariaDBMonitor::monitor_one_server(MariaDBServer& server)
{
MXS_MONITORED_SERVER* ptr = server.server_base;
ptr->mon_prev_status = ptr->server->status;
/* copy server status into monitor pending_status */
ptr->pending_status = ptr->server->status;
@ -521,8 +636,6 @@ void MariaDBMonitor::main_loop()
/* reset the slave list of current node */
memset(&ptr->server->slaves, 0, sizeof(ptr->server->slaves));
num_servers++;
if (mon_status_changed(ptr))
{
if (SRV_MASTER_STATUS(ptr->mon_prev_status))
@ -559,26 +672,32 @@ void MariaDBMonitor::main_loop()
/** Reset this server's error count */
ptr->mon_err_count = 0;
}
ptr = ptr->next;
}
ptr = m_monitor_base->monitored_servers;
/**
* Compute replication tree, assign root master.
*
* @param root_master Handle to master server
*/
void MariaDBMonitor::find_root_master(MXS_MONITORED_SERVER** root_master)
{
const int num_servers = m_servers.size();
/* if only one server is configured, that's is Master */
if (num_servers == 1)
{
if (SERVER_IS_RUNNING(ptr->server))
auto only_server = m_servers[0];
MXS_MONITORED_SERVER* mon_server = only_server.server_base;
if (SERVER_IS_RUNNING(mon_server->server))
{
ptr->server->depth = 0;
mon_server->server->depth = 0;
/* status cleanup */
monitor_clear_pending_status(ptr, SERVER_SLAVE);
monitor_clear_pending_status(mon_server, SERVER_SLAVE);
/* master status set */
monitor_set_pending_status(ptr, SERVER_MASTER);
monitor_set_pending_status(mon_server, SERVER_MASTER);
ptr->server->depth = 0;
m_master = ptr;
root_master = ptr;
mon_server->server->depth = 0;
m_master = mon_server;
*root_master = mon_server;
}
}
else
@ -586,11 +705,11 @@ void MariaDBMonitor::main_loop()
/* Compute the replication tree */
if (m_mysql51_replication)
{
root_master = build_mysql51_replication_tree();
*root_master = build_mysql51_replication_tree();
}
else
{
root_master = get_replication_tree(num_servers);
*root_master = get_replication_tree(num_servers);
}
}
@ -601,11 +720,11 @@ void MariaDBMonitor::main_loop()
variable set to ON will be assigned the slave status. */
find_graph_cycles(this, m_monitor_base->monitored_servers, num_servers);
}
}
if (m_master != NULL && SERVER_IS_MASTER(m_master->server))
void MariaDBMonitor::update_gtid_domain()
{
MariaDBServer* master_info = get_server_info(m_master);
// Update cluster gtid domain
int64_t domain = master_info->gtid_domain_id;
if (m_master_gtid_domain >= 0 && domain != m_master_gtid_domain)
{
@ -613,8 +732,11 @@ void MariaDBMonitor::main_loop()
m_master_gtid_domain, domain);
}
m_master_gtid_domain = domain;
}
// Update cluster external master
void MariaDBMonitor::update_external_master()
{
MariaDBServer* master_info = get_server_info(m_master);
if (SERVER_IS_SLAVE_OF_EXTERNAL_MASTER(m_master->server))
{
if (master_info->slave_status.master_host != m_external_master_host ||
@ -648,36 +770,29 @@ void MariaDBMonitor::main_loop()
}
}
ptr = m_monitor_base->monitored_servers;
while (ptr)
/**
* TODO: Move to MariaDBServer.
*
* @param serv_info
*/
void MariaDBMonitor::assign_relay_master(MariaDBServer& serv_info)
{
MariaDBServer *serv_info = get_server_info(ptr);
ss_dassert(serv_info);
MXS_MONITORED_SERVER* ptr = serv_info.server_base;
if (ptr->server->node_id > 0 && ptr->server->master_id > 0 &&
getSlaveOfNodeId(m_monitor_base->monitored_servers, ptr->server->node_id, REJECT_DOWN) &&
getServerByNodeId(m_monitor_base->monitored_servers, ptr->server->master_id) &&
(!m_detect_multimaster || serv_info->group == 0))
(!m_detect_multimaster || serv_info.group == 0))
{
/** This server is both a slave and a master i.e. a relay master */
monitor_set_pending_status(ptr, SERVER_RELAY_MASTER);
monitor_clear_pending_status(ptr, SERVER_MASTER);
}
/* Remove SLAVE status if this server is a Binlog Server relay */
if (serv_info->binlog_relay)
{
monitor_clear_pending_status(ptr, SERVER_SLAVE);
}
ptr = ptr->next;
}
/* Update server status from monitor pending status on that server*/
ptr = m_monitor_base->monitored_servers;
while (ptr)
void MariaDBMonitor::update_server_states(MariaDBServer& db_server, MXS_MONITORED_SERVER* root_master,
bool detect_stale_master)
{
MXS_MONITORED_SERVER* ptr = db_server.server_base;
if (!SERVER_IN_MAINT(ptr->server))
{
MariaDBServer *serv_info = get_server_info(ptr);
@ -755,59 +870,28 @@ void MariaDBMonitor::main_loop()
ptr->server->status = ptr->pending_status;
}
ptr = ptr->next;
}
/** Now that all servers have their status correctly set, we can check
if we need to use standalone master. */
if (m_detect_standalone_master)
void MariaDBMonitor::measure_replication_lag(MXS_MONITORED_SERVER* root_master)
{
if (standalone_master_required(m_monitor_base->monitored_servers))
set_master_heartbeat(root_master);
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
// Other servers have died, set last remaining server as master
if (set_standalone_master(m_monitor_base->monitored_servers))
MXS_MONITORED_SERVER* ptr = iter->server_base;
if ((!SERVER_IN_MAINT(ptr->server)) && SERVER_IS_RUNNING(ptr->server))
{
// Update the root_master to point to the standalone master
root_master = m_master;
if (ptr->server->node_id != root_master->server->node_id &&
(SERVER_IS_SLAVE(ptr->server) ||
SERVER_IS_RELAY_SERVER(ptr->server)) &&
!iter->binlog_relay) // No select lag for Binlog Server
{
set_slave_heartbeat(ptr);
}
}
else
{
m_warn_set_standalone_master = true;
}
}
if (root_master && SERVER_IS_MASTER(root_master->server))
{
// Clear slave and stale slave status bits from current master
server_clear_status_nolock(root_master->server, SERVER_SLAVE | SERVER_STALE_SLAVE);
monitor_clear_pending_status(root_master, SERVER_SLAVE | SERVER_STALE_SLAVE);
/**
* Clear external slave status from master if configured to do so.
* This allows parts of a multi-tiered replication setup to be used
* in MaxScale.
*/
if (m_ignore_external_masters)
{
monitor_clear_pending_status(root_master, SERVER_SLAVE_OF_EXTERNAL_MASTER);
server_clear_status_nolock(root_master->server, SERVER_SLAVE_OF_EXTERNAL_MASTER);
}
}
ss_dassert(root_master == NULL || m_master == root_master);
ss_dassert(!root_master ||
((root_master->server->status & (SERVER_SLAVE | SERVER_MASTER))
!= (SERVER_SLAVE | SERVER_MASTER)));
/**
* After updating the status of all servers, check if monitor events
* need to be launched.
*/
mon_process_state_changes(m_monitor_base, m_script.c_str(), m_events);
bool failover_performed = false; // Has an automatic failover been performed this loop?
if (m_auto_failover)
void MariaDBMonitor::handle_auto_failover(bool* failover_performed)
{
const char RE_ENABLE_FMT[] = "%s To re-enable failover, manually set '%s' to 'true' for monitor "
"'%s' via MaxAdmin or the REST API, or restart MaxScale.";
@ -827,7 +911,7 @@ void MariaDBMonitor::main_loop()
{
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
}
else if (!mon_process_failover(&failover_performed))
else if (!mon_process_failover(failover_performed))
{
const char FAILED[] = "Failed to perform failover, disabling automatic failover.";
MXS_ERROR(RE_ENABLE_FMT, FAILED, CN_AUTO_FAILOVER, m_monitor_base->name);
@ -836,7 +920,8 @@ void MariaDBMonitor::main_loop()
}
}
/* log master detection failure of first master becomes available after failure */
void MariaDBMonitor::log_master_changes(MXS_MONITORED_SERVER* root_master, int* log_no_master)
{
if (root_master &&
mon_status_changed(root_master) &&
!(root_master->server->status & SERVER_STALE_STATUS))
@ -857,50 +942,20 @@ void MariaDBMonitor::main_loop()
root_master->server->name,
root_master->server->port);
}
log_no_master = 1;
*log_no_master = 1;
}
else
{
if (!root_master && log_no_master)
if (!root_master && *log_no_master)
{
MXS_ERROR("No Master can be determined");
log_no_master = 0;
*log_no_master = 0;
}
}
}
/* Generate the replication heartbeat event by performing an update */
if (replication_heartbeat &&
root_master &&
(SERVER_IS_MASTER(root_master->server) ||
SERVER_IS_RELAY_SERVER(root_master->server)))
void MariaDBMonitor::handle_auto_rejoin()
{
set_master_heartbeat(root_master);
ptr = m_monitor_base->monitored_servers;
while (ptr)
{
MariaDBServer *serv_info = get_server_info(ptr);
if ((!SERVER_IN_MAINT(ptr->server)) && SERVER_IS_RUNNING(ptr->server))
{
if (ptr->server->node_id != root_master->server->node_id &&
(SERVER_IS_SLAVE(ptr->server) ||
SERVER_IS_RELAY_SERVER(ptr->server)) &&
!serv_info->binlog_relay) // No select lag for Binlog Server
{
set_slave_heartbeat(ptr);
}
}
ptr = ptr->next;
}
}
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
// has been performed, as server states have not been updated yet. It will happen next iteration.
if (!config_get_global_options()->passive && m_auto_rejoin &&
!failover_performed && cluster_can_be_joined())
{
// Check if any servers should be autojoined to the cluster
ServerVector joinable_servers;
if (get_joinable_servers(&joinable_servers))
{
@ -925,13 +980,6 @@ void MariaDBMonitor::main_loop()
}
}
mon_hangup_failed_servers(m_monitor_base);
servers_status_current_to_pending(m_monitor_base);
store_server_journal(m_monitor_base, m_master);
release_monitor_servers(m_monitor_base);
} /*< while (1) */
}
/**
* The entry point for the monitoring module thread
*

View File

@ -254,4 +254,15 @@ private:
json_t** error_out) const;
bool can_replicate_from(MXS_MONITORED_SERVER* slave, MariaDBServer* slave_info,
MariaDBServer* master_info);
void monitor_one_server(MariaDBServer& server);
void find_root_master(MXS_MONITORED_SERVER** root_master);
void update_gtid_domain();
void update_external_master();
void assign_relay_master(MariaDBServer& serv_info);
void update_server_states(MariaDBServer& db_server, MXS_MONITORED_SERVER* root_master,
bool detect_stale_master);
void log_master_changes(MXS_MONITORED_SERVER* root_master, int* log_no_master);
void handle_auto_failover(bool* failover_performed);
void measure_replication_lag(MXS_MONITORED_SERVER* root_master);
void handle_auto_rejoin();
};