MXS-1883 Remove locking during monitor loop
Since the admin cannot modify server states any more, the locks are not required.
This commit is contained in:
@ -259,9 +259,7 @@ struct mxs_monitor
|
|||||||
char *module_name; /**< Name of the monitor module */
|
char *module_name; /**< Name of the monitor module */
|
||||||
MXS_MONITOR_INSTANCE *instance; /**< Instance returned from startMonitor */
|
MXS_MONITOR_INSTANCE *instance; /**< Instance returned from startMonitor */
|
||||||
size_t interval; /**< The monitor interval */
|
size_t interval; /**< The monitor interval */
|
||||||
volatile bool server_pending_changes;
|
int check_maintenance_flag; /**< Set when admin requests a maintenance status change. */
|
||||||
/**< Are there any pending changes to a server?
|
|
||||||
* If yes, the next monitor loop starts early. */
|
|
||||||
bool active; /**< True if monitor is active */
|
bool active; /**< True if monitor is active */
|
||||||
time_t journal_max_age; /**< Maximum age of journal file */
|
time_t journal_max_age; /**< Maximum age of journal file */
|
||||||
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
|
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
|
||||||
@ -322,8 +320,7 @@ bool check_monitor_permissions(MXS_MONITOR* monitor, const char* query);
|
|||||||
|
|
||||||
void monitor_clear_pending_status(MXS_MONITORED_SERVER *ptr, uint64_t bit);
|
void monitor_clear_pending_status(MXS_MONITORED_SERVER *ptr, uint64_t bit);
|
||||||
void monitor_set_pending_status(MXS_MONITORED_SERVER *ptr, uint64_t bit);
|
void monitor_set_pending_status(MXS_MONITORED_SERVER *ptr, uint64_t bit);
|
||||||
void servers_status_pending_to_current(MXS_MONITOR *monitor);
|
void monitor_check_maintenance_requests(MXS_MONITOR *monitor);
|
||||||
void servers_status_current_to_pending(MXS_MONITOR *monitor);
|
|
||||||
|
|
||||||
bool mon_status_changed(MXS_MONITORED_SERVER* mon_srv);
|
bool mon_status_changed(MXS_MONITORED_SERVER* mon_srv);
|
||||||
bool mon_print_fail_status(MXS_MONITORED_SERVER* mon_srv);
|
bool mon_print_fail_status(MXS_MONITORED_SERVER* mon_srv);
|
||||||
@ -333,9 +330,6 @@ bool mon_connection_is_ok(mxs_connect_result_t connect_result);
|
|||||||
void mon_log_connect_error(MXS_MONITORED_SERVER* database, mxs_connect_result_t rval);
|
void mon_log_connect_error(MXS_MONITORED_SERVER* database, mxs_connect_result_t rval);
|
||||||
const char* mon_get_event_name(mxs_monitor_event_t event);
|
const char* mon_get_event_name(mxs_monitor_event_t event);
|
||||||
|
|
||||||
void lock_monitor_servers(MXS_MONITOR *monitor);
|
|
||||||
void release_monitor_servers(MXS_MONITOR *monitor);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Alter monitor parameters
|
* Alter monitor parameters
|
||||||
*
|
*
|
||||||
|
@ -48,7 +48,8 @@ extern const char CN_PROXY_PROTOCOL[];
|
|||||||
const int MAINTENANCE_OFF = -100;
|
const int MAINTENANCE_OFF = -100;
|
||||||
const int MAINTENANCE_NO_CHANGE = 0;
|
const int MAINTENANCE_NO_CHANGE = 0;
|
||||||
const int MAINTENANCE_ON = 100;
|
const int MAINTENANCE_ON = 100;
|
||||||
|
const int MAINTENANCE_FLAG_NOCHECK = 0;
|
||||||
|
const int MAINTENANCE_FLAG_CHECK = -1;
|
||||||
/**
|
/**
|
||||||
* The server parameters used for weighting routing decissions
|
* The server parameters used for weighting routing decissions
|
||||||
*/
|
*/
|
||||||
|
@ -136,7 +136,7 @@ MXS_MONITOR* monitor_create(const char *name, const char *module)
|
|||||||
mon->journal_max_age = DEFAULT_JOURNAL_MAX_AGE;
|
mon->journal_max_age = DEFAULT_JOURNAL_MAX_AGE;
|
||||||
mon->script_timeout = DEFAULT_SCRIPT_TIMEOUT;
|
mon->script_timeout = DEFAULT_SCRIPT_TIMEOUT;
|
||||||
mon->parameters = NULL;
|
mon->parameters = NULL;
|
||||||
mon->server_pending_changes = false;
|
mon->check_maintenance_flag = MAINTENANCE_FLAG_NOCHECK;
|
||||||
memset(mon->journal_hash, 0, sizeof(mon->journal_hash));
|
memset(mon->journal_hash, 0, sizeof(mon->journal_hash));
|
||||||
mon->disk_space_threshold = NULL;
|
mon->disk_space_threshold = NULL;
|
||||||
mon->disk_space_check_interval = 0;
|
mon->disk_space_check_interval = 0;
|
||||||
@ -1712,73 +1712,35 @@ void mon_report_query_error(MXS_MONITORED_SERVER* db)
|
|||||||
db->server->port, mysql_error(db->con));
|
db->server->port, mysql_error(db->con));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Acquire locks on all servers monitored by this monitor. There should
|
|
||||||
* only be max 1 monitor per server.
|
|
||||||
* @param monitor The target monitor
|
|
||||||
*/
|
|
||||||
void lock_monitor_servers(MXS_MONITOR *monitor)
|
|
||||||
{
|
|
||||||
MXS_MONITORED_SERVER *ptr = monitor->monitored_servers;
|
|
||||||
while (ptr)
|
|
||||||
{
|
|
||||||
spinlock_acquire(&ptr->server->lock);
|
|
||||||
ptr = ptr->next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Release locks on all servers monitored by this monitor. There should
|
|
||||||
* only be max 1 monitor per server.
|
|
||||||
* @param monitor The target monitor
|
|
||||||
*/
|
|
||||||
void release_monitor_servers(MXS_MONITOR *monitor)
|
|
||||||
{
|
|
||||||
MXS_MONITORED_SERVER *ptr = monitor->monitored_servers;
|
|
||||||
while (ptr)
|
|
||||||
{
|
|
||||||
spinlock_release(&ptr->server->lock);
|
|
||||||
ptr = ptr->next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/**
|
/**
|
||||||
* Check if admin is requesting setting or clearing maintenance status on the server and act accordingly.
|
* Check if admin is requesting setting or clearing maintenance status on the server and act accordingly.
|
||||||
* Should be called at the beginning of a monitor loop.
|
* Should be called at the beginning of a monitor loop.
|
||||||
*
|
*
|
||||||
* @param monitor The target monitor
|
* @param monitor The target monitor
|
||||||
*/
|
*/
|
||||||
void servers_status_pending_to_current(MXS_MONITOR *monitor)
|
void monitor_check_maintenance_requests(MXS_MONITOR *monitor)
|
||||||
{
|
{
|
||||||
MXS_MONITORED_SERVER *ptr = monitor->monitored_servers;
|
/* In theory, the admin may be modifying the server maintenance status during this function. The overall
|
||||||
while (ptr)
|
* maintenance flag should be read-written atomically to prevent missing a value. */
|
||||||
|
int flags_changed = atomic_exchange_int(&monitor->check_maintenance_flag, MAINTENANCE_FLAG_NOCHECK);
|
||||||
|
if (flags_changed != MAINTENANCE_FLAG_NOCHECK)
|
||||||
{
|
{
|
||||||
// The only server status bit the admin may change is the [Maintenance] bit.
|
MXS_MONITORED_SERVER *ptr = monitor->monitored_servers;
|
||||||
int admin_msg = atomic_exchange_int(&ptr->server->maint_request, MAINTENANCE_NO_CHANGE);
|
while (ptr)
|
||||||
if (admin_msg == MAINTENANCE_ON)
|
|
||||||
{
|
{
|
||||||
// TODO: Change to writing MONITORED_SERVER->pending status instead once cleanup done.
|
// The only server status bit the admin may change is the [Maintenance] bit.
|
||||||
server_set_status_nolock(ptr->server, SERVER_MAINT);
|
int admin_msg = atomic_exchange_int(&ptr->server->maint_request, MAINTENANCE_NO_CHANGE);
|
||||||
|
if (admin_msg == MAINTENANCE_ON)
|
||||||
|
{
|
||||||
|
// TODO: Change to writing MONITORED_SERVER->pending status instead once cleanup done.
|
||||||
|
server_set_status_nolock(ptr->server, SERVER_MAINT);
|
||||||
|
}
|
||||||
|
else if (admin_msg == MAINTENANCE_OFF)
|
||||||
|
{
|
||||||
|
server_clear_status_nolock(ptr->server, SERVER_MAINT);
|
||||||
|
}
|
||||||
|
ptr = ptr->next;
|
||||||
}
|
}
|
||||||
else if (admin_msg == MAINTENANCE_OFF)
|
|
||||||
{
|
|
||||||
server_clear_status_nolock(ptr->server, SERVER_MAINT);
|
|
||||||
}
|
|
||||||
ptr = ptr->next;
|
|
||||||
}
|
|
||||||
monitor->server_pending_changes = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the pending status of all servers monitored by this monitor to
|
|
||||||
* the current status. This should only be called at the end of
|
|
||||||
* a monitor loop, before the servers are released.
|
|
||||||
* @param monitor The target monitor
|
|
||||||
*/
|
|
||||||
void servers_status_current_to_pending(MXS_MONITOR *monitor)
|
|
||||||
{
|
|
||||||
MXS_MONITORED_SERVER *ptr = monitor->monitored_servers;
|
|
||||||
while (ptr)
|
|
||||||
{
|
|
||||||
ptr = ptr->next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2778,8 +2740,7 @@ void MonitorInstance::main()
|
|||||||
|
|
||||||
while (!m_shutdown)
|
while (!m_shutdown)
|
||||||
{
|
{
|
||||||
lock_monitor_servers(m_monitor);
|
monitor_check_maintenance_requests(m_monitor);
|
||||||
servers_status_pending_to_current(m_monitor);
|
|
||||||
|
|
||||||
tick();
|
tick();
|
||||||
|
|
||||||
@ -2790,15 +2751,13 @@ void MonitorInstance::main()
|
|||||||
mon_process_state_changes(m_monitor, m_script.empty() ? NULL : m_script.c_str(), m_events);
|
mon_process_state_changes(m_monitor, m_script.empty() ? NULL : m_script.c_str(), m_events);
|
||||||
|
|
||||||
mon_hangup_failed_servers(m_monitor);
|
mon_hangup_failed_servers(m_monitor);
|
||||||
servers_status_current_to_pending(m_monitor);
|
|
||||||
store_server_journal(m_monitor, m_master);
|
store_server_journal(m_monitor, m_master);
|
||||||
release_monitor_servers(m_monitor);
|
|
||||||
|
|
||||||
/** Sleep until the next monitoring interval */
|
/** Sleep until the next monitoring interval */
|
||||||
unsigned int ms = 0;
|
unsigned int ms = 0;
|
||||||
while (ms < m_monitor->interval && !m_shutdown)
|
while (ms < m_monitor->interval && !m_shutdown)
|
||||||
{
|
{
|
||||||
if (m_monitor->server_pending_changes)
|
if (atomic_load_int(&m_monitor->check_maintenance_flag) != MAINTENANCE_FLAG_NOCHECK)
|
||||||
{
|
{
|
||||||
// Admin has changed something, skip sleep
|
// Admin has changed something, skip sleep
|
||||||
break;
|
break;
|
||||||
|
@ -1342,7 +1342,7 @@ bool mxs::server_set_status(SERVER *server, int bit, string* errmsg_out)
|
|||||||
{
|
{
|
||||||
MXS_WARNING(WRN_REQUEST_OVERWRITTEN);
|
MXS_WARNING(WRN_REQUEST_OVERWRITTEN);
|
||||||
}
|
}
|
||||||
mon->server_pending_changes = true;
|
atomic_store_int(&mon->check_maintenance_flag, MAINTENANCE_FLAG_CHECK);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1387,7 +1387,7 @@ bool mxs::server_clear_status(SERVER *server, int bit, string* errmsg_out)
|
|||||||
{
|
{
|
||||||
MXS_WARNING(WRN_REQUEST_OVERWRITTEN);
|
MXS_WARNING(WRN_REQUEST_OVERWRITTEN);
|
||||||
}
|
}
|
||||||
mon->server_pending_changes = true;
|
atomic_store_int(&mon->check_maintenance_flag, MAINTENANCE_FLAG_CHECK);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -308,12 +308,8 @@ void MariaDBMonitor::main()
|
|||||||
/* Coarse time has resolution ~1ms (as opposed to 1ns) but this is enough. */
|
/* Coarse time has resolution ~1ms (as opposed to 1ns) but this is enough. */
|
||||||
clock_gettime(CLOCK_MONOTONIC_COARSE, &loop_start);
|
clock_gettime(CLOCK_MONOTONIC_COARSE, &loop_start);
|
||||||
|
|
||||||
lock_monitor_servers(m_monitor);
|
|
||||||
/* Read any admin status changes from SERVER->status_pending. */
|
/* Read any admin status changes from SERVER->status_pending. */
|
||||||
if (m_monitor->server_pending_changes)
|
monitor_check_maintenance_requests(m_monitor);
|
||||||
{
|
|
||||||
servers_status_pending_to_current(m_monitor);
|
|
||||||
}
|
|
||||||
/* Update MXS_MONITORED_SERVER->pending_status. This is where the monitor loop writes it's findings.
|
/* Update MXS_MONITORED_SERVER->pending_status. This is where the monitor loop writes it's findings.
|
||||||
* Also, backup current status so that it can be compared to any deduced state. */
|
* Also, backup current status so that it can be compared to any deduced state. */
|
||||||
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
|
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
|
||||||
@ -322,9 +318,6 @@ void MariaDBMonitor::main()
|
|||||||
mon_srv->pending_status = status;
|
mon_srv->pending_status = status;
|
||||||
mon_srv->mon_prev_status = status;
|
mon_srv->mon_prev_status = status;
|
||||||
}
|
}
|
||||||
/* Avoid reading/writing SERVER->status while servers are not locked. Routers read the state
|
|
||||||
* all the time. */
|
|
||||||
release_monitor_servers(m_monitor);
|
|
||||||
|
|
||||||
// Query all servers for their status. Update the server id array.
|
// Query all servers for their status. Update the server id array.
|
||||||
m_servers_by_id.clear();
|
m_servers_by_id.clear();
|
||||||
@ -415,67 +408,44 @@ void MariaDBMonitor::main()
|
|||||||
measure_replication_lag(root_master);
|
measure_replication_lag(root_master);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Servers have been queried and states deduced. Lock servers for a short while again while we update
|
// Update shared status. The next functions read the shared status. TODO: change the following
|
||||||
* the shared server states and make the changes visible. There is a problem though: the admin could
|
// functions to read "pending_status" instead.
|
||||||
* have changed the shared state while the queries were running. In this rare case, the deduced
|
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
|
||||||
* server states no longer apply and may be inconsistent. The most straightforward solution to this
|
|
||||||
* is to discard the results of the latest monitor loop and start another immediately. Cluster
|
|
||||||
* modifications are not performed until the next loop. The discarding of results is not perfect as
|
|
||||||
* some data has already been written. But this data is only used by the monitor itself and will be
|
|
||||||
* rewritten next loop. TODO: make versions of log_master_changes() and mon_process_state_changes()
|
|
||||||
* which read from MXS_MONITORED_SERVER->pending_status instead of SERVER->status. Then this locking
|
|
||||||
* and releasing can be moved after failover etc and the user changes can be applied gracefully. */
|
|
||||||
lock_monitor_servers(m_monitor);
|
|
||||||
if (m_monitor->server_pending_changes)
|
|
||||||
{
|
{
|
||||||
MXS_WARNING("MaxAdmin or REST API modified the state of a server monitored by '%s' during a "
|
mon_srv->server->status = mon_srv->pending_status;
|
||||||
"monitor iteration. Discarding the monitor results and retrying.",
|
|
||||||
m_monitor->name);
|
|
||||||
release_monitor_servers(m_monitor);
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
/* Check if monitor events need to be launched. */
|
||||||
|
mon_process_state_changes(m_monitor, m_script.c_str(), m_events);
|
||||||
|
m_cluster_modified = false;
|
||||||
|
if (m_auto_failover)
|
||||||
{
|
{
|
||||||
// Update shared status.
|
m_cluster_modified = handle_auto_failover();
|
||||||
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
|
|
||||||
{
|
|
||||||
auto new_status = mon_srv->pending_status;
|
|
||||||
auto srv = mon_srv->server;
|
|
||||||
srv->status = new_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check if monitor events need to be launched. */
|
|
||||||
mon_process_state_changes(m_monitor, m_script.c_str(), m_events);
|
|
||||||
m_cluster_modified = false;
|
|
||||||
if (m_auto_failover)
|
|
||||||
{
|
|
||||||
m_cluster_modified = handle_auto_failover();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* log master detection failure of first master becomes available after failure */
|
|
||||||
log_master_changes(root_master, &log_no_master);
|
|
||||||
|
|
||||||
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
|
|
||||||
// has been performed, as server states have not been updated yet. It will happen next iteration.
|
|
||||||
if (!config_get_global_options()->passive && m_auto_rejoin && !m_cluster_modified &&
|
|
||||||
cluster_can_be_joined())
|
|
||||||
{
|
|
||||||
// Check if any servers should be autojoined to the cluster and try to join them.
|
|
||||||
handle_auto_rejoin();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not
|
|
||||||
* perform this if cluster has been modified this loop since it may not be clear which server
|
|
||||||
* should be a slave. */
|
|
||||||
if (!config_get_global_options()->passive && m_enforce_read_only_slaves && !m_cluster_modified)
|
|
||||||
{
|
|
||||||
enforce_read_only_on_slaves();
|
|
||||||
}
|
|
||||||
|
|
||||||
mon_hangup_failed_servers(m_monitor);
|
|
||||||
store_server_journal(m_monitor, m_master ? m_master->m_server_base : NULL);
|
|
||||||
release_monitor_servers(m_monitor);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* log master detection failure of first master becomes available after failure */
|
||||||
|
log_master_changes(root_master, &log_no_master);
|
||||||
|
|
||||||
|
// Do not auto-join servers on this monitor loop if a failover (or any other cluster modification)
|
||||||
|
// has been performed, as server states have not been updated yet. It will happen next iteration.
|
||||||
|
if (!config_get_global_options()->passive && m_auto_rejoin && !m_cluster_modified &&
|
||||||
|
cluster_can_be_joined())
|
||||||
|
{
|
||||||
|
// Check if any servers should be autojoined to the cluster and try to join them.
|
||||||
|
handle_auto_rejoin();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if any slave servers have read-only off and turn it on if user so wishes. Again, do not
|
||||||
|
* perform this if cluster has been modified this loop since it may not be clear which server
|
||||||
|
* should be a slave. */
|
||||||
|
if (!config_get_global_options()->passive && m_enforce_read_only_slaves && !m_cluster_modified)
|
||||||
|
{
|
||||||
|
enforce_read_only_on_slaves();
|
||||||
|
}
|
||||||
|
|
||||||
|
mon_hangup_failed_servers(m_monitor);
|
||||||
|
store_server_journal(m_monitor, m_master ? m_master->m_server_base : NULL);
|
||||||
|
|
||||||
// Check how much the monitor should sleep to get one full monitor interval.
|
// Check how much the monitor should sleep to get one full monitor interval.
|
||||||
timespec loop_end;
|
timespec loop_end;
|
||||||
clock_gettime(CLOCK_MONOTONIC_COARSE, &loop_end);
|
clock_gettime(CLOCK_MONOTONIC_COARSE, &loop_end);
|
||||||
@ -486,7 +456,8 @@ void MariaDBMonitor::main()
|
|||||||
sleep_time_remaining = MXS_MAX(MXS_MON_BASE_INTERVAL_MS, sleep_time_remaining);
|
sleep_time_remaining = MXS_MAX(MXS_MON_BASE_INTERVAL_MS, sleep_time_remaining);
|
||||||
/* Sleep in small increments to react faster to some events. This should ideally use some type of
|
/* Sleep in small increments to react faster to some events. This should ideally use some type of
|
||||||
* notification mechanism. */
|
* notification mechanism. */
|
||||||
while (sleep_time_remaining > 0 && !should_shutdown() && !m_monitor->server_pending_changes)
|
while (sleep_time_remaining > 0 && !should_shutdown() &&
|
||||||
|
atomic_load_int(&m_monitor->check_maintenance_flag) == MAINTENANCE_FLAG_NOCHECK)
|
||||||
{
|
{
|
||||||
int small_sleep_ms = (sleep_time_remaining >= MXS_MON_BASE_INTERVAL_MS) ?
|
int small_sleep_ms = (sleep_time_remaining >= MXS_MON_BASE_INTERVAL_MS) ?
|
||||||
MXS_MON_BASE_INTERVAL_MS : sleep_time_remaining;
|
MXS_MON_BASE_INTERVAL_MS : sleep_time_remaining;
|
||||||
|
Reference in New Issue
Block a user