Have server status updates applied during monitor loop

Previously, server status changes from MaxAdmin would be set immediately
as long as the server lock could be acquired. This meant that it might take
several seconds until the next monitor pass is executed. Usually, this was
fine but in some situations we would want the monitor to run immediately
after the change (MXS-740 and Galera). This patch changes the logic of
setting and clearing status bits to a delayed mode: changes are first applied
to a "status_pending"-variable, and only once the monitor runs will the
setting be applied. To reduce the delay, the monitor now has a flag
which is checked during sleep (between short 0.1s naps). If set, the
sleep is cut short.

If a server is not monitored, the status bits are set directly.

There is a small possibility of a race condition: If a monitor is stopped or
destroyed before the pending change is applied, the change is forgotten.
This commit is contained in:
Esa Korhonen
2016-12-14 17:50:17 +02:00
parent ac36f04f93
commit 7e9db7ed0c
9 changed files with 112 additions and 18 deletions

View File

@ -200,6 +200,9 @@ struct monitor
void *handle; /**< Handle returned from startMonitor */
size_t interval; /**< The monitor interval */
bool created_online; /**< Whether this monitor was created at runtime */
volatile bool server_pending_changes;
/**< Are there any pending changes to a server?
* If yes, the next monitor loop starts early. */
struct monitor *next; /**< Next monitor in the linked list */
};
@ -237,7 +240,8 @@ void mon_log_connect_error(MONITOR_SERVERS* database, connect_result_t rval);
void mon_log_state_change(MONITOR_SERVERS *ptr);
void lock_monitor_servers(MONITOR *monitor);
void release_monitor_servers(MONITOR *monitor);
void servers_status_pending_to_current(MONITOR *monitor);
void servers_status_current_to_pending(MONITOR *monitor);
/**
* @brief Hangup connections to failed servers
*
@ -274,10 +278,10 @@ bool monitor_serialize_servers(const MONITOR *monitor);
bool monitor_serialize(const MONITOR *monitor);
/**
* Check if a monitor uses @c servers
* Check if a server is being monitored and return the monitor.
* @param server Server that is queried
* @return True if server is used by at least one monitor
* @return The monitor watching this server, or NULL if not monitored
*/
bool monitor_server_in_use(const SERVER *server);
MONITOR* monitor_server_in_use(const SERVER *server);
MXS_END_DECLS

View File

@ -96,6 +96,7 @@ typedef struct server
char *auth_options; /**< Authenticator options */
SSL_LISTENER *server_ssl; /**< SSL data structure for server, if any */
unsigned int status; /**< Status flag bitmap for the server */
unsigned int status_pending; /**< Pending status flag bitmap for the server */
char monuser[MAX_SERVER_MONUSER_LEN]; /**< User name to use to monitor the db */
char monpw[MAX_SERVER_MONPW_LEN]; /**< Password to use to monitor the db */
SERVER_STATS stats; /**< The server statistics */

View File

@ -105,6 +105,7 @@ monitor_alloc(char *name, char *module)
mon->interval = MONITOR_INTERVAL;
mon->parameters = NULL;
mon->created_online = false;
mon->server_pending_changes = false;
spinlock_init(&mon->lock);
spinlock_acquire(&monLock);
mon->next = allMonitors;
@ -1248,9 +1249,9 @@ void mon_log_state_change(MONITOR_SERVERS *ptr)
MXS_FREE(next);
}
bool monitor_server_in_use(const SERVER *server)
MONITOR* monitor_server_in_use(const SERVER *server)
{
bool rval = false;
MONITOR *rval = NULL;
spinlock_acquire(&monLock);
@ -1262,7 +1263,7 @@ bool monitor_server_in_use(const SERVER *server)
{
if (db->server == server)
{
rval = true;
rval = mon;
}
}
@ -1439,7 +1440,7 @@ void mon_hangup_failed_servers(MONITOR *monitor)
}
}
/**
* Acquire locks on all servers monitored my this monitor. There should
* Acquire locks on all servers monitored by this monitor. There should
* only be max 1 monitor per server.
* @param monitor The target monitor
*/
@ -1452,6 +1453,11 @@ void lock_monitor_servers(MONITOR *monitor)
ptr = ptr->next;
}
}
/**
* Release locks on all servers monitored by this monitor. There should
* only be max 1 monitor per server.
* @param monitor The target monitor
*/
void release_monitor_servers(MONITOR *monitor)
{
MONITOR_SERVERS *ptr = monitor->databases;
@ -1461,3 +1467,34 @@ void release_monitor_servers(MONITOR *monitor)
ptr = ptr->next;
}
}
/**
* Sets the current status of all servers monitored by this monitor to
* the pending status. This should only be called at the beginning of
* a monitor loop, after the servers are locked.
* @param monitor The target monitor
*/
void servers_status_pending_to_current(MONITOR *monitor)
{
MONITOR_SERVERS *ptr = monitor->databases;
while (ptr)
{
ptr->server->status = ptr->server->status_pending;
ptr = ptr->next;
}
monitor->server_pending_changes = false;
}
/**
* Sets the pending status of all servers monitored by this monitor to
* the current status. This should only be called at the end of
* a monitor loop, before the servers are released.
* @param monitor The target monitor
*/
void servers_status_current_to_pending(MONITOR *monitor)
{
MONITOR_SERVERS *ptr = monitor->databases;
while (ptr)
{
ptr->server->status_pending = ptr->server->status;
ptr = ptr->next;
}
}

View File

@ -121,6 +121,7 @@ SERVER* server_alloc(const char *name, const char *address, unsigned short port,
server->auth_options = my_auth_options;
server->port = port;
server->status = SERVER_RUNNING;
server->status_pending = SERVER_RUNNING;
server->node_id = -1;
server->rlag = -2;
server->master_id = -1;
@ -1264,28 +1265,58 @@ SERVER* server_find_destroyed(const char *name, const char *protocol,
/**
* Set a status bit in the server under a lock. This ensures synchronization
* with the server monitor thread. Calling this inside the monitor will likely
* cause a deadlock.
* cause a deadlock. If the server is monitored, only set the pending bit.
*
* @param server The server to update
* @param bit The bit to set for the server
*/
void server_set_status(SERVER *server, int bit)
{
/* First check if the server is monitored. This isn't done under a lock
* but the race condition cannot cause significant harm. Monitors are never
* freed so the pointer stays valid.
*/
MONITOR *mon = monitor_server_in_use(server);
spinlock_acquire(&server->lock);
server_set_status_nolock(server, bit);
if (mon)
{
/* Set a pending status bit. It will be activated on the next monitor
* loop. Also set a flag so the next loop happens sooner.
*/
server->status_pending |= bit;
mon->server_pending_changes = true;
}
else
{
/* Set the bit directly */
server_set_status_nolock(server, bit);
}
spinlock_release(&server->lock);
}
/**
* Clear a status bit in the server under a lock. This ensures synchronization
* with the server monitor thread. Calling this inside the monitor will likely
* cause a deadlock.
* cause a deadlock. If the server is monitored, only clear the pending bit.
*
* @param server The server to update
* @param bit The bit to clear for the server
*/
void server_clear_status(SERVER *server, int bit)
{
MONITOR *mon = monitor_server_in_use(server);
spinlock_acquire(&server->lock);
server_clear_status_nolock(server, bit);
if (mon)
{
/* Clear a pending status bit. It will be activated on the next monitor
* loop. Also set a flag so the next loop happens sooner.
*/
server->status_pending &= ~bit;
mon->server_pending_changes = true;
}
else
{
/* Clear bit directly */
server_clear_status_nolock(server, bit);
}
spinlock_release(&server->lock);
}

View File

@ -188,6 +188,8 @@ monitorMain(void *arg)
while (!handle->shutdown)
{
lock_monitor_servers(monitor);
servers_status_pending_to_current(monitor);
for (MONITOR_SERVERS *ptr = monitor->databases; ptr; ptr = ptr->next)
{
update_server_status(monitor, ptr);
@ -221,12 +223,18 @@ monitorMain(void *arg)
}
}
}
servers_status_current_to_pending(monitor);
release_monitor_servers(monitor);
/** Sleep until the next monitoring interval */
int ms = 0;
while (ms < monitor->interval && !handle->shutdown)
{
if (monitor->server_pending_changes)
{
// Admin has changed something, skip sleep
break;
}
thread_millisleep(MON_BASE_INTERVAL_MS);
ms += MON_BASE_INTERVAL_MS;
}

View File

@ -482,7 +482,9 @@ monitorMain(void *arg)
* interval, then skip monitoring checks. Excluding the first
* round.
*/
if (nrounds != 0 && ((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >= MON_BASE_INTERVAL_MS)
if (nrounds != 0 &&
(((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >=
MON_BASE_INTERVAL_MS) && (!mon->server_pending_changes))
{
nrounds += 1;
continue;
@ -494,6 +496,7 @@ monitorMain(void *arg)
is_cluster = 0;
lock_monitor_servers(mon);
servers_status_pending_to_current(mon);
ptr = mon->databases;
while (ptr)
@ -619,6 +622,7 @@ monitorMain(void *arg)
}
mon_hangup_failed_servers(mon);
servers_status_current_to_pending(mon);
release_monitor_servers(mon);
}
}

View File

@ -550,8 +550,8 @@ monitorMain(void *arg)
* round.
*/
if (nrounds != 0 &&
((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >=
MON_BASE_INTERVAL_MS)
(((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >=
MON_BASE_INTERVAL_MS) && (!mon->server_pending_changes))
{
nrounds += 1;
continue;
@ -559,6 +559,8 @@ monitorMain(void *arg)
nrounds += 1;
lock_monitor_servers(mon);
servers_status_pending_to_current(mon);
/* start from the first server in the list */
ptr = mon->databases;
@ -643,6 +645,7 @@ monitorMain(void *arg)
}
mon_hangup_failed_servers(mon);
servers_status_current_to_pending(mon);
release_monitor_servers(mon);
}
}

View File

@ -1151,8 +1151,8 @@ monitorMain(void *arg)
* round.
*/
if (nrounds != 0 &&
((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >=
MON_BASE_INTERVAL_MS)
(((nrounds * MON_BASE_INTERVAL_MS) % mon->interval) >=
MON_BASE_INTERVAL_MS) && (!mon->server_pending_changes))
{
nrounds += 1;
continue;
@ -1162,6 +1162,8 @@ monitorMain(void *arg)
num_servers = 0;
lock_monitor_servers(mon);
servers_status_pending_to_current(mon);
/* start from the first server in the list */
ptr = mon->databases;
@ -1452,6 +1454,7 @@ monitorMain(void *arg)
}
mon_hangup_failed_servers(mon);
servers_status_current_to_pending(mon);
release_monitor_servers(mon);
} /*< while (1) */
}

View File

@ -375,6 +375,8 @@ monitorMain(void *arg)
nrounds += 1;
lock_monitor_servers(mon);
servers_status_pending_to_current(mon);
ptr = mon->databases;
while (ptr)
{
@ -415,6 +417,7 @@ monitorMain(void *arg)
}
mon_hangup_failed_servers(mon);
servers_status_current_to_pending(mon);
release_monitor_servers(mon);
}
}