MaxScale/server/core/monitor.c
Markus Makela fec1ebe925 Removed restrictions on monitor timeouts
The monitor timeouts can now be larger than the monitor interval. This will
allow the combination of low monitoring intervals and large network timeouts.
If a network experiences some periodic lag, it is desirable to allow large
timeout values.
2016-02-19 09:57:15 +02:00

971 lines
25 KiB
C

/*
* This file is distributed as part of the MariaDB Corporation MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright MariaDB Corporation Ab 2013-2014
*/
/**
* @file monitor.c - The monitor module management routines
*
* @verbatim
* Revision History
*
* Date Who Description
* 08/07/13 Mark Riddoch Initial implementation
* 23/05/14 Massimiliano Pinto Addition of monitor_interval parameter
* and monitor id
* 30/10/14 Massimiliano Pinto Addition of disable_master_failback parameter
* 07/11/14 Massimiliano Pinto Addition of monitor network timeouts
* 08/05/15 Markus Makela Moved common monitor variables to MONITOR struct
*
* @endverbatim
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <monitor.h>
#include <spinlock.h>
#include <modules.h>
#include <skygw_utils.h>
#include <log_manager.h>
#include <secrets.h>
#include <maxscale_pcre2.h>
#include <externcmd.h>
#include <mysql/mysqld_error.h>
/*
* Create declarations of the enum for monitor events and also the array of
* structs containing the matching names. The data is taken from def_monitor_event.h
*
*/
#undef ADDITEM
#define ADDITEM( _event_type, _event_name ) { #_event_name }
const monitor_def_t monitor_event_definitions[MAX_MONITOR_EVENT] =
{
#include "def_monitor_event.h"
};
#undef ADDITEM
static MONITOR *allMonitors = NULL;
static SPINLOCK monLock = SPINLOCK_INIT;
static void monitor_servers_free(MONITOR_SERVERS *servers);
/**
* Allocate a new monitor, load the associated module for the monitor
* and start execution on the monitor.
*
* @param name The name of the monitor module to load
* @param module The module to load
* @return The newly created monitor
*/
MONITOR *
monitor_alloc(char *name, char *module)
{
MONITOR *mon;
if ((mon = (MONITOR *)malloc(sizeof(MONITOR))) == NULL)
{
return NULL;
}
if ((mon->module = load_module(module, MODULE_MONITOR)) == NULL)
{
MXS_ERROR("Unable to load monitor module '%s'.", name);
free(mon);
return NULL;
}
mon->state = MONITOR_STATE_ALLOC;
mon->name = strdup(name);
mon->handle = NULL;
mon->databases = NULL;
mon->password = NULL;
mon->user = NULL;
mon->password = NULL;
mon->read_timeout = DEFAULT_READ_TIMEOUT;
mon->write_timeout = DEFAULT_WRITE_TIMEOUT;
mon->connect_timeout = DEFAULT_CONNECT_TIMEOUT;
mon->interval = MONITOR_INTERVAL;
mon->parameters = NULL;
spinlock_init(&mon->lock);
spinlock_acquire(&monLock);
mon->next = allMonitors;
allMonitors = mon;
spinlock_release(&monLock);
return mon;
}
/**
* Free a monitor, first stop the monitor and then remove the monitor from
* the chain of monitors and free the memory.
*
* @param mon The monitor to free
*/
void
monitor_free(MONITOR *mon)
{
MONITOR *ptr;
mon->module->stopMonitor(mon);
mon->state = MONITOR_STATE_FREED;
spinlock_acquire(&monLock);
if (allMonitors == mon)
{
allMonitors = mon->next;
}
else
{
ptr = allMonitors;
while (ptr->next && ptr->next != mon)
{
ptr = ptr->next;
}
if (ptr->next)
{
ptr->next = mon->next;
}
}
spinlock_release(&monLock);
free_config_parameter(mon->parameters);
monitor_servers_free(mon->databases);
free(mon->name);
free(mon);
}
/**
* Start an individual monitor that has previoulsy been stopped.
*
* @param monitor The Monitor that should be started
*/
void
monitorStart(MONITOR *monitor, void* params)
{
spinlock_acquire(&monitor->lock);
monitor->handle = (*monitor->module->startMonitor)(monitor,params);
monitor->state = MONITOR_STATE_RUNNING;
spinlock_release(&monitor->lock);
}
/**
* Start all monitors
*/
void monitorStartAll()
{
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
while (ptr)
{
monitorStart(ptr, ptr->parameters);
ptr = ptr->next;
}
spinlock_release(&monLock);
}
/**
* Stop a given monitor
*
* @param monitor The monitor to stop
*/
void
monitorStop(MONITOR *monitor)
{
spinlock_acquire(&monitor->lock);
if (monitor->state != MONITOR_STATE_STOPPED)
{
monitor->state = MONITOR_STATE_STOPPING;
monitor->module->stopMonitor(monitor);
monitor->state = MONITOR_STATE_STOPPED;
MONITOR_SERVERS* db = monitor->databases;
while (db)
{
mysql_close(db->con);
db->con = NULL;
db = db->next;
}
}
spinlock_release(&monitor->lock);
}
/**
* Shutdown all running monitors
*/
void
monitorStopAll()
{
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
while (ptr)
{
monitorStop(ptr);
ptr = ptr->next;
}
spinlock_release(&monLock);
}
/**
* Add a server to a monitor. Simply register the server that needs to be
* monitored to the running monitor module.
*
* @param mon The Monitor instance
* @param server The Server to add to the monitoring
*/
void
monitorAddServer(MONITOR *mon, SERVER *server)
{
MONITOR_SERVERS *ptr, *db;
if ((db = (MONITOR_SERVERS *)malloc(sizeof(MONITOR_SERVERS))) == NULL)
{
return;
}
db->server = server;
db->con = NULL;
db->next = NULL;
db->mon_err_count = 0;
db->log_version_err = true;
/** Server status is uninitialized */
db->mon_prev_status = -1;
/* pending status is updated by get_replication_tree */
db->pending_status = 0;
spinlock_acquire(&mon->lock);
if (mon->databases == NULL)
{
mon->databases = db;
}
else
{
ptr = mon->databases;
while (ptr->next != NULL)
{
ptr = ptr->next;
}
ptr->next = db;
}
spinlock_release(&mon->lock);
}
/**
* Free monitor server list
* @param servers Servers to free
*/
static void monitor_servers_free(MONITOR_SERVERS *servers)
{
while (servers)
{
MONITOR_SERVERS *tofree = servers;
servers = servers->next;
if (tofree->con)
{
mysql_close(tofree->con);
}
free(tofree);
}
}
/**
* Add a default user to the monitor. This user is used to connect to the
* monitored databases but may be overriden on a per server basis.
*
* @param mon The monitor instance
* @param user The default username to use when connecting
* @param passwd The default password associated to the default user.
*/
void
monitorAddUser(MONITOR *mon, char *user, char *passwd)
{
mon->user = strdup(user);
mon->password = strdup(passwd);
}
/**
* Show all monitors
*
* @param dcb DCB for printing output
*/
void
monitorShowAll(DCB *dcb)
{
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
while (ptr)
{
dcb_printf(dcb, "Monitor: %p\n", ptr);
dcb_printf(dcb, "\tName: %s\n", ptr->name);
if (ptr->module->diagnostics)
{
ptr->module->diagnostics(dcb, ptr);
}
ptr = ptr->next;
}
spinlock_release(&monLock);
}
/**
* Show a single monitor
*
* @param dcb DCB for printing output
*/
void
monitorShow(DCB *dcb, MONITOR *monitor)
{
dcb_printf(dcb, "Monitor: %p\n", monitor);
dcb_printf(dcb, "\tName: %s\n", monitor->name);
if (monitor->module->diagnostics)
{
monitor->module->diagnostics(dcb, monitor);
}
}
/**
* List all the monitors
*
* @param dcb DCB for printing output
*/
void
monitorList(DCB *dcb)
{
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
dcb_printf(dcb, "---------------------+---------------------\n");
dcb_printf(dcb, "%-20s | Status\n", "Monitor");
dcb_printf(dcb, "---------------------+---------------------\n");
while (ptr)
{
dcb_printf(dcb, "%-20s | %s\n", ptr->name,
ptr->state & MONITOR_STATE_RUNNING
? "Running" : "Stopped");
ptr = ptr->next;
}
dcb_printf(dcb, "---------------------+---------------------\n");
spinlock_release(&monLock);
}
/**
* Find a monitor by name
*
* @param name The name of the monitor
* @return Pointer to the monitor or NULL
*/
MONITOR *
monitor_find(char *name)
{
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
while (ptr)
{
if (!strcmp(ptr->name, name))
{
break;
}
ptr = ptr->next;
}
spinlock_release(&monLock);
return ptr;
}
/**
* Set the monitor sampling interval.
*
* @param mon The monitor instance
* @param interval The sampling interval in milliseconds
*/
void
monitorSetInterval(MONITOR *mon, unsigned long interval)
{
mon->interval = interval;
}
/**
* Set Monitor timeouts for connect/read/write
*
* @param mon The monitor instance
* @param type The timeout handling type
* @param value The timeout to set
*/
bool
monitorSetNetworkTimeout(MONITOR *mon, int type, int value)
{
bool rval = true;
if (value > 0)
{
switch (type)
{
case MONITOR_CONNECT_TIMEOUT:
mon->connect_timeout = value;
break;
case MONITOR_READ_TIMEOUT:
mon->read_timeout = value;
break;
case MONITOR_WRITE_TIMEOUT:
mon->write_timeout = value;
break;
default:
MXS_ERROR("Monitor setNetworkTimeout received an unsupported action type %i", type);
rval = false;
break;
}
}
else
{
MXS_ERROR("Negative value for monitor timeout.");
rval = false;
}
return rval;
}
/**
* Provide a row to the result set that defines the set of monitors
*
* @param set The result set
* @param data The index of the row to send
* @return The next row or NULL
*/
static RESULT_ROW *
monitorRowCallback(RESULTSET *set, void *data)
{
int *rowno = (int *)data;
int i = 0;;
char buf[20];
RESULT_ROW *row;
MONITOR *ptr;
spinlock_acquire(&monLock);
ptr = allMonitors;
while (i < *rowno && ptr)
{
i++;
ptr = ptr->next;
}
if (ptr == NULL)
{
spinlock_release(&monLock);
free(data);
return NULL;
}
(*rowno)++;
row = resultset_make_row(set);
resultset_row_set(row, 0, ptr->name);
resultset_row_set(row, 1, ptr->state & MONITOR_STATE_RUNNING
? "Running" : "Stopped");
spinlock_release(&monLock);
return row;
}
/**
* Return a resultset that has the current set of monitors in it
*
* @return A Result set
*/
RESULTSET *
monitorGetList()
{
RESULTSET *set;
int *data;
if ((data = (int *)malloc(sizeof(int))) == NULL)
{
return NULL;
}
*data = 0;
if ((set = resultset_create(monitorRowCallback, data)) == NULL)
{
free(data);
return NULL;
}
resultset_add_column(set, "Monitor", 20, COL_TYPE_VARCHAR);
resultset_add_column(set, "Status", 10, COL_TYPE_VARCHAR);
return set;
}
/**
* Check if the monitor user has all required permissions to operate properly.
* this checks for REPLICATION CLIENT permissions
* @param service Monitor to inspect
* @return False if an error with monitor permissions was detected or if an
* error occurred. True if permissions are correct.
*/
bool check_monitor_permissions(MONITOR* monitor)
{
MYSQL* mysql;
MYSQL_RES* res;
char *user,*dpasswd;
SERVER* server;
int conn_timeout = 1;
bool rval = true;
user = monitor->user;
dpasswd = decryptPassword(monitor->password);
if ((mysql = mysql_init(NULL)) == NULL)
{
MXS_ERROR("[%s] Error: MySQL connection initialization failed.", __FUNCTION__);
free(dpasswd);
return false;
}
server = monitor->databases->server;
mysql_options(mysql,MYSQL_OPT_USE_REMOTE_CONNECTION,NULL);
mysql_options(mysql,MYSQL_OPT_CONNECT_TIMEOUT,&conn_timeout);
/** Connect to the first server. This assumes all servers have identical
* user permissions. */
if (mysql_real_connect(mysql, server->name, user, dpasswd, NULL, server->port, NULL, 0) == NULL)
{
MXS_ERROR("%s: Failed to connect to server %s(%s:%d) when"
" checking monitor user credentials and permissions.",
monitor->name,
server->unique_name,
server->name,
server->port);
mysql_close(mysql);
free(dpasswd);
return false;
}
if (mysql_query(mysql,"show slave status") != 0)
{
if (mysql_errno(mysql) == ER_SPECIFIC_ACCESS_DENIED_ERROR)
{
MXS_ERROR("%s: User '%s' is missing REPLICATION CLIENT privileges. MySQL error message: %s",
monitor->name,user,mysql_error(mysql));
}
else
{
MXS_ERROR("%s: Monitor failed to query for slave status. MySQL error message: %s",
monitor->name,mysql_error(mysql));
}
rval = false;
}
else
{
if ((res = mysql_use_result(mysql)) == NULL)
{
MXS_ERROR("%s: Result retrieval failed when checking for REPLICATION CLIENT permissions: %s",
monitor->name,mysql_error(mysql));
rval = false;
}
else
{
mysql_free_result(res);
}
}
mysql_close(mysql);
free(dpasswd);
return rval;
}
/**
* Add parameters to the monitor
* @param monitor Monitor
* @param params Config parameters
*/
void monitorAddParameters(MONITOR *monitor, CONFIG_PARAMETER *params)
{
while (params)
{
CONFIG_PARAMETER* clone = config_clone_param(params);
if (clone)
{
clone->next = monitor->parameters;
monitor->parameters = clone;
}
params = params->next;
}
}
/**
* Set a pending status bit in the monitor server
*
* @param server The server to update
* @param bit The bit to clear for the server
*/
void
monitor_set_pending_status(MONITOR_SERVERS *ptr, int bit)
{
ptr->pending_status |= bit;
}
/**
* Clear a pending status bit in the monitor server
*
* @param server The server to update
* @param bit The bit to clear for the server
*/
void
monitor_clear_pending_status(MONITOR_SERVERS *ptr, int bit)
{
ptr->pending_status &= ~bit;
}
/*
* Determine a monitor event, defined by the difference between the old
* status of a server and the new status.
*
* @param node The monitor server data for a particular server
* @result monitor_event_t A monitor event (enum)
*/
monitor_event_t
mon_get_event_type(MONITOR_SERVERS* node)
{
typedef enum
{
DOWN_EVENT,
UP_EVENT,
LOSS_EVENT,
NEW_EVENT,
UNSUPPORTED_EVENT
} general_event_type;
general_event_type event_type = UNSUPPORTED_EVENT;
unsigned int prev = node->mon_prev_status
& (SERVER_RUNNING|SERVER_MASTER|SERVER_SLAVE|SERVER_JOINED|SERVER_NDB);
unsigned int present = node->server->status
& (SERVER_RUNNING|SERVER_MASTER|SERVER_SLAVE|SERVER_JOINED|SERVER_NDB);
if (prev == present)
{
/* No change in the bits we're interested in */
return UNDEFINED_MONITOR_EVENT;
}
if ((prev & SERVER_RUNNING) == 0)
{
/* The server was not running previously */
if ((present & SERVER_RUNNING) != 0)
{
event_type = UP_EVENT;
}
/* Otherwise, was not running and still is not running */
/* - this is not a recognised event */
}
else
{
/* Previous state must have been running */
if ((present & SERVER_RUNNING) == 0)
{
event_type = DOWN_EVENT;
}
else
{
/* Was running and still is */
if (prev & (SERVER_MASTER|SERVER_SLAVE|SERVER_JOINED|SERVER_NDB))
{
/* We used to know what kind of server it was */
event_type = LOSS_EVENT;
}
else
{
/* We didn't know what kind of server it was, now we do*/
event_type = NEW_EVENT;
}
}
}
switch (event_type)
{
case UP_EVENT:
return (present & SERVER_MASTER) ? MASTER_UP_EVENT :
(present & SERVER_SLAVE) ? SLAVE_UP_EVENT :
(present & SERVER_JOINED) ? SYNCED_UP_EVENT :
(present & SERVER_NDB) ? NDB_UP_EVENT :
SERVER_UP_EVENT;
case DOWN_EVENT:
return (prev & SERVER_MASTER) ? MASTER_DOWN_EVENT :
(prev & SERVER_SLAVE) ? SLAVE_DOWN_EVENT :
(prev & SERVER_JOINED) ? SYNCED_DOWN_EVENT :
(prev & SERVER_NDB) ? NDB_DOWN_EVENT :
SERVER_DOWN_EVENT;
case LOSS_EVENT:
return (prev & SERVER_MASTER) ? LOST_MASTER_EVENT :
(prev & SERVER_SLAVE) ? LOST_SLAVE_EVENT :
(prev & SERVER_JOINED) ? LOST_SYNCED_EVENT :
LOST_NDB_EVENT;
case NEW_EVENT:
return (present & SERVER_MASTER) ? NEW_MASTER_EVENT :
(present & SERVER_SLAVE) ? NEW_SLAVE_EVENT :
(present & SERVER_JOINED) ? NEW_SYNCED_EVENT :
NEW_NDB_EVENT;
default:
return UNDEFINED_MONITOR_EVENT;
}
}
/*
* Given a monitor event (enum) provide a text string equivalent
* @param node The monitor server data whose event is wanted
* @result string The name of the monitor event for the server
*/
const char*
mon_get_event_name(MONITOR_SERVERS* node)
{
return monitor_event_definitions[mon_get_event_type(node)].name;
}
/*
* Given the text version of a monitor event, determine the event (enum)
*
* @param event_name String containing the event name
* @result monitor_event_t Monitor event corresponding to name
*/
monitor_event_t
mon_name_to_event(const char *event_name)
{
monitor_event_t event;
for (event = 0; event < MAX_MONITOR_EVENT; event++)
{
if (0 == strcasecmp(monitor_event_definitions[event].name, event_name))
{
return event;
}
}
return UNDEFINED_MONITOR_EVENT;
}
/**
* Create a list of running servers
*
* @param servers Monitored servers
* @param dest Destination where the string is appended, must be null terminated
* @param len Length of @c dest
*/
void
mon_append_node_names(MONITOR_SERVERS* servers, char* dest, int len)
{
char *separator = "";
char arr[MAX_SERVER_NAME_LEN + 32]; // Some extra space for port
while (servers && strlen(dest) < (len - strlen(separator)))
{
if (SERVER_IS_RUNNING(servers->server))
{
strcat(dest, separator);
separator = ",";
snprintf(arr, sizeof(arr), "%s:%d", servers->server->name, servers->server->port);
strncat(dest, arr, len - strlen(dest) - 1);
}
servers = servers->next;
}
}
/**
* Check if current monitored server status has changed
*
* @param mon_srv The monitored server
* @return true if status has changed or false
*/
bool
mon_status_changed(MONITOR_SERVERS* mon_srv)
{
/* Previous status is -1 if not yet set */
return (mon_srv->mon_prev_status != -1
&& mon_srv->mon_prev_status != mon_srv->server->status);
}
/**
* Check if current monitored server has a loggable failure status
*
* @param mon_srv The monitored server
* @return true if failed status can be logged or false
*/
bool
mon_print_fail_status(MONITOR_SERVERS* mon_srv)
{
return (SERVER_IS_DOWN(mon_srv->server) && mon_srv->mon_err_count == 0);
}
/**
* Launch a script
* @param mon Owning monitor
* @param ptr The server which has changed state
* @param script Script to execute
*/
void
monitor_launch_script(MONITOR* mon, MONITOR_SERVERS* ptr, char* script)
{
char nodelist[PATH_MAX + MON_ARG_MAX + 1] = {'\0'};
char initiator[strlen(ptr->server->name) + 24]; // Extra space for port
snprintf(initiator, sizeof(initiator), "%s:%d", ptr->server->name, ptr->server->port);
mon_append_node_names(mon->databases, nodelist, PATH_MAX + MON_ARG_MAX);
EXTERNCMD* cmd = externcmd_allocate(script);
if (cmd == NULL)
{
MXS_ERROR("Failed to initialize script '%s'. See previous errors for the "
"cause of this failure.", script);
return;
}
externcmd_substitute_arg(cmd, "[$]INITIATOR", initiator);
externcmd_substitute_arg(cmd, "[$]EVENT", mon_get_event_name(ptr));
externcmd_substitute_arg(cmd, "[$]NODELIST", nodelist);
if (externcmd_execute(cmd))
{
MXS_ERROR("Failed to execute script '%s' on server state change event %s.",
script, mon_get_event_name(ptr));
}
externcmd_free(cmd);
}
/**
* Parse a string of event names to an array with enabled events.
* @param events Pointer to an array of boolean values
* @param count Size of the array
* @param string String to parse
* @return 0 on success. 1 when an error has occurred or an unexpected event was
* found.
*/
int
mon_parse_event_string(bool* events, size_t count, char* given_string)
{
char *tok, *saved, *string = strdup(given_string);
monitor_event_t event;
tok = strtok_r(string, ",| ", &saved);
if (tok == NULL)
{
free(string);
return -1;
}
while (tok)
{
event = mon_name_to_event(tok);
if (event == UNDEFINED_MONITOR_EVENT)
{
MXS_ERROR("Invalid event name %s", tok);
free(string);
return -1;
}
if (event < count)
{
events[event] = true;
tok = strtok_r(NULL, ",| ", &saved);
}
}
free(string);
return 0;
}
/**
* Connect to a database. This will always leave a valid database handle in the
* database->con pointer. This allows the user to call MySQL C API functions to
* find out the reason of the failure.
* @param mon Monitor
* @param database Monitored database
* @return MONITOR_CONN_OK if the connection is OK else the reason for the failure
*/
connect_result_t
mon_connect_to_db(MONITOR* mon, MONITOR_SERVERS *database)
{
connect_result_t rval = MONITOR_CONN_OK;
/** Return if the connection is OK */
if (database->con && mysql_ping(database->con) == 0)
{
return rval;
}
int connect_timeout = mon->connect_timeout;
int read_timeout = mon->read_timeout;
int write_timeout = mon->write_timeout;
char *uname = database->server->monuser ? database->server->monuser : mon->user;
char *passwd = database->server->monpw ? database->server->monpw : mon->password;
char *dpwd = decryptPassword(passwd);
if (database->con)
{
mysql_close(database->con);
}
database->con = mysql_init(NULL);
mysql_options(database->con, MYSQL_OPT_CONNECT_TIMEOUT, (void *) &connect_timeout);
mysql_options(database->con, MYSQL_OPT_READ_TIMEOUT, (void *) &read_timeout);
mysql_options(database->con, MYSQL_OPT_WRITE_TIMEOUT, (void *) &write_timeout);
time_t start = time(NULL);
bool result = (mysql_real_connect(database->con,
database->server->name,
uname,
dpwd,
NULL,
database->server->port,
NULL,
0) != NULL);
time_t end = time(NULL);
if (!result)
{
if ((int) difftime(end, start) >= connect_timeout)
{
rval = MONITOR_CONN_TIMEOUT;
}
else
{
rval = MONITOR_CONN_REFUSED;
}
}
free(dpwd);
return rval;
}
/**
* Log an error about the failure to connect to a backend server
* and why it happened.
* @param database Backend database
* @param rval Return value of mon_connect_to_db
*/
void
mon_log_connect_error(MONITOR_SERVERS* database, connect_result_t rval)
{
MXS_ERROR(rval == MONITOR_CONN_TIMEOUT ?
"Monitor timed out when connecting to "
"server %s:%d : \"%s\"" :
"Monitor was unable to connect to "
"server %s:%d : \"%s\"",
database->server->name,
database->server->port,
mysql_error(database->con));
}