MySQL monitor detects broken replication and a new option 'detect_stale_master' has been added.

- Monitor automatically builds the replication topology tree if at least Slave_IO thread is running in slaves and the master server will be available.

- If the replication is stopped or configuration removed the previos master server may be still available with detect_stale_master=1
	The SERVER_STALE_STATUS bit is added to the server->status field

Monitor diagnostic routine print all new status descriptions
This commit is contained in:
MassimilianoPinto 2014-09-04 09:29:31 +02:00
commit f20a224801
9 changed files with 129 additions and 33 deletions

View File

@ -34,6 +34,7 @@
* 29/05/14 Mark Riddoch Addition of filter definition
* 23/05/14 Massimiliano Pinto Added automatic set of maxscale-id: first listening ipv4_raw + port + pid
* 28/05/14 Massimiliano Pinto Added detect_replication_lag parameter
* 28/08/14 Massimiliano Pinto Added detect_stale_master parameter
*
* @endverbatim
*/
@ -650,6 +651,7 @@ int error_count = 0;
char *passwd;
unsigned long interval = 0;
int replication_heartbeat = 0;
int detect_stale_master = 0;
module = config_get_value(obj->parameters, "module");
servers = config_get_value(obj->parameters, "servers");
@ -663,6 +665,10 @@ int error_count = 0;
replication_heartbeat = atoi(config_get_value(obj->parameters, "detect_replication_lag"));
}
if (config_get_value(obj->parameters, "detect_stale_master")) {
detect_stale_master = atoi(config_get_value(obj->parameters, "detect_stale_master"));
}
if (module)
{
obj->element = monitor_alloc(obj->object, module);
@ -686,6 +692,10 @@ int error_count = 0;
if(replication_heartbeat == 1)
monitorSetReplicationHeartbeat(obj->element, replication_heartbeat);
/* detect stale master */
if(detect_stale_master == 1)
monitorDetectStaleMaster(obj->element, detect_stale_master);
/* get the servers to monitor */
s = strtok(servers, ",");
while (s)
@ -1346,6 +1356,7 @@ static char *monitor_params[] =
"passwd",
"monitor_interval",
"detect_replication_lag",
"detect_stale_master",
NULL
};
/**

View File

@ -207,8 +207,7 @@ MONITOR *ptr;
/**
* Show a single monitor
*
* @param dcb DCB for printing output
* @param monitor The monitor to print information regarding
* @param dcb DCB for printing output
*/
void
monitorShow(DCB *dcb, MONITOR *monitor)
@ -304,12 +303,26 @@ monitorSetInterval (MONITOR *mon, unsigned long interval)
* Enable Replication Heartbeat support in monitor.
*
* @param mon The monitor instance
* @param replication_heartbeat The replication heartbeat
* @param enable The enabling value is 1, 0 turns it off
*/
void
monitorSetReplicationHeartbeat(MONITOR *mon, int replication_heartbeat)
monitorSetReplicationHeartbeat(MONITOR *mon, int enable)
{
if (mon->module->replicationHeartbeat != NULL) {
mon->module->replicationHeartbeat(mon->handle, replication_heartbeat);
mon->module->replicationHeartbeat(mon->handle, enable);
}
}
/**
* Enable Stale Master assignement.
*
* @param mon The monitor instance
* @param enable The enabling value is 1, 0 turns it off
*/
void
monitorDetectStaleMaster(MONITOR *mon, int enable)
{
if (mon->module->detectStaleMaster != NULL) {
mon->module->detectStaleMaster(mon->handle, enable);
}
}

View File

@ -30,6 +30,7 @@
* 28/05/14 Massimiliano Pinto Addition of rlagd and node_ts fields
* 20/06/14 Massimiliano Pinto Addition of master_id, depth, slaves fields
* 26/06/14 Mark Riddoch Addition of server parameters
* 30/08/14 Massimiliano Pinto Addition of new service status description
*
* @endverbatim
*/
@ -148,7 +149,8 @@ server_set_unique_name(SERVER *server, char *name)
* Find an existing server using the unique section name in
* configuration file
*
* @param name The Server name defined in the header file
* @param servname The Server name or address
* @param port The server port
* @return The server or NULL if not found
*/
SERVER *
@ -405,7 +407,7 @@ server_status(SERVER *server)
{
char *status = NULL;
if ((status = (char *)malloc(200)) == NULL)
if ((status = (char *)malloc(256)) == NULL)
return NULL;
status[0] = 0;
if (server->status & SERVER_MAINT)
@ -418,6 +420,10 @@ char *status = NULL;
strcat(status, "Synced, ");
if (server->status & SERVER_NDB)
strcat(status, "NDB, ");
if (server->status & SERVER_SLAVE_OF_EXTERNAL_MASTER)
strcat(status, "Slave of External Server, ");
if (server->status & SERVER_STALE_STATUS)
strcat(status, "Stale Status, ");
if (server->status & SERVER_RUNNING)
strcat(status, "Running");
else

View File

@ -31,6 +31,8 @@
* 25/07/13 Mark Riddoch Addition of diagnotics
* 23/05/14 Mark Riddoch Addition of routine to find monitors by name
* 23/05/14 Massimiliano Pinto Addition of defaultId and setInterval
* 23/06/14 Massimiliano Pinto Addition of replicationHeartbeat
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
*
* @endverbatim
*/
@ -70,6 +72,7 @@ typedef struct {
void (*setInterval)(void *, unsigned long);
void (*defaultId)(void *, unsigned long);
void (*replicationHeartbeat)(void *, int);
void (*detectStaleMaster)(void *, int);
} MONITOR_OBJECT;
/**
@ -110,4 +113,5 @@ extern void monitorList(DCB *);
extern void monitorSetId(MONITOR *, unsigned long);
extern void monitorSetInterval (MONITOR *, unsigned long);
extern void monitorSetReplicationHeartbeat(MONITOR *, int);
extern void monitorDetectStaleMaster(MONITOR *, int);
#endif

View File

@ -39,6 +39,7 @@
* 20/06/14 Massimiliano Pinto Addition of master_id, depth, slaves fields
* 26/06/14 Mark Riddoch Adidtion of server parameters
* 30/07/14 Massimiliano Pinto Addition of NDB status for MySQL Cluster
* 30/08/14 Massimiliano Pinto Addition of SERVER_STALE_STATUS
*
* @endverbatim
*/
@ -103,6 +104,7 @@ typedef struct server {
#define SERVER_NDB 0x0010 /**<< The server is part of a MySQL cluster setup */
#define SERVER_MAINT 0x1000 /**<< Server is in maintenance mode */
#define SERVER_SLAVE_OF_EXTERNAL_MASTER 0x0080 /**<< Server is slave of a Master outside the provided replication topology */
#define SERVER_STALE_STATUS 0x2000 /**<< Server stale status, monitor didn't update it */
/**
* Is the server running - the macro returns true if the server is marked as running

View File

@ -69,7 +69,7 @@ static void defaultUsers(void *, char *, char *);
static void diagnostics(DCB *, void *);
static void setInterval(void *, unsigned long);
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL };
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL, NULL };
/**
* Implementation of the mandatory version entry point

View File

@ -40,6 +40,9 @@
* the status to update in server status field before
* starting the replication consistency check.
* This will also give routers a consistent "status" of all servers
* 28/08/14 Massimiliano Pinto Added detectStaleMaster feature: previous detected master will be used again, even if the replication is stopped.
* This means both IO and SQL threads are not working on slaves.
* This option is not enabled by default.
*
* @endverbatim
*/
@ -62,7 +65,7 @@ extern int lm_enabled_logfiles_bitmask;
static void monitorMain(void *);
static char *version_str = "V1.2.0";
static char *version_str = "V1.3.0";
MODULE_INFO info = {
MODULE_API_MONITOR,
@ -80,6 +83,7 @@ static void diagnostics(DCB *, void *);
static void setInterval(void *, unsigned long);
static void defaultId(void *, unsigned long);
static void replicationHeartbeat(void *, int);
static void detectStaleMaster(void *, int);
static bool mon_status_changed(MONITOR_SERVERS* mon_srv);
static bool mon_print_fail_status(MONITOR_SERVERS* mon_srv);
static MONITOR_SERVERS *getServerByNodeId(MONITOR_SERVERS *, long);
@ -91,7 +95,7 @@ static int add_slave_to_master(long *, int, long);
static void monitor_set_pending_status(MONITOR_SERVERS *, int);
static void monitor_clear_pending_status(MONITOR_SERVERS *, int);
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat };
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat, detectStaleMaster };
/**
* Implementation of the mandatory version entry point
@ -160,6 +164,7 @@ MYSQL_MONITOR *handle;
handle->id = MONITOR_DEFAULT_ID;
handle->interval = MONITOR_INTERVAL;
handle->replicationHeartbeat = 0;
handle->detectStaleMaster = 0;
handle->master = NULL;
spinlock_init(&handle->lock);
}
@ -306,6 +311,7 @@ char *sep;
dcb_printf(dcb,"\tSampling interval:\t%lu milliseconds\n", handle->interval);
dcb_printf(dcb,"\tMaxScale MonitorId:\t%lu\n", handle->id);
dcb_printf(dcb,"\tReplication lag:\t%s\n", (handle->replicationHeartbeat == 1) ? "enabled" : "disabled");
dcb_printf(dcb,"\tDetect Stale Master:\t%s\n", (handle->detectStaleMaster == 1) ? "enabled" : "disabled");
dcb_printf(dcb, "\tMonitored servers: ");
db = handle->databases;
@ -403,6 +409,12 @@ char *server_string;
monitor_clear_pending_status(database, SERVER_SLAVE);
monitor_clear_pending_status(database, SERVER_MASTER);
/* Clean addition status too */
server_clear_status(database->server, SERVER_SLAVE_OF_EXTERNAL_MASTER);
server_clear_status(database->server, SERVER_STALE_STATUS);
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
return;
}
free(dpwd);
@ -458,12 +470,20 @@ char *server_string;
if (strncmp(row[12], "Yes", 3) == 0
&& strncmp(row[13], "Yes", 3) == 0) {
isslave += 1;
}
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
* the replication tree, slaves ids will be added to master(s) and we will have at least the
* root master server.
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
*/
if (strncmp(row[12], "Yes", 3) == 0) {
/* get Master_Server_Id values */
master_id = atol(row[41]);
if (master_id == 0)
master_id = -1;
}
i++;
}
/* store master_id of current node */
@ -489,7 +509,14 @@ char *server_string;
if (strncmp(row[10], "Yes", 3) == 0
&& strncmp(row[11], "Yes", 3) == 0) {
isslave = 1;
}
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
* the replication tree, slaves ids will be added to master(s) and we will have at least the
* root master server.
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
*/
if (strncmp(row[10], "Yes", 3) == 0) {
/* get Master_Server_Id values */
master_id = atol(row[39]);
if (master_id == 0)
@ -505,6 +532,7 @@ char *server_string;
/* Remove addition info */
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
/* Please note, the MASTER status and SERVER_SLAVE_OF_EXTERNAL_MASTER
* will be assigned in the monitorMain() via get_replication_tree() routine
@ -534,6 +562,7 @@ monitorMain(void *arg)
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
MONITOR_SERVERS *ptr;
int replication_heartbeat = handle->replicationHeartbeat;
int detect_stale_master = handle->detectStaleMaster;
int num_servers=0;
MONITOR_SERVERS *root_master;
@ -545,6 +574,7 @@ MONITOR_SERVERS *root_master;
"module. Exiting.\n")));
return;
}
handle->status = MONITOR_RUNNING;
while (1)
{
@ -616,10 +646,19 @@ MONITOR_SERVERS *root_master;
while (ptr)
{
if (! SERVER_IN_MAINT(ptr->server)) {
ptr->server->status = ptr->pending_status;
/* If "detect_stale_master" option is On, let's use the previus master */
if (detect_stale_master && root_master && (!strcmp(ptr->server->name, root_master->server->name) && ptr->server->port == root_master->server->port) && (ptr->server->status & SERVER_MASTER) && !(ptr->pending_status & SERVER_MASTER)) {
/* in this case server->status will not be updated from pending_status */
LOGIF(LM, (skygw_log_write_flush(
LOGFILE_MESSAGE, "[mysql_mon]: root server [%s:%i] is no longer Master, let's use it again even if it could be a stale master, you have been warned!", ptr->server->name, ptr->server->port)));
/* Set the STALE bit for this server in server struct */
server_set_status(ptr->server, SERVER_STALE_STATUS);
} else {
ptr->server->status = ptr->pending_status;
}
}
ptr = ptr->next;
}
ptr = ptr->next;
}
/* Do now the heartbeat replication set/get for MySQL Replication Consistency */
if (replication_heartbeat && root_master && (SERVER_IS_MASTER(root_master->server) || SERVER_IS_RELAY_SERVER(root_master->server))) {
@ -665,19 +704,34 @@ setInterval(void *arg, unsigned long interval)
{
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
memcpy(&handle->interval, &interval, sizeof(unsigned long));
}
}
/**
* Enable/Disable the MySQL Replication hearbeat, detecting slave lag behind master.
*
* @param arg The handle allocated by startMonitor
* @param replicationHeartbeat To enable it 1, disable it with 0
* @param arg The handle allocated by startMonitor
* @param enable To enable it 1, disable it with 0
*/
static void
replicationHeartbeat(void *arg, int replicationHeartbeat)
replicationHeartbeat(void *arg, int enable)
{
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
memcpy(&handle->replicationHeartbeat, &replicationHeartbeat, sizeof(int));
memcpy(&handle->replicationHeartbeat, &enable, sizeof(int));
}
/**
* Enable/Disable the MySQL Replication Stale Master dectection, allowing a previouvsly detected master to still act as a Master.
* This option must be enabled in order to keep the Master when the replication is stopped or removed from slaves.
* If the replication is still stopped when MaxSclale is restarted no Master will be available.
*
* @param arg The handle allocated by startMonitor
* @param enable To enable it 1, disable it with 0
*/
static void
detectStaleMaster(void *arg, int enable)
{
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
memcpy(&handle->detectStaleMaster, &enable, sizeof(int));
}
static bool mon_status_changed(
@ -1038,6 +1092,10 @@ static MONITOR_SERVERS *get_replication_tree(MYSQL_MONITOR *handle, int num_serv
monitor_set_pending_status(master, SERVER_MASTER);
} else {
if (current->master_id > 0) {
/* this server is slave of another server not in MaxScale configuration
* we cannot use it as a real slave.
*/
monitor_clear_pending_status(ptr, SERVER_SLAVE);
monitor_set_pending_status(ptr, SERVER_SLAVE_OF_EXTERNAL_MASTER);
}
}

View File

@ -32,6 +32,7 @@
* 26/05/14 Massimiliano Pinto Default values for MONITOR_INTERVAL
* 28/05/14 Massimiliano Pinto Addition of new fields in MYSQL_MONITOR struct
* 24/06/14 Massimiliano Pinto Addition of master field in MYSQL_MONITOR struct and MONITOR_MAX_NUM_SLAVES
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
*
* @endverbatim
*/
@ -43,9 +44,9 @@
typedef struct monitor_servers {
SERVER *server; /**< The server being monitored */
MYSQL *con; /**< The MySQL connection */
int mon_err_count;
unsigned int mon_prev_status;
unsigned int pending_status; /**< Pending Status flag bitmap */
int mon_err_count;
unsigned int mon_prev_status;
unsigned int pending_status; /**< Pending Status flag bitmap */
struct monitor_servers
*next; /**< The next server in the list */
} MONITOR_SERVERS;
@ -54,17 +55,18 @@ typedef struct monitor_servers {
* The handle for an instance of a MySQL Monitor module
*/
typedef struct {
SPINLOCK lock; /**< The monitor spinlock */
pthread_t tid; /**< id of monitor thread */
int shutdown; /**< Flag to shutdown the monitor thread */
int status; /**< Monitor status */
char *defaultUser; /**< Default username for monitoring */
char *defaultPasswd; /**< Default password for monitoring */
unsigned long interval; /**< Monitor sampling interval */
unsigned long id; /**< Monitor ID */
SPINLOCK lock; /**< The monitor spinlock */
pthread_t tid; /**< id of monitor thread */
int shutdown; /**< Flag to shutdown the monitor thread */
int status; /**< Monitor status */
char *defaultUser; /**< Default username for monitoring */
char *defaultPasswd; /**< Default password for monitoring */
unsigned long interval; /**< Monitor sampling interval */
unsigned long id; /**< Monitor ID */
int replicationHeartbeat; /**< Monitor flag for MySQL replication heartbeat */
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
int detectStaleMaster; /**< Monitor flag for MySQL replication Stale Master detection */
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
} MYSQL_MONITOR;
#define MONITOR_RUNNING 1

View File

@ -63,7 +63,7 @@ static void defaultUsers(void *, char *, char *);
static void diagnostics(DCB *, void *);
static void setInterval(void *, unsigned long);
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL };
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL, NULL };
/**
* Implementation of the mandatory version entry point