MySQL monitor detects broken replication and a new option 'detect_stale_master' has been added.
- Monitor automatically builds the replication topology tree if at least Slave_IO thread is running in slaves and the master server will be available. - If the replication is stopped or configuration removed the previos master server may be still available with detect_stale_master=1 The SERVER_STALE_STATUS bit is added to the server->status field Monitor diagnostic routine print all new status descriptions
This commit is contained in:
commit
f20a224801
@ -34,6 +34,7 @@
|
||||
* 29/05/14 Mark Riddoch Addition of filter definition
|
||||
* 23/05/14 Massimiliano Pinto Added automatic set of maxscale-id: first listening ipv4_raw + port + pid
|
||||
* 28/05/14 Massimiliano Pinto Added detect_replication_lag parameter
|
||||
* 28/08/14 Massimiliano Pinto Added detect_stale_master parameter
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -650,6 +651,7 @@ int error_count = 0;
|
||||
char *passwd;
|
||||
unsigned long interval = 0;
|
||||
int replication_heartbeat = 0;
|
||||
int detect_stale_master = 0;
|
||||
|
||||
module = config_get_value(obj->parameters, "module");
|
||||
servers = config_get_value(obj->parameters, "servers");
|
||||
@ -663,6 +665,10 @@ int error_count = 0;
|
||||
replication_heartbeat = atoi(config_get_value(obj->parameters, "detect_replication_lag"));
|
||||
}
|
||||
|
||||
if (config_get_value(obj->parameters, "detect_stale_master")) {
|
||||
detect_stale_master = atoi(config_get_value(obj->parameters, "detect_stale_master"));
|
||||
}
|
||||
|
||||
if (module)
|
||||
{
|
||||
obj->element = monitor_alloc(obj->object, module);
|
||||
@ -686,6 +692,10 @@ int error_count = 0;
|
||||
if(replication_heartbeat == 1)
|
||||
monitorSetReplicationHeartbeat(obj->element, replication_heartbeat);
|
||||
|
||||
/* detect stale master */
|
||||
if(detect_stale_master == 1)
|
||||
monitorDetectStaleMaster(obj->element, detect_stale_master);
|
||||
|
||||
/* get the servers to monitor */
|
||||
s = strtok(servers, ",");
|
||||
while (s)
|
||||
@ -1346,6 +1356,7 @@ static char *monitor_params[] =
|
||||
"passwd",
|
||||
"monitor_interval",
|
||||
"detect_replication_lag",
|
||||
"detect_stale_master",
|
||||
NULL
|
||||
};
|
||||
/**
|
||||
|
@ -207,8 +207,7 @@ MONITOR *ptr;
|
||||
/**
|
||||
* Show a single monitor
|
||||
*
|
||||
* @param dcb DCB for printing output
|
||||
* @param monitor The monitor to print information regarding
|
||||
* @param dcb DCB for printing output
|
||||
*/
|
||||
void
|
||||
monitorShow(DCB *dcb, MONITOR *monitor)
|
||||
@ -304,12 +303,26 @@ monitorSetInterval (MONITOR *mon, unsigned long interval)
|
||||
* Enable Replication Heartbeat support in monitor.
|
||||
*
|
||||
* @param mon The monitor instance
|
||||
* @param replication_heartbeat The replication heartbeat
|
||||
* @param enable The enabling value is 1, 0 turns it off
|
||||
*/
|
||||
void
|
||||
monitorSetReplicationHeartbeat(MONITOR *mon, int replication_heartbeat)
|
||||
monitorSetReplicationHeartbeat(MONITOR *mon, int enable)
|
||||
{
|
||||
if (mon->module->replicationHeartbeat != NULL) {
|
||||
mon->module->replicationHeartbeat(mon->handle, replication_heartbeat);
|
||||
mon->module->replicationHeartbeat(mon->handle, enable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable Stale Master assignement.
|
||||
*
|
||||
* @param mon The monitor instance
|
||||
* @param enable The enabling value is 1, 0 turns it off
|
||||
*/
|
||||
void
|
||||
monitorDetectStaleMaster(MONITOR *mon, int enable)
|
||||
{
|
||||
if (mon->module->detectStaleMaster != NULL) {
|
||||
mon->module->detectStaleMaster(mon->handle, enable);
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@
|
||||
* 28/05/14 Massimiliano Pinto Addition of rlagd and node_ts fields
|
||||
* 20/06/14 Massimiliano Pinto Addition of master_id, depth, slaves fields
|
||||
* 26/06/14 Mark Riddoch Addition of server parameters
|
||||
* 30/08/14 Massimiliano Pinto Addition of new service status description
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -148,7 +149,8 @@ server_set_unique_name(SERVER *server, char *name)
|
||||
* Find an existing server using the unique section name in
|
||||
* configuration file
|
||||
*
|
||||
* @param name The Server name defined in the header file
|
||||
* @param servname The Server name or address
|
||||
* @param port The server port
|
||||
* @return The server or NULL if not found
|
||||
*/
|
||||
SERVER *
|
||||
@ -405,7 +407,7 @@ server_status(SERVER *server)
|
||||
{
|
||||
char *status = NULL;
|
||||
|
||||
if ((status = (char *)malloc(200)) == NULL)
|
||||
if ((status = (char *)malloc(256)) == NULL)
|
||||
return NULL;
|
||||
status[0] = 0;
|
||||
if (server->status & SERVER_MAINT)
|
||||
@ -418,6 +420,10 @@ char *status = NULL;
|
||||
strcat(status, "Synced, ");
|
||||
if (server->status & SERVER_NDB)
|
||||
strcat(status, "NDB, ");
|
||||
if (server->status & SERVER_SLAVE_OF_EXTERNAL_MASTER)
|
||||
strcat(status, "Slave of External Server, ");
|
||||
if (server->status & SERVER_STALE_STATUS)
|
||||
strcat(status, "Stale Status, ");
|
||||
if (server->status & SERVER_RUNNING)
|
||||
strcat(status, "Running");
|
||||
else
|
||||
|
@ -31,6 +31,8 @@
|
||||
* 25/07/13 Mark Riddoch Addition of diagnotics
|
||||
* 23/05/14 Mark Riddoch Addition of routine to find monitors by name
|
||||
* 23/05/14 Massimiliano Pinto Addition of defaultId and setInterval
|
||||
* 23/06/14 Massimiliano Pinto Addition of replicationHeartbeat
|
||||
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -70,6 +72,7 @@ typedef struct {
|
||||
void (*setInterval)(void *, unsigned long);
|
||||
void (*defaultId)(void *, unsigned long);
|
||||
void (*replicationHeartbeat)(void *, int);
|
||||
void (*detectStaleMaster)(void *, int);
|
||||
} MONITOR_OBJECT;
|
||||
|
||||
/**
|
||||
@ -110,4 +113,5 @@ extern void monitorList(DCB *);
|
||||
extern void monitorSetId(MONITOR *, unsigned long);
|
||||
extern void monitorSetInterval (MONITOR *, unsigned long);
|
||||
extern void monitorSetReplicationHeartbeat(MONITOR *, int);
|
||||
extern void monitorDetectStaleMaster(MONITOR *, int);
|
||||
#endif
|
||||
|
@ -39,6 +39,7 @@
|
||||
* 20/06/14 Massimiliano Pinto Addition of master_id, depth, slaves fields
|
||||
* 26/06/14 Mark Riddoch Adidtion of server parameters
|
||||
* 30/07/14 Massimiliano Pinto Addition of NDB status for MySQL Cluster
|
||||
* 30/08/14 Massimiliano Pinto Addition of SERVER_STALE_STATUS
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -103,6 +104,7 @@ typedef struct server {
|
||||
#define SERVER_NDB 0x0010 /**<< The server is part of a MySQL cluster setup */
|
||||
#define SERVER_MAINT 0x1000 /**<< Server is in maintenance mode */
|
||||
#define SERVER_SLAVE_OF_EXTERNAL_MASTER 0x0080 /**<< Server is slave of a Master outside the provided replication topology */
|
||||
#define SERVER_STALE_STATUS 0x2000 /**<< Server stale status, monitor didn't update it */
|
||||
|
||||
/**
|
||||
* Is the server running - the macro returns true if the server is marked as running
|
||||
|
@ -69,7 +69,7 @@ static void defaultUsers(void *, char *, char *);
|
||||
static void diagnostics(DCB *, void *);
|
||||
static void setInterval(void *, unsigned long);
|
||||
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL };
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL, NULL };
|
||||
|
||||
/**
|
||||
* Implementation of the mandatory version entry point
|
||||
|
@ -40,6 +40,9 @@
|
||||
* the status to update in server status field before
|
||||
* starting the replication consistency check.
|
||||
* This will also give routers a consistent "status" of all servers
|
||||
* 28/08/14 Massimiliano Pinto Added detectStaleMaster feature: previous detected master will be used again, even if the replication is stopped.
|
||||
* This means both IO and SQL threads are not working on slaves.
|
||||
* This option is not enabled by default.
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -62,7 +65,7 @@ extern int lm_enabled_logfiles_bitmask;
|
||||
|
||||
static void monitorMain(void *);
|
||||
|
||||
static char *version_str = "V1.2.0";
|
||||
static char *version_str = "V1.3.0";
|
||||
|
||||
MODULE_INFO info = {
|
||||
MODULE_API_MONITOR,
|
||||
@ -80,6 +83,7 @@ static void diagnostics(DCB *, void *);
|
||||
static void setInterval(void *, unsigned long);
|
||||
static void defaultId(void *, unsigned long);
|
||||
static void replicationHeartbeat(void *, int);
|
||||
static void detectStaleMaster(void *, int);
|
||||
static bool mon_status_changed(MONITOR_SERVERS* mon_srv);
|
||||
static bool mon_print_fail_status(MONITOR_SERVERS* mon_srv);
|
||||
static MONITOR_SERVERS *getServerByNodeId(MONITOR_SERVERS *, long);
|
||||
@ -91,7 +95,7 @@ static int add_slave_to_master(long *, int, long);
|
||||
static void monitor_set_pending_status(MONITOR_SERVERS *, int);
|
||||
static void monitor_clear_pending_status(MONITOR_SERVERS *, int);
|
||||
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat };
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat, detectStaleMaster };
|
||||
|
||||
/**
|
||||
* Implementation of the mandatory version entry point
|
||||
@ -160,6 +164,7 @@ MYSQL_MONITOR *handle;
|
||||
handle->id = MONITOR_DEFAULT_ID;
|
||||
handle->interval = MONITOR_INTERVAL;
|
||||
handle->replicationHeartbeat = 0;
|
||||
handle->detectStaleMaster = 0;
|
||||
handle->master = NULL;
|
||||
spinlock_init(&handle->lock);
|
||||
}
|
||||
@ -306,6 +311,7 @@ char *sep;
|
||||
dcb_printf(dcb,"\tSampling interval:\t%lu milliseconds\n", handle->interval);
|
||||
dcb_printf(dcb,"\tMaxScale MonitorId:\t%lu\n", handle->id);
|
||||
dcb_printf(dcb,"\tReplication lag:\t%s\n", (handle->replicationHeartbeat == 1) ? "enabled" : "disabled");
|
||||
dcb_printf(dcb,"\tDetect Stale Master:\t%s\n", (handle->detectStaleMaster == 1) ? "enabled" : "disabled");
|
||||
dcb_printf(dcb, "\tMonitored servers: ");
|
||||
|
||||
db = handle->databases;
|
||||
@ -403,6 +409,12 @@ char *server_string;
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE);
|
||||
monitor_clear_pending_status(database, SERVER_MASTER);
|
||||
|
||||
/* Clean addition status too */
|
||||
server_clear_status(database->server, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
server_clear_status(database->server, SERVER_STALE_STATUS);
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
|
||||
|
||||
return;
|
||||
}
|
||||
free(dpwd);
|
||||
@ -458,12 +470,20 @@ char *server_string;
|
||||
if (strncmp(row[12], "Yes", 3) == 0
|
||||
&& strncmp(row[13], "Yes", 3) == 0) {
|
||||
isslave += 1;
|
||||
|
||||
}
|
||||
|
||||
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
|
||||
* the replication tree, slaves ids will be added to master(s) and we will have at least the
|
||||
* root master server.
|
||||
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
|
||||
*/
|
||||
if (strncmp(row[12], "Yes", 3) == 0) {
|
||||
/* get Master_Server_Id values */
|
||||
master_id = atol(row[41]);
|
||||
if (master_id == 0)
|
||||
master_id = -1;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
/* store master_id of current node */
|
||||
@ -489,7 +509,14 @@ char *server_string;
|
||||
if (strncmp(row[10], "Yes", 3) == 0
|
||||
&& strncmp(row[11], "Yes", 3) == 0) {
|
||||
isslave = 1;
|
||||
}
|
||||
|
||||
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
|
||||
* the replication tree, slaves ids will be added to master(s) and we will have at least the
|
||||
* root master server.
|
||||
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
|
||||
*/
|
||||
if (strncmp(row[10], "Yes", 3) == 0) {
|
||||
/* get Master_Server_Id values */
|
||||
master_id = atol(row[39]);
|
||||
if (master_id == 0)
|
||||
@ -505,6 +532,7 @@ char *server_string;
|
||||
|
||||
/* Remove addition info */
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
|
||||
|
||||
/* Please note, the MASTER status and SERVER_SLAVE_OF_EXTERNAL_MASTER
|
||||
* will be assigned in the monitorMain() via get_replication_tree() routine
|
||||
@ -534,6 +562,7 @@ monitorMain(void *arg)
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
MONITOR_SERVERS *ptr;
|
||||
int replication_heartbeat = handle->replicationHeartbeat;
|
||||
int detect_stale_master = handle->detectStaleMaster;
|
||||
int num_servers=0;
|
||||
MONITOR_SERVERS *root_master;
|
||||
|
||||
@ -545,6 +574,7 @@ MONITOR_SERVERS *root_master;
|
||||
"module. Exiting.\n")));
|
||||
return;
|
||||
}
|
||||
|
||||
handle->status = MONITOR_RUNNING;
|
||||
while (1)
|
||||
{
|
||||
@ -616,10 +646,19 @@ MONITOR_SERVERS *root_master;
|
||||
while (ptr)
|
||||
{
|
||||
if (! SERVER_IN_MAINT(ptr->server)) {
|
||||
ptr->server->status = ptr->pending_status;
|
||||
/* If "detect_stale_master" option is On, let's use the previus master */
|
||||
if (detect_stale_master && root_master && (!strcmp(ptr->server->name, root_master->server->name) && ptr->server->port == root_master->server->port) && (ptr->server->status & SERVER_MASTER) && !(ptr->pending_status & SERVER_MASTER)) {
|
||||
/* in this case server->status will not be updated from pending_status */
|
||||
LOGIF(LM, (skygw_log_write_flush(
|
||||
LOGFILE_MESSAGE, "[mysql_mon]: root server [%s:%i] is no longer Master, let's use it again even if it could be a stale master, you have been warned!", ptr->server->name, ptr->server->port)));
|
||||
/* Set the STALE bit for this server in server struct */
|
||||
server_set_status(ptr->server, SERVER_STALE_STATUS);
|
||||
} else {
|
||||
ptr->server->status = ptr->pending_status;
|
||||
}
|
||||
}
|
||||
ptr = ptr->next;
|
||||
}
|
||||
ptr = ptr->next;
|
||||
}
|
||||
|
||||
/* Do now the heartbeat replication set/get for MySQL Replication Consistency */
|
||||
if (replication_heartbeat && root_master && (SERVER_IS_MASTER(root_master->server) || SERVER_IS_RELAY_SERVER(root_master->server))) {
|
||||
@ -665,19 +704,34 @@ setInterval(void *arg, unsigned long interval)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->interval, &interval, sizeof(unsigned long));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable/Disable the MySQL Replication hearbeat, detecting slave lag behind master.
|
||||
*
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param replicationHeartbeat To enable it 1, disable it with 0
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param enable To enable it 1, disable it with 0
|
||||
*/
|
||||
static void
|
||||
replicationHeartbeat(void *arg, int replicationHeartbeat)
|
||||
replicationHeartbeat(void *arg, int enable)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->replicationHeartbeat, &replicationHeartbeat, sizeof(int));
|
||||
memcpy(&handle->replicationHeartbeat, &enable, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable/Disable the MySQL Replication Stale Master dectection, allowing a previouvsly detected master to still act as a Master.
|
||||
* This option must be enabled in order to keep the Master when the replication is stopped or removed from slaves.
|
||||
* If the replication is still stopped when MaxSclale is restarted no Master will be available.
|
||||
*
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param enable To enable it 1, disable it with 0
|
||||
*/
|
||||
static void
|
||||
detectStaleMaster(void *arg, int enable)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->detectStaleMaster, &enable, sizeof(int));
|
||||
}
|
||||
|
||||
static bool mon_status_changed(
|
||||
@ -1038,6 +1092,10 @@ static MONITOR_SERVERS *get_replication_tree(MYSQL_MONITOR *handle, int num_serv
|
||||
monitor_set_pending_status(master, SERVER_MASTER);
|
||||
} else {
|
||||
if (current->master_id > 0) {
|
||||
/* this server is slave of another server not in MaxScale configuration
|
||||
* we cannot use it as a real slave.
|
||||
*/
|
||||
monitor_clear_pending_status(ptr, SERVER_SLAVE);
|
||||
monitor_set_pending_status(ptr, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
* 26/05/14 Massimiliano Pinto Default values for MONITOR_INTERVAL
|
||||
* 28/05/14 Massimiliano Pinto Addition of new fields in MYSQL_MONITOR struct
|
||||
* 24/06/14 Massimiliano Pinto Addition of master field in MYSQL_MONITOR struct and MONITOR_MAX_NUM_SLAVES
|
||||
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -43,9 +44,9 @@
|
||||
typedef struct monitor_servers {
|
||||
SERVER *server; /**< The server being monitored */
|
||||
MYSQL *con; /**< The MySQL connection */
|
||||
int mon_err_count;
|
||||
unsigned int mon_prev_status;
|
||||
unsigned int pending_status; /**< Pending Status flag bitmap */
|
||||
int mon_err_count;
|
||||
unsigned int mon_prev_status;
|
||||
unsigned int pending_status; /**< Pending Status flag bitmap */
|
||||
struct monitor_servers
|
||||
*next; /**< The next server in the list */
|
||||
} MONITOR_SERVERS;
|
||||
@ -54,17 +55,18 @@ typedef struct monitor_servers {
|
||||
* The handle for an instance of a MySQL Monitor module
|
||||
*/
|
||||
typedef struct {
|
||||
SPINLOCK lock; /**< The monitor spinlock */
|
||||
pthread_t tid; /**< id of monitor thread */
|
||||
int shutdown; /**< Flag to shutdown the monitor thread */
|
||||
int status; /**< Monitor status */
|
||||
char *defaultUser; /**< Default username for monitoring */
|
||||
char *defaultPasswd; /**< Default password for monitoring */
|
||||
unsigned long interval; /**< Monitor sampling interval */
|
||||
unsigned long id; /**< Monitor ID */
|
||||
SPINLOCK lock; /**< The monitor spinlock */
|
||||
pthread_t tid; /**< id of monitor thread */
|
||||
int shutdown; /**< Flag to shutdown the monitor thread */
|
||||
int status; /**< Monitor status */
|
||||
char *defaultUser; /**< Default username for monitoring */
|
||||
char *defaultPasswd; /**< Default password for monitoring */
|
||||
unsigned long interval; /**< Monitor sampling interval */
|
||||
unsigned long id; /**< Monitor ID */
|
||||
int replicationHeartbeat; /**< Monitor flag for MySQL replication heartbeat */
|
||||
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
|
||||
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
|
||||
int detectStaleMaster; /**< Monitor flag for MySQL replication Stale Master detection */
|
||||
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
|
||||
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
|
||||
} MYSQL_MONITOR;
|
||||
|
||||
#define MONITOR_RUNNING 1
|
||||
|
@ -63,7 +63,7 @@ static void defaultUsers(void *, char *, char *);
|
||||
static void diagnostics(DCB *, void *);
|
||||
static void setInterval(void *, unsigned long);
|
||||
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL };
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL, NULL };
|
||||
|
||||
/**
|
||||
* Implementation of the mandatory version entry point
|
||||
|
Loading…
x
Reference in New Issue
Block a user