Fix for broken replication
Fix for broken replication has been added to mysql_monitor. Both Slave_IO and Slave_SQL threads must be running in order to assign the SERVER_SLAVE status but If only Slave_IO is running let’s assign the master_id to current server and continue building the replication tree; if no slaves at all the master will be still available. The “detect_stale_master” option has been added, its default is 0. If set to 1 the monitor will keep the last detected master even if the replication setup is completely not working, i.e. both Slave_IO and Slave_SQL threads are not running: this applies only to the server that was master before. After monitor or MaxScale are restarted and the replication is still stopped or not configured there will be no master because it’s not possible to compute the replication topology tree.
This commit is contained in:
parent
493feb49ba
commit
63d267e5ef
@ -34,6 +34,7 @@
|
||||
* 29/05/14 Mark Riddoch Addition of filter definition
|
||||
* 23/05/14 Massimiliano Pinto Added automatic set of maxscale-id: first listening ipv4_raw + port + pid
|
||||
* 28/05/14 Massimiliano Pinto Added detect_replication_lag parameter
|
||||
* 28/08/14 Massimiliano Pinto Added detect_stale_master parameter
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -650,6 +651,7 @@ int error_count = 0;
|
||||
char *passwd;
|
||||
unsigned long interval = 0;
|
||||
int replication_heartbeat = 0;
|
||||
int detect_stale_master = 0;
|
||||
|
||||
module = config_get_value(obj->parameters, "module");
|
||||
servers = config_get_value(obj->parameters, "servers");
|
||||
@ -663,6 +665,10 @@ int error_count = 0;
|
||||
replication_heartbeat = atoi(config_get_value(obj->parameters, "detect_replication_lag"));
|
||||
}
|
||||
|
||||
if (config_get_value(obj->parameters, "detect_stale_master")) {
|
||||
detect_stale_master = atoi(config_get_value(obj->parameters, "detect_stale_master"));
|
||||
}
|
||||
|
||||
if (module)
|
||||
{
|
||||
obj->element = monitor_alloc(obj->object, module);
|
||||
@ -686,6 +692,10 @@ int error_count = 0;
|
||||
if(replication_heartbeat == 1)
|
||||
monitorSetReplicationHeartbeat(obj->element, replication_heartbeat);
|
||||
|
||||
/* detect stale master */
|
||||
if(detect_stale_master == 1)
|
||||
monitorDetectStaleMaster(obj->element, detect_stale_master);
|
||||
|
||||
/* get the servers to monitor */
|
||||
s = strtok(servers, ",");
|
||||
while (s)
|
||||
@ -1346,6 +1356,7 @@ static char *monitor_params[] =
|
||||
"passwd",
|
||||
"monitor_interval",
|
||||
"detect_replication_lag",
|
||||
"detect_stale_master",
|
||||
NULL
|
||||
};
|
||||
/**
|
||||
|
@ -207,8 +207,7 @@ MONITOR *ptr;
|
||||
/**
|
||||
* Show a single monitor
|
||||
*
|
||||
* @param dcb DCB for printing output
|
||||
* @param monitor The monitor to print information regarding
|
||||
* @param dcb DCB for printing output
|
||||
*/
|
||||
void
|
||||
monitorShow(DCB *dcb, MONITOR *monitor)
|
||||
@ -304,12 +303,26 @@ monitorSetInterval (MONITOR *mon, unsigned long interval)
|
||||
* Enable Replication Heartbeat support in monitor.
|
||||
*
|
||||
* @param mon The monitor instance
|
||||
* @param replication_heartbeat The replication heartbeat
|
||||
* @param enable The enabling value is 1, 0 turns it off
|
||||
*/
|
||||
void
|
||||
monitorSetReplicationHeartbeat(MONITOR *mon, int replication_heartbeat)
|
||||
monitorSetReplicationHeartbeat(MONITOR *mon, int enable)
|
||||
{
|
||||
if (mon->module->replicationHeartbeat != NULL) {
|
||||
mon->module->replicationHeartbeat(mon->handle, replication_heartbeat);
|
||||
mon->module->replicationHeartbeat(mon->handle, enable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable Stale Master assignement.
|
||||
*
|
||||
* @param mon The monitor instance
|
||||
* @param enable The enabling value is 1, 0 turns it off
|
||||
*/
|
||||
void
|
||||
monitorDetectStaleMaster(MONITOR *mon, int enable)
|
||||
{
|
||||
if (mon->module->detectStaleMaster != NULL) {
|
||||
mon->module->detectStaleMaster(mon->handle, enable);
|
||||
}
|
||||
}
|
||||
|
@ -148,7 +148,8 @@ server_set_unique_name(SERVER *server, char *name)
|
||||
* Find an existing server using the unique section name in
|
||||
* configuration file
|
||||
*
|
||||
* @param name The Server name defined in the header file
|
||||
* @param servname The Server name or address
|
||||
* @param port The server port
|
||||
* @return The server or NULL if not found
|
||||
*/
|
||||
SERVER *
|
||||
@ -405,7 +406,7 @@ server_status(SERVER *server)
|
||||
{
|
||||
char *status = NULL;
|
||||
|
||||
if ((status = (char *)malloc(200)) == NULL)
|
||||
if ((status = (char *)malloc(256)) == NULL)
|
||||
return NULL;
|
||||
status[0] = 0;
|
||||
if (server->status & SERVER_MAINT)
|
||||
@ -416,6 +417,10 @@ char *status = NULL;
|
||||
strcat(status, "Slave, ");
|
||||
if (server->status & SERVER_JOINED)
|
||||
strcat(status, "Synced, ");
|
||||
if (server->status & SERVER_SLAVE_OF_EXTERNAL_MASTER)
|
||||
strcat(status, "Slave of External Server, ");
|
||||
if (server->status & SERVER_STALE_STATUS)
|
||||
strcat(status, "Stale Status, ");
|
||||
if (server->status & SERVER_RUNNING)
|
||||
strcat(status, "Running");
|
||||
else
|
||||
|
@ -31,6 +31,8 @@
|
||||
* 25/07/13 Mark Riddoch Addition of diagnotics
|
||||
* 23/05/14 Mark Riddoch Addition of routine to find monitors by name
|
||||
* 23/05/14 Massimiliano Pinto Addition of defaultId and setInterval
|
||||
* 23/06/14 Massimiliano Pinto Addition of replicationHeartbeat
|
||||
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -70,6 +72,7 @@ typedef struct {
|
||||
void (*setInterval)(void *, unsigned long);
|
||||
void (*defaultId)(void *, unsigned long);
|
||||
void (*replicationHeartbeat)(void *, int);
|
||||
void (*detectStaleMaster)(void *, int);
|
||||
} MONITOR_OBJECT;
|
||||
|
||||
/**
|
||||
@ -110,4 +113,5 @@ extern void monitorList(DCB *);
|
||||
extern void monitorSetId(MONITOR *, unsigned long);
|
||||
extern void monitorSetInterval (MONITOR *, unsigned long);
|
||||
extern void monitorSetReplicationHeartbeat(MONITOR *, int);
|
||||
extern void monitorDetectStaleMaster(MONITOR *, int);
|
||||
#endif
|
||||
|
@ -38,6 +38,7 @@
|
||||
* 03/06/14 Mark Riddoch Addition of maintainance mode
|
||||
* 20/06/14 Massimiliano Pinto Addition of master_id, depth, slaves fields
|
||||
* 26/06/14 Mark Riddoch Adidtion of server parameters
|
||||
* 30/08/14 Massimiliano Pinto Addition of SERVER_STALE_STATUS
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -101,6 +102,7 @@ typedef struct server {
|
||||
#define SERVER_JOINED 0x0008 /**<< The server is joined in a Galera cluster */
|
||||
#define SERVER_MAINT 0x1000 /**<< Server is in maintenance mode */
|
||||
#define SERVER_SLAVE_OF_EXTERNAL_MASTER 0x0080 /**<< Server is slave of a Master outside the provided replication topology */
|
||||
#define SERVER_STALE_STATUS 0x2000 /**<< Server stale status, monitor didn't update it */
|
||||
|
||||
/**
|
||||
* Is the server running - the macro returns true if the server is marked as running
|
||||
|
@ -69,7 +69,7 @@ static void defaultUsers(void *, char *, char *);
|
||||
static void diagnostics(DCB *, void *);
|
||||
static void setInterval(void *, unsigned long);
|
||||
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL };
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUsers, diagnostics, setInterval, NULL, NULL, NULL };
|
||||
|
||||
/**
|
||||
* Implementation of the mandatory version entry point
|
||||
|
@ -40,6 +40,9 @@
|
||||
* the status to update in server status field before
|
||||
* starting the replication consistency check.
|
||||
* This will also give routers a consistent "status" of all servers
|
||||
* 28/08/14 Massimiliano Pinto Added detectStaleMaster feature: previous detected master will be used again, even if the replication is stopped.
|
||||
* This means both IO and SQL threads are not working on slaves.
|
||||
* This option is not enabled by default.
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -62,7 +65,7 @@ extern int lm_enabled_logfiles_bitmask;
|
||||
|
||||
static void monitorMain(void *);
|
||||
|
||||
static char *version_str = "V1.2.0";
|
||||
static char *version_str = "V1.3.0";
|
||||
|
||||
MODULE_INFO info = {
|
||||
MODULE_API_MONITOR,
|
||||
@ -80,6 +83,7 @@ static void diagnostics(DCB *, void *);
|
||||
static void setInterval(void *, unsigned long);
|
||||
static void defaultId(void *, unsigned long);
|
||||
static void replicationHeartbeat(void *, int);
|
||||
static void detectStaleMaster(void *, int);
|
||||
static bool mon_status_changed(MONITOR_SERVERS* mon_srv);
|
||||
static bool mon_print_fail_status(MONITOR_SERVERS* mon_srv);
|
||||
static MONITOR_SERVERS *getServerByNodeId(MONITOR_SERVERS *, long);
|
||||
@ -91,7 +95,7 @@ static int add_slave_to_master(long *, int, long);
|
||||
static void monitor_set_pending_status(MONITOR_SERVERS *, int);
|
||||
static void monitor_clear_pending_status(MONITOR_SERVERS *, int);
|
||||
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat };
|
||||
static MONITOR_OBJECT MyObject = { startMonitor, stopMonitor, registerServer, unregisterServer, defaultUser, diagnostics, setInterval, defaultId, replicationHeartbeat, detectStaleMaster };
|
||||
|
||||
/**
|
||||
* Implementation of the mandatory version entry point
|
||||
@ -160,6 +164,7 @@ MYSQL_MONITOR *handle;
|
||||
handle->id = MONITOR_DEFAULT_ID;
|
||||
handle->interval = MONITOR_INTERVAL;
|
||||
handle->replicationHeartbeat = 0;
|
||||
handle->detectStaleMaster = 0;
|
||||
handle->master = NULL;
|
||||
spinlock_init(&handle->lock);
|
||||
}
|
||||
@ -306,6 +311,7 @@ char *sep;
|
||||
dcb_printf(dcb,"\tSampling interval:\t%lu milliseconds\n", handle->interval);
|
||||
dcb_printf(dcb,"\tMaxScale MonitorId:\t%lu\n", handle->id);
|
||||
dcb_printf(dcb,"\tReplication lag:\t%s\n", (handle->replicationHeartbeat == 1) ? "enabled" : "disabled");
|
||||
dcb_printf(dcb,"\tDetect Stale Master:\t%s\n", (handle->detectStaleMaster == 1) ? "enabled" : "disabled");
|
||||
dcb_printf(dcb, "\tMonitored servers: ");
|
||||
|
||||
db = handle->databases;
|
||||
@ -403,6 +409,12 @@ char *server_string;
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE);
|
||||
monitor_clear_pending_status(database, SERVER_MASTER);
|
||||
|
||||
/* Clean addition status too */
|
||||
server_clear_status(database->server, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
server_clear_status(database->server, SERVER_STALE_STATUS);
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
|
||||
|
||||
return;
|
||||
}
|
||||
free(dpwd);
|
||||
@ -458,12 +470,20 @@ char *server_string;
|
||||
if (strncmp(row[12], "Yes", 3) == 0
|
||||
&& strncmp(row[13], "Yes", 3) == 0) {
|
||||
isslave += 1;
|
||||
|
||||
}
|
||||
|
||||
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
|
||||
* the replication tree, slaves ids will be added to master(s) and we will have at least the
|
||||
* root master server.
|
||||
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
|
||||
*/
|
||||
if (strncmp(row[12], "Yes", 3) == 0) {
|
||||
/* get Master_Server_Id values */
|
||||
master_id = atol(row[41]);
|
||||
if (master_id == 0)
|
||||
master_id = -1;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
/* store master_id of current node */
|
||||
@ -489,7 +509,14 @@ char *server_string;
|
||||
if (strncmp(row[10], "Yes", 3) == 0
|
||||
&& strncmp(row[11], "Yes", 3) == 0) {
|
||||
isslave = 1;
|
||||
}
|
||||
|
||||
/* If Slave_IO_Running = Yes, assign the master_id to current server: this allows building
|
||||
* the replication tree, slaves ids will be added to master(s) and we will have at least the
|
||||
* root master server.
|
||||
* Please note, there could be no slaves at all if Slave_SQL_Running == 'No'
|
||||
*/
|
||||
if (strncmp(row[10], "Yes", 3) == 0) {
|
||||
/* get Master_Server_Id values */
|
||||
master_id = atol(row[39]);
|
||||
if (master_id == 0)
|
||||
@ -505,6 +532,7 @@ char *server_string;
|
||||
|
||||
/* Remove addition info */
|
||||
monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
monitor_clear_pending_status(database, SERVER_STALE_STATUS);
|
||||
|
||||
/* Please note, the MASTER status and SERVER_SLAVE_OF_EXTERNAL_MASTER
|
||||
* will be assigned in the monitorMain() via get_replication_tree() routine
|
||||
@ -534,6 +562,7 @@ monitorMain(void *arg)
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
MONITOR_SERVERS *ptr;
|
||||
int replication_heartbeat = handle->replicationHeartbeat;
|
||||
int detect_stale_master = handle->detectStaleMaster;
|
||||
int num_servers=0;
|
||||
MONITOR_SERVERS *root_master;
|
||||
|
||||
@ -545,6 +574,7 @@ MONITOR_SERVERS *root_master;
|
||||
"module. Exiting.\n")));
|
||||
return;
|
||||
}
|
||||
|
||||
handle->status = MONITOR_RUNNING;
|
||||
while (1)
|
||||
{
|
||||
@ -616,10 +646,19 @@ MONITOR_SERVERS *root_master;
|
||||
while (ptr)
|
||||
{
|
||||
if (! SERVER_IN_MAINT(ptr->server)) {
|
||||
ptr->server->status = ptr->pending_status;
|
||||
/* If "detect_stale_master" option is On, let's use the previus master */
|
||||
if (detect_stale_master && root_master && (!strcmp(ptr->server->name, root_master->server->name) && ptr->server->port == root_master->server->port) && (ptr->server->status & SERVER_MASTER) && !(ptr->pending_status & SERVER_MASTER)) {
|
||||
/* in this case server->status will not be updated from pending_status */
|
||||
LOGIF(LM, (skygw_log_write_flush(
|
||||
LOGFILE_MESSAGE, "[mysql_mon]: root server [%s:%i] is no longer Master, let's use it again even if it could be a stale master, you have been warned!", ptr->server->name, ptr->server->port)));
|
||||
/* Set the STALE bit for this server in server struct */
|
||||
server_set_status(ptr->server, SERVER_STALE_STATUS);
|
||||
} else {
|
||||
ptr->server->status = ptr->pending_status;
|
||||
}
|
||||
}
|
||||
ptr = ptr->next;
|
||||
}
|
||||
ptr = ptr->next;
|
||||
}
|
||||
|
||||
/* Do now the heartbeat replication set/get for MySQL Replication Consistency */
|
||||
if (replication_heartbeat && root_master && (SERVER_IS_MASTER(root_master->server) || SERVER_IS_RELAY_SERVER(root_master->server))) {
|
||||
@ -665,19 +704,34 @@ setInterval(void *arg, unsigned long interval)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->interval, &interval, sizeof(unsigned long));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable/Disable the MySQL Replication hearbeat, detecting slave lag behind master.
|
||||
*
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param replicationHeartbeat To enable it 1, disable it with 0
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param enable To enable it 1, disable it with 0
|
||||
*/
|
||||
static void
|
||||
replicationHeartbeat(void *arg, int replicationHeartbeat)
|
||||
replicationHeartbeat(void *arg, int enable)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->replicationHeartbeat, &replicationHeartbeat, sizeof(int));
|
||||
memcpy(&handle->replicationHeartbeat, &enable, sizeof(int));
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable/Disable the MySQL Replication Stale Master dectection, allowing a previouvsly detected master to still act as a Master.
|
||||
* This option must be enabled in order to keep the Master when the replication is stopped or removed from slaves.
|
||||
* If the replication is still stopped when MaxSclale is restarted no Master will be available.
|
||||
*
|
||||
* @param arg The handle allocated by startMonitor
|
||||
* @param enable To enable it 1, disable it with 0
|
||||
*/
|
||||
static void
|
||||
detectStaleMaster(void *arg, int enable)
|
||||
{
|
||||
MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg;
|
||||
memcpy(&handle->detectStaleMaster, &enable, sizeof(int));
|
||||
}
|
||||
|
||||
static bool mon_status_changed(
|
||||
@ -1038,6 +1092,10 @@ static MONITOR_SERVERS *get_replication_tree(MYSQL_MONITOR *handle, int num_serv
|
||||
monitor_set_pending_status(master, SERVER_MASTER);
|
||||
} else {
|
||||
if (current->master_id > 0) {
|
||||
/* this server is slave of another server not in MaxScale configuration
|
||||
* we cannot use it as a real slave.
|
||||
*/
|
||||
monitor_clear_pending_status(ptr, SERVER_SLAVE);
|
||||
monitor_set_pending_status(ptr, SERVER_SLAVE_OF_EXTERNAL_MASTER);
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
* 26/05/14 Massimiliano Pinto Default values for MONITOR_INTERVAL
|
||||
* 28/05/14 Massimiliano Pinto Addition of new fields in MYSQL_MONITOR struct
|
||||
* 24/06/14 Massimiliano Pinto Addition of master field in MYSQL_MONITOR struct and MONITOR_MAX_NUM_SLAVES
|
||||
* 28/08/14 Massimiliano Pinto Addition of detectStaleMaster
|
||||
*
|
||||
* @endverbatim
|
||||
*/
|
||||
@ -43,9 +44,9 @@
|
||||
typedef struct monitor_servers {
|
||||
SERVER *server; /**< The server being monitored */
|
||||
MYSQL *con; /**< The MySQL connection */
|
||||
int mon_err_count;
|
||||
unsigned int mon_prev_status;
|
||||
unsigned int pending_status; /**< Pending Status flag bitmap */
|
||||
int mon_err_count;
|
||||
unsigned int mon_prev_status;
|
||||
unsigned int pending_status; /**< Pending Status flag bitmap */
|
||||
struct monitor_servers
|
||||
*next; /**< The next server in the list */
|
||||
} MONITOR_SERVERS;
|
||||
@ -54,17 +55,18 @@ typedef struct monitor_servers {
|
||||
* The handle for an instance of a MySQL Monitor module
|
||||
*/
|
||||
typedef struct {
|
||||
SPINLOCK lock; /**< The monitor spinlock */
|
||||
pthread_t tid; /**< id of monitor thread */
|
||||
int shutdown; /**< Flag to shutdown the monitor thread */
|
||||
int status; /**< Monitor status */
|
||||
char *defaultUser; /**< Default username for monitoring */
|
||||
char *defaultPasswd; /**< Default password for monitoring */
|
||||
unsigned long interval; /**< Monitor sampling interval */
|
||||
unsigned long id; /**< Monitor ID */
|
||||
SPINLOCK lock; /**< The monitor spinlock */
|
||||
pthread_t tid; /**< id of monitor thread */
|
||||
int shutdown; /**< Flag to shutdown the monitor thread */
|
||||
int status; /**< Monitor status */
|
||||
char *defaultUser; /**< Default username for monitoring */
|
||||
char *defaultPasswd; /**< Default password for monitoring */
|
||||
unsigned long interval; /**< Monitor sampling interval */
|
||||
unsigned long id; /**< Monitor ID */
|
||||
int replicationHeartbeat; /**< Monitor flag for MySQL replication heartbeat */
|
||||
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
|
||||
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
|
||||
int detectStaleMaster; /**< Monitor flag for MySQL replication Stale Master detection */
|
||||
MONITOR_SERVERS *master; /**< Master server for MySQL Master/Slave replication */
|
||||
MONITOR_SERVERS *databases; /**< Linked list of servers to monitor */
|
||||
} MYSQL_MONITOR;
|
||||
|
||||
#define MONITOR_RUNNING 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user