Addition of backoff process for master reconnect

Housekeeper task display
This commit is contained in:
Mark Riddoch
2014-10-23 10:46:55 +01:00
parent 932fc5dc2c
commit 70672e43a1
9 changed files with 326 additions and 30 deletions

View File

@ -57,7 +57,16 @@
*/
#define DEF_SHORT_BURST 15
#define DEF_LONG_BURST 500
#define DEF_BURST_SIZE 1024000 /* 1 Mb */
/**
* master reconnect backoff constants
* BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds)
* BLR_MAX_BACKOFF Maximum number of increments to backoff to
*/
#define BLR_MASTER_BACKOFF_TIME 5
#define BLR_MAX_BACKOFF 60
/**
* Some useful macros for examining the MySQL Response packets
*/
@ -255,9 +264,11 @@ typedef struct router_instance {
unsigned int high_water; /*< High water mark for client DCB */
unsigned int short_burst; /*< Short burst for slave catchup */
unsigned int long_burst; /*< Long burst for slave catchup */
unsigned long burst_size; /*< Maximum size of burst to send */
ROUTER_STATS stats; /*< Statistics for this router */
int active_logs;
int reconnect_pending;
int retry_backoff;
int handling_threads;
struct router_instance
*next;

View File

@ -39,6 +39,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <service.h>
#include <server.h>
@ -160,7 +161,7 @@ static ROUTER *
createInstance(SERVICE *service, char **options)
{
ROUTER_INSTANCE *inst;
char *value;
char *value, *name;
int i;
if ((inst = calloc(1, sizeof(ROUTER_INSTANCE))) == NULL) {
@ -183,6 +184,8 @@ int i;
inst->initbinlog = 0;
inst->short_burst = DEF_SHORT_BURST;
inst->long_burst = DEF_LONG_BURST;
inst->burst_size = DEF_BURST_SIZE;
inst->retry_backoff = 1;
/*
* We only support one server behind this router, since the server is
@ -279,6 +282,30 @@ int i;
{
inst->long_burst = atoi(value);
}
else if (strcmp(options[i], "burstsize") == 0)
{
unsigned long size = atoi(value);
char *ptr = value;
while (*ptr && isdigit(*ptr))
ptr++;
switch (*ptr)
{
case 'G':
case 'g':
size = size * 1024 * 1000 * 1000;
break;
case 'M':
case 'm':
size = size * 1024 * 1000;
break;
case 'K':
case 'k':
size = size * 1024;
break;
}
inst->burst_size = size;
}
else
{
LOGIF(LE, (skygw_log_write(
@ -325,7 +352,11 @@ int i;
*/
blr_init_cache(inst);
hktask_add("Binlog Router", stats_func, inst, BLR_STATS_FREQ);
if ((name = (char *)malloc(80)) != NULL)
{
sprintf(name, "%s stats", service->name);
hktask_add(name, stats_func, inst, BLR_STATS_FREQ);
}
/*
* Now start the replication from the master to MaxScale
@ -822,9 +853,9 @@ char msg[85];
strcpy(msg, "");
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Erorr Reply '%s', %sattempting reconnect to master",
LOGFILE_ERROR, "Master connection '%s', %sattempting reconnect to master",
message, msg)));
*succp = false;
*succp = true;
blr_master_reconnect(router);
}

View File

@ -47,6 +47,7 @@
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <housekeeper.h>
#include <sys/types.h>
#include <sys/socket.h>
@ -107,6 +108,12 @@ GWBUF *buf;
client->session = router->session;
if ((router->master = dcb_connect(router->service->databases, router->session, BLR_PROTOCOL)) == NULL)
{
char *name = malloc(strlen(router->service->name) + strlen(" Master") + 1);
sprintf(name, "%s Master", router->service->name);
hktask_oneshot(name, blr_start_master, router,
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
if (router->retry_backoff > BLR_MAX_BACKOFF)
router->retry_backoff = 1;
LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR,
"Binlog router: failed to connect to master server '%s'",
router->service->databases->unique_name)));
@ -122,6 +129,7 @@ perror("setsockopt");
router->master_state = BLRM_TIMESTAMP;
router->stats.n_masterstarts++;
router->retry_backoff = 1;
}
/**
@ -137,10 +145,7 @@ blr_restart_master(ROUTER_INSTANCE *router)
{
GWBUF *ptr;
dcb_close(router->master);
dcb_close(router->client);
dcb_free(router->master);
dcb_free(router->client);
/* Discard the queued residual data */
ptr = router->residual;
@ -696,9 +701,10 @@ static REP_HEADER phdr;
}
else if (hdr.flags != LOG_EVENT_ARTIFICIAL_F)
{
router->rotating = 1;
ptr = ptr + 5; // We don't put the first byte of the payload
// into the binlog file
if (hdr.event_type == ROTATE_EVENT)
router->rotating = 1;
blr_write_binlog_record(router, &hdr, ptr);
if (hdr.event_type == ROTATE_EVENT)
{
@ -719,10 +725,10 @@ static REP_HEADER phdr;
hdr.event_size,
router->binlog_name,
router->binlog_position)));
router->rotating = 1;
ptr += 5;
if (hdr.event_type == ROTATE_EVENT)
{
router->rotating = 1;
blr_rotate_event(router, ptr, &hdr);
}
}
@ -800,7 +806,7 @@ blr_extract_header(register uint8_t *ptr, register REP_HEADER *hdr)
* Extract a numeric field from a packet of the specified number of bits
*
* @param src The raw packet source
* @param birs The number of bits to extract (multiple of 8)
* @param bits The number of bits to extract (multiple of 8)
*/
inline uint32_t
extract_field(register uint8_t *src, int bits)

View File

@ -47,6 +47,7 @@
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <housekeeper.h>
#include <skygw_types.h>
#include <skygw_utils.h>
@ -63,6 +64,7 @@ static int blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, G
int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large);
uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr);
int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data);
static int blr_slave_fake_rotate(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
extern int lm_enabled_logfiles_bitmask;
@ -501,6 +503,7 @@ uint32_t chksum;
slave->seqno = 1;
if (slave->nocrc)
len = 19 + 8 + binlognamelen;
else
@ -694,12 +697,15 @@ blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large)
GWBUF *head, *record;
REP_HEADER hdr;
int written, rval = 1, burst;
int rotating;
unsigned long burst_size;
uint8_t *ptr;
if (large)
burst = router->long_burst;
else
burst = router->short_burst;
burst_size = router->burst_size;
spinlock_acquire(&slave->catch_lock);
if (slave->cstate & CS_BUSY)
{
@ -711,19 +717,30 @@ uint8_t *ptr;
if (slave->file == NULL)
{
rotating = router->rotating;
if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL)
{
if (rotating)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_EXPECTCB;
slave->cstate &= ~CS_BUSY;
spinlock_release(&slave->catch_lock);
poll_fake_write_event(slave->dcb);
return rval;
}
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s",
slave->binlogfile)));
slave->cstate &= ~CS_BUSY;
slave->state = BLRS_ERRORED;
dcb_close(slave->dcb);
return 0;
}
}
slave->stats.n_bursts++;
while (burst-- &&
while (burst-- && burst_size > 0 &&
(record = blr_read_binlog(router, slave->file, slave->binlog_pos, &hdr)) != NULL)
{
head = gwbuf_alloc(5);
@ -735,17 +752,35 @@ uint8_t *ptr;
head = gwbuf_append(head, record);
if (hdr.event_type == ROTATE_EVENT)
{
unsigned long beat1 = hkheartbeat;
blr_close_binlog(router, slave->file);
if (hkheartbeat - beat1 > 1) LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR, "blr_close_binlog took %d beats",
hkheartbeat - beat1)));
blr_slave_rotate(slave, GWBUF_DATA(record));
beat1 = hkheartbeat;
if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL)
{
if (rotating)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_EXPECTCB;
slave->cstate &= ~CS_BUSY;
spinlock_release(&slave->catch_lock);
poll_fake_write_event(slave->dcb);
return rval;
}
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s",
slave->binlogfile)));
slave->state = BLRS_ERRORED;
dcb_close(slave->dcb);
break;
}
if (hkheartbeat - beat1 > 1) LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR, "blr_open_binlog took %d beats",
hkheartbeat - beat1)));
}
written = slave->dcb->func.write(slave->dcb, head);
if (written && hdr.event_type != ROTATE_EVENT)
@ -754,6 +789,7 @@ uint8_t *ptr;
}
rval = written;
slave->stats.n_events++;
burst_size -= hdr.event_size;
}
if (record == NULL)
slave->stats.n_failed_read++;
@ -785,6 +821,8 @@ uint8_t *ptr;
{
slave->cstate &= ~CS_UPTODATE;
slave->cstate |= CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
spinlock_release(&router->binlog_lock);
poll_fake_write_event(slave->dcb);
}
else
@ -793,11 +831,11 @@ uint8_t *ptr;
{
slave->stats.n_upd++;
slave->cstate |= CS_UPTODATE;
spinlock_release(&slave->catch_lock);
spinlock_release(&router->binlog_lock);
state_change = 1;
}
}
spinlock_release(&slave->catch_lock);
spinlock_release(&router->binlog_lock);
if (state_change)
{
@ -819,7 +857,7 @@ uint8_t *ptr;
* binlog file.
*
* Note if the master is rotating there is a window during
* whch the rotate event has been written to the old binlog
* which the rotate event has been written to the old binlog
* but the new binlog file has not yet been created. Therefore
* we ignore these issues during the rotate processing.
*/
@ -829,7 +867,18 @@ uint8_t *ptr;
"Master binlog is %s, %lu.",
slave->binlogfile, slave->binlog_pos,
router->binlog_name, router->binlog_position)));
slave->state = BLRS_ERRORED;
if (blr_slave_fake_rotate(router, slave))
{
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
poll_fake_write_event(slave->dcb);
}
else
{
slave->state = BLRS_ERRORED;
dcb_close(slave->dcb);
}
}
else
{
@ -911,3 +960,74 @@ int len = EXTRACT24(ptr + 9); // Extract the event length
memcpy(slave->binlogfile, ptr + 8, len);
slave->binlogfile[len] = 0;
}
/**
* Generate an internal rotate event that we can use to cause the slave to move beyond
* a binlog file that is misisng the rotate eent at the end.
*
* @param router The router instance
* @param slave The slave to rotate
* @return Non-zero if the rotate took place
*/
static int
blr_slave_fake_rotate(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave)
{
char *sptr;
int filenum;
GWBUF *resp;
uint8_t *ptr;
int len, binlognamelen;
REP_HEADER hdr;
uint32_t chksum;
if ((sptr = strrchr(slave->binlogfile, '.')) == NULL)
return 0;
blr_close_binlog(router, slave->file);
filenum = atoi(sptr + 1);
sprintf(slave->binlogfile, BINLOG_NAMEFMT, router->fileroot, filenum + 1);
slave->binlog_pos = 4;
if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL)
return 0;
binlognamelen = strlen(slave->binlogfile);
if (slave->nocrc)
len = 19 + 8 + binlognamelen;
else
len = 19 + 8 + 4 + binlognamelen;
// Build a fake rotate event
resp = gwbuf_alloc(len + 5);
hdr.payload_len = len + 1;
hdr.seqno = slave->seqno++;
hdr.ok = 0;
hdr.timestamp = 0L;
hdr.event_type = ROTATE_EVENT;
hdr.serverid = router->masterid;
hdr.event_size = len;
hdr.next_pos = 0;
hdr.flags = 0x20;
ptr = blr_build_header(resp, &hdr);
encode_value(ptr, slave->binlog_pos, 64);
ptr += 8;
memcpy(ptr, slave->binlogfile, binlognamelen);
ptr += binlognamelen;
if (!slave->nocrc)
{
/*
* Now add the CRC to the fake binlog rotate event.
*
* The algorithm is first to compute the checksum of an empty buffer
* and then the checksum of the event portion of the message, ie we do not
* include the length, sequence number and ok byte that makes up the first
* 5 bytes of the message. We also do not include the 4 byte checksum itself.
*/
chksum = crc32(0L, NULL, 0);
chksum = crc32(chksum, GWBUF_DATA(resp) + 5, hdr.event_size - 4);
encode_value(ptr, chksum, 32);
}
slave->dcb->func.write(slave->dcb, resp);
return 1;
}

View File

@ -67,6 +67,7 @@
#include <monitor.h>
#include <debugcli.h>
#include <poll.h>
#include <housekeeper.h>
#include <skygw_utils.h>
#include <log_manager.h>
@ -167,6 +168,10 @@ struct subcommand showoptions[] = {
"Show all active sessions in MaxScale",
"Show all active sessions in MaxScale",
{0, 0, 0} },
{ "tasks", 0, hkshow_tasks,
"Show all active housekeeper tasks in MaxScale",
"Show all active housekeeper tasks in MaxScale",
{0, 0, 0} },
{ "threads", 0, dShowThreads,
"Show the status of the polling threads in MaxScale",
"Show the status of the polling threads in MaxScale",