Updates to slave catchup mode to use fake events

Addition of fake EPOLLOUT event mechanism

New memlog feature for debugging purposes
This commit is contained in:
Mark Riddoch
2014-09-30 13:25:45 +01:00
parent 3430fc99d2
commit 0ef87e3cc1
13 changed files with 951 additions and 253 deletions

View File

@ -52,7 +52,6 @@
#include <skygw_utils.h>
#include <log_manager.h>
static uint32_t extract_field(uint8_t *src, int bits);
static void encode_value(unsigned char *data, unsigned int value, int len);
static int blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
@ -61,7 +60,7 @@ static void blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, c
static int blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
static int blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
static int blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large);
static uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr);
static int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data);
@ -567,7 +566,7 @@ uint32_t chksum;
slave->dcb->low_water = router->low_water;
slave->dcb->high_water = router->high_water;
dcb_add_callback(slave->dcb, DCB_REASON_LOW_WATER, blr_slave_callback, slave);
// dcb_add_callback(slave->dcb, DCB_REASON_LOW_WATER, blr_slave_callback, slave);
dcb_add_callback(slave->dcb, DCB_REASON_DRAINED, blr_slave_callback, slave);
if (slave->binlog_pos != router->binlog_position ||
@ -576,7 +575,7 @@ uint32_t chksum;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_UPTODATE;
spinlock_release(&slave->catch_lock);
rval = blr_slave_catchup(router, slave);
rval = blr_slave_catchup(router, slave, true);
}
return rval;
@ -660,187 +659,107 @@ uint8_t *ptr;
* We have a registered slave that is behind the current leading edge of the
* binlog. We must replay the log entries to bring this node up to speed.
*
* There may be a large numebr of records to send to the slave, the process
* There may be a large number of records to send to the slave, the process
* is triggered by the slave COM_BINLOG_DUMP message and all the events must
* be sent without receiving any new event. This measn there is no trigger into
* MaxScale other than this initial message. However, if we simply send all the
* events we end up with an extremely long write queue on the DCB and risk running
* the server out of resources.
* events we end up with an extremely long write queue on the DCB and risk
* running the server out of resources.
*
* To resolve this the concept of high and low water marks within the DCB has been
* added, with the ability for the DCB code to call user defined callbacks when the
* write queue is completely drained, when it crosses above the high water mark and
* when it crosses below the low water mark.
*
* The blr_slave_catchup routine will send binlog events to the slave until the high
* water mark is reached, at which point it will return. Later, when a low water mark
* callback is generated by the code that drains the DCB of data the blr_slave_catchup
* routine will again be called to write more events. The process is repeated until
* the slave has caught up with the master.
* The slave catchup routine will send a burst of replication events per single
* call. The paramter "long" control the number of events in the burst. The
* short burst is intended to be used when the master receive an event and
* needs to put the slave into catchup mode. This prevents the slave taking
* too much tiem away from the thread that is processing the master events.
*
* Note: an additional check that the DCB is still above the low water mark is done
* prior to the return from this function to allow for any delays due to the call to
* the close system call, since this may cause thread rescheduling.
* At the end ofthe burst a fake EPOLLOUT event is added to the poll event
* queue. This ensures that the slave callback for processing DCB write drain
* will be called and future catchup requests will be handle on another thread.
*
* @param router The binlog router
* @param slave The slave that is behind
* @param large Send a long or short burst of events
* @return The number of bytes written
*/
int
blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave)
blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large)
{
GWBUF *head, *record;
REP_HEADER hdr;
int written, fd, rval = 1, burst = 0;
int written, fd, rval = 1, burst;
uint8_t *ptr;
struct timespec req;
if (large)
burst = router->long_burst;
else
burst = router->short_burst;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
doitagain:
/*
* We have a slightly complex syncronisation mechansim here,
* we need to make sure that we do not have multiple threads
* running the catchup loop, but we need to be very careful
* that we do not loose a call that is coming via a callback
* call as this will stall the binlog catchup process.
*
* We don't want to simply use a traditional mutex here for
* the loop, since this would block a MaxScale thread for
* an unacceptable length of time.
*
* We have two status bits, the CS_READING that says we are
* in the outer loop and the CS_INNERLOOP, to say we are in
* the inner loop.
*
* If just CS_READING is set the other thread may be about to
* enter the inner loop or may be about to exit the function
* completely. Therefore we have to wait to see if CS_READING
* is cleared or CS_INNERLOOP is set.
*
* If CS_READING gets cleared then this thread should proceed
* into the loop.
*
* If CS_INNERLOOP get's set then this thread does not need to
* proceed.
*
* If CS_READING is not set then this thread simply enters the
* loop.
*/
req.tv_sec = 0;
req.tv_nsec = 1000;
spinlock_acquire(&slave->catch_lock);
if (slave->cstate & CS_UPTODATE)
while ((slave->cstate & (CS_HOLD|CS_BUSY)) == (CS_HOLD|CS_BUSY))
{
LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE,
"blr_slave_catchup called with up to date slave %d at "
"%s@%d. Reading position %s@%d\n",
slave->serverid, slave->binlogfile,
slave->binlog_pos, router->binlog_name,
router->binlog_position)));
slave->stats.n_alreadyupd++;
spinlock_release(&slave->catch_lock);
return 1;
}
while (slave->cstate & CS_READING)
{
// Wait until we know what the other thread is doing
while ((slave->cstate & (CS_READING|CS_INNERLOOP)) == CS_READING)
{
spinlock_release(&slave->catch_lock);
nanosleep(&req, NULL);
spinlock_acquire(&slave->catch_lock);
}
// Other thread is in the innerloop
if ((slave->cstate & (CS_READING|CS_INNERLOOP)) == (CS_READING|CS_INNERLOOP))
{
spinlock_release(&slave->catch_lock);
LOGIF(LM, (skygw_log_write(
LOGFILE_MESSAGE,
"blr_slave_catchup thread returning due to "
"lock being held by another thread. %s@%d\n",
slave->binlogfile,
slave->binlog_pos)));
slave->stats.n_catchupnr++;
return 1; // We cheat here and return 1 because otherwise
// an error would be sent and we do not want that
}
/* Release the lock for a short time to allow the other
* thread to exit the outer reading loop.
*/
spinlock_release(&slave->catch_lock);
req.tv_sec = 0;
req.tv_nsec = 100;
nanosleep(&req, NULL);
spinlock_acquire(&slave->catch_lock);
}
if (slave->pthread)
LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Multiple threads sending to same thread.\n")));
slave->pthread = pthread_self();
slave->cstate |= CS_READING;
slave->cstate |= (CS_HOLD|CS_BUSY);
spinlock_release(&slave->catch_lock);
if (DCB_ABOVE_HIGH_WATER(slave->dcb))
LOGIF(LT, (skygw_log_write(LOGFILE_TRACE, "blr_slave_catchup above high water on entry.\n")));
do {
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
slave->binlogfile)));
return 0;
}
slave->stats.n_bursts++;
while (burst-- &&
(record = blr_read_binlog(fd, slave->binlog_pos, &hdr)) != NULL)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_HOLD;
spinlock_release(&slave->catch_lock);
head = gwbuf_alloc(5);
ptr = GWBUF_DATA(head);
encode_value(ptr, hdr.event_size + 1, 24);
ptr += 3;
*ptr++ = slave->seqno++;
*ptr++ = 0; // OK
head = gwbuf_append(head, record);
if (hdr.event_type == ROTATE_EVENT)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_READING;
spinlock_release(&slave->catch_lock);
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
close(fd);
blr_slave_rotate(slave, GWBUF_DATA(record));
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
slave->binlogfile)));
return 0;
break;
}
}
slave->stats.n_bursts++;
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_INNERLOOP;
slave->cstate |= CS_HOLD;
spinlock_release(&slave->catch_lock);
while ((!DCB_ABOVE_HIGH_WATER(slave->dcb)) &&
(record = blr_read_binlog(fd, slave->binlog_pos, &hdr)) != NULL)
written = slave->dcb->func.write(slave->dcb, head);
if (written && hdr.event_type != ROTATE_EVENT)
{
if (hdr.event_size > DEF_HIGH_WATER) slave->stats.n_above++;
head = gwbuf_alloc(5);
ptr = GWBUF_DATA(head);
encode_value(ptr, hdr.event_size + 1, 24);
ptr += 3;
*ptr++ = slave->seqno++;
*ptr++ = 0; // OK
head = gwbuf_append(head, record);
if (hdr.event_type == ROTATE_EVENT)
{
close(fd);
blr_slave_rotate(slave, GWBUF_DATA(record));
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
slave->binlogfile)));
break;
}
}
written = slave->dcb->func.write(slave->dcb, head);
if (written && hdr.event_type != ROTATE_EVENT)
{
slave->binlog_pos = hdr.next_pos;
}
rval = written;
slave->stats.n_events++;
burst++;
slave->binlog_pos = hdr.next_pos;
}
if (record == NULL)
slave->stats.n_failed_read++;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_INNERLOOP;
spinlock_release(&slave->catch_lock);
rval = written;
slave->stats.n_events++;
}
if (record == NULL)
slave->stats.n_failed_read++;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_BUSY;
spinlock_release(&slave->catch_lock);
close(fd);
} while (record && DCB_BELOW_LOW_WATER(slave->dcb));
close(fd);
poll_fake_write_event(slave->dcb);
if (record)
{
slave->stats.n_flows++;
@ -864,25 +783,6 @@ if (hdr.event_size > DEF_HIGH_WATER) slave->stats.n_above++;
"blr_slave_catchup slave is up to date %s, %u\n",
slave->binlogfile, slave->binlog_pos)));
}
spinlock_acquire(&slave->catch_lock);
#if 0
if (slave->pthread != pthread_self())
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Multple threads in catchup for same slave: %x and %x\n", slave->pthread, pthread_self())));
abort();
}
#endif
slave->pthread = 0;
#if 0
if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) abort();
#endif
slave->cstate &= ~CS_READING;
spinlock_release(&slave->catch_lock);
if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Expected to be above low water\n")));
goto doitagain;
}
return rval;
}
@ -908,7 +808,7 @@ ROUTER_INSTANCE *router = slave->router;
slave->binlog_pos != router->binlog_position)
{
slave->stats.n_dcb++;
blr_slave_catchup(router, slave);
blr_slave_catchup(router, slave, true);
}
}
@ -917,7 +817,7 @@ ROUTER_INSTANCE *router = slave->router;
if (slave->state == BLRS_DUMPING)
{
slave->stats.n_cb++;
blr_slave_catchup(router, slave);
blr_slave_catchup(router, slave, true);
}
else
{