Reconnect to master on error

Refine locking in blr_slave_catchup and add tracing
This commit is contained in:
Mark Riddoch 2014-10-02 17:19:43 +01:00
parent 829a4bcbfa
commit 501d0bcae6
2 changed files with 27 additions and 8 deletions

View File

@ -797,9 +797,13 @@ ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
static void
errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp)
{
ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Erorr Reply '%s'", message)));
LOGFILE_ERROR, "Erorr Reply '%s', attempting reconnect to master",
message)));
*succp = false;
blr_master_reconnect(router);
}
/** to be inline'd */

View File

@ -674,9 +674,9 @@ uint8_t *ptr;
* needs to put the slave into catchup mode. This prevents the slave taking
* too much tiem away from the thread that is processing the master events.
*
* At the end ofthe burst a fake EPOLLOUT event is added to the poll event
* At the end of the burst a fake EPOLLOUT event is added to the poll event
* queue. This ensures that the slave callback for processing DCB write drain
* will be called and future catchup requests will be handle on another thread.
* will be called and future catchup requests will be handled on another thread.
*
* @param router The binlog router
* @param slave The slave that is behind
@ -692,6 +692,10 @@ int written, fd, rval = 1, burst;
uint8_t *ptr;
struct timespec req;
extern unsigned long hkheartbeat;
unsigned long beat;
beat = hkheartbeat;
if (large)
burst = router->long_burst;
else
@ -705,8 +709,19 @@ struct timespec req;
nanosleep(&req, NULL);
spinlock_acquire(&slave->catch_lock);
}
slave->cstate |= (CS_HOLD|CS_BUSY);
if (slave->cstate & CS_BUSY)
{
spinlock_release(&slave->catch_lock);
if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Long wait in blr_salve_catchup %ld00ms with %s burst, return without write records.\n",
hkheartbeat - beat, large ? "long" : "short")));
return 0;
}
slave->cstate |= CS_BUSY;
spinlock_release(&slave->catch_lock);
if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Long wait in blr_salve_catchup %ld00ms with %s burst.\n",
hkheartbeat - beat, large ? "long" : "short")));
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
@ -743,9 +758,6 @@ struct timespec req;
break;
}
}
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_HOLD;
spinlock_release(&slave->catch_lock);
written = slave->dcb->func.write(slave->dcb, head);
if (written && hdr.event_type != ROTATE_EVENT)
{
@ -753,6 +765,9 @@ struct timespec req;
}
rval = written;
slave->stats.n_events++;
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_HOLD;
spinlock_release(&slave->catch_lock);
}
if (record == NULL)
slave->stats.n_failed_read++;
@ -762,13 +777,13 @@ struct timespec req;
if (fd != -1)
close(fd);
poll_fake_write_event(slave->dcb);
if (record)
{
slave->stats.n_flows++;
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
poll_fake_write_event(slave->dcb);
}
else
{