From 501d0bcae6f8d8ec5fefc28e9ef0257ada07f903 Mon Sep 17 00:00:00 2001 From: Mark Riddoch Date: Thu, 2 Oct 2014 17:19:43 +0100 Subject: [PATCH] Reconnect to master on error Refine locking in blr_slave_catchup and add tracing --- server/modules/routing/binlog/blr.c | 6 ++++- server/modules/routing/binlog/blr_slave.c | 29 +++++++++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c index f41462468..6dcff344a 100644 --- a/server/modules/routing/binlog/blr.c +++ b/server/modules/routing/binlog/blr.c @@ -797,9 +797,13 @@ ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; static void errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp) { +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; + LOGIF(LE, (skygw_log_write_flush( - LOGFILE_ERROR, "Erorr Reply '%s'", message))); + LOGFILE_ERROR, "Erorr Reply '%s', attempting reconnect to master", + message))); *succp = false; + blr_master_reconnect(router); } /** to be inline'd */ diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 2afcda7cf..01e39739e 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -674,9 +674,9 @@ uint8_t *ptr; * needs to put the slave into catchup mode. This prevents the slave taking * too much tiem away from the thread that is processing the master events. * - * At the end ofthe burst a fake EPOLLOUT event is added to the poll event + * At the end of the burst a fake EPOLLOUT event is added to the poll event * queue. This ensures that the slave callback for processing DCB write drain - * will be called and future catchup requests will be handle on another thread. + * will be called and future catchup requests will be handled on another thread. * * @param router The binlog router * @param slave The slave that is behind @@ -692,6 +692,10 @@ int written, fd, rval = 1, burst; uint8_t *ptr; struct timespec req; +extern unsigned long hkheartbeat; +unsigned long beat; + beat = hkheartbeat; + if (large) burst = router->long_burst; else @@ -705,8 +709,19 @@ struct timespec req; nanosleep(&req, NULL); spinlock_acquire(&slave->catch_lock); } - slave->cstate |= (CS_HOLD|CS_BUSY); + if (slave->cstate & CS_BUSY) + { + spinlock_release(&slave->catch_lock); +if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Long wait in blr_salve_catchup %ld00ms with %s burst, return without write records.\n", +hkheartbeat - beat, large ? "long" : "short"))); + return 0; + } + slave->cstate |= CS_BUSY; spinlock_release(&slave->catch_lock); +if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Long wait in blr_salve_catchup %ld00ms with %s burst.\n", +hkheartbeat - beat, large ? "long" : "short"))); if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1) { @@ -743,9 +758,6 @@ struct timespec req; break; } } - spinlock_acquire(&slave->catch_lock); - slave->cstate |= CS_HOLD; - spinlock_release(&slave->catch_lock); written = slave->dcb->func.write(slave->dcb, head); if (written && hdr.event_type != ROTATE_EVENT) { @@ -753,6 +765,9 @@ struct timespec req; } rval = written; slave->stats.n_events++; + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_HOLD; + spinlock_release(&slave->catch_lock); } if (record == NULL) slave->stats.n_failed_read++; @@ -762,13 +777,13 @@ struct timespec req; if (fd != -1) close(fd); - poll_fake_write_event(slave->dcb); if (record) { slave->stats.n_flows++; spinlock_acquire(&slave->catch_lock); slave->cstate |= CS_EXPECTCB; spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); } else {