Reconnect to master on error

Refine locking in blr_slave_catchup and add tracing
2014-10-02 17:19:43 +01:00 · 2014-10-02 17:19:43 +01:00 · 501d0bcae6
commit 501d0bcae6
parent 829a4bcbfa
2 changed files with 27 additions and 8 deletions
--- a/server/modules/routing/binlog/blr.c
+++ b/server/modules/routing/binlog/blr.c
@ -797,9 +797,13 @@ ROUTER_INSTANCE	*router = (ROUTER_INSTANCE *)instance;
 static  void
 errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp)
 {
+ROUTER_INSTANCE	*router = (ROUTER_INSTANCE *)instance;
+
       	LOGIF(LE, (skygw_log_write_flush(
-		LOGFILE_ERROR, "Erorr Reply '%s'", message)));
+		LOGFILE_ERROR, "Erorr Reply '%s', attempting reconnect to master",
+			message)));
 	*succp = false;
+	blr_master_reconnect(router);
 }

 /** to be inline'd */
--- a/server/modules/routing/binlog/blr_slave.c
+++ b/server/modules/routing/binlog/blr_slave.c
@ -674,9 +674,9 @@ uint8_t	*ptr;
 * needs to put the slave into catchup mode. This prevents the slave taking
 * too much tiem away from the thread that is processing the master events.
 *
- * At the end ofthe burst a fake EPOLLOUT event is added to the poll event
+ * At the end of the burst a fake EPOLLOUT event is added to the poll event
 * queue. This ensures that the slave callback for processing DCB write drain
- * will be called and future catchup requests will be handle on another thread.
+ * will be called and future catchup requests will be handled on another thread.
 *
 * @param	router		The binlog router
 * @param	slave		The slave that is behind
@ -692,6 +692,10 @@ int		written, fd, rval = 1, burst;
 uint8_t		*ptr;
 struct timespec	req;

+extern unsigned long	hkheartbeat;
+unsigned long		beat;
+	beat = hkheartbeat;
+
 	if (large)
 		burst = router->long_burst;
 	else
@ -705,8 +709,19 @@ struct timespec	req;
 		nanosleep(&req, NULL);
 		spinlock_acquire(&slave->catch_lock);
 	}
-	slave->cstate |= (CS_HOLD|CS_BUSY);
+	if (slave->cstate & CS_BUSY)
+	{
+		spinlock_release(&slave->catch_lock);
+if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
+	"Long wait in blr_salve_catchup %ld00ms with %s burst, return without write records.\n",
+hkheartbeat - beat, large ? "long" : "short")));
+		return 0;
+	}
+	slave->cstate |= CS_BUSY;
 	spinlock_release(&slave->catch_lock);
+if (hkheartbeat - beat > 5) LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
+	"Long wait in blr_salve_catchup %ld00ms with %s burst.\n",
+hkheartbeat - beat, large ? "long" : "short")));

 	if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
 	{
@ -743,9 +758,6 @@ struct timespec	req;
 				break;
 			}
 		}
-		spinlock_acquire(&slave->catch_lock);
-		slave->cstate |= CS_HOLD;
-		spinlock_release(&slave->catch_lock);
 		written = slave->dcb->func.write(slave->dcb, head);
 		if (written && hdr.event_type != ROTATE_EVENT)
 		{
@ -753,6 +765,9 @@ struct timespec	req;
 		}
 		rval = written;
 		slave->stats.n_events++;
+		spinlock_acquire(&slave->catch_lock);
+		slave->cstate |= CS_HOLD;
+		spinlock_release(&slave->catch_lock);
 	}
 	if (record == NULL)
 		slave->stats.n_failed_read++;
@ -762,13 +777,13 @@ struct timespec	req;

 	if (fd != -1)
 		close(fd);
-	poll_fake_write_event(slave->dcb);
 	if (record)
 	{
 		slave->stats.n_flows++;
 		spinlock_acquire(&slave->catch_lock);
 		slave->cstate |= CS_EXPECTCB;
 		spinlock_release(&slave->catch_lock);
+		poll_fake_write_event(slave->dcb);
 	}
 	else
 	{