From e8c1c82265a7d2349089ca85885e45b61e66d740 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Wed, 4 Nov 2015 17:54:41 +0100 Subject: [PATCH 01/11] Removed failed fstat message Removed failed fstat message --- server/modules/routing/binlog/blr_file.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index a7ae413ce..2a28c1ca1 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -428,11 +428,6 @@ struct stat statb; } if (fstat(file->fd, &statb) == 0) filelen = statb.st_size; - else - { - snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Invalid size of binlog file, pos %lu", pos); - return NULL; - } if (pos > filelen) { From 00c506b029c88ca8100bd9a97497952135f319b4 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Fri, 6 Nov 2015 18:14:52 +0100 Subject: [PATCH 02/11] Fix for rotate and catchup with transaction_safety = 1 Fix for rotate and catchup with transaction_safety = 1 --- server/modules/include/blr.h | 3 +- server/modules/routing/binlog/blr_file.c | 36 +++++++++-- server/modules/routing/binlog/blr_master.c | 72 ++++++++++------------ server/modules/routing/binlog/blr_slave.c | 44 +++++++++++-- 4 files changed, 105 insertions(+), 50 deletions(-) diff --git a/server/modules/include/blr.h b/server/modules/include/blr.h index 53f6525f1..934c315e0 100644 --- a/server/modules/include/blr.h +++ b/server/modules/include/blr.h @@ -181,9 +181,10 @@ #define BLRM_MASTER_REGITRATION_QUERY_LEN 255 /* Read Binlog position states */ -#define SLAVE_POS_READ_OK 0x0 +#define SLAVE_POS_READ_OK 0x00 #define SLAVE_POS_READ_ERR 0xff #define SLAVE_POS_READ_UNSAFE 0xfe + /** * Some useful macros for examining the MySQL Response packets */ diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index 2a28c1ca1..0299a83a4 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -208,6 +208,7 @@ unsigned char magic[] = BINLOG_MAGIC; router->current_pos = 4; /* Initial position after the magic number */ router->binlog_position = 4; /* Initial position after the magic number */ router->current_safe_event = 4; + router->last_written = 0; } @@ -426,30 +427,54 @@ struct stat statb; snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Invalid file pointer for requested binlog at position %lu", pos); return NULL; } + + spinlock_acquire(&file->lock); if (fstat(file->fd, &statb) == 0) filelen = statb.st_size; + else { + if (file->fd == -1) { + hdr->ok = SLAVE_POS_READ_OK; + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "blr_read_binlog called with invalid file->fd, pos %lu", pos); + spinlock_release(&file->lock); + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Slave has failed fstat %s", errmsg))); + return NULL; + } + } + spinlock_release(&file->lock); if (pos > filelen) { - snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested position %lu is beyond end of the binlog file '%s', size %lu", pos, file->binlogname, filelen); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested position %lu is beyond end of the binlog file '%s', size %lu", + pos, file->binlogname, filelen); return NULL; } + spinlock_acquire(&router->binlog_lock); + spinlock_acquire(&file->lock); + if (strcmp(router->binlog_name, file->binlogname) == 0 && pos >= router->binlog_position) { - if (pos > router->binlog_position) + if (pos > router->binlog_position && !router->rotating) { /* Unsafe position, slave will be disconnected by the calling routine */ - snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested binlog position %lu. Position is unsafe so disconnecting. Latest safe position %lu, end of binlog file %lu", pos, router->binlog_position, router->current_pos); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested position %lu is not available. " + "Latest safe position %lu, end of binlog '%s' is %lu", + pos, router->binlog_position, file->binlogname, router->current_pos); hdr->ok = SLAVE_POS_READ_UNSAFE; } else { /* accessing last position is ok */ hdr->ok = SLAVE_POS_READ_OK; } + spinlock_release(&file->lock); + spinlock_release(&router->binlog_lock); + return NULL; } + spinlock_release(&file->lock); + spinlock_release(&router->binlog_lock); /* Read the header information from the file */ if ((n = pread(file->fd, hdbuf, BINLOG_EVENT_HDR_LEN, pos)) != BINLOG_EVENT_HDR_LEN) @@ -960,9 +985,8 @@ int fde_seen = 0; pending_transaction = 0; LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Warning : pending transaction has been found. " - "Setting safe pos to %lu, current pos %lu", - router->binlog_position, router->current_pos))); + "Binlog '%s' ends at position %lu and has an incomplete transaction at %lu. ", + router->binlog_name, router->current_pos, router->binlog_position))); return 0; } else { diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c index 3ac8086d5..d13b87613 100644 --- a/server/modules/routing/binlog/blr_master.c +++ b/server/modules/routing/binlog/blr_master.c @@ -96,7 +96,7 @@ void blr_master_close(ROUTER_INSTANCE *); char *blr_extract_column(GWBUF *buf, int col); void blr_cache_response(ROUTER_INSTANCE *router, char *response, GWBUF *buf); void poll_fake_write_event(DCB *dcb); -GWBUF *blr_read_events_from_pos(ROUTER_INSTANCE *router, unsigned long long pos, REP_HEADER *hdr); +GWBUF *blr_read_events_from_pos(ROUTER_INSTANCE *router, unsigned long long pos, REP_HEADER *hdr, unsigned long long pos_end); static void blr_check_last_master_event(void *inst); extern int blr_check_heartbeat(ROUTER_INSTANCE *router); extern char * blr_last_event_description(ROUTER_INSTANCE *router); @@ -1059,16 +1059,13 @@ int n_bufs = -1, pn_bufs = -1; * won't be updated to router->current_pos */ + spinlock_acquire(&router->binlog_lock); if (router->trx_safe == 0 || (router->trx_safe && router->pending_transaction == 0)) { /* no pending transaction: set current_pos to binlog_position */ - - spinlock_acquire(&router->lock); - router->binlog_position = router->current_pos; router->current_safe_event = router->current_pos; - - spinlock_release(&router->lock); } + spinlock_release(&router->binlog_lock); /** * Detect transactions in events @@ -1091,6 +1088,8 @@ int n_bufs = -1, pn_bufs = -1; flags = *(ptr+4+20 + 8 + 4); if (flags == 0) { + spinlock_acquire(&router->binlog_lock); + if (router->pending_transaction > 0) { LOGIF(LE,(skygw_log_write_flush(LOGFILE_ERROR, "Error: a MariaDB 10 transaction " @@ -1104,11 +1103,9 @@ int n_bufs = -1, pn_bufs = -1; // An action should be taken here } - spinlock_acquire(&router->lock); - router->pending_transaction = 1; - spinlock_release(&router->lock); + spinlock_release(&router->binlog_lock); } } } @@ -1127,9 +1124,10 @@ int n_bufs = -1, pn_bufs = -1; statement_sql = calloc(1, statement_len+1); strncpy(statement_sql, (char *)ptr+4+20+4+4+1+2+2+var_block_len+1+db_name_len, statement_len); + spinlock_acquire(&router->binlog_lock); + /* Check for BEGIN (it comes for START TRANSACTION too) */ if (strncmp(statement_sql, "BEGIN", 5) == 0) { - if (router->pending_transaction > 0) { LOGIF(LE,(skygw_log_write_flush(LOGFILE_ERROR, "Error: a transaction is already open " @@ -1140,34 +1138,28 @@ int n_bufs = -1, pn_bufs = -1; // An action should be taken here } - spinlock_acquire(&router->lock); - router->pending_transaction = 1; - - spinlock_release(&router->lock); } /* Check for COMMIT in non transactional store engines */ if (strncmp(statement_sql, "COMMIT", 6) == 0) { - spinlock_acquire(&router->lock); router->pending_transaction = 2; - - spinlock_release(&router->lock); } + spinlock_release(&router->binlog_lock); + free(statement_sql); } /* Check for COMMIT in Transactional engines, i.e InnoDB */ if(hdr.event_type == XID_EVENT) { + spinlock_acquire(&router->binlog_lock); + if (router->pending_transaction) { - spinlock_acquire(&router->lock); - router->pending_transaction = 3; - - spinlock_release(&router->lock); } + spinlock_release(&router->binlog_lock); } } @@ -1281,13 +1273,14 @@ int n_bufs = -1, pn_bufs = -1; * may depend on pending transaction */ + spinlock_acquire(&router->binlog_lock); + if (router->trx_safe == 0 || (router->trx_safe && router->pending_transaction == 0)) { - spinlock_acquire(&router->lock); router->binlog_position = router->current_pos; router->current_safe_event = router->current_pos; - spinlock_release(&router->lock); + spinlock_release(&router->binlog_lock); /* Now distribute events */ blr_distribute_binlog_record(router, &hdr, ptr); @@ -1307,26 +1300,25 @@ int n_bufs = -1, pn_bufs = -1; if (router->pending_transaction > 1) { unsigned long long pos; + unsigned long long end_pos; GWBUF *record; uint8_t *raw_data; REP_HEADER new_hdr; int i=0; - spinlock_acquire(&router->lock); - pos = router->binlog_position; + end_pos = router->current_pos; - spinlock_release(&router->lock); - + spinlock_release(&router->binlog_lock); - while ((record = blr_read_events_from_pos(router, pos, &new_hdr)) != NULL) { + while ((record = blr_read_events_from_pos(router, pos, &new_hdr, end_pos)) != NULL) { i++; raw_data = GWBUF_DATA(record); /* distribute event */ blr_distribute_binlog_record(router, &new_hdr, raw_data); - spinlock_acquire(&router->lock); + spinlock_acquire(&router->binlog_lock); /** The current safe position is only updated * if it points to the event we just distributed. */ @@ -1337,13 +1329,12 @@ int n_bufs = -1, pn_bufs = -1; pos = new_hdr.next_pos; - spinlock_release(&router->lock); + spinlock_release(&router->binlog_lock); gwbuf_free(record); } /* Check whether binlog records has been read in previous loop */ - if (pos < router->current_pos) { char err_message[BINLOG_ERROR_MSG_LEN+1]; @@ -1376,16 +1367,17 @@ int n_bufs = -1, pn_bufs = -1; blr_distribute_error_message(router, err_message, "HY000", 1236); } - spinlock_acquire(&router->lock); + /* update binlog_position and set pending to 0 */ + spinlock_acquire(&router->binlog_lock); router->binlog_position = router->current_pos; router->pending_transaction = 0; - spinlock_release(&router->lock); + spinlock_release(&router->binlog_lock); + } else { + spinlock_release(&router->binlog_lock); } - } - } else { @@ -1654,6 +1646,8 @@ int action; if (action == 1) { + spinlock_acquire(&router->binlog_lock); + slave_event_action_t slave_action = SLAVE_FORCE_CATCHUP; if(router->trx_safe && slave->binlog_pos == router->current_safe_event && @@ -1702,6 +1696,8 @@ int action; hdr->next_pos - hdr->event_size))); } + spinlock_release(&router->binlog_lock); + /* * If slave_action is SLAVE_FORCE_CATCHUP then * the slave is not at the position it should be. Force it into @@ -1738,11 +1734,11 @@ int action; slave->stats.n_bytes += gwbuf_length(pkt); slave->stats.n_events++; slave->dcb->func.write(slave->dcb, pkt); + spinlock_acquire(&slave->catch_lock); if (hdr->event_type != ROTATE_EVENT) { slave->binlog_pos = hdr->next_pos; } - spinlock_acquire(&slave->catch_lock); if (slave->overrun) { slave->stats.n_overrun++; @@ -1908,7 +1904,7 @@ char *rval; * @return The binlog record wrapped in a GWBUF structure */ GWBUF -*blr_read_events_from_pos(ROUTER_INSTANCE *router, unsigned long long pos, REP_HEADER *hdr) { +*blr_read_events_from_pos(ROUTER_INSTANCE *router, unsigned long long pos, REP_HEADER *hdr, unsigned long long pos_end) { unsigned long long end_pos = 0; struct stat statb; uint8_t hdbuf[19]; @@ -1918,7 +1914,7 @@ int n; int event_limit; /* Get current binnlog position */ - end_pos = router->current_pos; + end_pos = pos_end; /* end of file reached, we're done */ if (pos == end_pos) { diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index b12b909db..c988531d4 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -1813,8 +1813,9 @@ uint32_t chksum; LOGIF(LM, (skygw_log_write( LOGFILE_MESSAGE, - "%s: Slave %s, server id %d requested binlog file %s from position %lu", + "%s: Slave %s:%d, server id %d requested binlog file %s from position %lu", router->service->name, slave->dcb->remote, + slave->port, slave->serverid, slave->binlogfile, (unsigned long)slave->binlog_pos))); @@ -2050,8 +2051,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; if (hdr.ok == SLAVE_POS_READ_ERR) { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "%s Slave %s:%i, server-id %d, binlog '%s', %s", - router->service->name, + "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", slave->dcb->remote, slave->port, slave->serverid, @@ -2076,7 +2076,13 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; if (hdr.ok == SLAVE_POS_READ_UNSAFE) { - ROUTER_OBJECT *router_obj= router->service->router; + ROUTER_OBJECT *router_obj; + + spinlock_acquire(&router->lock); + + router_obj = router->service->router; + + spinlock_release(&router->lock); LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "%s: Slave %s:%i, server-id %d, binlog '%s', %s", @@ -2146,7 +2152,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; if (slave->stats.n_caughtup == 1) { LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, - "%s: Slave %s:%d, server-id %d is up to date '%s', position %lu.", + "%s: Slave %s:%d, server-id %d is now up to date '%s', position %lu.", router->service->name, slave->dcb->remote, slave->port, @@ -2232,6 +2238,34 @@ ROUTER_INSTANCE *router = slave->router; { if (slave->state == BLRS_DUMPING) { + int do_return; + + spinlock_acquire(&router->binlog_lock); + + do_return = 0; + + /* check for a pending transaction and not rotating */ + if (router->pending_transaction && strcmp(router->binlog_name, slave->binlogfile) == 0 && + (slave->binlog_pos > router->binlog_position) && !router->rotating) { + do_return = 1; + } + + spinlock_release(&router->binlog_lock); + + if (do_return) { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Slave %s:%d, server-id %d, binlog '%s', blr_slave_callback is not calling blr_slave_catchup, pos %lu", + slave->dcb->remote, + slave->port, + slave->serverid, + slave->binlogfile, + (unsigned long)slave->binlog_pos))); + + slave->cstate |= CS_EXPECTCB; + poll_fake_write_event(slave->dcb); + return 0; + } + spinlock_acquire(&slave->catch_lock); slave->cstate &= ~(CS_UPTODATE|CS_EXPECTCB); spinlock_release(&slave->catch_lock); From 5e71ff093f9edae7e03bae9c608374df377ccfa2 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Fri, 6 Nov 2015 18:28:41 +0100 Subject: [PATCH 03/11] Log message updates Log message updates --- server/modules/routing/binlog/blr.c | 4 ++-- server/modules/routing/binlog/blr_file.c | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c index c2b5e5c1c..0858706c7 100644 --- a/server/modules/routing/binlog/blr.c +++ b/server/modules/routing/binlog/blr.c @@ -912,9 +912,9 @@ ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; if (slave->state > 0) { LOGIF(LM, (skygw_log_write_flush( LOGFILE_MESSAGE, - "%s: Slave %s, server id %d, disconnected after %ld seconds. " + "%s: Slave %s:%d, server id %d, disconnected after %ld seconds. " "%d SQL commands, %d events sent (%lu bytes), binlog '%s', last position %lu", - router->service->name, slave->dcb->remote, + router->service->name, slave->dcb->remote, slave->port, slave->serverid, time(0) - slave->connect_time, slave->stats.n_queries, diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index 0299a83a4..3310c92e7 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -459,9 +459,10 @@ struct stat statb; if (pos > router->binlog_position && !router->rotating) { /* Unsafe position, slave will be disconnected by the calling routine */ - snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested position %lu is not available. " - "Latest safe position %lu, end of binlog '%s' is %lu", - pos, router->binlog_position, file->binlogname, router->current_pos); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Requested binlog position %lu. Position is unsafe so disconnecting. " + "Latest safe position %lu, end of binlog file %lu", + pos, router->binlog_position, router->current_pos); + hdr->ok = SLAVE_POS_READ_UNSAFE; } else { /* accessing last position is ok */ From e7dbc605af2536a11dc9adc1980e41d7579df431 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Mon, 9 Nov 2015 09:59:36 +0100 Subject: [PATCH 04/11] Spinlock added for slave state set Spinlock added for slave state set --- server/modules/routing/binlog/blr_slave.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index c988531d4..b1a9a50e3 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -2261,9 +2261,12 @@ ROUTER_INSTANCE *router = slave->router; slave->binlogfile, (unsigned long)slave->binlog_pos))); + spinlock_acquire(&slave->catch_lock); slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); poll_fake_write_event(slave->dcb); - return 0; + + return 0; } spinlock_acquire(&slave->catch_lock); From 23b3c9d2b201fa0f01ed35c5f30ec4456e8152c6 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Mon, 9 Nov 2015 11:53:27 +0100 Subject: [PATCH 05/11] Log message removed Log message removed --- server/modules/routing/binlog/blr_slave.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index b1a9a50e3..12dba3b86 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -2253,19 +2253,10 @@ ROUTER_INSTANCE *router = slave->router; spinlock_release(&router->binlog_lock); if (do_return) { - LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Slave %s:%d, server-id %d, binlog '%s', blr_slave_callback is not calling blr_slave_catchup, pos %lu", - slave->dcb->remote, - slave->port, - slave->serverid, - slave->binlogfile, - (unsigned long)slave->binlog_pos))); - spinlock_acquire(&slave->catch_lock); slave->cstate |= CS_EXPECTCB; spinlock_release(&slave->catch_lock); poll_fake_write_event(slave->dcb); - return 0; } From 8395bad394c5fe564f84d7f50179492ce5cf20e9 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Wed, 11 Nov 2015 12:19:58 +0100 Subject: [PATCH 06/11] Added slave remote port in log messages Added slave remote port in log messages --- server/modules/routing/binlog/blr.c | 2 +- server/modules/routing/binlog/blr_slave.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c index 0858706c7..866019d5d 100644 --- a/server/modules/routing/binlog/blr.c +++ b/server/modules/routing/binlog/blr.c @@ -914,7 +914,7 @@ ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; LOGFILE_MESSAGE, "%s: Slave %s:%d, server id %d, disconnected after %ld seconds. " "%d SQL commands, %d events sent (%lu bytes), binlog '%s', last position %lu", - router->service->name, slave->dcb->remote, slave->port, + router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, time(0) - slave->connect_time, slave->stats.n_queries, diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 12dba3b86..9eac1e6c5 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -1815,7 +1815,7 @@ uint32_t chksum; LOGFILE_MESSAGE, "%s: Slave %s:%d, server id %d requested binlog file %s from position %lu", router->service->name, slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, (unsigned long)slave->binlog_pos))); @@ -1956,7 +1956,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file", - slave->dcb->remote, slave->port, slave->serverid, + slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile))); slave->cstate &= ~CS_BUSY; @@ -2013,7 +2013,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; LOGFILE_ERROR, "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file in rotate event", slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile))); @@ -2053,7 +2053,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, read_errmsg))); @@ -2088,7 +2088,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; "%s: Slave %s:%i, server-id %d, binlog '%s', %s", router->service->name, slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, read_errmsg))); @@ -2155,7 +2155,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; "%s: Slave %s:%d, server-id %d is now up to date '%s', position %lu.", router->service->name, slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, (unsigned long)slave->binlog_pos))); } @@ -2165,7 +2165,7 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; "%s: Slave %s:%d, server-id %d is up to date '%s', position %lu.", router->service->name, slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, (unsigned long)slave->binlog_pos))); } @@ -2257,6 +2257,7 @@ ROUTER_INSTANCE *router = slave->router; slave->cstate |= CS_EXPECTCB; spinlock_release(&slave->catch_lock); poll_fake_write_event(slave->dcb); + return 0; } @@ -2410,7 +2411,7 @@ char err_msg[BINLOG_ERROR_MSG_LEN+1]; LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", slave->dcb->remote, - slave->port, + ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, err_msg))); From 3346c9c30d6c14004a2cd5d8ed0a7ef30cea1af1 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Wed, 11 Nov 2015 15:18:20 +0100 Subject: [PATCH 07/11] Removed '' around slave_uuid Removed '' around slave_uuid --- server/modules/routing/binlog/blr_slave.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 9eac1e6c5..649e4e832 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -688,8 +688,19 @@ extern char *strcasestr(); } else if (strcasecmp(word, "@slave_uuid") == 0) { - if ((word = strtok_r(NULL, sep, &brkb)) != NULL) - slave->uuid = strdup(word); + if ((word = strtok_r(NULL, sep, &brkb)) != NULL) { + int len = strlen(word); + char *word_ptr = word; + if (len) { + if (word[len-1] == '\'') + word[len-1] = '\0'; + if (word[0] == '\'') { + word[0] = '\0'; + word_ptr++; + } + } + slave->uuid = strdup(word_ptr); + } free(query_text); return blr_slave_replay(router, slave, router->saved_master.setslaveuuid); } From a90b336cc4d5f065e332de3b56fc28713d2fc543 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Mon, 16 Nov 2015 15:55:08 +0100 Subject: [PATCH 08/11] blr_master_reconnect instead of blr_start_master blr_master_reconnect instead of blr_start_master in blr_stop_start_master --- server/modules/routing/binlog/blr_master.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c index d13b87613..d27bfd047 100644 --- a/server/modules/routing/binlog/blr_master.c +++ b/server/modules/routing/binlog/blr_master.c @@ -2079,14 +2079,10 @@ blr_stop_start_master(ROUTER_INSTANCE *router) { } router->residual = NULL; - /* Now it is safe to unleash other threads on this router instance */ - router->reconnect_pending = 0; - router->active_logs = 0; - router->master_state = BLRM_UNCONNECTED; spinlock_release(&router->lock); - blr_start_master(router); + blr_master_reconnect(router); } /** From 82dc3ff3c5d8c632b0b11636d0d0aabd37f14ae5 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Tue, 17 Nov 2015 09:47:21 +0100 Subject: [PATCH 09/11] Update lastReply with heartbeat event Update lastReply when heartbeat event is seen and a transaction is still open --- server/modules/routing/binlog/blr_master.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c index d27bfd047..43fa9b9ea 100644 --- a/server/modules/routing/binlog/blr_master.c +++ b/server/modules/routing/binlog/blr_master.c @@ -1225,6 +1225,8 @@ int n_bufs = -1, pn_bufs = -1; router->stats.n_heartbeats++; + if (router->pending_transaction) + router->stats.lastReply = time(0); } else if (hdr.flags != LOG_EVENT_ARTIFICIAL_F) { From 99fdf9cdecca460b77b0fc2143ebac55dc9f96dc Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Thu, 19 Nov 2015 17:54:23 +0100 Subject: [PATCH 10/11] Fixed reference to LOGIF macro Fixed reference to LOGIF macro --- server/modules/routing/binlog/blr.c | 23 +++++----- server/modules/routing/binlog/blr_file.c | 8 ++-- server/modules/routing/binlog/blr_slave.c | 52 ++++++++++------------- 3 files changed, 36 insertions(+), 47 deletions(-) diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c index 7a2484164..930379c12 100644 --- a/server/modules/routing/binlog/blr.c +++ b/server/modules/routing/binlog/blr.c @@ -866,18 +866,17 @@ ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; atomic_add(&router->stats.n_registered, -1); if (slave->state > 0) { - LOGIF(LM, (skygw_log_write_flush( - LOGFILE_MESSAGE, - "%s: Slave %s:%d, server id %d, disconnected after %ld seconds. " - "%d SQL commands, %d events sent (%lu bytes), binlog '%s', last position %lu", - router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), - slave->serverid, - time(0) - slave->connect_time, - slave->stats.n_queries, - slave->stats.n_events, - slave->stats.n_bytes, - slave->binlogfile, - (unsigned long)slave->binlog_pos))); + MXS_NOTICE("%s: Slave %s:%d, server id %d, disconnected after %ld seconds. " + "%d SQL commands, %d events sent (%lu bytes), binlog '%s', " + "last position %lu", + router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), + slave->serverid, + time(0) - slave->connect_time, + slave->stats.n_queries, + slave->stats.n_events, + slave->stats.n_bytes, + slave->binlogfile, + (unsigned long)slave->binlog_pos); } else { MXS_NOTICE("%s: Slave %s, server id %d, disconnected after %ld seconds. " "%d SQL commands", diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index 7c2cad30f..e1132fa4f 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -425,8 +425,7 @@ struct stat statb; hdr->ok = SLAVE_POS_READ_OK; snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "blr_read_binlog called with invalid file->fd, pos %lu", pos); spinlock_release(&file->lock); - LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Slave has failed fstat %s", errmsg))); + MXS_ERROR("Slave has failed fstat %s", errmsg); return NULL; } } @@ -965,9 +964,8 @@ int fde_seen = 0; router->pending_transaction = 1; pending_transaction = 0; - LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Binlog '%s' ends at position %lu and has an incomplete transaction at %lu. ", - router->binlog_name, router->current_pos, router->binlog_position))); + MXS_ERROR("Binlog '%s' ends at position %lu and has an incomplete transaction at %lu. ", + router->binlog_name, router->current_pos, router->binlog_position); return 0; } else { diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 77ba7e83b..43543ac5d 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -1792,13 +1792,11 @@ uint32_t chksum; slave->state = BLRS_DUMPING; - LOGIF(LM, (skygw_log_write( - LOGFILE_MESSAGE, - "%s: Slave %s:%d, server id %d requested binlog file %s from position %lu", - router->service->name, slave->dcb->remote, - ntohs((slave->dcb->ipv4).sin_port), - slave->serverid, - slave->binlogfile, (unsigned long)slave->binlog_pos))); + MXS_NOTICE("%s: Slave %s:%d, server id %d requested binlog file %s from position %lu", + router->service->name, slave->dcb->remote, + ntohs((slave->dcb->ipv4).sin_port), + slave->serverid, + slave->binlogfile, (unsigned long)slave->binlog_pos); if (slave->binlog_pos != router->binlog_position || strcmp(slave->binlogfile, router->binlog_name) != 0) @@ -1934,11 +1932,10 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; poll_fake_write_event(slave->dcb); return rval; } - LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, - "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file", + MXS_ERROR("Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup " + "failed to open binlog file", slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, - slave->binlogfile))); + slave->binlogfile); slave->cstate &= ~CS_BUSY; slave->state = BLRS_ERRORED; @@ -1989,13 +1986,12 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; poll_fake_write_event(slave->dcb); return rval; } - LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, - "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file in rotate event", + MXS_ERROR("Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup " + "failed to open binlog file in rotate event", slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, - slave->binlogfile))); + slave->binlogfile); slave->state = BLRS_ERRORED; @@ -2029,13 +2025,13 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; slave->stats.n_failed_read++; if (hdr.ok == SLAVE_POS_READ_ERR) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", + MXS_ERROR("%s Slave %s:%i, server-id %d, binlog '%s', %s", + router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, - read_errmsg))); + read_errmsg); spinlock_acquire(&slave->catch_lock); @@ -2063,14 +2059,13 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; spinlock_release(&router->lock); - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "%s: Slave %s:%i, server-id %d, binlog '%s', %s", + MXS_ERROR("%s: Slave %s:%i, server-id %d, binlog '%s', %s", router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, - read_errmsg))); + read_errmsg); /* * Close the slave session and socket @@ -2133,23 +2128,21 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; slave->stats.n_caughtup++; if (slave->stats.n_caughtup == 1) { - LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, - "%s: Slave %s:%d, server-id %d is now up to date '%s', position %lu.", + MXS_ERROR("%s: Slave %s:%d, server-id %d is now up to date '%s', position %lu.", router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, - slave->binlogfile, (unsigned long)slave->binlog_pos))); + slave->binlogfile, (unsigned long)slave->binlog_pos); } else if ((slave->stats.n_caughtup % 50) == 0) { - LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, - "%s: Slave %s:%d, server-id %d is up to date '%s', position %lu.", + MXS_ERROR("%s: Slave %s:%d, server-id %d is up to date '%s', position %lu.", router->service->name, slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, - slave->binlogfile, (unsigned long)slave->binlog_pos))); + slave->binlogfile, (unsigned long)slave->binlog_pos); } } } @@ -2397,13 +2390,12 @@ char err_msg[BINLOG_ERROR_MSG_LEN+1]; if ((record = blr_read_binlog(router, file, 4, &hdr, err_msg)) == NULL) { if (hdr.ok != SLAVE_POS_READ_OK) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", + MXS_ERROR("Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", slave->dcb->remote, ntohs((slave->dcb->ipv4).sin_port), slave->serverid, slave->binlogfile, - err_msg))); + err_msg); } blr_close_binlog(router, file); From c167499c7b756340dd0e8fb89602806edf99e7a4 Mon Sep 17 00:00:00 2001 From: Johan Wikman Date: Mon, 23 Nov 2015 09:25:50 +0200 Subject: [PATCH 11/11] Add notice about previous failure to unlock. In blr_slave.c under certain conditions, two locks were not released. That was fixed in another change, and with this change a notice will be logged if that branch is entered. That way it will be possible to find out whether this may have been the cause of earlier lock-ups. --- server/modules/routing/binlog/blr_slave.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 43543ac5d..ae470b050 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -2117,7 +2117,11 @@ char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; spinlock_release(&slave->catch_lock); spinlock_release(&router->binlog_lock); state_change = 1; - } else { + } + else + { + MXS_NOTICE("Execution entered branch were locks previously were NOT " + "released. Previously this would have caused a lock-up."); spinlock_release(&slave->catch_lock); spinlock_release(&router->binlog_lock); }