From d31e081e450506ebdf0e6887b98d473af56749aa Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Thu, 1 Oct 2015 17:03:48 +0200 Subject: [PATCH] Changed log messages for blr_read_binlog, added slave send error feaure Changed log messages for blr_read_binlog, added slave send error feaure. When blr_read_binlog detects an error an error message with 1236 code is sent to the slave and it will stop the replication to binlogrouter --- server/modules/include/blr.h | 2 +- server/modules/routing/binlog/blr_file.c | 137 +++++++++++---------- server/modules/routing/binlog/blr_master.c | 19 +++ server/modules/routing/binlog/blr_slave.c | 72 ++++++++++- 4 files changed, 158 insertions(+), 72 deletions(-) diff --git a/server/modules/include/blr.h b/server/modules/include/blr.h index 51e967650..d6b3e49b4 100644 --- a/server/modules/include/blr.h +++ b/server/modules/include/blr.h @@ -547,7 +547,7 @@ extern int blr_write_binlog_record(ROUTER_INSTANCE *, REP_HEADER *,uint8_t *); extern int blr_file_rotate(ROUTER_INSTANCE *, char *, uint64_t); extern void blr_file_flush(ROUTER_INSTANCE *); extern BLFILE *blr_open_binlog(ROUTER_INSTANCE *, char *); -extern GWBUF *blr_read_binlog(ROUTER_INSTANCE *, BLFILE *, unsigned long, REP_HEADER *); +extern GWBUF *blr_read_binlog(ROUTER_INSTANCE *, BLFILE *, unsigned long, REP_HEADER *, char *); extern void blr_close_binlog(ROUTER_INSTANCE *, BLFILE *); extern unsigned long blr_file_size(BLFILE *); extern int blr_statistics(ROUTER_INSTANCE *, ROUTER_SLAVE *, GWBUF *); diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index 77f73b312..0075f40d7 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -395,22 +395,27 @@ BLFILE *file; * @param file File record * @param pos Position of binlog record to read * @param hdr Binlog header to populate + * @param errmsg Allocated BINLOG_ERROR_MSG_LEN bytes message error buffer * @return The binlog record wrapped in a GWBUF structure */ GWBUF * -blr_read_binlog(ROUTER_INSTANCE *router, BLFILE *file, unsigned long pos, REP_HEADER *hdr) +blr_read_binlog(ROUTER_INSTANCE *router, BLFILE *file, unsigned long pos, REP_HEADER *hdr, char *errmsg) { uint8_t hdbuf[BINLOG_EVENT_HDR_LEN]; GWBUF *result; unsigned char *data; -int n; +int n; unsigned long filelen = 0; struct stat statb; memset(&hdbuf, '\0', BINLOG_EVENT_HDR_LEN); + /* set error indicator */ + hdr->ok = 0xff; + if (!file) { + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Invalid file pointer for requested binlog at position %lu", pos); return NULL; } if (fstat(file->fd, &statb) == 0) @@ -418,9 +423,7 @@ struct stat statb; if (pos > filelen) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Attempting to read off the end of the binlog file %s, " - "event at %lu.", file->binlogname, pos))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Attempting to read off the end of the binlog file, size %lu, event at %lu", filelen, pos); return NULL; } @@ -429,9 +432,11 @@ struct stat statb; { if (pos > router->binlog_position) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Attempting to read off the binlog pos in file %s, " - "event at %lu.", file->binlogname, pos))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Attempting to read off the binlog file pos %lu, event at %lu", + router->binlog_position, pos); + } else { + /* accessing last position is ok */ + hdr->ok = 0x0; } return NULL; @@ -446,32 +451,33 @@ struct stat statb; LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Reached end of binlog file at %d.", pos))); + + /* set ok indicator */ + hdr->ok = 0x0; + break; case -1: { char err_msg[BLRM_STRERROR_R_MSG_SIZE+1] = ""; strerror_r(errno, err_msg, BLRM_STRERROR_R_MSG_SIZE); - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to read binlog file %s at position %d" - " (%s).", file->binlogname, - pos, err_msg))); + + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Failed to read binlog file; (%s), event at %lu", + err_msg, pos); + if (errno == EBADF) - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Bad file descriptor in read binlog for file %s" - ", reference count is %d, descriptor %d.", - file->binlogname, file->refcnt, file->fd))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Bad file descriptor for binlog file, refcount %d, descriptor %d, event at %lu", + file->refcnt, file->fd, pos); break; } default: - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Short read when reading the header. " - "Expected 19 bytes but got %d bytes. " - "Binlog file is %s, position %d", - n, file->binlogname, pos))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Bogus data in log event header; " + "expected %d bytes but read %d, position %lu", + BINLOG_EVENT_HDR_LEN, n, pos); break; } return NULL; } + hdr->timestamp = EXTRACT32(hdbuf); hdr->event_type = hdbuf[4]; hdr->serverid = EXTRACT32(&hdbuf[5]); @@ -479,23 +485,21 @@ struct stat statb; hdr->next_pos = EXTRACT32(&hdbuf[13]); hdr->flags = EXTRACT16(&hdbuf[17]); + /* event pos & size checks */ + if (hdr->next_pos < 0 || hdr->event_size <= 0 || ((hdr->next_pos != (pos + hdr->event_size)) && (hdr->event_type != ROTATE_EVENT))) { + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Client requested master to start replication from invalid position %lu", pos); + return NULL; + } + + /* event type checks */ if (router->mariadb10_compat) { if (hdr->event_type > MAX_EVENT_TYPE_MARIADB10) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Invalid MariaDB 10 event type 0x%x. " - "Binlog file is %s, position %d", - hdr->event_type, - file->binlogname, pos))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Invalid MariaDB 10 event type 0x%x at %lu", hdr->event_type, pos); return NULL; } } else { if (hdr->event_type > MAX_EVENT_TYPE) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Invalid event type 0x%x. " - "Binlog file is %s, position %d", - hdr->event_type, - file->binlogname, pos))); - + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Invalid event type 0x%x at %lu", hdr->event_type, pos); return NULL; } } @@ -508,6 +512,7 @@ struct stat statb; "file size is %lu. Master will write %lu in %s next.", pos, file->binlogname, filelen, router->binlog_position, router->binlog_name))); + if ((n = pread(file->fd, hdbuf, BINLOG_EVENT_HDR_LEN, pos)) != BINLOG_EVENT_HDR_LEN) { switch (n) @@ -516,33 +521,34 @@ struct stat statb; LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Reached end of binlog file at %d.", pos))); + + /* set ok indicator */ + hdr->ok = 0x0; + break; case -1: { char err_msg[BLRM_STRERROR_R_MSG_SIZE+1] = ""; strerror_r(errno, err_msg, BLRM_STRERROR_R_MSG_SIZE); - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to read binlog file %s at position %d" - " (%s).", file->binlogname, - pos, err_msg))); + + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Failed to reread header in binlog file; (%s), event at %lu", + err_msg, pos); if (errno == EBADF) - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Bad file descriptor in read binlog for file %s" - ", reference count is %d, descriptor %d.", - file->binlogname, file->refcnt, file->fd))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Bad file descriptor rereading header for binlog file, " + "refcount %d, descriptor %d, event at %lu", + file->refcnt, file->fd, pos); break; } default: - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Short read when reading the header. " - "Expected 19 bytes but got %d bytes. " - "Binlog file is %s, position %d", - file->binlogname, pos, n))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Bogus data rereading log event header; " + "expected %d bytes but read %d, position %lu", + BINLOG_EVENT_HDR_LEN, n, pos); break; } return NULL; } + hdr->timestamp = EXTRACT32(hdbuf); hdr->event_type = hdbuf[4]; hdr->serverid = EXTRACT32(&hdbuf[5]); @@ -552,9 +558,7 @@ struct stat statb; if (hdr->next_pos < pos && hdr->event_type != ROTATE_EVENT) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Next position still incorrect after " - "rereading"))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Next event position still incorrect after rereading, event at %lu", pos); return NULL; } else @@ -566,14 +570,15 @@ struct stat statb; } if ((result = gwbuf_alloc(hdr->event_size)) == NULL) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to allocate memory for binlog entry, " - "size %d at %d.", - hdr->event_size, pos))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Failed to allocate memory for binlog entry, size %d, event at %lu", + hdr->event_size, pos); return NULL; } + data = GWBUF_DATA(result); + memcpy(data, hdbuf, BINLOG_EVENT_HDR_LEN); // Copy the header in + if ((n = pread(file->fd, &data[BINLOG_EVENT_HDR_LEN], hdr->event_size - BINLOG_EVENT_HDR_LEN, pos + BINLOG_EVENT_HDR_LEN)) != hdr->event_size - BINLOG_EVENT_HDR_LEN) // Read the balance { @@ -581,24 +586,22 @@ struct stat statb; { char err_msg[BLRM_STRERROR_R_MSG_SIZE+1] = ""; strerror_r(errno, err_msg, BLRM_STRERROR_R_MSG_SIZE); - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Error reading the event at %ld in %s. " - "%s, expected %d bytes.", - pos, file->binlogname, - err_msg, hdr->event_size - BINLOG_EVENT_HDR_LEN))); + + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Error reading the binlog event at %lu;" + "(%s), expected %d bytes.", + pos, err_msg, hdr->event_size - BINLOG_EVENT_HDR_LEN); } else { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Short read when reading the event at %ld in %s. " - "Expected %d bytes got %d bytes.", - pos, file->binlogname, hdr->event_size - BINLOG_EVENT_HDR_LEN, n))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Bogus data in log event entry; " + "expected %d bytes but got %d, position %lu", + hdr->event_size - BINLOG_EVENT_HDR_LEN, n, pos); + if (filelen != 0 && filelen - pos < hdr->event_size) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Binlog event is close to the end of the binlog file, " - "current file size is %u.", - filelen))); + snprintf(errmsg, BINLOG_ERROR_MSG_LEN, "Binlog event is close to the end of the binlog file; " + "current file size is %lu, event at %lu", + filelen, pos); } blr_log_header(LOGFILE_ERROR, "Possible malformed event header", hdbuf); } @@ -607,6 +610,10 @@ struct stat statb; return NULL; } + + /* set OK indicator */ + hdr->ok = 0x0; + return result; } diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c index 1a61469bc..91ba5e370 100644 --- a/server/modules/routing/binlog/blr_master.c +++ b/server/modules/routing/binlog/blr_master.c @@ -1329,6 +1329,25 @@ int n_bufs = -1, pn_bufs = -1; gwbuf_free(record); } + /* Log whether no event has been sent */ + if (pos == router->binlog_position) { + LOGIF(LE,(skygw_log_write(LOGFILE_ERROR, + "No events distributed to slaves for a pending transaction in %s at %lu." + " Last event from master at %lu", + router->binlog_name, + router->binlog_position, + router->current_pos))); + } + if (pos < router->current_pos) { + LOGIF(LE,(skygw_log_write(LOGFILE_ERROR, + "Some events were not distributed to slaves for a pending transaction " + "in %s at %lu. Last distributed even at %lu, last event from master at %lu", + router->binlog_name, + router->binlog_position, + pos, + router->current_pos))); + } + spinlock_acquire(&router->lock); router->binlog_position = router->current_pos; diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index e27d150ab..ffddddf7d 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -1917,6 +1917,9 @@ int written, rval = 1, burst; int rotating = 0; unsigned long burst_size; uint8_t *ptr; +char read_errmsg[BINLOG_ERROR_MSG_LEN+1]; + + read_errmsg[BINLOG_ERROR_MSG_LEN] = '\0'; if (large) burst = router->long_burst; @@ -1937,6 +1940,9 @@ uint8_t *ptr; rotating = router->rotating; if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL) { + char err_msg[BINLOG_ERROR_MSG_LEN+1]; + err_msg[BINLOG_ERROR_MSG_LEN] = '\0'; + if (rotating) { spinlock_acquire(&slave->catch_lock); @@ -1948,10 +1954,18 @@ uint8_t *ptr; } LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "blr_slave_catchup failed to open binlog file %s", - slave->binlogfile))); + "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file", + slave->dcb->remote, slave->port, slave->serverid, + slave->binlogfile))); + slave->cstate &= ~CS_BUSY; slave->state = BLRS_ERRORED; + + snprintf(err_msg, BINLOG_ERROR_MSG_LEN, "Failed to open binlog '%s'", slave->binlogfile); + + /* Send error that stops slave replication */ + blr_send_custom_error(slave->dcb, slave->seqno++, 0, err_msg, "HY000", 1236); + dcb_close(slave->dcb); return 0; } @@ -1959,7 +1973,7 @@ uint8_t *ptr; slave->stats.n_bursts++; while (burst-- && burst_size > 0 && - (record = blr_read_binlog(router, slave->file, slave->binlog_pos, &hdr)) != NULL) + (record = blr_read_binlog(router, slave->file, slave->binlog_pos, &hdr, read_errmsg)) != NULL) { head = gwbuf_alloc(5); ptr = GWBUF_DATA(head); @@ -1983,6 +1997,8 @@ uint8_t *ptr; beat1 = hkheartbeat; if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL) { + char err_msg[BINLOG_ERROR_MSG_LEN+1]; + err_msg[BINLOG_ERROR_MSG_LEN] = '\0'; if (rotating) { spinlock_acquire(&slave->catch_lock); @@ -1994,9 +2010,19 @@ uint8_t *ptr; } LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "blr_slave_catchup failed to open binlog file %s", + "Slave %s:%i, server-id %d, binlog '%s': blr_slave_catchup failed to open binlog file in rotate event", + slave->dcb->remote, + slave->port, + slave->serverid, slave->binlogfile))); + slave->state = BLRS_ERRORED; + + snprintf(err_msg, BINLOG_ERROR_MSG_LEN, "Failed to open binlog '%s' in rotate event", slave->binlogfile); + + /* Send error that stops slave replication */ + blr_send_custom_error(slave->dcb, (slave->seqno - 1), 0, err_msg, "HY000", 1236); + dcb_close(slave->dcb); break; } @@ -2019,8 +2045,31 @@ uint8_t *ptr; if (router->send_slave_heartbeat) slave->lastReply = time(0); } - if (record == NULL) + if (record == NULL) { slave->stats.n_failed_read++; + + if (hdr.ok == 0xff) { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", + slave->dcb->remote, + slave->port, + slave->serverid, + slave->binlogfile, + read_errmsg))); + + spinlock_acquire(&slave->catch_lock); + + slave->state = BLRS_ERRORED; + + spinlock_release(&slave->catch_lock); + + blr_send_custom_error(slave->dcb, slave->seqno++, 0, read_errmsg, "HY000", 1236); + + dcb_close(slave->dcb); + + return 0; + } + } spinlock_acquire(&slave->catch_lock); slave->cstate &= ~CS_BUSY; spinlock_release(&slave->catch_lock); @@ -2289,13 +2338,24 @@ REP_HEADER hdr; GWBUF *record, *head; uint8_t *ptr; uint32_t chksum; +char err_msg[BINLOG_ERROR_MSG_LEN+1]; + + err_msg[BINLOG_ERROR_MSG_LEN] = '\0'; memset(&hdr, 0, BINLOG_EVENT_HDR_LEN); if ((file = blr_open_binlog(router, slave->binlogfile)) == NULL) return; - if ((record = blr_read_binlog(router, file, 4, &hdr)) == NULL) + if ((record = blr_read_binlog(router, file, 4, &hdr, err_msg)) == NULL) { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Slave %s:%i, server-id %d, binlog '%s', blr_read_binlog failure: %s", + slave->dcb->remote, + slave->port, + slave->serverid, + slave->binlogfile, + err_msg))); + blr_close_binlog(router, file); return; }