diff --git a/build_gateway.inc b/build_gateway.inc index 0e1d6b75f..87ad45398 100644 --- a/build_gateway.inc +++ b/build_gateway.inc @@ -52,3 +52,12 @@ endif # ERRMSG := $(HOME)/usr/share/mysql +# +# Build a binary that produces profile data +# +PROFILE := N + +# +# Build a binary that produces code coverage data +# +GCOV := N diff --git a/makefile.inc b/makefile.inc index f2d93bf84..279cea6a4 100644 --- a/makefile.inc +++ b/makefile.inc @@ -41,3 +41,13 @@ endif ifdef PROF CFLAGS := $(CFLAGS) -DSS_PROF endif + +ifeq "$(PROFILE)" "Y" + CFLAGS += -pg + LDFLAGS += -pg +endif + +ifeq "$(GCOV)" "Y" + CFLAGS += -fprofile-arcs -ftest-coverage + LIBS += -lgcov +endif diff --git a/server/core/Makefile b/server/core/Makefile index 9a807a925..19df8a476 100644 --- a/server/core/Makefile +++ b/server/core/Makefile @@ -34,6 +34,7 @@ # gateway needs mysql client lib, not qc. # 24/07/13 Mark Ridoch Addition of encryption routines # 30/05/14 Mark Ridoch Filter API added +# 29/08/14 Mark Riddoch Added housekeeper include ../../build_gateway.inc @@ -47,17 +48,23 @@ CFLAGS=-c -I/usr/include -I../include -I../modules/include -I../inih \ -I$(LOGPATH) -I$(UTILSPATH) \ -Wall -g -include ../../makefile.inc - LDFLAGS=-rdynamic -L$(LOGPATH) \ -Wl,-rpath,$(DEST)/lib \ -Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) \ -Wl,-rpath,$(EMBEDDED_LIB) + +LIBS=-L$(EMBEDDED_LIB) \ + -lmysqld \ + -lz -lm -lcrypt -lcrypto -ldl -laio -lrt -pthread -llog_manager \ + -L../inih/extra -linih -lssl -lstdc++ + +include ../../makefile.inc + SRCS= atomic.c buffer.c spinlock.c gateway.c \ gw_utils.c utils.c dcb.c load_utils.c session.c service.c server.c \ poll.c config.c users.c hashtable.c dbusers.c thread.c gwbitmask.c \ - monitor.c adminusers.c secrets.c filter.c modutil.c + monitor.c adminusers.c secrets.c filter.c modutil.c housekeeper.c HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \ ../include/gw.h ../modules/include/mysql_client_server_protocol.h \ @@ -65,18 +72,13 @@ HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \ ../include/modules.h ../include/poll.h ../include/config.h \ ../include/users.h ../include/hashtable.h ../include/gwbitmask.h \ ../include/adminusers.h ../include/version.h ../include/maxscale.h \ - ../include/filter.h modutil.h + ../include/filter.h ../include/modutil.h ../include/housekeeper.h OBJ=$(SRCS:.c=.o) KOBJS=maxkeys.o secrets.o utils.o POBJS=maxpasswd.o secrets.o utils.o -LIBS=-L$(EMBEDDED_LIB) \ - -lmysqld \ - -lz -lm -lcrypt -lcrypto -ldl -laio -lrt -pthread -llog_manager \ - -L../inih/extra -linih -lssl -lstdc++ - all: maxscale maxkeys maxpasswd cleantests: diff --git a/server/core/buffer.c b/server/core/buffer.c index 26c2c1439..a61fdc2fb 100644 --- a/server/core/buffer.c +++ b/server/core/buffer.c @@ -32,6 +32,8 @@ * 11/07/13 Mark Riddoch Add reference count mechanism * 16/07/2013 Massimiliano Pinto Added command type to gwbuf struct * 24/06/2014 Mark Riddoch Addition of gwbuf_trim + * 28/08/2014 Mark Riddoch Adition of tail pointer to speed + * the gwbuf_append process * * @endverbatim */ @@ -82,6 +84,7 @@ SHARED_BUF *sbuf; sbuf->refcount = 1; rval->sbuf = sbuf; rval->next = NULL; + rval->tail = rval; rval->gwbuf_type = GWBUF_TYPE_UNDEFINED; rval->command = 0; CHK_GWBUF(rval); @@ -131,6 +134,7 @@ GWBUF *rval; rval->end = buf->end; rval->gwbuf_type = buf->gwbuf_type; rval->next = NULL; + rval->tail = rval; CHK_GWBUF(rval); return rval; } @@ -157,6 +161,7 @@ GWBUF *gwbuf_clone_portion( clonebuf->end = (void *)((char *)clonebuf->start)+length; clonebuf->gwbuf_type = buf->gwbuf_type; /*< clone the type for now */ clonebuf->next = NULL; + clonebuf->tail = clonebuf; CHK_GWBUF(clonebuf); return clonebuf; @@ -233,11 +238,8 @@ GWBUF *ptr = head; if (!head) return tail; CHK_GWBUF(head); - while (ptr->next) - { - ptr = ptr->next; - } - ptr->next = tail; + head->tail->next = tail; + head->tail = tail->tail; return head; } @@ -262,6 +264,7 @@ GWBUF * gwbuf_consume(GWBUF *head, unsigned int length) { GWBUF *rval = head; + CHK_GWBUF(head); GWBUF_CONSUME(head, length); CHK_GWBUF(head); @@ -269,8 +272,13 @@ GWBUF *rval = head; if (GWBUF_EMPTY(head)) { rval = head->next; + if (head->next) + head->next->tail = head->tail; gwbuf_free(head); } + + ss_dassert(rval->end > rval->start); + return rval; } @@ -302,6 +310,8 @@ int rval = 0; * buffer has n_bytes or less then it will be freed and * NULL will be returned. * + * This routine assumes the buffer is not part of a chain + * * @param buf The buffer to trim * @param n_bytes The number of bytes to trim off * @return The buffer chain or NULL if buffer has <= n_bytes @@ -309,6 +319,8 @@ int rval = 0; GWBUF * gwbuf_trim(GWBUF *buf, unsigned int n_bytes) { + ss_dassert(buf->next == NULL); + if (GWBUF_LENGTH(buf) <= n_bytes) { gwbuf_consume(buf, GWBUF_LENGTH(buf)); diff --git a/server/core/config.c b/server/core/config.c index 0dfc980dc..fe639bcd7 100644 --- a/server/core/config.c +++ b/server/core/config.c @@ -923,6 +923,15 @@ config_threadcount() return gateway.n_threads; } +static struct { + char *logname; + logfile_id_t logfile; +} lognames[] = { + { "log_messages", LOGFILE_MESSAGE }, + { "log_trace", LOGFILE_TRACE }, + { "log_debug", LOGFILE_DEBUG }, + { NULL, 0 } +}; /** * Configuration handler for items in the global [MaxScale] section * @@ -933,10 +942,20 @@ config_threadcount() static int handle_global_item(const char *name, const char *value) { +int i; if (strcmp(name, "threads") == 0) { gateway.n_threads = atoi(value); } else { - return 0; + for (i = 0; lognames[i].logname; i++) + { + if (strcasecmp(name, lognames[i].logname) == 0) + { + if (atoi(value)) + skygw_log_enable(lognames[i].logfile); + else + skygw_log_disable(lognames[i].logfile); + } + } } return 1; } diff --git a/server/core/dcb.c b/server/core/dcb.c index 382940626..5afaca11c 100644 --- a/server/core/dcb.c +++ b/server/core/dcb.c @@ -89,7 +89,13 @@ static int dcb_null_write(DCB *dcb, GWBUF *buf); static int dcb_null_close(DCB *dcb); static int dcb_null_auth(DCB *dcb, SERVER *server, SESSION *session, GWBUF *buf); -DCB* dcb_get_zombies(void) +/** + * Return the pointer to the lsit of zombie DCB's + * + * @return Zombies DCB list + */ +DCB * +dcb_get_zombies(void) { return zombies; } @@ -128,6 +134,12 @@ DCB *rval; spinlock_init(&rval->delayqlock); spinlock_init(&rval->authlock); spinlock_init(&rval->cb_lock); + spinlock_init(&rval->pollinlock); + spinlock_init(&rval->polloutlock); + rval->pollinbusy = 0; + rval->readcheck = 0; + rval->polloutbusy = 0; + rval->writecheck = 0; rval->fd = -1; memset(&rval->stats, 0, sizeof(DCBSTATS)); // Zero the statistics rval->state = DCB_STATE_ALLOC; @@ -376,11 +388,6 @@ DCB_CALLBACK *cb; } spinlock_release(&dcb->cb_lock); - if (dcb->dcb_readqueue) - { - GWBUF* queue = dcb->dcb_readqueue; - while ((queue = gwbuf_consume(queue, GWBUF_LENGTH(queue))) != NULL); - } bitmask_free(&dcb->memdata.bitmask); simple_mutex_done(&dcb->dcb_read_lock); simple_mutex_done(&dcb->dcb_write_lock); @@ -399,7 +406,7 @@ DCB_CALLBACK *cb; * * @param threadid The thread ID of the caller */ -DCB* +DCB * dcb_process_zombies(int threadid) { DCB *ptr, *lptr; @@ -1187,7 +1194,7 @@ printDCB(DCB *dcb) if (dcb->remote) printf("\tConnected to: %s\n", dcb->remote); if (dcb->user) - printf("\tUsername to: %s\n", dcb->user); + printf("\tUsername to: %s\n", dcb->user); if (dcb->writeq) printf("\tQueued write data: %d\n",gwbuf_length(dcb->writeq)); printf("\tStatistics:\n"); @@ -1204,6 +1211,19 @@ printDCB(DCB *dcb) printf("\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); } +/** + * Display an entry from the spinlock statistics data + * + * @param dcb The DCB to print to + * @param desc Description of the statistic + * @param value The statistic value + */ +static void +spin_reporter(void *dcb, char *desc, int value) +{ + dcb_printf((DCB *)dcb, "\t\t%-35s %d\n", desc, value); +} + /** * Diagnostic to print all DCB allocated in the system @@ -1233,6 +1253,12 @@ void dprintAllDCBs(DCB *pdcb) DCB *dcb; spinlock_acquire(&dcbspin); +#if SPINLOCK_PROFILE + dcb_printf(pdcb, "DCB List Spinlock Statistics:\n"); + spinlock_stats(&dcbspin, spin_reporter, pdcb); + dcb_printf(pdcb, "Zombie Queue Lock Statistics:\n"); + spinlock_stats(&zombiespin, spin_reporter, pdcb); +#endif dcb = allDCBs; while (dcb) { @@ -1252,12 +1278,16 @@ DCB *dcb; dcb_printf(pdcb, "\tQueued write data: %d\n", gwbuf_length(dcb->writeq)); dcb_printf(pdcb, "\tStatistics:\n"); - dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads); - dcb_printf(pdcb, "\t\tNo. of Writes: %d\n", dcb->stats.n_writes); - dcb_printf(pdcb, "\t\tNo. of Buffered Writes: %d\n", dcb->stats.n_buffered); - dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts); - dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water); - dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); + dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads); + dcb_printf(pdcb, "\t\tNo. of Writes: %d\n", dcb->stats.n_writes); + dcb_printf(pdcb, "\t\tNo. of Buffered Writes: %d\n", dcb->stats.n_buffered); + dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts); + dcb_printf(pdcb, "\t\tNo. of busy polls: %d\n", dcb->stats.n_busypolls); + dcb_printf(pdcb, "\t\tNo. of read rechecks: %d\n", dcb->stats.n_readrechecks); + dcb_printf(pdcb, "\t\tNo. of busy write polls: %d\n", dcb->stats.n_busywrpolls); + dcb_printf(pdcb, "\t\tNo. of write rechecks: %d\n", dcb->stats.n_writerechecks); + dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water); + dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); if (dcb->flags & DCBF_CLONE) dcb_printf(pdcb, "\t\tDCB is a clone.\n"); dcb = dcb->next; @@ -1278,20 +1308,20 @@ DCB *dcb; spinlock_acquire(&dcbspin); dcb = allDCBs; dcb_printf(pdcb, "Descriptor Control Blocks\n"); - dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n"); - dcb_printf(pdcb, " %-10s | %-26s | %-20s | %s\n", + dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n"); + dcb_printf(pdcb, " %-16s | %-26s | %-18s | %s\n", "DCB", "State", "Service", "Remote"); - dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n"); + dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n"); while (dcb) { - dcb_printf(pdcb, " %10p | %-26s | %-20s | %s\n", + dcb_printf(pdcb, " %-16p | %-26s | %-18s | %s\n", dcb, gw_dcb_state2string(dcb->state), - (dcb->session->service ? - dcb->session->service->name : ""), + + ((dcb->session && dcb->session->service) ? dcb->session->service->name : ""), (dcb->remote ? dcb->remote : "")); dcb = dcb->next; } - dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n\n"); + dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n\n"); spinlock_release(&dcbspin); } @@ -1308,16 +1338,16 @@ DCB *dcb; spinlock_acquire(&dcbspin); dcb = allDCBs; dcb_printf(pdcb, "Client Connections\n"); - dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n"); - dcb_printf(pdcb, " %-15s | %-10s | %-20s | %s\n", + dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n"); + dcb_printf(pdcb, " %-15s | %-16s | %-20s | %s\n", "Client", "DCB", "Service", "Session"); - dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n"); + dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n"); while (dcb) { if (dcb_isclient(dcb) && dcb->dcb_role == DCB_ROLE_REQUEST_HANDLER) { - dcb_printf(pdcb, " %-15s | %10p | %-20s | %10p\n", + dcb_printf(pdcb, " %-15s | %16p | %-20s | %10p\n", (dcb->remote ? dcb->remote : ""), dcb, (dcb->session->service ? dcb->session->service->name : ""), @@ -1325,7 +1355,7 @@ DCB *dcb; } dcb = dcb->next; } - dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n\n"); + dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n\n"); spinlock_release(&dcbspin); } @@ -1342,16 +1372,18 @@ dprintDCB(DCB *pdcb, DCB *dcb) dcb_printf(pdcb, "DCB: %p\n", (void *)dcb); dcb_printf(pdcb, "\tDCB state: %s\n", gw_dcb_state2string(dcb->state)); if (dcb->session && dcb->session->service) - dcb_printf(pdcb, "\tService: %s\n", + dcb_printf(pdcb, "\tService: %s\n", dcb->session->service->name); if (dcb->remote) dcb_printf(pdcb, "\tConnected to: %s\n", dcb->remote); if (dcb->user) - dcb_printf(pdcb, "\tUsername: %s\n", + dcb_printf(pdcb, "\tUsername: %s\n", dcb->user); dcb_printf(pdcb, "\tOwning Session: %p\n", dcb->session); if (dcb->writeq) dcb_printf(pdcb, "\tQueued write data: %d\n", gwbuf_length(dcb->writeq)); + if (dcb->delayq) + dcb_printf(pdcb, "\tDelayed write data: %d\n", gwbuf_length(dcb->delayq)); dcb_printf(pdcb, "\tStatistics:\n"); dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads); @@ -1361,12 +1393,30 @@ dprintDCB(DCB *pdcb, DCB *dcb) dcb->stats.n_buffered); dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts); + dcb_printf(pdcb, "\t\tNo. of busy polls: %d\n", dcb->stats.n_busypolls); + dcb_printf(pdcb, "\t\tNo. of read rechecks: %d\n", dcb->stats.n_readrechecks); + dcb_printf(pdcb, "\t\tNo. of busy write polls: %d\n", dcb->stats.n_busywrpolls); + dcb_printf(pdcb, "\t\tNo. of write rechecks: %d\n", dcb->stats.n_writerechecks); dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water); dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); if (dcb->flags & DCBF_CLONE) dcb_printf(pdcb, "\t\tDCB is a clone.\n"); +#if SPINLOCK_PROFILE + dcb_printf(pdcb, "\tInitlock Statistics:\n"); + spinlock_stats(&dcb->dcb_initlock, spin_reporter, pdcb); + dcb_printf(pdcb, "\tWrite Queue Lock Statistics:\n"); + spinlock_stats(&dcb->writeqlock, spin_reporter, pdcb); + dcb_printf(pdcb, "\tDelay Queue Lock Statistics:\n"); + spinlock_stats(&dcb->delayqlock, spin_reporter, pdcb); + dcb_printf(pdcb, "\tPollin Lock Statistics:\n"); + spinlock_stats(&dcb->pollinlock, spin_reporter, pdcb); + dcb_printf(pdcb, "\tPollout Lock Statistics:\n"); + spinlock_stats(&dcb->polloutlock, spin_reporter, pdcb); + dcb_printf(pdcb, "\tCallback Lock Statistics:\n"); + spinlock_stats(&dcb->cb_lock, spin_reporter, pdcb); +#endif } /** @@ -1719,10 +1769,7 @@ int gw_write( * @return Non-zero (true) if the callback was added */ int -dcb_add_callback( - DCB *dcb, - DCB_REASON reason, - int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata) +dcb_add_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata) { DCB_CALLBACK *cb, *ptr; int rval = 1; @@ -1754,7 +1801,10 @@ int rval = 1; return 0; } if (cb->next == NULL) + { cb->next = ptr; + break; + } cb = cb->next; } spinlock_release(&dcb->cb_lock); @@ -1775,7 +1825,7 @@ int rval = 1; * @return Non-zero (true) if the callback was removed */ int -dcb_remove_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON), void *userdata) +dcb_remove_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata) { DCB_CALLBACK *cb, *pcb = NULL; int rval = 0; @@ -1868,8 +1918,102 @@ int rval = 0; return rval; } -static DCB* dcb_get_next ( - DCB* dcb) +/** + * Called by the EPOLLIN event. Take care of calling the protocol + * read entry point and managing multiple threads competing for the DCB + * without blocking those threads. + * + * This mechanism does away with the need for a mutex on the EPOLLIN event + * and instead implements a queuing mechanism in which nested events are + * queued on the DCB such that when the thread processing the first event + * returns it will read the queued event and process it. This allows the + * thread that woudl otherwise have to wait to process the nested event + * to return immediately and and process other events. + * + * @param dcb The DCB that has data available + */ +void +dcb_pollin(DCB *dcb, int thread_id) +{ + + spinlock_acquire(&dcb->pollinlock); + if (dcb->pollinbusy == 0) + { + dcb->pollinbusy = 1; + do { + if (dcb->readcheck) + { + dcb->stats.n_readrechecks++; + dcb_process_zombies(thread_id); + } + dcb->readcheck = 0; + spinlock_release(&dcb->pollinlock); + dcb->func.read(dcb); + spinlock_acquire(&dcb->pollinlock); + } while (dcb->readcheck); + dcb->pollinbusy = 0; + } + else + { + dcb->stats.n_busypolls++; + dcb->readcheck = 1; + } + spinlock_release(&dcb->pollinlock); +} + + +/** + * Called by the EPOLLOUT event. Take care of calling the protocol + * write_ready entry point and managing multiple threads competing for the DCB + * without blocking those threads. + * + * This mechanism does away with the need for a mutex on the EPOLLOUT event + * and instead implements a queuing mechanism in which nested events are + * queued on the DCB such that when the thread processing the first event + * returns it will read the queued event and process it. This allows the + * thread that would otherwise have to wait to process the nested event + * to return immediately and and process other events. + * + * @param dcb The DCB thats available for writes + */ +void +dcb_pollout(DCB *dcb, int thread_id) +{ + + spinlock_acquire(&dcb->polloutlock); + if (dcb->polloutbusy == 0) + { + dcb->polloutbusy = 1; + do { + if (dcb->writecheck) + { + dcb_process_zombies(thread_id); + dcb->stats.n_writerechecks++; + } + dcb->writecheck = 0; + spinlock_release(&dcb->polloutlock); + dcb->func.write_ready(dcb); + spinlock_acquire(&dcb->polloutlock); + } while (dcb->writecheck); + dcb->polloutbusy = 0; + } + else + { + dcb->stats.n_busywrpolls++; + dcb->writecheck = 1; + } + spinlock_release(&dcb->polloutlock); +} + + +/** + * Get the next DCB in the list of all DCB's + * + * @param dcb The current DCB + * @return The pointer to the next DCB or NULL if this is the last + */ +static DCB * +dcb_get_next (DCB* dcb) { DCB* p; @@ -1903,8 +2047,13 @@ static DCB* dcb_get_next ( return dcb; } -void dcb_call_foreach ( - DCB_REASON reason) +/** + * Call all the callbacks on all DCB's that match the reason given + * + * @param reason The DCB_REASON that triggers the callback + */ +void +dcb_call_foreach(DCB_REASON reason) { switch (reason) { case DCB_REASON_CLOSE: diff --git a/server/core/gateway.c b/server/core/gateway.c index dc174577b..f2f54d6aa 100644 --- a/server/core/gateway.c +++ b/server/core/gateway.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1510,6 +1511,12 @@ int main(int argc, char **argv) log_flush_thr = thread_start( log_flush_cb, (void *)&log_flush_timeout_ms); + + /* + * Start the housekeeper thread + */ + hkinit(); + /*< * Start the polling threads, note this is one less than is * configured as the main thread will also poll. diff --git a/server/core/hashtable.c b/server/core/hashtable.c index 4e1a2f4e1..14b1b8002 100644 --- a/server/core/hashtable.c +++ b/server/core/hashtable.c @@ -28,7 +28,7 @@ * and value and to free them. * * The hashtable is arrange as a set of linked lists, the number of linked - * lists beign the hashsize as requested by the user. Entries are hashed by + * lists being the hashsize as requested by the user. Entries are hashed by * calling the hash function that is passed in by the user, this is used as * an index into the array of linked lists, usign modulo hashsize. * diff --git a/server/core/housekeeper.c b/server/core/housekeeper.c new file mode 100644 index 000000000..6180f24a5 --- /dev/null +++ b/server/core/housekeeper.c @@ -0,0 +1,195 @@ +/* + * This file is distributed as part of the SkySQL Gateway. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ +#include +#include +#include +#include + +/** + * @file housekeeper.c Provide a mechanism to run periodic tasks + * + * @verbatim + * Revision History + * + * Date Who Description + * 29/08/14 Mark Riddoch Initial implementation + * + * @endverbatim + */ + +/** + * List of all tasks that need to be run + */ +static HKTASK *tasks = NULL; +/** + * Spinlock to protect the tasks list + */ +static SPINLOCK tasklock = SPINLOCK_INIT; + +static void hkthread(void *); + +/** + * Initialise the housekeeper thread + */ +void +hkinit() +{ + thread_start(hkthread, NULL); +} + +/** + * Add a new task to the housekeepers lists of tasks that should be + * run periodically. + * + * The task will be first run frequency seconds after this call is + * made and will the be executed repeatedly every frequency seconds + * until the task is removed. + * + * Task names must be unique. + * + * @param name The unique name for this housekeeper task + * @param taskfn The function to call for the task + * @param data Data to pass to the task function + * @param frequency How often to run the task, expressed in seconds + * @return Return the tiem in seconds when the task will be first run if the task was added, otherwise 0 + */ +int +hktask_add(char *name, void (*taskfn)(void *), void *data, int frequency) +{ +HKTASK *task, *ptr; + + if ((task = (HKTASK *)malloc(sizeof(HKTASK))) == NULL) + { + return 0; + } + if ((task->name = strdup(name)) == NULL) + { + free(task); + return 0; + } + task->task = taskfn; + task->data = data; + task->frequency = frequency; + task->nextdue = time(0) + frequency; + task->next = NULL; + spinlock_acquire(&tasklock); + ptr = tasks; + while (ptr && ptr->next) + { + if (strcmp(ptr->name, name) == 0) + { + spinlock_release(&tasklock); + free(task->name); + free(task); + return 0; + } + ptr = ptr->next; + } + if (ptr) + ptr->next = task; + else + tasks = task; + spinlock_release(&tasklock); + + return task->nextdue; +} + +/** + * Remove a named task from the housekeepers task list + * + * @param name The task name to remove + * @return Returns 0 if the task could not be removed + */ +int +hktask_remove(char *name) +{ +HKTASK *ptr, *lptr = NULL; + + spinlock_acquire(&tasklock); + ptr = tasks; + while (ptr && strcmp(ptr->name, name) != 0) + { + lptr = ptr; + ptr = ptr->next; + } + if (ptr && lptr) + lptr->next = ptr->next; + else if (ptr) + tasks = ptr->next; + spinlock_release(&tasklock); + + if (ptr) + { + free(ptr->name); + free(ptr); + return 1; + } + else + { + return 0; + } +} + + +/** + * The housekeeper thread implementation. + * + * This function is responsible for executing the housekeeper tasks. + * + * The implementation of the callng of the task functions is such that + * the tasks are called without the tasklock spinlock being held. This + * allows manipulation of the housekeeper task list during execution of + * one of the tasks. The resutl is that upon completion of a task the + * search for tasks to run must restart from the start of the queue. + * It is vital that the task->nextdue tiem is updated before the task + * is run. + * + * @param data Unused, here to satisfy the thread system + */ +void +hkthread(void *data) +{ +HKTASK *ptr; +time_t now; +void (*taskfn)(void *); +void *taskdata; + + for (;;) + { + thread_millisleep(1000); + now = time(0); + spinlock_acquire(&tasklock); + ptr = tasks; + while (ptr) + { + if (ptr->nextdue <= now) + { + ptr->nextdue = now + ptr->frequency; + taskfn = ptr->task; + taskdata = ptr->data; + spinlock_release(&tasklock); + (*taskfn)(taskdata); + spinlock_acquire(&tasklock); + ptr = tasks; + } + else + ptr = ptr->next; + } + spinlock_release(&tasklock); + } +} diff --git a/server/core/poll.c b/server/core/poll.c index 87d3640f0..3d11b6421 100644 --- a/server/core/poll.c +++ b/server/core/poll.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include extern int lm_enabled_logfiles_bitmask; @@ -41,14 +43,63 @@ extern int lm_enabled_logfiles_bitmask; * 19/06/13 Mark Riddoch Initial implementation * 28/06/13 Mark Riddoch Added poll mask support and DCB * zombie management + * 29/08/14 Mark Riddoch Addition of thread status data, load average + * etc. * * @endverbatim */ static int epoll_fd = -1; /*< The epoll file descriptor */ -static int do_shutdown = 0; /*< Flag the shutdown of the poll subsystem */ +static int do_shutdown = 0; /*< Flag the shutdown of the poll subsystem */ static GWBITMASK poll_mask; static simple_mutex_t epoll_wait_mutex; /*< serializes calls to epoll_wait */ +static int n_waiting = 0; /*< No. of threads in epoll_wait */ + +/** + * Thread load average, this is the average number of descriptors in each + * poll completion, a value of 1 or less is the ideal. + */ +static double load_average = 0.0; +static int load_samples = 0; +static int load_nfds = 0; +static double current_avg = 0.0; +static double *avg_samples = NULL; +static int next_sample = 0; +static int n_avg_samples; + +/* Thread statistics data */ +static int n_threads; /*< No. of threads */ + +/** + * Internal MaxScale thread states + */ +typedef enum { THREAD_STOPPED, THREAD_IDLE, + THREAD_POLLING, THREAD_PROCESSING, + THREAD_ZPROCESSING } THREAD_STATE; + +/** + * Thread data used to report the current state and activity related to + * a thread + */ +typedef struct { + THREAD_STATE state; /*< Current thread state */ + int n_fds; /*< No. of descriptors thread is processing */ + DCB *cur_dcb; /*< Current DCB being processed */ + uint32_t event; /*< Current event being processed */ +} THREAD_DATA; + +static THREAD_DATA *thread_data = NULL; /*< Status of each thread */ + +/** + * The number of buckets used to gather statistics about how many + * descriptors where processed on each epoll completion. + * + * An array of wakeup counts is created, with the number of descriptors used + * to index that array. Each time a completion occurs the n_fds - 1 value is + * used to index this array and increment the count held there. + * If n_fds - 1 >= MAXFDS then the count at MAXFDS -1 is incremented. + */ +#define MAXNFDS 10 /** * The polling statistics @@ -60,8 +111,20 @@ static struct { int n_hup; /*< Number of hangup events */ int n_accept; /*< Number of accept events */ int n_polls; /*< Number of poll cycles */ + int n_nothreads; /*< Number of times no threads are polling */ + int n_fds[MAXNFDS]; /*< Number of wakeups with particular + n_fds value */ } pollStats; +/** + * How frequently to call the poll_loadav function used to monitor the load + * average of the poll subsystem. + */ +#define POLL_LOAD_FREQ 10 +/** + * Periodic function to collect load data for average calculations + */ +static void poll_loadav(void *); /** * Initialise the polling system we are using for the gateway. @@ -71,6 +134,8 @@ static struct { void poll_init() { +int i; + if (epoll_fd != -1) return; if ((epoll_fd = epoll_create(MAX_EVENTS)) == -1) @@ -80,7 +145,23 @@ poll_init() } memset(&pollStats, 0, sizeof(pollStats)); bitmask_init(&poll_mask); + n_threads = config_threadcount(); + if ((thread_data = + (THREAD_DATA *)malloc(n_threads * sizeof(THREAD_DATA))) != NULL) + { + for (i = 0; i < n_threads; i++) + { + thread_data[i].state = THREAD_STOPPED; + } + } simple_mutex_init(&epoll_wait_mutex, "epoll_wait_mutex"); + + hktask_add("Load Average", poll_loadav, NULL, POLL_LOAD_FREQ); + n_avg_samples = 15 * 60 / POLL_LOAD_FREQ; + avg_samples = (double *)malloc(sizeof(double *) * n_avg_samples); + for (i = 0; i < n_avg_samples; i++) + avg_samples[i] = 0.0; + } /** @@ -100,7 +181,7 @@ poll_add_dcb(DCB *dcb) CHK_DCB(dcb); - ev.events = EPOLLIN | EPOLLOUT | EPOLLET; + ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLHUP | EPOLLET; ev.data.ptr = dcb; /*< @@ -245,20 +326,26 @@ return_rc: void poll_waitevents(void *arg) { - struct epoll_event events[MAX_EVENTS]; - int i, nfds; - int thread_id = (int)arg; - bool no_op = false; - static bool process_zombies_only = false; /*< flag for all threads */ - DCB *zombies = NULL; +struct epoll_event events[MAX_EVENTS]; +int i, nfds; +int thread_id = (int)arg; +bool no_op = false; +static bool process_zombies_only = false; /*< flag for all threads */ +DCB *zombies = NULL; /* Add this thread to the bitmask of running polling threads */ bitmask_set(&poll_mask, thread_id); + if (thread_data) + { + thread_data[thread_id].state = THREAD_IDLE; + } while (1) { + atomic_add(&n_waiting, 1); #if BLOCKINGPOLL nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, -1); + atomic_add(&n_waiting, -1); #else /* BLOCKINGPOLL */ if (!no_op) { LOGIF(LD, (skygw_log_write( @@ -272,9 +359,14 @@ poll_waitevents(void *arg) #if 0 simple_mutex_lock(&epoll_wait_mutex, TRUE); #endif + if (thread_data) + { + thread_data[thread_id].state = THREAD_POLLING; + } if ((nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, 0)) == -1) { + atomic_add(&n_waiting, -1); int eno = errno; errno = 0; LOGIF(LD, (skygw_log_write( @@ -288,6 +380,7 @@ poll_waitevents(void *arg) } else if (nfds == 0) { + atomic_add(&n_waiting, -1); if (process_zombies_only) { #if 0 simple_mutex_unlock(&epoll_wait_mutex); @@ -310,6 +403,13 @@ poll_waitevents(void *arg) } } } + else + { + atomic_add(&n_waiting, -1); + } + + if (n_waiting == 0) + atomic_add(&pollStats.n_nothreads, 1); #if 0 simple_mutex_unlock(&epoll_wait_mutex); #endif @@ -322,6 +422,20 @@ poll_waitevents(void *arg) pthread_self(), nfds))); atomic_add(&pollStats.n_polls, 1); + if (thread_data) + { + thread_data[thread_id].n_fds = nfds; + thread_data[thread_id].cur_dcb = NULL; + thread_data[thread_id].event = 0; + thread_data[thread_id].state = THREAD_PROCESSING; + } + + pollStats.n_fds[(nfds < MAXNFDS ? (nfds - 1) : MAXNFDS - 1)]++; + + load_average = (load_average * load_samples + nfds) + / (load_samples + 1); + atomic_add(&load_samples, 1); + atomic_add(&load_nfds, nfds); for (i = 0; i < nfds; i++) { @@ -329,6 +443,11 @@ poll_waitevents(void *arg) __uint32_t ev = events[i].events; CHK_DCB(dcb); + if (thread_data) + { + thread_data[thread_id].cur_dcb = dcb; + thread_data[thread_id].event = ev; + } #if defined(SS_DEBUG) if (dcb_fake_write_ev[dcb->fd] != 0) { @@ -364,6 +483,7 @@ poll_waitevents(void *arg) eno = gw_getsockerrno(dcb->fd); if (eno == 0) { +#if MUTEX_BLOCK simple_mutex_lock( &dcb->dcb_write_lock, true); @@ -378,6 +498,11 @@ poll_waitevents(void *arg) dcb->dcb_write_active = FALSE; simple_mutex_unlock( &dcb->dcb_write_lock); +#else + atomic_add(&pollStats.n_write, + 1); + dcb_pollout(dcb, thread_id); +#endif } else { LOGIF(LD, (skygw_log_write( LOGFILE_DEBUG, @@ -393,11 +518,13 @@ poll_waitevents(void *arg) } if (ev & EPOLLIN) { +#if MUTEX_BLOCK simple_mutex_lock(&dcb->dcb_read_lock, true); ss_info_dassert(!dcb->dcb_read_active, "Read already active"); dcb->dcb_read_active = TRUE; +#endif if (dcb->state == DCB_STATE_LISTENING) { @@ -421,11 +548,17 @@ poll_waitevents(void *arg) dcb, dcb->fd))); atomic_add(&pollStats.n_read, 1); +#if MUTEX_BLOCK dcb->func.read(dcb); +#else + dcb_pollin(dcb, thread_id); +#endif } +#if MUTEX_BLOCK dcb->dcb_read_active = FALSE; simple_mutex_unlock( &dcb->dcb_read_lock); +#endif } if (ev & EPOLLERR) { @@ -475,10 +608,33 @@ poll_waitevents(void *arg) atomic_add(&pollStats.n_hup, 1); dcb->func.hangup(dcb); } + + if (ev & EPOLLRDHUP) + { + int eno = 0; + eno = gw_getsockerrno(dcb->fd); + + LOGIF(LD, (skygw_log_write( + LOGFILE_DEBUG, + "%lu [poll_waitevents] " + "EPOLLRDHUP on dcb %p, fd %d. " + "Errno %d, %s.", + pthread_self(), + dcb, + dcb->fd, + eno, + strerror(eno)))); + atomic_add(&pollStats.n_hup, 1); + dcb->func.hangup(dcb); + } } /*< for */ no_op = FALSE; } process_zombies: + if (thread_data) + { + thread_data[thread_id].state = THREAD_ZPROCESSING; + } zombies = dcb_process_zombies(thread_id); if (zombies == NULL) { @@ -491,9 +647,17 @@ poll_waitevents(void *arg) * Remove the thread from the bitmask of running * polling threads. */ + if (thread_data) + { + thread_data[thread_id].state = THREAD_STOPPED; + } bitmask_clear(&poll_mask, thread_id); return; } + if (thread_data) + { + thread_data[thread_id].state = THREAD_IDLE; + } } /*< while(1) */ } @@ -525,10 +689,194 @@ poll_bitmask() void dprintPollStats(DCB *dcb) { - dcb_printf(dcb, "Number of epoll cycles: %d\n", pollStats.n_polls); - dcb_printf(dcb, "Number of read events: %d\n", pollStats.n_read); - dcb_printf(dcb, "Number of write events: %d\n", pollStats.n_write); - dcb_printf(dcb, "Number of error events: %d\n", pollStats.n_error); - dcb_printf(dcb, "Number of hangup events: %d\n", pollStats.n_hup); - dcb_printf(dcb, "Number of accept events: %d\n", pollStats.n_accept); +int i; + + dcb_printf(dcb, "Number of epoll cycles: %d\n", + pollStats.n_polls); + dcb_printf(dcb, "Number of read events: %d\n", + pollStats.n_read); + dcb_printf(dcb, "Number of write events: %d\n", + pollStats.n_write); + dcb_printf(dcb, "Number of error events: %d\n", + pollStats.n_error); + dcb_printf(dcb, "Number of hangup events: %d\n", + pollStats.n_hup); + dcb_printf(dcb, "Number of accept events: %d\n", + pollStats.n_accept); + dcb_printf(dcb, "Number of times no threads polling: %d\n", + pollStats.n_nothreads); + + dcb_printf(dcb, "No of poll completions with descriptors\n"); + dcb_printf(dcb, "\tNo. of descriptors\tNo. of poll completions.\n"); + for (i = 0; i < MAXNFDS - 1; i++) + { + dcb_printf(dcb, "\t%2d\t\t\t%d\n", i + 1, pollStats.n_fds[i]); + } + dcb_printf(dcb, "\t>= %d\t\t\t%d\n", MAXNFDS, + pollStats.n_fds[MAXNFDS-1]); +} + +/** + * Convert an EPOLL event mask into a printable string + * + * @param event The event mask + * @return A string representation, the caller must free the string + */ +static char * +event_to_string(uint32_t event) +{ +char *str; + + str = malloc(22); // 22 is max returned string length + if (str == NULL) + return NULL; + *str = 0; + if (event & EPOLLIN) + { + strcat(str, "IN"); + } + if (event & EPOLLOUT) + { + if (*str) + strcat(str, "|"); + strcat(str, "OUT"); + } + if (event & EPOLLERR) + { + if (*str) + strcat(str, "|"); + strcat(str, "ERR"); + } + if (event & EPOLLHUP) + { + if (*str) + strcat(str, "|"); + strcat(str, "HUP"); + } + if (event & EPOLLRDHUP) + { + if (*str) + strcat(str, "|"); + strcat(str, "RDHUP"); + } + + return str; +} + +/** + * Print the thread status for all the polling threads + * + * @param dcb The DCB to send the thread status data + */ +void +dShowThreads(DCB *dcb) +{ +int i, j, n; +char *state; +double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; + + + dcb_printf(dcb, "Polling Threads.\n\n"); + dcb_printf(dcb, "Historic Thread Load Average: %.2f.\n", load_average); + dcb_printf(dcb, "Current Thread Load Average: %.2f.\n", current_avg); + + /* Average all the samples to get the 15 minute average */ + for (i = 0; i < n_avg_samples; i++) + avg15 += avg_samples[i]; + avg15 = avg15 / n_avg_samples; + + /* Average the last third of the samples to get the 5 minute average */ + n = 5 * 60 / POLL_LOAD_FREQ; + i = next_sample - (n + 1); + if (i < 0) + i += n_avg_samples; + for (j = i; j < i + n; j++) + avg5 += avg_samples[j % n_avg_samples]; + avg5 = (3 * avg5) / (n_avg_samples); + + /* Average the last 15th of the samples to get the 1 minute average */ + n = 60 / POLL_LOAD_FREQ; + i = next_sample - (n + 1); + if (i < 0) + i += n_avg_samples; + for (j = i; j < i + n; j++) + avg1 += avg_samples[j % n_avg_samples]; + avg1 = (15 * avg1) / (n_avg_samples); + + dcb_printf(dcb, "15 Minute Average: %.2f, 5 Minute Average: %.2f, " + "1 Minute Average: %.2f\n\n", avg15, avg5, avg1); + + if (thread_data == NULL) + return; + dcb_printf(dcb, " ID | State | # fds | Descriptor | Event\n"); + dcb_printf(dcb, "----+------------+--------+------------------+---------------\n"); + for (i = 0; i < n_threads; i++) + { + switch (thread_data[i].state) + { + case THREAD_STOPPED: + state = "Stopped"; + break; + case THREAD_IDLE: + state = "Idle"; + break; + case THREAD_POLLING: + state = "Polling"; + break; + case THREAD_PROCESSING: + state = "Processing"; + break; + case THREAD_ZPROCESSING: + state = "Collecting"; + break; + } + if (thread_data[i].state != THREAD_PROCESSING) + dcb_printf(dcb, + " %2d | %-10s | | |\n", + i, state); + else if (thread_data[i].cur_dcb == NULL) + dcb_printf(dcb, + " %2d | %-10s | %6d | |\n", + i, state, thread_data[i].n_fds); + else + { + char *event_string + = event_to_string(thread_data[i].event); + if (event_string == NULL) + event_string = "??"; + dcb_printf(dcb, + " %2d | %-10s | %6d | %-16p | %s\n", + i, state, thread_data[i].n_fds, + thread_data[i].cur_dcb, event_string); + free(event_string); + } + } +} + +/** + * The function used to calculate time based load data. This is called by the + * housekeeper every POLL_LOAD_FREQ seconds. + * + * @param data Argument required by the housekeeper but not used here + */ +static void +poll_loadav(void *data) +{ +static int last_samples = 0, last_nfds = 0; +int new_samples, new_nfds; + + new_samples = load_samples - last_samples; + new_nfds = load_nfds - last_nfds; + last_samples = load_samples; + last_nfds = load_nfds; + + /* POLL_LOAD_FREQ average is... */ + if (new_samples) + current_avg = new_nfds / new_samples; + else + current_avg = 0.0; + avg_samples[next_sample] = current_avg; + next_sample++; + if (next_sample >= n_avg_samples) + next_sample = 0; } diff --git a/server/core/spinlock.c b/server/core/spinlock.c index c859f726e..7b35163f3 100644 --- a/server/core/spinlock.c +++ b/server/core/spinlock.c @@ -40,9 +40,12 @@ void spinlock_init(SPINLOCK *lock) { lock->lock = 0; -#ifdef DEBUG +#ifdef SPINLOCK_PROFILE lock->spins = 0; lock->acquired = 0; + lock->waiting = 0; + lock->max_waiting = 0; + lock->contended = 0; #endif } @@ -54,16 +57,29 @@ spinlock_init(SPINLOCK *lock) void spinlock_acquire(SPINLOCK *lock) { +#ifdef SPINLOCK_PROFILE +int spins = 0; + + atomic_add(&(lock->waiting), 1); +#endif while (atomic_add(&(lock->lock), 1) != 0) { atomic_add(&(lock->lock), -1); -#ifdef DEBUG +#ifdef SPINLOCK_PROFILE atomic_add(&(lock->spins), 1); + spins++; #endif } -#ifdef DEBUG +#ifdef SPINLOCK_PROFILE + if (spins) + { + lock->contended++; + if (lock->maxspins < spins) + lock->maxspins = spins; + } lock->acquired++; lock->owner = THREAD_SHELF(); + atomic_add(&(lock->waiting), -1); #endif } @@ -71,7 +87,7 @@ spinlock_acquire(SPINLOCK *lock) * Acquire a spinlock if it is not already locked. * * @param lock The spinlock to acquire - * @return True ifthe spinlock was acquired, otherwise false + * @return True if the spinlock was acquired, otherwise false */ int spinlock_acquire_nowait(SPINLOCK *lock) @@ -81,7 +97,7 @@ spinlock_acquire_nowait(SPINLOCK *lock) atomic_add(&(lock->lock), -1); return FALSE; } -#ifdef DEBUG +#ifdef SPINLOCK_PROFILE lock->acquired++; lock->owner = THREAD_SHELF(); #endif @@ -96,5 +112,45 @@ spinlock_acquire_nowait(SPINLOCK *lock) void spinlock_release(SPINLOCK *lock) { +#ifdef SPINLOCK_PROFILE + if (lock->waiting > lock->max_waiting) + lock->max_waiting = lock->waiting; +#endif atomic_add(&(lock->lock), -1); } + +/** + * Report statistics on a spinlock. This only has an effect if the + * spinlock code has been compiled with the SPINLOCK_PROFILE option set. + * + * NB A callback function is used to return the data rather than + * merely printing to a DCB in order to avoid a dependency on the DCB + * form the spinlock code and also to facilitate other uses of the + * statistics reporting. + * + * @param lock The spinlock to report on + * @param reporter The callback function to pass the statistics to + * @param hdl A handle that is passed to the reporter function + */ +void +spinlock_stats(SPINLOCK *lock, void (*reporter)(void *, char *, int), void *hdl) +{ +#ifdef SPINLOCK_PROFILE + reporter(hdl, "Spinlock acquired", lock->acquired); + if (lock->acquired) + { + reporter(hdl, "Total no. of spins", lock->spins); + reporter(hdl, "Average no. of spins (overall)", + lock->spins / lock->acquired); + if (lock->contended) + reporter(hdl, "Average no. of spins (when contended)", + lock->spins / lock->contended); + reporter(hdl, "Maximum no. of spins", lock->maxspins); + reporter(hdl, "Maximim no. of blocked threads", + lock->max_waiting); + reporter(hdl, "Contended locks", lock->contended); + reporter(hdl, "Contention percentage", + (lock->contended * 100) / lock->acquired); + } +#endif +} diff --git a/server/include/buffer.h b/server/include/buffer.h index 9729c538c..ec2e91d01 100644 --- a/server/include/buffer.h +++ b/server/include/buffer.h @@ -83,6 +83,7 @@ typedef struct { */ typedef struct gwbuf { struct gwbuf *next; /*< Next buffer in a linked chain of buffers */ + struct gwbuf *tail; /*< Last buffer in a linked chain of buffers */ void *start; /*< Start of the valid data */ void *end; /*< First byte after the valid data */ SHARED_BUF *sbuf; /*< The shared buffer with the real data */ diff --git a/server/include/dcb.h b/server/include/dcb.h index 4f8be99d4..723acec5d 100644 --- a/server/include/dcb.h +++ b/server/include/dcb.h @@ -53,6 +53,7 @@ struct service; * 07/02/2014 Massimiliano Pinto Added ipv4 data struct into for dcb * 07/05/2014 Mark Riddoch Addition of callback mechanism * 08/05/2014 Mark Riddoch Addition of writeq high and low watermarks + * 27/08/2014 Mark Ridddoch Addition of write event queuing * * @endverbatim */ @@ -107,12 +108,16 @@ typedef struct gw_protocol { * The statitics gathered on a descriptor control block */ typedef struct dcbstats { - int n_reads; /*< Number of reads on this descriptor */ - int n_writes; /*< Number of writes on this descriptor */ - int n_accepts; /*< Number of accepts on this descriptor */ - int n_buffered; /*< Number of buffered writes */ - int n_high_water; /*< Number of crosses of high water mark */ - int n_low_water; /*< Number of crosses of low water mark */ + int n_reads; /*< Number of reads on this descriptor */ + int n_writes; /*< Number of writes on this descriptor */ + int n_accepts; /*< Number of accepts on this descriptor */ + int n_buffered; /*< Number of buffered writes */ + int n_high_water; /*< Number of crosses of high water mark */ + int n_low_water; /*< Number of crosses of low water mark */ + int n_busypolls; /*< Number of read polls whiel reading */ + int n_readrechecks; /*< Number of rechecks for reads */ + int n_busywrpolls; /*< Number of write polls while writing */ + int n_writerechecks;/*< Number of rechecks for writes */ } DCBSTATS; /** @@ -231,6 +236,13 @@ typedef struct dcb { DCBMM memdata; /**< The data related to DCB memory management */ SPINLOCK cb_lock; /**< The lock for the callbacks linked list */ DCB_CALLBACK *callbacks; /**< The list of callbacks for the DCB */ + SPINLOCK pollinlock; + int pollinbusy; + int readcheck; + + SPINLOCK polloutlock; + int polloutbusy; + int writecheck; unsigned int high_water; /**< High water mark */ unsigned int low_water; /**< Low water mark */ @@ -259,6 +271,8 @@ int fail_accept_errno; #define DCB_BELOW_LOW_WATER(x) ((x)->low_water && (x)->writeqlen < (x)->low_water) #define DCB_ABOVE_HIGH_WATER(x) ((x)->high_water && (x)->writeqlen > (x)->high_water) +void dcb_pollin(DCB *, int); +void dcb_pollout(DCB *, int); DCB *dcb_get_zombies(void); int gw_write( #if defined(SS_DEBUG) @@ -289,7 +303,7 @@ void dcb_hashtable_stats(DCB *, void *); /**< Print statisitics */ void dcb_add_to_zombieslist(DCB* dcb); int dcb_add_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON, void *), void *); -int dcb_remove_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON), +int dcb_remove_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON, void *), void *); int dcb_isvalid(DCB *); /* Check the DCB is in the linked list */ diff --git a/server/include/housekeeper.h b/server/include/housekeeper.h new file mode 100644 index 000000000..597f19a91 --- /dev/null +++ b/server/include/housekeeper.h @@ -0,0 +1,50 @@ +#ifndef _HOUSEKEEPER_H +#define _HOUSEKEEPER_H +/* + * This file is distributed as part of the SkySQL Gateway. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ +#include + +/** + * @file housekeeper.h A mechanism to have task run periodically + * + * @verbatim + * Revision History + * + * Date Who Description + * 29/08/14 Mark Riddoch Initial implementation + * + * @endverbatim + */ + +/** + * The housekeeper task list + */ +typedef struct hktask { + char *name; /*< A simple task name */ + void (*task)(void *data); /*< The task to call */ + void *data; /*< Data to pass the task */ + int frequency; /*< How often to call the tasks (seconds) */ + time_t nextdue; /*< When the task should be next run */ + struct hktask + *next; /*< Next task in the list */ +} HKTASK; + +extern void hkinit(); +extern int hktask_add(char *name, void (*task)(void *), void *data, int frequency); +extern int hktask_remove(char *name); +#endif diff --git a/server/include/poll.h b/server/include/poll.h index e19be9c94..6524f1bbb 100644 --- a/server/include/poll.h +++ b/server/include/poll.h @@ -41,4 +41,5 @@ extern void poll_waitevents(void *); extern void poll_shutdown(); extern GWBITMASK *poll_bitmask(); extern void dprintPollStats(DCB *); +extern void dShowThreads(DCB *dcb); #endif diff --git a/server/include/spinlock.h b/server/include/spinlock.h index 42f7b5c2e..43192da3f 100644 --- a/server/include/spinlock.h +++ b/server/include/spinlock.h @@ -21,7 +21,7 @@ /** * @file spinlock.h * - * Spinlock implementation for ther gateway. + * Spinlock implementation for MaxScale. * * Spinlocks are cheap locks that can be used to protect short code blocks, they are * generally wasteful as any blocked threads will spin, consuming CPU cycles, waiting @@ -31,12 +31,28 @@ #include #include +#define SPINLOCK_PROFILE 1 + +/** + * The spinlock structure. + * + * In normal builds the structure merely contains a lock value which + * is 0 if the spinlock is not taken and greater than zero if it is held. + * + * In builds with the SPINLOCK_PROFILE option set this structure also holds + * a number of profile related fields that count the number of spins, number + * of waiting threads and the number of times the lock has been acquired. + */ typedef struct spinlock { - int lock; -#if DEBUG - int spins; - int acquired; - THREAD owner; + int lock; /*< Is the lock held? */ +#if SPINLOCK_PROFILE + int spins; /*< Number of spins on this lock */ + int maxspins; /*< Max no of spins to acquire lock */ + int acquired; /*< No. of times lock was acquired */ + int waiting; /*< No. of threads acquiring this lock */ + int max_waiting; /*< Max no of threads waiting for lock */ + int contended; /*< No. of times acquire was contended */ + THREAD owner; /*< Last owner of this lock */ #endif } SPINLOCK; @@ -47,8 +63,8 @@ typedef struct spinlock { #define FALSE false #endif -#if DEBUG -#define SPINLOCK_INIT { 0, 0, 0, NULL } +#if SPINLOCK_PROFILE +#define SPINLOCK_INIT { 0, 0, 0, 0, 0, 0, 0, 0 } #else #define SPINLOCK_INIT { 0 } #endif @@ -59,4 +75,6 @@ extern void spinlock_init(SPINLOCK *lock); extern void spinlock_acquire(SPINLOCK *lock); extern int spinlock_acquire_nowait(SPINLOCK *lock); extern void spinlock_release(SPINLOCK *lock); +extern void spinlock_stats(SPINLOCK *lock, + void (*reporter)(void *, char *, int), void *hdl); #endif diff --git a/server/modules/include/blr.h b/server/modules/include/blr.h new file mode 100644 index 000000000..f493ec715 --- /dev/null +++ b/server/modules/include/blr.h @@ -0,0 +1,367 @@ +#ifndef _BLR_H +#define _BLR_H +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr.h - The binlog router header file + * + * @verbatim + * Revision History + * + * Date Who Description + * 02/04/14 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include + +#define BINLOG_FNAMELEN 16 +#define BLR_PROTOCOL "MySQLBackend" +#define BINLOG_MAGIC { 0xfe, 0x62, 0x69, 0x6e } +#define BINLOG_NAMEFMT "%s.%06d" +#define BINLOG_NAME_ROOT "mysql-bin" + +/** + * High and Low water marks for the slave dcb. These values can be overriden + * by the router options highwater and lowwater. + */ +#define DEF_LOW_WATER 20000 +#define DEF_HIGH_WATER 300000 + +/** + * Some useful macros for examining the MySQL Response packets + */ +#define MYSQL_RESPONSE_OK(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0x00) +#define MYSQL_RESPONSE_EOF(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0xfe) +#define MYSQL_RESPONSE_ERR(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0xff) +#define MYSQL_ERROR_CODE(buf) (*((uint8_t *)GWBUF_DATA(buf) + 5)) +#define MYSQL_ERROR_MSG(buf) ((uint8_t *)GWBUF_DATA(buf) + 6) +#define MYSQL_COMMAND(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4)) + +/** + * Slave statistics + */ +typedef struct { + int n_events; /*< Number of events sent */ + int n_bursts; /*< Number of bursts sent */ + int n_requests; /*< Number of requests received */ + int n_flows; /*< Number of flow control restarts */ + int n_catchupnr; /*< No. of times catchup resulted in not entering loop */ + int n_alreadyupd; + int n_upd; + int n_cb; + int n_cbna; + int n_dcb; + int n_above; + int n_failed_read; + int n_overrun; + int n_actions[3]; +} SLAVE_STATS; + +/** + * The client session structure used within this router. This represents + * the slaves that are replicating binlogs from MaxScale. + */ +typedef struct router_slave { +#if defined(SS_DEBUG) + skygw_chk_t rses_chk_top; +#endif + DCB *dcb; /*< The slave server DCB */ + int state; /*< The state of this slave */ + int binlog_pos; /*< Binlog position for this slave */ + char binlogfile[BINLOG_FNAMELEN+1]; + /*< Current binlog file for this slave */ + int serverid; /*< Server-id of the slave */ + char *hostname; /*< Hostname of the slave, if known */ + char *user; /*< Username if given */ + char *passwd; /*< Password if given */ + short port; /*< MySQL port */ + int nocrc; /*< Disable CRC */ + int overrun; + uint32_t rank; /*< Replication rank */ + uint8_t seqno; /*< Replication dump sequence no */ + SPINLOCK catch_lock; /*< Event catchup lock */ + unsigned int cstate; /*< Catch up state */ + SPINLOCK rses_lock; /*< Protects rses_deleted */ + pthread_t pthread; + struct router_instance + *router; /*< Pointer to the owning router */ + struct router_slave *next; + SLAVE_STATS stats; /*< Slave statistics */ +#if defined(SS_DEBUG) + skygw_chk_t rses_chk_tail; +#endif +} ROUTER_SLAVE; + + +/** + * The statistics for this router instance + */ +typedef struct { + int n_slaves; /*< Number slave sessions created */ + int n_reads; /*< Number of record reads */ + uint64_t n_binlogs; /*< Number of binlog records from master */ + uint64_t n_binlog_errors;/*< Number of binlog records from master */ + uint64_t n_rotates; /*< Number of binlog rotate events */ + uint64_t n_cachehits; /*< Number of hits on the binlog cache */ + uint64_t n_cachemisses; /*< Number of misses on the binlog cache */ + int n_registered; /*< Number of registered slaves */ + int n_masterstarts; /*< Number of times connection restarted */ + int n_delayedreconnects; + int n_residuals; /*< Number of times residual data was buffered */ + int n_heartbeats; /*< Number of heartbeat messages */ + time_t lastReply; + uint64_t n_fakeevents; /*< Fake events not written to disk */ + uint64_t n_artificial; /*< Artificial events not written to disk */ + uint64_t events[0x24]; /*< Per event counters */ +} ROUTER_STATS; + +/** + * Saved responses from the master that will be forwarded to slaves + */ +typedef struct { + GWBUF *server_id; /*< Master server id */ + GWBUF *heartbeat; /*< Heartbeat period */ + GWBUF *chksum1; /*< Binlog checksum 1st response */ + GWBUF *chksum2; /*< Binlog checksum 2nd response */ + GWBUF *gtid_mode; /*< GTID Mode response */ + GWBUF *uuid; /*< Master UUID */ + GWBUF *setslaveuuid; /*< Set Slave UUID */ + GWBUF *setnames; /*< Set NAMES latin1 */ + GWBUF *utf8; /*< Set NAMES utf8 */ + GWBUF *select1; /*< select 1 */ + GWBUF *selectver; /*< select version() */ + uint8_t *fde_event; /*< Format Description Event */ + int fde_len; /*< Length of fde_event */ +} MASTER_RESPONSES; + +/** + * The binlog record structure. This contains the actual packet received from the + * master, the binlog position of the data in the packet, a point to the data and + * the length of the binlog record. + * + * This allows requests for binlog records in the cache to be serviced by simply + * sending the exact same packet as was received by MaxScale from the master. + * Items are written to the backing file as soon as they are received. The binlog + * cache is flushed of old records periodically, releasing the GWBUF's back to the + * free memory pool. + */ +typedef struct { + unsigned long position; /*< binlog record position for this cache entry */ + GWBUF *pkt; /*< The packet received from the master */ + unsigned char *data; /*< Pointer to the data within the packet */ + unsigned int record_len; /*< Binlog record length */ +} BLCACHE_RECORD; + +/** + * The binlog cache. A cache exists for each file that hold cached bin log records. + * Typically the router will hold two binlog caches, one for the current file and one + * for the previous file. + */ +typedef struct { + char filename[BINLOG_FNAMELEN+1]; + BLCACHE_RECORD *first; + BLCACHE_RECORD *current; + int cnt; +} BLCACHE; + + +/** + * The per instance data for the router. + */ +typedef struct router_instance { + SERVICE *service; /*< Pointer to the service using this router */ + ROUTER_SLAVE *slaves; /*< Link list of all the slave connections */ + SPINLOCK lock; /*< Spinlock for the instance data */ + char *uuid; /*< UUID for the router to use w/master */ + int masterid; /*< Server ID of the master */ + int serverid; /*< Server ID to use with master */ + char *user; /*< User name to use with master */ + char *password; /*< Password to use with master */ + char *fileroot; /*< Root of binlog filename */ + DCB *master; /*< DCB for master connection */ + DCB *client; /*< DCB for dummy client */ + SESSION *session; /*< Fake session for master connection */ + unsigned int master_state; /*< State of the master FSM */ + uint8_t lastEventReceived; + GWBUF *residual; /*< Any residual binlog event */ + MASTER_RESPONSES saved_master; /*< Saved master responses */ + char binlog_name[BINLOG_FNAMELEN+1]; + /*< Name of the current binlog file */ + uint64_t binlog_position; + /*< Current binlog position */ + int binlog_fd; /*< File descriptor of the binlog + * file being written + */ + unsigned int low_water; /*< Low water mark for client DCB */ + unsigned int high_water; /*< High water mark for client DCB */ + BLCACHE *cache[2]; + ROUTER_STATS stats; /*< Statistics for this router */ + int active_logs; + int reconnect_pending; + int handling_threads; + struct router_instance + *next; +} ROUTER_INSTANCE; + +/** + * Packet header for replication messages + */ +typedef struct rep_header { + int payload_len; /*< Payload length (24 bits) */ + uint8_t seqno; /*< Response sequence number */ + uint8_t ok; /*< OK Byte from packet */ + uint32_t timestamp; /*< Timestamp - start of binlog record */ + uint8_t event_type; /*< Binlog event type */ + uint32_t serverid; /*< Server id of master */ + uint32_t event_size; /*< Size of header, post-header and body */ + uint32_t next_pos; /*< Position of next event */ + uint16_t flags; /*< Event flags */ +} REP_HEADER; + +/** + * State machine for the master to MaxScale replication + */ +#define BLRM_UNCONNECTED 0x0000 +#define BLRM_AUTHENTICATED 0x0001 +#define BLRM_TIMESTAMP 0x0002 +#define BLRM_SERVERID 0x0003 +#define BLRM_HBPERIOD 0x0004 +#define BLRM_CHKSUM1 0x0005 +#define BLRM_CHKSUM2 0x0006 +#define BLRM_GTIDMODE 0x0007 +#define BLRM_MUUID 0x0008 +#define BLRM_SUUID 0x0009 +#define BLRM_LATIN1 0x000A +#define BLRM_UTF8 0x000B +#define BLRM_SELECT1 0x000C +#define BLRM_SELECTVER 0x000D +#define BLRM_REGISTER 0x000E +#define BLRM_BINLOGDUMP 0x000F + +#define BLRM_MAXSTATE 0x000F + +static char *blrm_states[] = { "Unconnected", "Authenticated", "Timestamp retrieval", + "Server ID retrieval", "HeartBeat Period setup", "binlog checksum config", + "binlog checksum rerieval", "GTID Mode retrieval", "Master UUID retrieval", + "Set Slave UUID", "Set Names latin1", "Set Names utf8", "select 1", + "select version()", "Register slave", "Binlog Dump" }; + +#define BLRS_CREATED 0x0000 +#define BLRS_UNREGISTERED 0x0001 +#define BLRS_REGISTERED 0x0002 +#define BLRS_DUMPING 0x0003 + +#define BLRS_MAXSTATE 0x0003 + +static char *blrs_states[] = { "Created", "Unregistered", "Registered", + "Sending binlogs" }; + +/** + * Slave catch-up status + */ +#define CS_READING 0x0001 +#define CS_INNERLOOP 0x0002 +#define CS_UPTODATE 0x0004 +#define CS_EXPECTCB 0x0008 +#define CS_DIST 0x0010 +#define CS_DISTLATCH 0x0020 + +/** + * MySQL protocol OpCodes needed for replication + */ +#define COM_QUIT 0x01 +#define COM_QUERY 0x03 +#define COM_REGISTER_SLAVE 0x15 +#define COM_BINLOG_DUMP 0x12 + +/** + * Binlog event types + */ +#define START_EVENT_V3 0x01 +#define QUERY_EVENT 0x02 +#define STOP_EVENT 0x03 +#define ROTATE_EVENT 0x04 +#define INTVAR_EVENT 0x05 +#define LOAD_EVENT 0x06 +#define SLAVE_EVENT 0x07 +#define CREATE_FILE_EVENT 0x08 +#define APPEND_BLOCK_EVENT 0x09 +#define EXEC_LOAD_EVENT 0x0A +#define DELETE_FILE_EVENT 0x0B +#define NEW_LOAD_EVENT 0x0C +#define RAND_EVENT 0x0D +#define USER_VAR_EVENT 0x0E +#define FORMAT_DESCRIPTION_EVENT 0x0F +#define XID_EVENT 0x10 +#define BEGIN_LOAD_QUERY_EVENT 0x11 +#define EXECUTE_LOAD_QUERY_EVENT 0x12 +#define TABLE_MAP_EVENT 0x13 +#define WRITE_ROWS_EVENTv0 0x14 +#define UPDATE_ROWS_EVENTv0 0x15 +#define DELETE_ROWS_EVENTv0 0x16 +#define WRITE_ROWS_EVENTv1 0x17 +#define UPDATE_ROWS_EVENTv1 0x18 +#define DELETE_ROWS_EVENTv1 0x19 +#define INCIDENT_EVENT 0x1A +#define HEARTBEAT_EVENT 0x1B +#define IGNORABLE_EVENT 0x1C +#define ROWS_QUERY_EVENT 0x1D +#define WRITE_ROWS_EVENTv2 0x1E +#define UPDATE_ROWS_EVENTv2 0x1F +#define DELETE_ROWS_EVENTv2 0x20 +#define GTID_EVENT 0x21 +#define ANONYMOUS_GTID_EVENT 0x22 +#define PREVIOUS_GTIDS_EVENT 0x23 + +/** + * Binlog event flags + */ +#define LOG_EVENT_BINLOG_IN_USE_F 0x0001 +#define LOG_EVENT_FORCED_ROTATE_F 0x0002 +#define LOG_EVENT_THREAD_SPECIFIC_F 0x0004 +#define LOG_EVENT_SUPPRESS_USE_F 0x0008 +#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x0010 +#define LOG_EVENT_ARTIFICIAL_F 0x0020 +#define LOG_EVENT_RELAY_LOG_F 0x0040 +#define LOG_EVENT_IGNORABLE_F 0x0080 +#define LOG_EVENT_NO_FILTER_F 0x0100 +#define LOG_EVENT_MTS_ISOLATE_F 0x0200 + +/* + * Externals within the router + */ +extern void blr_start_master(ROUTER_INSTANCE *); +extern void blr_master_response(ROUTER_INSTANCE *, GWBUF *); +extern void blr_master_reconnect(ROUTER_INSTANCE *); + +extern int blr_slave_request(ROUTER_INSTANCE *, ROUTER_SLAVE *, GWBUF *); +extern void blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr); +extern int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); +extern void blr_init_cache(ROUTER_INSTANCE *); + +extern void blr_file_init(ROUTER_INSTANCE *); +extern int blr_open_binlog(ROUTER_INSTANCE *, char *); +extern void blr_write_binlog_record(ROUTER_INSTANCE *, REP_HEADER *,uint8_t *); +extern void blr_file_rotate(ROUTER_INSTANCE *, char *, uint64_t); +extern void blr_file_flush(ROUTER_INSTANCE *); +extern GWBUF *blr_read_binlog(int, unsigned int, REP_HEADER *); +#endif diff --git a/server/modules/protocol/maxscaled.c b/server/modules/protocol/maxscaled.c index 738c78111..f580764f8 100644 --- a/server/modules/protocol/maxscaled.c +++ b/server/modules/protocol/maxscaled.c @@ -235,6 +235,7 @@ maxscaled_error(DCB *dcb) static int maxscaled_hangup(DCB *dcb) { + dcb_close(dcb); return 0; } @@ -313,9 +314,11 @@ maxscaled_close(DCB *dcb) MAXSCALED *maxscaled = dcb->protocol; if (maxscaled && maxscaled->username) + { free(maxscaled->username); + maxscaled->username = NULL; + } - dcb_close(dcb); return 0; } diff --git a/server/modules/protocol/mysql_backend.c b/server/modules/protocol/mysql_backend.c index ce7b6ef97..50843a66c 100644 --- a/server/modules/protocol/mysql_backend.c +++ b/server/modules/protocol/mysql_backend.c @@ -497,7 +497,7 @@ static int gw_read_backend_event(DCB *dcb) { { if (nbytes_read < 5) { - gwbuf_append(dcb->dcb_readqueue, read_buffer); + dcb->dcb_readqueue = gwbuf_append(dcb->dcb_readqueue, read_buffer); rc = 0; goto return_rc; } diff --git a/server/modules/protocol/mysql_client.c b/server/modules/protocol/mysql_client.c index 6ffe4e56f..f72402847 100644 --- a/server/modules/protocol/mysql_client.c +++ b/server/modules/protocol/mysql_client.c @@ -798,7 +798,7 @@ int gw_read_client_event( } /** succeed */ - if (rc == 1) { + if (rc) { rc = 0; /**< here '0' means success */ } else { GWBUF* errbuf; diff --git a/server/modules/routing/Makefile b/server/modules/routing/Makefile index 8287bdaea..5dba4efae 100644 --- a/server/modules/routing/Makefile +++ b/server/modules/routing/Makefile @@ -51,6 +51,8 @@ MODULES= libdebugcli.so libreadconnroute.so libtestroute.so libcli.so all: $(MODULES) + (cd readwritesplit; make) + (cd binlog; make) libtestroute.so: $(TESTOBJ) $(CC) $(LDFLAGS) $(TESTOBJ) $(LIBS) -o $@ @@ -73,19 +75,23 @@ libreadwritesplit.so: clean: $(DEL) $(OBJ) $(MODULES) (cd readwritesplit; touch depend.mk; make clean) + (cd binlog; touch depend.mk; make clean) tags: ctags $(SRCS) $(HDRS) (cd readwritesplit; make tags) + (cd binlog; make tags) depend: @$(DEL) depend.mk cc -M $(CFLAGS) $(SRCS) > depend.mk (cd readwritesplit; touch depend.mk ; make depend) + (cd binlog; touch depend.mk ; make depend) install: $(MODULES) install -D $(MODULES) $(DEST)/modules (cd readwritesplit; make DEST=$(DEST) install) + (cd binlog; make DEST=$(DEST) install) cleantests: $(MAKE) -C readwritesplit/test cleantests diff --git a/server/modules/routing/binlog/Makefile b/server/modules/routing/binlog/Makefile new file mode 100644 index 000000000..6e9282ea1 --- /dev/null +++ b/server/modules/routing/binlog/Makefile @@ -0,0 +1,65 @@ +# This file is distributed as part of the SkySQL Gateway. It is free +# software: you can redistribute it and/or modify it under the terms of the +# GNU General Public License as published by the Free Software Foundation, +# version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 51 +# Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Copyright SkySQL Ab 2013 +# +# Revision History +# Date Who Description +# 2/04/14 Mark Riddoch Initial framework put in place + +include ../../../../build_gateway.inc + +LOGPATH := $(ROOT_PATH)/log_manager +UTILSPATH := $(ROOT_PATH)/utils +QCLASSPATH := $(ROOT_PATH)/query_classifier + +CC=cc +CFLAGS=-c -fPIC -I/usr/include -I../../include -I../../../include \ + -I$(LOGPATH) -I$(UTILSPATH) -I$(QCLASSPATH) \ + $(MYSQL_HEADERS) -Wall -g + +include ../../../../makefile.inc + +LDFLAGS=-shared -L$(LOGPATH) -L$(QCLASSPATH) -L$(EMBEDDED_LIB) \ + -Wl,-rpath,$(DEST)/lib \ + -Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) -Wl,-rpath,$(QCLASSPATH) \ + -Wl,-rpath,$(EMBEDDED_LIB) + +SRCS=blr.c blr_master.c blr_cache.c blr_slave.c blr_file.c +OBJ=$(SRCS:.c=.o) +LIBS=-lssl -pthread -llog_manager -lmysqld +MODULES=libbinlogrouter.so + +all: $(MODULES) + +$(MODULES): $(OBJ) + $(CC) $(LDFLAGS) $(OBJ) $(UTILSPATH)/skygw_utils.o $(LIBS) -o $@ + +.c.o: + $(CC) $(CFLAGS) $< -o $@ + +clean: + rm -f $(OBJ) $(MODULES) + +tags: + ctags $(SRCS) $(HDRS) + +depend: + @rm -f depend.mk + cc -M $(CFLAGS) $(SRCS) > depend.mk + +install: $(MODULES) + install -D $(MODULES) $(DEST)/MaxScale/modules + +include depend.mk diff --git a/server/modules/routing/binlog/README b/server/modules/routing/binlog/README new file mode 100644 index 000000000..514b48341 --- /dev/null +++ b/server/modules/routing/binlog/README @@ -0,0 +1,53 @@ +The binlog router is not a "normal" MaxScale router, it is not +designed to be used to route client requests to a database in the +usual proxy fashion. Rather it is designed to allow MaxScale to be +used as a relay server in a MySQL replication environment. + +In this environment MaxScale sits between a master MySQL server and +a set of slave servers. The slaves servers execute a change master +to the MaxScale server, otehrwise they are configured in exactly +the same way as a normal MySQL slave server. + +The master server configuration is unaltered, it simply sees a +single slave server. + +MaxScale is configured as usual, with a service definition that +references the binlog router. The major configuration option to +consider is the router_options paramter, in the binlog router this +provides the binlog specific configuration parameters. + + uuid= + This is the UUID that MaxScale uses when it connects + to the real master. It will report the master's + UUID to slaves that connect to it. + + server-id= + The server-id that MaxScale uses when it connects + to the real master server. Again it will reports + the master's server-id to the slaves that connect + to it. + user= + The user that MaxScale uses to login to the real + master + password= + The password that MaxScale uses to login to the + real master + master-id= + The server-id of the real master. MaxScale should + get this by sending a query, but at the moment it + is in the configuration file for ease of implementation + + +An example binlog service configuration is shown below: + +[Binlog Service] +type=service +router=binlogrouter +servers=master +router_options=uuid=f12fcb7f-b97b-11e3-bc5e-0401152c4c22,server-id=3,user=repl,password=slavepass,master-id=1 +user=maxscale +passwd=Mhu87p2D + +The servers list for a binlog router service should contain just +the master server. In future a list will be given and the monitor +used to determine which server is the current master server. diff --git a/server/modules/routing/binlog/STATUS b/server/modules/routing/binlog/STATUS new file mode 100644 index 000000000..db3a190f5 --- /dev/null +++ b/server/modules/routing/binlog/STATUS @@ -0,0 +1,13 @@ +The binlog router contained here is a prototype implementation and +should not be consider as production ready. + +The router has been written and tested with MySQL 5.6 as a reference +for the replication behaviour, more investigation and implementation +is likely to be needed in order to use other versions of MySQL, +MariaDB or Percona Server. + +To Do List: + +1. The router does not implement the replication heartbeat mechanism. + +2. Performance measurements have yet to be made. diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c new file mode 100644 index 000000000..dec20f8b4 --- /dev/null +++ b/server/modules/routing/binlog/blr.c @@ -0,0 +1,770 @@ +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr.c - binlog router, allows MaxScale to act as an intermediatory for replication + * + * The binlog router is designed to be used in replication environments to + * increase the replication fanout of a master server. It provides a transparant + * mechanism to read the binlog entries for multiple slaves while requiring + * only a single connection to the actual master to support the slaves. + * + * The current prototype implement is designed to support MySQL 5.6 and has + * a number of limitations. This prototype is merely a proof of concept and + * should not be considered production ready. + * + * @verbatim + * Revision History + * + * Date Who Description + * 02/04/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +extern int lm_enabled_logfiles_bitmask; + +static char *version_str = "V1.0.6"; + +/* The router entry points */ +static ROUTER *createInstance(SERVICE *service, char **options); +static void *newSession(ROUTER *instance, SESSION *session); +static void closeSession(ROUTER *instance, void *router_session); +static void freeSession(ROUTER *instance, void *router_session); +static int routeQuery(ROUTER *instance, void *router_session, GWBUF *queue); +static void diagnostics(ROUTER *instance, DCB *dcb); +static void clientReply( + ROUTER *instance, + void *router_session, + GWBUF *queue, + DCB *backend_dcb); +static void errorReply( + ROUTER *instance, + void *router_session, + GWBUF *message, + DCB *backend_dcb, + error_action_t action, + bool *succp); +static uint8_t getCapabilities (ROUTER* inst, void* router_session); + + +/** The module object definition */ +static ROUTER_OBJECT MyObject = { + createInstance, + newSession, + closeSession, + freeSession, + routeQuery, + diagnostics, + clientReply, + errorReply, + getCapabilities +}; + +static bool rses_begin_locked_router_action(ROUTER_SLAVE *); +static void rses_end_locked_router_action(ROUTER_SLAVE *); + +static SPINLOCK instlock; +static ROUTER_INSTANCE *instances; + +/** + * Implementation of the mandatory version entry point + * + * @return version string of the module + */ +char * +version() +{ + return version_str; +} + +/** + * The module initialisation routine, called when the module + * is first loaded. + */ +void +ModuleInit() +{ + LOGIF(LM, (skygw_log_write( + LOGFILE_MESSAGE, + "Initialise binlog router module %s.\n", version_str))); + spinlock_init(&instlock); + instances = NULL; +} + +/** + * The module entry point routine. It is this routine that + * must populate the structure that is referred to as the + * "module object", this is a structure with the set of + * external entry points for this module. + * + * @return The module object + */ +ROUTER_OBJECT * +GetModuleObject() +{ + return &MyObject; +} + +/** + * Create an instance of the router for a particular service + * within MaxScale. + * + * The process of creating the instance causes the router to register + * with the master server and begin replication of the binlogs from + * the master server to MaxScale. + * + * @param service The service this router is being create for + * @param options An array of options for this query router + * + * @return The instance data for this new instance + */ +static ROUTER * +createInstance(SERVICE *service, char **options) +{ +ROUTER_INSTANCE *inst; +char *value; +int i; + + if ((inst = calloc(1, sizeof(ROUTER_INSTANCE))) == NULL) { + return NULL; + } + + memset(&inst->stats, 0, sizeof(ROUTER_STATS)); + memset(&inst->saved_master, 0, sizeof(MASTER_RESPONSES)); + + inst->service = service; + spinlock_init(&inst->lock); + + inst->low_water = DEF_LOW_WATER; + inst->high_water = DEF_HIGH_WATER; + + /* + * We only support one server behind this router, since the server is + * the master from which we replicate binlog records. Therefore check + * that only one server has been defined. + * + * A later improvement will be to define multiple servers and have the + * router use the information that is supplied by the monitor to find + * which of these servers is currently the master and replicate from + * that server. + */ + if (service->databases == NULL || service->databases->nextdb != NULL) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "Error : Exactly one database server may be " + "for use with the binlog router."))); + } + + + /* + * Process the options. + * We have an array of attrbute values passed to us that we must + * examine. Supported attributes are: + * uuid= + * server-id= + * user= + * password= + * master-id= + * filestem= + * lowwater= + * highwater= + */ + if (options) + { + for (i = 0; options[i]; i++) + { + if ((value = strchr(options[i], '=')) == NULL) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "Warning : Unsupported router " + "option %s for binlog router.", + options[i]))); + } + else + { + *value = 0; + value++; + if (strcmp(options[i], "uuid") == 0) + { + inst->uuid = strdup(value); + } + else if (strcmp(options[i], "server-id") == 0) + { + inst->serverid = atoi(value); + } + else if (strcmp(options[i], "user") == 0) + { + inst->user = strdup(value); + } + else if (strcmp(options[i], "password") == 0) + { + inst->password = strdup(value); + } + else if (strcmp(options[i], "master-id") == 0) + { + inst->masterid = atoi(value); + } + else if (strcmp(options[i], "filestem") == 0) + { + inst->fileroot = strdup(value); + } + else if (strcmp(options[i], "lowwater") == 0) + { + inst->low_water = atoi(value); + } + else if (strcmp(options[i], "highwater") == 0) + { + inst->high_water = atoi(value); + } + else + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "Warning : Unsupported router " + "option %s for binlog router.", + options[i]))); + } + } + } + if (inst->fileroot == NULL) + inst->fileroot = strdup(BINLOG_NAME_ROOT); + } + + /* + * We have completed the creation of the instance data, so now + * insert this router instance into the linked list of routers + * that have been created with this module. + */ + spinlock_acquire(&instlock); + inst->next = instances; + instances = inst; + spinlock_release(&instlock); + + inst->active_logs = 0; + inst->reconnect_pending = 0; + inst->handling_threads = 0; + inst->residual = NULL; + inst->slaves = NULL; + inst->next = NULL; + + /* + * Initialise the binlog file and position + */ + blr_file_init(inst); + LOGIF(LT, (skygw_log_write( + LOGFILE_TRACE, + "Binlog router: current binlog file is: %s, current position %u\n", + inst->binlog_name, inst->binlog_position))); + + /* + * Initialise the binlog cache for this router instance + */ + blr_init_cache(inst); + + /* + * Now start the replication from the master to MaxScale + */ + blr_start_master(inst); + + return (ROUTER *)inst; +} + +/** + * Associate a new session with this instance of the router. + * + * In the case of the binlog router a new session equates to a new slave + * connecting to MaxScale and requesting binlog records. We need to go + * through the slave registration process for this new slave. + * + * @param instance The router instance data + * @param session The session itself + * @return Session specific data for this session + */ +static void * +newSession(ROUTER *instance, SESSION *session) +{ +ROUTER_INSTANCE *inst = (ROUTER_INSTANCE *)instance; +ROUTER_SLAVE *slave; + + LOGIF(LD, (skygw_log_write_flush( + LOGFILE_DEBUG, + "binlog router: %lu [newSession] new router session with " + "session %p, and inst %p.", + pthread_self(), + session, + inst))); + + + if ((slave = (ROUTER_SLAVE *)calloc(1, sizeof(ROUTER_SLAVE))) == NULL) + { + LOGIF(LD, (skygw_log_write_flush( + LOGFILE_ERROR, + "Insufficient memory to create new slave session for binlog router"))); + return NULL; + } + +#if defined(SS_DEBUG) + slave->rses_chk_top = CHK_NUM_ROUTER_SES; + slave->rses_chk_tail = CHK_NUM_ROUTER_SES; +#endif + + memset(&slave->stats, 0, sizeof(SLAVE_STATS)); + atomic_add(&inst->stats.n_slaves, 1); + slave->state = BLRS_CREATED; /* Set initial state of the slave */ + slave->cstate = 0; + slave->pthread = 0; + slave->overrun = 0; + spinlock_init(&slave->catch_lock); + slave->dcb = session->client; + slave->router = inst; + + /** + * Add this session to the list of active sessions. + */ + spinlock_acquire(&inst->lock); + slave->next = inst->slaves; + inst->slaves = slave; + spinlock_release(&inst->lock); + + CHK_CLIENT_RSES(slave); + + return (void *)slave; +} + +/** + * The session is no longer required. Shutdown all operation and free memory + * associated with this session. In this case a single session is associated + * to a slave of MaxScale. Therefore this is called when that slave is no + * longer active and should remove of reference to that slave, free memory + * and prevent any further forwarding of binlog records to that slave. + * + * Parameters: + * @param router_instance The instance of the router + * @param router_cli_ses The particular session to free + * + */ +static void freeSession( + ROUTER* router_instance, + void* router_client_ses) +{ +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)router_instance; +ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_client_ses; +int prev_val; + + prev_val = atomic_add(&router->stats.n_slaves, -1); + ss_dassert(prev_val > 0); + + /* + * Remove the slave session form the list of slaves that are using the + * router currently. + */ + spinlock_acquire(&router->lock); + if (router->slaves == slave) { + router->slaves = slave->next; + } else { + ROUTER_SLAVE *ptr = router->slaves; + + while (ptr != NULL && ptr->next != slave) { + ptr = ptr->next; + } + + if (ptr != NULL) { + ptr->next = slave->next; + } + } + spinlock_release(&router->lock); + + LOGIF(LD, (skygw_log_write_flush( + LOGFILE_DEBUG, + "%lu [freeSession] Unlinked router_client_session %p from " + "router %p. Connections : %d. ", + pthread_self(), + slave, + router, + prev_val-1))); + + if (slave->hostname) + free(slave->hostname); + if (slave->user) + free(slave->user); + if (slave->passwd) + free(slave->passwd); + free(slave); +} + + +/** + * Close a session with the router, this is the mechanism + * by which a router may cleanup data structure etc. + * + * @param instance The router instance data + * @param router_session The session being closed + */ +static void +closeSession(ROUTER *instance, void *router_session) +{ +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; +ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; + + if (slave == NULL) + { + /* + * We must be closing the master session. + * + * TODO: Handle closure of master session + */ + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, "Binlog router close session with master"))); + blr_master_reconnect(router); + return; + } + CHK_CLIENT_RSES(slave); + /** + * Lock router client session for secure read and update. + */ + if (rses_begin_locked_router_action(slave)) + { + /* decrease server registered slaves counter */ + atomic_add(&router->stats.n_registered, -1); + + /* + * Mark the slave as unregistered to prevent the forwarding + * of any more binlog records to this slave. + */ + slave->state = BLRS_UNREGISTERED; + + /* Unlock */ + rses_end_locked_router_action(slave); + } +} + +/** + * We have data from the client, this is likely to be packets related to + * the registration of the slave to receive binlog records. Unlike most + * MaxScale routers there is no forwarding to the backend database, merely + * the return of either predefined server responses that have been cached + * or binlog records. + * + * @param instance The router instance + * @param router_session The router session returned from the newSession call + * @param queue The queue of data buffers to route + * @return The number of bytes sent + */ +static int +routeQuery(ROUTER *instance, void *router_session, GWBUF *queue) +{ +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; +ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; + + return blr_slave_request(router, slave, queue); +} + +static char *event_names[] = { + "Invalid", "Start Event V3", "Query Event", "Stop Event", "Rotate Event", + "Integer Session Variable", "Load Event", "Slave Event", "Create File Event", + "Append Block Event", "Exec Load Event", "Delete File Event", + "New Load Event", "Rand Event", "User Variable Event", "Format Description Event", + "Transaction ID Event (2 Phase Commit)", "Begin Load Query Event", + "Execute Load Query Event", "Table Map Event", "Write Rows Event (v0)", + "Update Rows Event (v0)", "Delete Rows Event (v0)", "Write Rows Event (v1)", + "Update Rows Event (v1)", "Delete Rows Event (v1)", "Incident Event", + "Heartbeat Event", "Ignorable Event", "Rows Query Event", "Write Rows Event (v2)", + "Update Rows Event (v2)", "Delete Rows Event (v2)", "GTID Event", + "Anonymous GTID Event", "Previous GTIDS Event" +}; + +/** + * Display an entry from the spinlock statistics data + * + * @param dcb The DCB to print to + * @param desc Description of the statistic + * @param value The statistic value + */ +static void +spin_reporter(void *dcb, char *desc, int value) +{ + dcb_printf((DCB *)dcb, "\t\t%-35s %d\n", desc, value); +} + +/** + * Display router diagnostics + * + * @param instance Instance of the router + * @param dcb DCB to send diagnostics to + */ +static void +diagnostics(ROUTER *router, DCB *dcb) +{ +ROUTER_INSTANCE *router_inst = (ROUTER_INSTANCE *)router; +ROUTER_SLAVE *session; +int i = 0; +char buf[40]; +struct tm tm; + + spinlock_acquire(&router_inst->lock); + session = router_inst->slaves; + while (session) + { + i++; + session = session->next; + } + spinlock_release(&router_inst->lock); + + dcb_printf(dcb, "\tMaster connection DCB: %p\n", + router_inst->master); + dcb_printf(dcb, "\tMaster connection state: %s\n", + blrm_states[router_inst->master_state]); + + localtime_r(&router_inst->stats.lastReply, &tm); + asctime_r(&tm, buf); + + dcb_printf(dcb, "\tNumber of master connects: %d\n", + router_inst->stats.n_masterstarts); + dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n", + router_inst->stats.n_delayedreconnects); + dcb_printf(dcb, "\tCurrent binlog file: %s\n", + router_inst->binlog_name); + dcb_printf(dcb, "\tCurrent binlog position: %u\n", + router_inst->binlog_position); + dcb_printf(dcb, "\tNumber of slave servers: %u\n", + router_inst->stats.n_slaves); + dcb_printf(dcb, "\tNumber of binlog events received: %u\n", + router_inst->stats.n_binlogs); + dcb_printf(dcb, "\tNumber of fake binlog events: %u\n", + router_inst->stats.n_fakeevents); + dcb_printf(dcb, "\tNumber of artificial binlog events: %u\n", + router_inst->stats.n_artificial); + dcb_printf(dcb, "\tNumber of binlog events in error: %u\n", + router_inst->stats.n_binlog_errors); + dcb_printf(dcb, "\tNumber of binlog rotate events: %u\n", + router_inst->stats.n_rotates); + dcb_printf(dcb, "\tNumber of binlog cache hits: %u\n", + router_inst->stats.n_cachehits); + dcb_printf(dcb, "\tNumber of binlog cache misses: %u\n", + router_inst->stats.n_cachemisses); + dcb_printf(dcb, "\tNumber of heartbeat events: %u\n", + router_inst->stats.n_heartbeats); + dcb_printf(dcb, "\tNumber of packets received: %u\n", + router_inst->stats.n_reads); + dcb_printf(dcb, "\tNumber of residual data packets: %u\n", + router_inst->stats.n_residuals); + dcb_printf(dcb, "\tAverage events per packet %.1f\n", + (double)router_inst->stats.n_binlogs / router_inst->stats.n_reads); + dcb_printf(dcb, "\tLast event from master at: %s", + buf); + dcb_printf(dcb, "\t (%d seconds ago)\n", + time(0) - router_inst->stats.lastReply); + dcb_printf(dcb, "\tLast event from master: 0x%x\n", + router_inst->lastEventReceived); + if (router_inst->active_logs) + dcb_printf(dcb, "\tRouter processing binlog records\n"); + if (router_inst->reconnect_pending) + dcb_printf(dcb, "\tRouter pending reconnect to master\n"); + dcb_printf(dcb, "\tEvents received:\n"); + for (i = 0; i < 0x24; i++) + { + dcb_printf(dcb, "\t\t%-38s: %u\n", event_names[i], router_inst->stats.events[i]); + } + +#if SPINLOCK_PROFILE + dcb_printf(dcb, "\tSpinlock statistics (instlock):\n"); + spinlock_stats(&instlock, spin_reporter, dcb); + dcb_printf(dcb, "\tSpinlock statistics (instance lock):\n"); + spinlock_stats(&router_inst->lock, spin_reporter, dcb); +#endif + + if (router_inst->slaves) + { + dcb_printf(dcb, "\tSlaves:\n"); + spinlock_acquire(&router_inst->lock); + session = router_inst->slaves; + while (session) + { + dcb_printf(dcb, "\t\tServer-id: %d\n", session->serverid); + if (session->hostname) + dcb_printf(dcb, "\t\tHostname: %s\n", session->hostname); + dcb_printf(dcb, "\t\tSlave DCB: %p\n", session->dcb); + dcb_printf(dcb, "\t\tNext Sequence No: %d\n", session->seqno); + dcb_printf(dcb, "\t\tState: %s\n", blrs_states[session->state]); + dcb_printf(dcb, "\t\tBinlog file: %s\n", session->binlogfile); + dcb_printf(dcb, "\t\tBinlog position: %u\n", session->binlog_pos); + if (session->nocrc) + dcb_printf(dcb, "\t\tMaster Binlog CRC: None\n"); + dcb_printf(dcb, "\t\tNo. requests: %u\n", session->stats.n_requests); + dcb_printf(dcb, "\t\tNo. events sent: %u\n", session->stats.n_events); + dcb_printf(dcb, "\t\tNo. bursts sent: %u\n", session->stats.n_bursts); + dcb_printf(dcb, "\t\tNo. flow control: %u\n", session->stats.n_flows); + dcb_printf(dcb, "\t\tNo. catchup NRs: %u\n", session->stats.n_catchupnr); + dcb_printf(dcb, "\t\tNo. already up to date: %u\n", session->stats.n_alreadyupd); + dcb_printf(dcb, "\t\tNo. up to date: %u\n", session->stats.n_upd); + dcb_printf(dcb, "\t\tNo. of low water cbs %u\n", session->stats.n_cb); + dcb_printf(dcb, "\t\tNo. of drained cbs %u\n", session->stats.n_dcb); + dcb_printf(dcb, "\t\tNo. of low water cbs N/A %u\n", session->stats.n_cbna); + dcb_printf(dcb, "\t\tNo. of events > high water %u\n", session->stats.n_above); + dcb_printf(dcb, "\t\tNo. of failed reads %u\n", session->stats.n_failed_read); + dcb_printf(dcb, "\t\tNo. of nested distribute events %u\n", session->stats.n_overrun); + dcb_printf(dcb, "\t\tNo. of distribute action 1 %u\n", session->stats.n_actions[0]); + dcb_printf(dcb, "\t\tNo. of distribute action 2 %u\n", session->stats.n_actions[1]); + dcb_printf(dcb, "\t\tNo. of distribute action 3 %u\n", session->stats.n_actions[2]); + if ((session->cstate & CS_UPTODATE) == 0) + { + dcb_printf(dcb, "\t\tSlave is in catchup mode. %s\n", + ((session->cstate & CS_EXPECTCB) == 0 ? "" : + "Waiting for DCB queue to drain.")); + + } + else + { + dcb_printf(dcb, "\t\tSlave is in normal mode.\n"); + if (session->binlog_pos != router_inst->binlog_position) + { + dcb_printf(dcb, "\t\tSlave reports up to date however " + "the slave binlog position does not match the master\n"); + } + } +#if SPINLOCK_PROFILE + dcb_printf(dcb, "\tSpinlock statistics (catch_lock):\n"); + spinlock_stats(&session->catch_lock, spin_reporter, dcb); + dcb_printf(dcb, "\tSpinlock statistics (rses_lock):\n"); + spinlock_stats(&session->rses_lock, spin_reporter, dcb); +#endif + + session = session->next; + } + spinlock_release(&router_inst->lock); + } +} + +/** + * Client Reply routine - in this case this is a message from the + * master server, It should be sent to the state machine that manages + * master packets as it may be binlog records or part of the registration + * handshake that takes part during connection establishment. + * + * + * @param instance The router instance + * @param router_session The router session + * @param master_dcb The DCB for the connection to the master + * @param queue The GWBUF with reply data + */ +static void +clientReply(ROUTER *instance, void *router_session, GWBUF *queue, DCB *backend_dcb) +{ +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; + + atomic_add(&router->stats.n_reads, 1); + blr_master_response(router, queue); + router->stats.lastReply = time(0); +} + +/** + * Error Reply routine + * + * The routine will reply to client errors and/or closing the session + * or try to open a new backend connection. + * + * @param instance The router instance + * @param router_session The router session + * @param message The error message to reply + * @param backend_dcb The backend DCB + * @param action The action: REPLY, REPLY_AND_CLOSE, NEW_CONNECTION + * @param succp Result of action + * + */ +static void +errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp) +{ + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, "Erorr Reply '%s'", message))); + *succp = false; +} + +/** to be inline'd */ +/** + * @node Acquires lock to router client session if it is not closed. + * + * Parameters: + * @param rses - in, use + * + * + * @return true if router session was not closed. If return value is true + * it means that router is locked, and must be unlocked later. False, if + * router was closed before lock was acquired. + * + * + * @details (write detailed description here) + * + */ +static bool rses_begin_locked_router_action(ROUTER_SLAVE *rses) +{ + bool succp = false; + + CHK_CLIENT_RSES(rses); + + spinlock_acquire(&rses->rses_lock); + succp = true; + + return succp; +} + +/** to be inline'd */ +/** + * @node Releases router client session lock. + * + * Parameters: + * @param rses - + * + * + * @return void + * + * + * @details (write detailed description here) + * + */ +static void rses_end_locked_router_action(ROUTER_SLAVE * rses) +{ + CHK_CLIENT_RSES(rses); + spinlock_release(&rses->rses_lock); +} + + +static uint8_t getCapabilities(ROUTER *inst, void *router_session) +{ + return 0; +} diff --git a/server/modules/routing/binlog/blr_cache.c b/server/modules/routing/binlog/blr_cache.c new file mode 100644 index 000000000..5bc46f036 --- /dev/null +++ b/server/modules/routing/binlog/blr_cache.c @@ -0,0 +1,69 @@ +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr_cache.c - binlog router cache, manage the binlog cache + * + * The binlog router is designed to be used in replication environments to + * increase the replication fanout of a master server. It provides a transparant + * mechanism to read the binlog entries for multiple slaves while requiring + * only a single connection to the actual master to support the slaves. + * + * The current prototype implement is designed to support MySQL 5.6 and has + * a number of limitations. This prototype is merely a proof of concept and + * should not be considered production ready. + * + * @verbatim + * Revision History + * + * Date Who Description + * 07/04/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +extern int lm_enabled_logfiles_bitmask; + + +/** + * Initialise the cache for this instanceof the binlog router. As a side + * effect also determine the binlog file to read and the position to read + * from. + * + * @param router The router instance + */ +void +blr_init_cache(ROUTER_INSTANCE *router) +{ +} diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c new file mode 100644 index 000000000..4f7232e64 --- /dev/null +++ b/server/modules/routing/binlog/blr_file.c @@ -0,0 +1,346 @@ +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr_file.c - contains code for the router binlog file management + * + * + * @verbatim + * Revision History + * + * Date Who Description + * 14/04/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +extern int lm_enabled_logfiles_bitmask; + +static void blr_file_create(ROUTER_INSTANCE *router, char *file); +static void blr_file_append(ROUTER_INSTANCE *router, char *file); +static uint32_t extract_field(uint8_t *src, int bits); + +/** + * Initialise the binlog file for this instance. MaxScale will look + * for all the binlogs that it has on local disk, determien the next + * binlog to use and initialise it for writing, determining the + * next record to be fetched from the real master. + * + * @param router The router instance this defines the master for this replication chain + */ +void +blr_file_init(ROUTER_INSTANCE *router) +{ +char *ptr, path[1024], filename[1050]; +int file_found, n = 1; +int root_len, i; +DIR *dirp; +struct dirent *dp; + + strcpy(path, "/usr/local/skysql/MaxScale"); + if ((ptr = getenv("MAXSCALE_HOME")) != NULL) + { + strcpy(path, ptr); + } + strcat(path, "/"); + strcat(path, router->service->name); + + if (access(path, R_OK) == -1) + mkdir(path, 0777); + + /* First try to find a binlog file number by reading the directory */ + root_len = strlen(router->fileroot); + dirp = opendir(path); + while ((dp = readdir(dirp)) != NULL) + { + if (strncmp(dp->d_name, router->fileroot, root_len) == 0) + { + i = atoi(dp->d_name + root_len + 1); + if (i > n) + n = i; + } + } + closedir(dirp); + + + file_found = 0; + do { + sprintf(filename, "%s/" BINLOG_NAMEFMT, path, router->fileroot, n); + if (access(filename, R_OK) != -1) + { + file_found = 1; + n++; + } + else + file_found = 0; + } while (file_found); + n--; + + if (n == 0) // No binlog files found + { + sprintf(filename, BINLOG_NAMEFMT, router->fileroot, 1); + blr_file_create(router, filename); + } + else + { + sprintf(filename, BINLOG_NAMEFMT, router->fileroot, n); + blr_file_append(router, filename); + } + +} + +void +blr_file_rotate(ROUTER_INSTANCE *router, char *file, uint64_t pos) +{ + blr_file_create(router, file); +} + + +/** + * Create a new binlog file for the router to use. + * + * @param router The router instance + * @param file The binlog file name + */ +static void +blr_file_create(ROUTER_INSTANCE *router, char *file) +{ +char *ptr, path[1024]; +int fd; +unsigned char magic[] = BINLOG_MAGIC; + + strcpy(path, "/usr/local/skysql/MaxScale"); + if ((ptr = getenv("MAXSCALE_HOME")) != NULL) + { + strcpy(path, ptr); + } + strcat(path, "/"); + strcat(path, router->service->name); + strcat(path, "/"); + strcat(path, file); + + if ((fd = open(path, O_RDWR|O_CREAT, 0666)) != -1) + { + write(fd, magic, 4); + } + else + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to create binlog file %s\n", path))); + } + fsync(fd); + close(router->binlog_fd); + strcpy(router->binlog_name, file); + router->binlog_position = 4; /* Initial position after the magic number */ + router->binlog_fd = fd; +} + + +/** + * Prepare an existing binlog file to be appened to. + * + * @param router The router instance + * @param file The binlog file name + */ +static void +blr_file_append(ROUTER_INSTANCE *router, char *file) +{ +char *ptr, path[1024]; +int fd; + + strcpy(path, "/usr/local/skysql/MaxScale"); + if ((ptr = getenv("MAXSCALE_HOME")) != NULL) + { + strcpy(path, ptr); + } + strcat(path, "/"); + strcat(path, router->service->name); + strcat(path, "/"); + strcat(path, file); + + if ((fd = open(path, O_RDWR|O_APPEND, 0666)) == -1) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to open binlog file %s for append.\n", + path))); + return; + } + fsync(fd); + close(router->binlog_fd); + strcpy(router->binlog_name, file); + router->binlog_position = lseek(fd, 0L, SEEK_END); + router->binlog_fd = fd; +} + +/** + * Write a binlog entry to disk. + * + * @param router The router instance + * @param buf The binlog record + * @param len The length of the binlog record + */ +void +blr_write_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t *buf) +{ + pwrite(router->binlog_fd, buf, hdr->event_size, hdr->next_pos - hdr->event_size); + router->binlog_position = hdr->next_pos; +} + +/** + * Flush the content of the binlog file to disk. + * + * @param router The binlog router + */ +void +blr_file_flush(ROUTER_INSTANCE *router) +{ + fsync(router->binlog_fd); +} + +int +blr_open_binlog(ROUTER_INSTANCE *router, char *binlog) +{ +char *ptr, path[1024]; +int rval; + + strcpy(path, "/usr/local/skysql/MaxScale"); + if ((ptr = getenv("MAXSCALE_HOME")) != NULL) + { + strcpy(path, ptr); + } + strcat(path, "/"); + strcat(path, router->service->name); + strcat(path, "/"); + strcat(path, binlog); + + if ((rval = open(path, O_RDONLY, 0666)) == -1) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to open binlog file %s\n", path))); + } + + return rval; +} + +/** + * Read a replication event into a GWBUF structure. + * + * @param fd File descriptor of the binlog file + * @param pos Position of binlog record to read + * @param hdr Binlog header to populate + * @return The binlog record wrapped in a GWBUF structure + */ +GWBUF * +blr_read_binlog(int fd, unsigned int pos, REP_HEADER *hdr) +{ +uint8_t hdbuf[19]; +GWBUF *result; +unsigned char *data; +int n; + + if (lseek(fd, pos, SEEK_SET) != pos) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to seek for binlog entry, " + "at %d.\n", pos))); + return NULL; + } + + /* Read the header information from the file */ + if ((n = read(fd, hdbuf, 19)) != 19) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to read header for binlog entry, " + "at %d (%s).\n", pos, strerror(errno)))); + if (n> 0 && n < 19) + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Short read when reading the header. " + "Expected 19 bytes got %d bytes.\n", + n))); + return NULL; + } + hdr->timestamp = extract_field(hdbuf, 32); + hdr->event_type = hdbuf[4]; + hdr->serverid = extract_field(&hdbuf[5], 32); + hdr->event_size = extract_field(&hdbuf[9], 32); + hdr->next_pos = extract_field(&hdbuf[13], 32); + hdr->flags = extract_field(&hdbuf[17], 16); + if ((result = gwbuf_alloc(hdr->event_size)) == NULL) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to allocate memory for binlog entry, " + "size %d at %d.\n", + hdr->event_size, pos))); + return NULL; + } + data = GWBUF_DATA(result); + memcpy(data, hdbuf, 19); // Copy the header in + if ((n = read(fd, &data[19], hdr->event_size - 19)) + != hdr->event_size - 19) // Read the balance + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Short read when reading the event at %d. " + "Expected %d bytes got %d bytes.\n", + pos, n))); + gwbuf_consume(result, hdr->event_size); + return NULL; + } + return result; +} + +/** + * Extract a numeric field from a packet of the specified number of bits + * + * @param src The raw packet source + * @param birs The number of bits to extract (multiple of 8) + */ +static uint32_t +extract_field(uint8_t *src, int bits) +{ +uint32_t rval = 0, shift = 0; + + while (bits > 0) + { + rval |= (*src++) << shift; + shift += 8; + bits -= 8; + } + return rval; +} diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c new file mode 100644 index 000000000..412276e48 --- /dev/null +++ b/server/modules/routing/binlog/blr_master.c @@ -0,0 +1,1024 @@ +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr_master.c - contains code for the router to master communication + * + * The binlog router is designed to be used in replication environments to + * increase the replication fanout of a master server. It provides a transparant + * mechanism to read the binlog entries for multiple slaves while requiring + * only a single connection to the actual master to support the slaves. + * + * The current prototype implement is designed to support MySQL 5.6 and has + * a number of limitations. This prototype is merely a proof of concept and + * should not be considered production ready. + * + * @verbatim + * Revision History + * + * Date Who Description + * 02/04/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* Temporary requirement for auth data */ +#include + +extern int lm_enabled_logfiles_bitmask; + +static GWBUF *blr_make_query(char *statement); +static GWBUF *blr_make_registration(ROUTER_INSTANCE *router); +static GWBUF *blr_make_binlog_dump(ROUTER_INSTANCE *router); +static void encode_value(unsigned char *data, unsigned int value, int len); +static void blr_handle_binlog_record(ROUTER_INSTANCE *router, GWBUF *pkt); +static void blr_rotate_event(ROUTER_INSTANCE *router, uint8_t *pkt, REP_HEADER *hdr); +static void blr_distribute_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t *ptr); +static void *CreateMySQLAuthData(char *username, char *password, char *database); +static void blr_extract_header(uint8_t *pkt, REP_HEADER *hdr); +static uint32_t extract_field(uint8_t *src, int bits); +static void blr_log_packet(logfile_id_t file, char *msg, uint8_t *ptr, int len); + +static int keepalive = 1; + +/** + * blr_start_master - controls the connection of the binlog router to the + * master MySQL server and triggers the slave registration process for + * the router. + * + * @param router The router instance + */ +void +blr_start_master(ROUTER_INSTANCE *router) +{ +DCB *client; +GWBUF *buf; + + if ((client = dcb_alloc(DCB_ROLE_INTERNAL)) == NULL) + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Binlog router: failed to create DCB for dummy client\n"))); + return; + } + router->client = client; + client->data = CreateMySQLAuthData(router->user, router->password, ""); + if ((router->session = session_alloc(router->service, client)) == NULL) + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Binlog router: failed to create session for connection to master\n"))); + return; + } + client->session = router->session; + if ((router->master = dcb_connect(router->service->databases, router->session, BLR_PROTOCOL)) == NULL) + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Binlog router: failed to connect to master\n"))); + return; + } + +if (setsockopt(router->master->fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive , sizeof(keepalive ))) +perror("setsockopt"); + + router->master_state = BLRM_AUTHENTICATED; + buf = blr_make_query("SELECT UNIX_TIMESTAMP()"); + router->master->func.write(router->master, buf); + router->master_state = BLRM_TIMESTAMP; + + router->stats.n_masterstarts++; +} + +/** + * Reconnect to the master server. + * + * IMPORTANT - must be called with router->active_logs set by the + * thread that set active_logs. + * + * @param router The router instance + */ +static void +blr_restart_master(ROUTER_INSTANCE *router) +{ +GWBUF *ptr; + + dcb_close(router->master); + dcb_free(router->master); + dcb_free(router->client); + + /* Discard the queued residual data */ + ptr = router->residual; + while (ptr) + { + ptr = gwbuf_consume(ptr, GWBUF_LENGTH(ptr)); + } + router->residual = NULL; + + /* Now it is safe to unleash other threads on this router instance */ + spinlock_acquire(&router->lock); + router->reconnect_pending = 0; + router->active_logs = 0; + spinlock_release(&router->lock); + blr_start_master(router); +} + +/** + * Request a reconnect to the master. + * + * If another thread is active processing messages from the master + * then merely set a flag for that thread to do the restart. If no + * threads are active then directly call the restart routine to + * reconnect to the master. + * + * @param router The router instance + */ +void +blr_master_reconnect(ROUTER_INSTANCE *router) +{ +int do_reconnect = 0; + + spinlock_acquire(&router->lock); + if (router->active_logs) + { + /* Currently processing a response, set a flag + * and get the thread that is process a response + * to deal with the reconnect. + */ + router->reconnect_pending = 1; + router->stats.n_delayedreconnects++; + } + else + { + router->active_logs = 1; + do_reconnect = 1; + } + spinlock_release(&router->lock); + if (do_reconnect) + { + blr_restart_master(router); + spinlock_acquire(&router->lock); + router->active_logs = 0; + spinlock_release(&router->lock); + } +} + +/** + * Binlog router master side state machine event handler. + * + * Handles an incoming response from the master server to the binlog + * router. + * + * @param router The router instance + * @param buf The incoming packet + */ +void +blr_master_response(ROUTER_INSTANCE *router, GWBUF *buf) +{ +char query[128]; + + atomic_add(&router->handling_threads, 1); + ss_dassert(router->handling_threads == 1); + spinlock_acquire(&router->lock); + router->active_logs = 1; + spinlock_release(&router->lock); + if (router->master_state < 0 || router->master_state > BLRM_MAXSTATE) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "Invalid master state machine state (%d) for binlog router.\n", + router->master_state))); + gwbuf_consume(buf, gwbuf_length(buf)); + spinlock_acquire(&router->lock); + if (router->reconnect_pending) + { + router->active_logs = 0; + spinlock_release(&router->lock); + atomic_add(&router->handling_threads, -1); + blr_restart_master(router); + return; + } + router->active_logs = 0; + spinlock_release(&router->lock); + atomic_add(&router->handling_threads, -1); + return; + } + + if (router->master_state != BLRM_BINLOGDUMP && MYSQL_RESPONSE_ERR(buf)) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "Received error: %d, %s from master during %s phase of the master state machine.\n", + MYSQL_ERROR_CODE(buf), MYSQL_ERROR_MSG(buf), blrm_states[router->master_state] + ))); + gwbuf_consume(buf, gwbuf_length(buf)); + spinlock_acquire(&router->lock); + router->active_logs = 0; + if (router->reconnect_pending) + { + spinlock_release(&router->lock); + atomic_add(&router->handling_threads, -1); + blr_restart_master(router); + return; + } + spinlock_release(&router->lock); + atomic_add(&router->handling_threads, -1); + return; + } + switch (router->master_state) + { + case BLRM_TIMESTAMP: + // Response to a timestamp message, no need to save this. + gwbuf_consume(buf, GWBUF_LENGTH(buf)); + buf = blr_make_query("SHOW VARIABLES LIKE 'SERVER_ID'"); + router->master_state = BLRM_SERVERID; + router->master->func.write(router->master, buf); + break; + case BLRM_SERVERID: + // Response to fetch of master's server-id + router->saved_master.server_id = buf; + // TODO: Extract the value of server-id and place in router->master_id + buf = blr_make_query("SET @master_heartbeat_period = 1799999979520"); + router->master_state = BLRM_HBPERIOD; + router->master->func.write(router->master, buf); + break; + case BLRM_HBPERIOD: + // Response to set the heartbeat period + router->saved_master.heartbeat = buf; + buf = blr_make_query("SET @master_binlog_checksum = @@global.binlog_checksum"); + router->master_state = BLRM_CHKSUM1; + router->master->func.write(router->master, buf); + break; + case BLRM_CHKSUM1: + // Response to set the master binlog checksum + router->saved_master.chksum1 = buf; + buf = blr_make_query("SELECT @master_binlog_checksum"); + router->master_state = BLRM_CHKSUM2; + router->master->func.write(router->master, buf); + break; + case BLRM_CHKSUM2: + // Response to the master_binlog_checksum, should be stored + router->saved_master.chksum2 = buf; + buf = blr_make_query("SELECT @@GLOBAL.GTID_MODE"); + router->master_state = BLRM_GTIDMODE; + router->master->func.write(router->master, buf); + break; + case BLRM_GTIDMODE: + // Response to the GTID_MODE, should be stored + router->saved_master.gtid_mode = buf; + buf = blr_make_query("SHOW VARIABLES LIKE 'SERVER_UUID'"); + router->master_state = BLRM_MUUID; + router->master->func.write(router->master, buf); + break; + case BLRM_MUUID: + // Response to the SERVER_UUID, should be stored + router->saved_master.uuid = buf; + sprintf(query, "SET @slave_uuid='%s'", router->uuid); + buf = blr_make_query(query); + router->master_state = BLRM_SUUID; + router->master->func.write(router->master, buf); + break; + case BLRM_SUUID: + // Response to the SET @server_uuid, should be stored + router->saved_master.setslaveuuid = buf; + buf = blr_make_query("SET NAMES latin1"); + router->master_state = BLRM_LATIN1; + router->master->func.write(router->master, buf); + break; + case BLRM_LATIN1: + // Response to the SET NAMES latin1, should be stored + router->saved_master.setnames = buf; + buf = blr_make_query("SET NAMES utf8"); + router->master_state = BLRM_UTF8; + router->master->func.write(router->master, buf); + break; + case BLRM_UTF8: + // Response to the SET NAMES utf8, should be stored + router->saved_master.utf8 = buf; + buf = blr_make_query("SELECT 1"); + router->master_state = BLRM_SELECT1; + router->master->func.write(router->master, buf); + break; + case BLRM_SELECT1: + // Response to the SELECT 1, should be stored + router->saved_master.select1 = buf; + buf = blr_make_query("SELECT VERSION();"); + router->master_state = BLRM_SELECTVER; + router->master->func.write(router->master, buf); + break; + case BLRM_SELECTVER: + // Response to SELECT VERSION should be stored + router->saved_master.selectver = buf; + buf = blr_make_registration(router); + router->master_state = BLRM_REGISTER; + router->master->func.write(router->master, buf); + break; + case BLRM_REGISTER: + // Request a dump of the binlog file + buf = blr_make_binlog_dump(router); + router->master_state = BLRM_BINLOGDUMP; + router->master->func.write(router->master, buf); + break; + case BLRM_BINLOGDUMP: + // Main body, we have received a binlog record from the master + blr_handle_binlog_record(router, buf); + break; + } + + if (router->reconnect_pending) + blr_restart_master(router); + spinlock_acquire(&router->lock); + router->active_logs = 0; + spinlock_release(&router->lock); + atomic_add(&router->handling_threads, -1); +} + +/** + * Build a MySQL query into a GWBUF that we can send to the master database + * + * @param query The text of the query to send + */ +static GWBUF * +blr_make_query(char *query) +{ +GWBUF *buf; +unsigned char *data; +int len; + + if ((buf = gwbuf_alloc(strlen(query) + 5)) == NULL) + return NULL; + data = GWBUF_DATA(buf); + len = strlen(query) + 1; + encode_value(&data[0], len, 24); // Payload length + data[3] = 0; // Sequence id + // Payload + data[4] = COM_QUERY; // Command + memcpy(&data[5], query, strlen(query)); + + return buf; +} + +/** + * Build a MySQL slave registration into a GWBUF that we can send to the + * master database + * + * @param router The router instance + * @return A MySQL Replication registration message in a GWBUF structure + */ +static GWBUF * +blr_make_registration(ROUTER_INSTANCE *router) +{ +GWBUF *buf; +unsigned char *data; +int len = 18; + + if ((buf = gwbuf_alloc(len + 4)) == NULL) + return NULL; + data = GWBUF_DATA(buf); + encode_value(&data[0], len, 24); // Payload length + data[3] = 0; // Sequence ID + data[4] = COM_REGISTER_SLAVE; // Command + encode_value(&data[5], router->serverid, 32); // Slave Server ID + data[9] = 0; // Slave hostname length + data[10] = 0; // Slave username length + data[11] = 0; // Slave password length + encode_value(&data[12], + router->service->ports->port, 16); // Slave master port + encode_value(&data[14], 0, 32); // Replication rank + encode_value(&data[18], router->masterid, 32); // Master server-id + + return buf; +} + + +/** + * Build a Binlog dump command into a GWBUF that we can send to the + * master database + * + * @param router The router instance + * @return A MySQL Replication COM_BINLOG_DUMP message in a GWBUF structure + */ +static GWBUF * +blr_make_binlog_dump(ROUTER_INSTANCE *router) +{ +GWBUF *buf; +unsigned char *data; +int len = 0x1b; + + if ((buf = gwbuf_alloc(len + 4)) == NULL) + return NULL; + data = GWBUF_DATA(buf); + + encode_value(&data[0], len,24); // Payload length + data[3] = 0; // Sequence ID + data[4] = COM_BINLOG_DUMP; // Command + encode_value(&data[5], + router->binlog_position, 32); // binlog position + encode_value(&data[9], 0, 16); // Flags + encode_value(&data[11], + router->serverid, 32); // Server-id of MaxScale + strncpy((char *)&data[15], router->binlog_name, + BINLOG_FNAMELEN); // binlog filename + return buf; +} + + +/** + * Encode a value into a number of bits in a MySQL packet + * + * @param data Point to location in target packet + * @param value The value to pack + * @param len Number of bits to encode value into + */ +static void +encode_value(unsigned char *data, unsigned int value, int len) +{ + while (len > 0) + { + *data++ = value & 0xff; + value >>= 8; + len -= 8; + } +} + +/** + * blr_handle_binlog_record - we have received binlog records from + * the master and we must now work out what to do with them. + * + * @param router The router instance + * @param pkt The binlog records + */ +static void +blr_handle_binlog_record(ROUTER_INSTANCE *router, GWBUF *pkt) +{ +uint8_t *msg = NULL, *ptr, *pdata; +REP_HEADER hdr; +unsigned int len, reslen; +unsigned int pkt_length; +int no_residual = 1; +int preslen = -1; +int prev_length = -1; +int n_bufs = -1, pn_bufs = -1; +static REP_HEADER phdr; + + /* + * Prepend any residual buffer to the buffer chain we have + * been called with. + */ + if (router->residual) + { + pkt = gwbuf_append(router->residual, pkt); + router->residual = NULL; + no_residual = 0; + } + + pkt_length = gwbuf_length(pkt); + while (pkt && pkt_length > 24) + { + reslen = GWBUF_LENGTH(pkt); + pdata = GWBUF_DATA(pkt); + if (reslen < 3) // Payload length straddles buffers + { + /* Get the length of the packet from the residual and new packet */ + if (reslen >= 3) + { + len = extract_field(pdata, 24); + } + else if (reslen == 2) + { + len = extract_field(pdata, 16); + len |= (extract_field(GWBUF_DATA(pkt->next), 8) << 16); + } + else if (reslen == 1) + { + len = extract_field(pdata, 8); + len |= (extract_field(GWBUF_DATA(pkt->next), 16) << 8); + } + len += 4; // Allow space for the header + } + else + { + len = extract_field(pdata, 24) + 4; + } + + if (reslen < len && pkt_length >= len) + { + /* + * The message is contained in more than the current + * buffer, however we have the complete messasge in + * this buffer and the chain of remaining buffers. + * + * Allocate a contiguous buffer for the binlog message + * and copy the complete message into this buffer. + */ + int remainder = len; + GWBUF *p = pkt; + + if ((msg = malloc(len)) == NULL) + { + LOGIF(LE,(skygw_log_write( + LOGFILE_ERROR, + "Insufficient memory to buffer event " + "of %d bytes. Binlog %s @ %d\n.", + len, router->binlog_name, + router->binlog_position))); + break; + } + + n_bufs = 0; + ptr = msg; + while (p && remainder > 0) + { + int plen = GWBUF_LENGTH(p); + int n = (remainder > plen ? plen : remainder); + memcpy(ptr, GWBUF_DATA(p), n); + remainder -= n; + ptr += n; + if (remainder > 0) + p = p->next; + n_bufs++; + } + if (remainder) + { + LOGIF(LE,(skygw_log_write( + LOGFILE_ERROR, + "Expected entire message in buffer " + "chain, but failed to create complete " + "message as expected. %s @ %d\n", + router->binlog_name, + router->binlog_position))); + free(msg); + msg = NULL; + break; + } + + ptr = msg; + } + else if (reslen < len) + { + /* + * The message is not fully contained in the current + * and we do not have the complete message in the + * buffer chain. Therefore we must stop processing + * until we receive the next buffer. + */ + router->stats.n_residuals++; + LOGIF(LD,(skygw_log_write( + LOGFILE_DEBUG, + "Residual data left after %d records. %s @ %d\n", + router->stats.n_binlogs, + router->binlog_name, router->binlog_position))); + break; + } + else + { + /* + * The message is fully contained in the current buffer + */ + ptr = pdata; + n_bufs = 1; + } + + blr_extract_header(ptr, &hdr); + + if (hdr.event_size != len - 5) + { + LOGIF(LE,(skygw_log_write( + LOGFILE_ERROR, + "Packet length is %d, but event size is %d, " + "binlog file %s position %d" + "reslen is %d and preslen is %d, " + "length of previous event %d. %s", + len, hdr.event_size, + router->binlog_name, + router->binlog_position, + reslen, preslen, prev_length, + (prev_length == -1 ? + (no_residual ? "No residual data from previous call" : "Residual data from previous call") : "") + ))); + blr_log_packet(LOGFILE_ERROR, "Packet:", ptr, len); + LOGIF(LE,(skygw_log_write( + LOGFILE_ERROR, + "This event (0x%x) was contained in %d GWBUFs, " + "the previous events was contained in %d GWBUFs", + router->lastEventReceived, n_bufs, pn_bufs))); + if (msg) + { + free(msg); + msg = NULL; + } + break; + } + phdr = hdr; + if (hdr.ok == 0) + { + router->stats.n_binlogs++; + router->lastEventReceived = hdr.event_type; + +// #define SHOW_EVENTS +#ifdef SHOW_EVENTS + printf("blr: event type 0x%02x, flags 0x%04x, event size %d\n", hdr.event_type, hdr.flags, hdr.event_size); +#endif + if (hdr.event_type >= 0 && hdr.event_type < 0x24) + router->stats.events[hdr.event_type]++; + if (hdr.event_type == FORMAT_DESCRIPTION_EVENT && hdr.next_pos == 0) + { + // Fake format description message + LOGIF(LD,(skygw_log_write(LOGFILE_DEBUG, + "Replication fake event. " + "Binlog %s @ %d.\n", + router->binlog_name, + router->binlog_position))); + router->stats.n_fakeevents++; + if (hdr.event_type == FORMAT_DESCRIPTION_EVENT) + { + /* + * We need to save this to replay to new + * slaves that attach later. + */ + if (router->saved_master.fde_event) + free(router->saved_master.fde_event); + router->saved_master.fde_len = hdr.event_size; + router->saved_master.fde_event = malloc(hdr.event_size); + if (router->saved_master.fde_event) + memcpy(router->saved_master.fde_event, + ptr + 5, hdr.event_size); + } + } + else + { + if (hdr.event_type == HEARTBEAT_EVENT) + { +#ifdef SHOW_EVENTS + printf("Replication heartbeat\n"); +#endif + LOGIF(LD,(skygw_log_write( + LOGFILE_DEBUG, + "Replication heartbeat. " + "Binlog %s @ %d.\n", + router->binlog_name, + router->binlog_position))); + router->stats.n_heartbeats++; + } + else if (hdr.flags != LOG_EVENT_ARTIFICIAL_F) + { + ptr = ptr + 5; // We don't put the first byte of the payload + // into the binlog file + blr_write_binlog_record(router, &hdr, ptr); + if (hdr.event_type == ROTATE_EVENT) + { + blr_rotate_event(router, ptr, &hdr); + } + blr_distribute_binlog_record(router, &hdr, ptr); + } + else + { + router->stats.n_artificial++; + LOGIF(LD,(skygw_log_write( + LOGFILE_DEBUG, + "Artificial event not written " + "to disk or distributed. " + "Type 0x%x, Length %d, Binlog " + "%s @ %d\n.", + hdr.event_type, + hdr.event_size, + router->binlog_name, + router->binlog_position))); + ptr += 5; + if (hdr.event_type == ROTATE_EVENT) + { + blr_rotate_event(router, ptr, &hdr); + } + } + } + } + else + { + printf("Binlog router error: %s\n", &ptr[7]); + LOGIF(LE,(skygw_log_write(LOGFILE_ERROR, + "Error packet in binlog stream.%s @ %d\n.", + router->binlog_name, + router->binlog_position))); + blr_log_packet(LOGFILE_ERROR, "Error Packet:", + ptr, len); + router->stats.n_binlog_errors++; + } + + if (msg) + { + free(msg); + msg = NULL; + } + prev_length = len; + while (len > 0) + { + int n, plen; + plen = GWBUF_LENGTH(pkt); + n = (plen < len ? plen : len); + pkt = gwbuf_consume(pkt, n); + len -= n; + pkt_length -= n; + } + preslen = reslen; + pn_bufs = n_bufs; + } + + /* + * Check if we have a residual, part binlog message to deal with. + * Just simply store the GWBUF for next time + */ + if (pkt) + { + router->residual = pkt; + ss_dassert(pkt_length != 0); + } + else + { + ss_dassert(pkt_length == 0); + } + blr_file_flush(router); +} + +/** + * Populate a header structure for a replication message from a GWBUF structure. + * + * @param pkt The incoming packet in a GWBUF chain + * @param hdr The packet header to populate + */ +static void +blr_extract_header(uint8_t *ptr, REP_HEADER *hdr) +{ + + hdr->payload_len = extract_field(ptr, 24); + hdr->seqno = ptr[3]; + hdr->ok = ptr[4]; + hdr->timestamp = extract_field(&ptr[5], 32); + hdr->event_type = ptr[9]; + hdr->serverid = extract_field(&ptr[10], 32); + hdr->event_size = extract_field(&ptr[14], 32); + hdr->next_pos = extract_field(&ptr[18], 32); + hdr->flags = extract_field(&ptr[22], 16); +} + +/** + * Extract a numeric field from a packet of the specified number of bits + * + * @param src The raw packet source + * @param birs The number of bits to extract (multiple of 8) + */ +static uint32_t +extract_field(uint8_t *src, int bits) +{ +uint32_t rval = 0, shift = 0; + + while (bits > 0) + { + rval |= (*src++) << shift; + shift += 8; + bits -= 8; + } + return rval; +} + +/** + * Process a binlog rotate event. + * + * @param router The instance of the router + * @param ptr The packet containing the rotate event + * @param hdr The replication message header + */ +static void +blr_rotate_event(ROUTER_INSTANCE *router, uint8_t *ptr, REP_HEADER *hdr) +{ +int len, slen; +uint64_t pos; +char file[BINLOG_FNAMELEN+1]; + + ptr += 19; // Skip event header + len = hdr->event_size - 19; // Event size minus header + pos = extract_field(ptr+4, 32); + pos <<= 32; + pos |= extract_field(ptr, 32); + slen = len - 8; + if (slen > BINLOG_FNAMELEN) + slen = BINLOG_FNAMELEN; + memcpy(file, ptr + 8, slen); + file[slen] = 0; + +#ifdef VERBOSE_ROTATE + printf("binlog rotate: "); + while (len--) + printf("0x%02x ", *ptr++); + printf("\n"); + printf("New file: %s @ %ld\n", file, pos); +#endif + + if (strncmp(router->binlog_name, file, slen) != 0) + { + router->stats.n_rotates++; + blr_file_rotate(router, file, pos); + } +} + +/** + * Create the auth data needed to be able to call dcb_connect. + * + * This doesn't really belong here and should be moved at some stage. + */ +static void * +CreateMySQLAuthData(char *username, char *password, char *database) +{ +MYSQL_session *auth_info; + + if (username == NULL || password == NULL) + { + LOGIF(LE,(skygw_log_write( + LOGFILE_ERROR, + "You must specify both username and password for the binlog router.\n"))); + return NULL; + } + + if ((auth_info = calloc(1, sizeof(MYSQL_session))) == NULL) + return NULL; + strcpy(auth_info->user, username); + strcpy(auth_info->db, database); + gw_sha1_str((const uint8_t *)password, strlen(password), auth_info->client_sha1); + + return auth_info; +} + +/** + * Distribute the binlog record we have just received to all the registered slaves. + * + * @param router The router instance + * @param hdr The replication event header + * @param ptr The raw replication event data + */ +static void +blr_distribute_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t *ptr) +{ +GWBUF *pkt; +uint8_t *buf; +ROUTER_SLAVE *slave; +int action; + + spinlock_acquire(&router->lock); + slave = router->slaves; + while (slave) + { + spinlock_acquire(&slave->catch_lock); + if ((slave->cstate & (CS_UPTODATE|CS_DIST)) == CS_UPTODATE) + { + /* Slave is up to date with the binlog and no distribute is + * running on this slave. + */ + action = 1; + slave->cstate |= CS_DIST; + } + else if ((slave->cstate & (CS_UPTODATE|CS_DIST)) == (CS_UPTODATE|CS_DIST)) + { + /* Slave is up to date with the binlog and a distribute is + * running on this slave. + */ + slave->overrun = 1; + action = 2; + } + else if ((slave->cstate & CS_UPTODATE) == 0) + { + /* Slave is in catchup mode */ + action = 3; + } + slave->stats.n_actions[action-1]++; + spinlock_release(&slave->catch_lock); + if (action == 1) + { + if ((slave->binlog_pos == hdr->next_pos - hdr->event_size) + && (strcmp(slave->binlogfile, router->binlog_name) == 0 || + hdr->event_type == ROTATE_EVENT)) + { + pkt = gwbuf_alloc(hdr->event_size + 5); + buf = GWBUF_DATA(pkt); + encode_value(buf, hdr->event_size + 1, 24); + buf += 3; + *buf++ = slave->seqno++; + *buf++ = 0; // OK + memcpy(buf, ptr, hdr->event_size); + if (hdr->event_type == ROTATE_EVENT) + { + blr_slave_rotate(slave, ptr); + } + slave->dcb->func.write(slave->dcb, pkt); + if (hdr->event_type != ROTATE_EVENT) + { + slave->binlog_pos = hdr->next_pos; + } + spinlock_acquire(&slave->catch_lock); + if (slave->overrun) + { + slave->stats.n_overrun++; + slave->overrun = 0; + spinlock_release(&router->lock); + slave->cstate &= ~(CS_UPTODATE|CS_DIST); + spinlock_release(&slave->catch_lock); + blr_slave_catchup(router, slave); + spinlock_acquire(&router->lock); + slave = router->slaves; + if (slave) + continue; + else + break; + } + else + { + slave->cstate &= ~CS_DIST; + } + spinlock_release(&slave->catch_lock); + } + else if ((slave->binlog_pos > hdr->next_pos - hdr->event_size) + && strcmp(slave->binlogfile, router->binlog_name) == 0) + { + LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, + "Slave %d is ahead of expected position %s@%d. " + "Expected position %d", + slave->serverid, slave->binlogfile, + slave->binlog_pos, + hdr->next_pos - hdr->event_size))); + } + else if ((hdr->event_type != ROTATE_EVENT) + && (slave->binlog_pos != hdr->next_pos - hdr->event_size || + strcmp(slave->binlogfile, router->binlog_name) != 0)) + { + /* Check slave is in catchup mode and if not + * force it to go into catchup mode. + */ + if (slave->cstate & CS_UPTODATE) + { + spinlock_release(&router->lock); + LOGIF(LD, (skygw_log_write_flush(LOGFILE_DEBUG, + "Force slave %d into catchup mode %s@%d\n", + slave->serverid, slave->binlogfile, + slave->binlog_pos))); + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~(CS_UPTODATE|CS_DIST); + spinlock_release(&slave->catch_lock); + blr_slave_catchup(router, slave); + spinlock_acquire(&router->lock); + slave = router->slaves; + if (slave) + continue; + else + break; + } + } + } + + slave = slave->next; + } + spinlock_release(&router->lock); +} + +static void +blr_log_packet(logfile_id_t file, char *msg, uint8_t *ptr, int len) +{ +char buf[400], *bufp; +int i; + + bufp = buf; + bufp += sprintf(bufp, "%s length = %d: ", msg, len); + for (i = 0; i < len && i < 40; i++) + bufp += sprintf(bufp, "0x%02x ", ptr[i]); + if (i < len) + skygw_log_write_flush(file, "%s...\n", buf); + else + skygw_log_write_flush(file, "%s\n", buf); + +} diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c new file mode 100644 index 000000000..176efbe4c --- /dev/null +++ b/server/modules/routing/binlog/blr_slave.c @@ -0,0 +1,944 @@ +/* + * This file is distributed as part of MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright SkySQL Ab 2014 + */ + +/** + * @file blr_slave.c - contains code for the router to slave communication + * + * The binlog router is designed to be used in replication environments to + * increase the replication fanout of a master server. It provides a transparant + * mechanism to read the binlog entries for multiple slaves while requiring + * only a single connection to the actual master to support the slaves. + * + * The current prototype implement is designed to support MySQL 5.6 and has + * a number of limitations. This prototype is merely a proof of concept and + * should not be considered production ready. + * + * @verbatim + * Revision History + * + * Date Who Description + * 14/04/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +static uint32_t extract_field(uint8_t *src, int bits); +static void encode_value(unsigned char *data, unsigned int value, int len); +static int blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); +static int blr_slave_replay(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *master); +static void blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, char *msg); +static int blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); +static int blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); +static int blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); +int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); +static uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr); +static int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data); + +extern int lm_enabled_logfiles_bitmask; + +/** + * Process a request packet from the slave server. + * + * The router can handle a limited subset of requests from the slave, these + * include a subset of general SQL queries, a slave registeration command and + * the binlog dump command. + * + * The strategy for responding to these commands is to use caches responses + * for the the same commands that have previously been made to the real master + * if this is possible, if it is not then the router itself will synthesize a + * response. + * + * @param router The router instance this defines the master for this replication chain + * @param slave The slave specific data + * @param queue The incoming request packet + */ +int +blr_slave_request(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) +{ + if (slave->state < 0 || slave->state > BLRS_MAXSTATE) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "Invalid slave state machine state (%d) for binlog router.\n", + slave->state))); + gwbuf_consume(queue, gwbuf_length(queue)); + return 0; + } + + atomic_add(&slave->stats.n_requests, 1); + switch (MYSQL_COMMAND(queue)) + { + case COM_QUERY: + return blr_slave_query(router, slave, queue); + break; + case COM_REGISTER_SLAVE: + return blr_slave_register(router, slave, queue); + break; + case COM_BINLOG_DUMP: + return blr_slave_binlog_dump(router, slave, queue); + break; + case COM_QUIT: + LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, + "COM_QUIT received from slave with server_id %d\n", + slave->serverid))); + break; + default: + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "Unexpected MySQL Command (%d) received from slave\n", + MYSQL_COMMAND(queue)))); + break; + } + return 0; +} + +/** + * Handle a query from the slave. This is expected to be one of the "standard" + * queries we expect as part of the registraton process. Most of these can + * be dealt with by replying the stored responses we got from the master + * when MaxScale registered as a slave. The exception to the rule is the + * request to obtain the current timestamp value of the server. + * + * Five select statements are currently supported: + * SELECT UNIX_TIMESTAMP(); + * SELECT @master_binlog_checksum + * SELECT @@GLOBAL.GTID_MODE + * SELECT VERSION() + * SELECT 1 + * + * Two show commands are supported: + * SHOW VARIABLES LIKE 'SERVER_ID' + * SHOW VARIABLES LIKE 'SERVER_UUID' + * + * Five set commands are supported: + * SET @master_binlog_checksum = @@global.binlog_checksum + * SET @master_heartbeat_period=... + * SET @slave_slave_uuid=... + * SET NAMES latin1 + * SET NAMES utf8 + * + * @param router The router instance this defines the master for this replication chain + * @param slave The slave specific data + * @param queue The incoming request packet + * @return Non-zero if data has been sent + */ +static int +blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) +{ +char *qtext, *query_text; +char *sep = " ,="; +char *word, *brkb; +int query_len; + + qtext = GWBUF_DATA(queue); + query_len = extract_field((uint8_t *)qtext, 24) - 1; + qtext += 5; // Skip header and first byte of the payload + query_text = strndup(qtext, query_len); + + LOGIF(LT, (skygw_log_write( + LOGFILE_TRACE, "Execute statement from the slave '%s'\n", query_text))); + /* + * Implement a very rudimental "parsing" of the query text by extarcting the + * words from the statement and matchng them against the subset of queries we + * are expecting from the slave. We already have responses to these commands, + * except for the select of UNIX_TIMESTAMP(), that we have saved from MaxScale's + * own interaction with the real master. We simply replay these saved responses + * to the slave. + */ + word = strtok_r(query_text, sep, &brkb); + if (strcasecmp(word, "SELECT") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "UNIX_TIMESTAMP()") == 0) + { + free(query_text); + return blr_slave_send_timestamp(router, slave); + } + else if (strcasecmp(word, "@master_binlog_checksum") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.chksum2); + } + else if (strcasecmp(word, "@@GLOBAL.GTID_MODE") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.gtid_mode); + } + else if (strcasecmp(word, "1") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.select1); + } + else if (strcasecmp(word, "VERSION()") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.selectver); + } + } + else if (strcasecmp(word, "SHOW") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "VARIABLES") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "LIKE") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "'SERVER_ID'") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.server_id); + } + else if (strcasecmp(word, "'SERVER_UUID'") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.uuid); + } + } + } + } + else if (strcasecmp(query_text, "SET") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "@master_heartbeat_period") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.heartbeat); + } + else if (strcasecmp(word, "@master_binlog_checksum") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "'none'") == 0) + slave->nocrc = 1; + else + slave->nocrc = 0; + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.chksum1); + } + else if (strcasecmp(word, "@slave_uuid") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.setslaveuuid); + } + else if (strcasecmp(word, "NAMES") == 0) + { + word = strtok_r(NULL, sep, &brkb); + if (strcasecmp(word, "latin1") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.setnames); + } + else if (strcasecmp(word, "utf8") == 0) + { + free(query_text); + return blr_slave_replay(router, slave, router->saved_master.utf8); + } + } + } + free(query_text); + + query_text = strndup(qtext, query_len); + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "Unexpected query from slave server %s\n", query_text))); + free(query_text); + blr_slave_send_error(router, slave, "Unexpected SQL query received from slave."); + return 0; +} + + +/** + * Send a reply to a command we have received from the slave. The reply itself + * is merely a copy of a previous message we received from the master when we + * registered as a slave. Hence we just replay this saved reply. + * + * @param router The binlog router instance + * @param slave The slave server to which we are sending the response + * @param master The saved master response + * @return Non-zero if data was sent + */ +static int +blr_slave_replay(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *master) +{ +GWBUF *clone; + + if (!master) + return 0; + if ((clone = gwbuf_clone(master)) != NULL) + { + return slave->dcb->func.write(slave->dcb, clone); + } + else + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to clone server response to send to slave.\n"))); + return 0; + } +} + +/** + * Construct an error response + * + * @param router The router instance + * @param slave The slave server instance + * @param msg The error message to send + */ +static void +blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, char *msg) +{ +GWBUF *pkt; +unsigned char *data; +int len; + + if ((pkt = gwbuf_alloc(strlen(msg) + 13)) == NULL) + return; + data = GWBUF_DATA(pkt); + len = strlen(msg) + 1; + encode_value(&data[0], len, 24); // Payload length + data[3] = 0; // Sequence id + // Payload + data[4] = 0xff; // Error indicator + data[5] = 0; // Error Code + data[6] = 0; // Error Code + strncpy((char *)&data[7], "#00000", 6); + memcpy(&data[13], msg, strlen(msg)); // Error Message + slave->dcb->func.write(slave->dcb, pkt); +} + +/* + * Some standard packets that have been captured from a network trace of server + * interactions. These packets are the schema definition sent in response to + * a SELECT UNIX_TIMESTAMP() statement and the EOF packet that marks the end + * of transmission of the result set. + */ +static uint8_t timestamp_def[] = { + 0x01, 0x00, 0x00, 0x01, 0x01, 0x26, 0x00, 0x00, 0x02, 0x03, 0x64, 0x65, 0x66, 0x00, 0x00, 0x00, + 0x10, 0x55, 0x4e, 0x49, 0x58, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x28, + 0x29, 0x00, 0x0c, 0x3f, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x08, 0x81, 0x00, 0x00, 0x00, 0x00, 0x05, + 0x00, 0x00, 0x03, 0xfe, 0x00, 0x00, 0x02, 0x00 +}; +static uint8_t timestamp_eof[] = { 0x05, 0x00, 0x00, 0x05, 0xfe, 0x00, 0x00, 0x02, 0x00 }; + +/** + * Send a response to a "SELECT UNIX_TIMESTAMP()" request. This differs from the other + * requests since we do not save a copy of the original interaction with the master + * and simply replay it. We want to always send the current time. We have stored a typcial + * response, which gives us the schema information normally returned. This is sent to the + * client and then we add a dynamic part that will insert the current timestamp data. + * Finally we send a preprepaed EOF packet to end the response stream. + * + * @param router The binlog router instance + * @param slave The slave server to which we are sending the response + * @return Non-zero if data was sent + */ +static int +blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave) +{ +GWBUF *pkt; +char timestamp[20]; +uint8_t *ptr; +int len, ts_len; + + sprintf(timestamp, "%ld", time(0)); + ts_len = strlen(timestamp); + len = sizeof(timestamp_def) + sizeof(timestamp_eof) + 5 + ts_len; + if ((pkt = gwbuf_alloc(len)) == NULL) + return 0; + ptr = GWBUF_DATA(pkt); + memcpy(ptr, timestamp_def, sizeof(timestamp_def)); // Fixed preamble + ptr += sizeof(timestamp_def); + encode_value(ptr, ts_len + 1, 24); // Add length of data packet + ptr += 3; + *ptr++ = 0x04; // Sequence number in response + *ptr++ = ts_len; // Length of result string + strncpy((char *)ptr, timestamp, ts_len); // Result string + ptr += ts_len; + memcpy(ptr, timestamp_eof, sizeof(timestamp_eof)); // EOF packet to terminate result + return slave->dcb->func.write(slave->dcb, pkt); +} + +/** + * Process a slave replication registration message. + * + * We store the various bits of information the slave gives us and generate + * a reply message. + * + * @param router The router instance + * @param slave The slave server + * @param queue The BINLOG_DUMP packet + * @return Non-zero if data was sent + */ +static int +blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) +{ +GWBUF *resp; +uint8_t *ptr; +int len, slen; + + ptr = GWBUF_DATA(queue); + len = extract_field(ptr, 24); + ptr += 4; // Skip length and sequence number + if (*ptr++ != COM_REGISTER_SLAVE) + return 0; + slave->serverid = extract_field(ptr, 32); + ptr += 4; + slen = *ptr++; + if (slen != 0) + { + slave->hostname = strndup((char *)ptr, slen); + ptr += slen; + } + else + slave->hostname = NULL; + slen = *ptr++; + if (slen != 0) + { + ptr += slen; + slave->user = strndup((char *)ptr, slen); + } + else + slave->user = NULL; + slen = *ptr++; + if (slen != 0) + { + slave->passwd = strndup((char *)ptr, slen); + ptr += slen; + } + else + slave->passwd = NULL; + slave->port = extract_field(ptr, 16); + ptr += 2; + slave->rank = extract_field(ptr, 32); + + /* + * Now construct a response + */ + if ((resp = gwbuf_alloc(11)) == NULL) + return 0; + ptr = GWBUF_DATA(resp); + encode_value(ptr, 7, 24); // Payload length + ptr += 3; + *ptr++ = 1; // Sequence number + encode_value(ptr, 0, 24); + ptr += 3; + encode_value(ptr, slave->serverid, 32); + slave->state = BLRS_REGISTERED; + return slave->dcb->func.write(slave->dcb, resp); +} + +/** + * Process a COM_BINLOG_DUMP message from the slave. This is the + * final step in the process of registration. The new master, MaxScale + * must send a response packet and generate a fake BINLOG_ROTATE event + * with the binlog file requested by the slave. And then send a + * FORMAT_DESCRIPTION_EVENT that has been saved from the real master. + * + * Once send MaxScale must continue to send binlog events to the slave. + * + * @param router The router instance + * @param slave The slave server + * @param queue The BINLOG_DUMP packet + * @return The number of bytes written to the slave + */ +static int +blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) +{ +GWBUF *resp; +uint8_t *ptr; +int len, flags, serverid, rval; +REP_HEADER hdr; +uint32_t chksum; + + ptr = GWBUF_DATA(queue); + len = extract_field(ptr, 24); + ptr += 4; // Skip length and sequence number + if (*ptr++ != COM_BINLOG_DUMP) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "blr_slave_binlog_dump expected a COM_BINLOG_DUMP but received %d\n", + *(ptr-1)))); + return 0; + } + + slave->binlog_pos = extract_field(ptr, 32); + ptr += 4; + flags = extract_field(ptr, 16); + ptr += 2; + serverid = extract_field(ptr, 32); + ptr += 4; + strncpy(slave->binlogfile, (char *)ptr, BINLOG_FNAMELEN); + + slave->state = BLRS_DUMPING; + slave->seqno = 1; + + if (slave->nocrc) + len = 0x2b; + else + len = 0x2f; + + // Build a fake rotate event + resp = gwbuf_alloc(len + 5); + hdr.payload_len = len + 1; + hdr.seqno = slave->seqno++; + hdr.ok = 0; + hdr.timestamp = 0L; + hdr.event_type = ROTATE_EVENT; + hdr.serverid = router->masterid; + hdr.event_size = len; + hdr.next_pos = 0; + hdr.flags = 0x20; + ptr = blr_build_header(resp, &hdr); + encode_value(ptr, slave->binlog_pos, 64); + ptr += 8; + memcpy(ptr, slave->binlogfile, BINLOG_FNAMELEN); + ptr += BINLOG_FNAMELEN; + + if (!slave->nocrc) + { + /* + * Now add the CRC to the fake binlog rotate event. + * + * The algorithm is first to compute the checksum of an empty buffer + * and then the checksum of the event portion of the message, ie we do not + * include the length, sequence number and ok byte that makes up the first + * 5 bytes of the message. We also do not include the 4 byte checksum itself. + */ + chksum = crc32(0L, NULL, 0); + chksum = crc32(chksum, GWBUF_DATA(resp) + 5, hdr.event_size - 4); + encode_value(ptr, chksum, 32); + } + + rval = slave->dcb->func.write(slave->dcb, resp); + + /* Send the FORMAT_DESCRIPTION_EVENT */ + if (router->saved_master.fde_event) + { + resp = gwbuf_alloc(router->saved_master.fde_len + 5); + ptr = GWBUF_DATA(resp); + encode_value(ptr, router->saved_master.fde_len + 1, 24); // Payload length + ptr += 3; + *ptr++ = slave->seqno++; + *ptr++ = 0; // OK + memcpy(ptr, router->saved_master.fde_event, router->saved_master.fde_len); + encode_value(ptr, time(0), 32); // Overwrite timestamp + /* + * Since we have changed the timestamp we must recalculate the CRC + * + * Position ptr to the start of the event header, + * calculate a new checksum + * and write it into the header + */ + ptr = GWBUF_DATA(resp) + 5 + router->saved_master.fde_len - 4; + chksum = crc32(0L, NULL, 0); + chksum = crc32(chksum, GWBUF_DATA(resp) + 5, router->saved_master.fde_len - 4); + encode_value(ptr, chksum, 32); + rval = slave->dcb->func.write(slave->dcb, resp); + } + + slave->dcb->low_water = router->low_water; + slave->dcb->high_water = router->high_water; + dcb_add_callback(slave->dcb, DCB_REASON_LOW_WATER, blr_slave_callback, slave); + dcb_add_callback(slave->dcb, DCB_REASON_DRAINED, blr_slave_callback, slave); + + if (slave->binlog_pos != router->binlog_position || + strcmp(slave->binlogfile, router->binlog_name) != 0) + { + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_UPTODATE; + spinlock_release(&slave->catch_lock); + rval = blr_slave_catchup(router, slave); + } + + return rval; +} + +/** + * Extract a numeric field from a packet of the specified number of bits, + * the number of bits must be a multiple of 8. + * + * @param src The raw packet source + * @param bits The number of bits to extract (multiple of 8) + * @return The extracted value + */ +static uint32_t +extract_field(uint8_t *src, int bits) +{ +uint32_t rval = 0, shift = 0; + + while (bits > 0) + { + rval |= (*src++) << shift; + shift += 8; + bits -= 8; + } + return rval; +} + +/** + * Encode a value into a number of bits in a MySQL packet + * + * @param data Pointer to location in target packet + * @param value The value to encode into the buffer + * @param len Number of bits to encode value into + */ +static void +encode_value(unsigned char *data, unsigned int value, int len) +{ + while (len > 0) + { + *data++ = value & 0xff; + value >>= 8; + len -= 8; + } +} + + +/** + * Populate a header structure for a replication message from a GWBUF structure. + * + * @param pkt The incoming packet in a GWBUF chain + * @param hdr The packet header to populate + * @return A pointer to the first byte following the event header + */ +static uint8_t * +blr_build_header(GWBUF *pkt, REP_HEADER *hdr) +{ +uint8_t *ptr; + + ptr = GWBUF_DATA(pkt); + + encode_value(ptr, hdr->payload_len, 24); + ptr += 3; + *ptr++ = hdr->seqno; + *ptr++ = hdr->ok; + encode_value(ptr, hdr->timestamp, 32); + ptr += 4; + *ptr++ = hdr->event_type; + encode_value(ptr, hdr->serverid, 32); + ptr += 4; + encode_value(ptr, hdr->event_size, 32); + ptr += 4; + encode_value(ptr, hdr->next_pos, 32); + ptr += 4; + encode_value(ptr, hdr->flags, 16); + ptr += 2; + + return ptr; +} + +/** + * We have a registered slave that is behind the current leading edge of the + * binlog. We must replay the log entries to bring this node up to speed. + * + * There may be a large numebr of records to send to the slave, the process + * is triggered by the slave COM_BINLOG_DUMP message and all the events must + * be sent without receiving any new event. This measn there is no trigger into + * MaxScale other than this initial message. However, if we simply send all the + * events we end up with an extremely long write queue on the DCB and risk running + * the server out of resources. + * + * To resolve this the concept of high and low water marks within the DCB has been + * added, with the ability for the DCB code to call user defined callbacks when the + * write queue is completely drained, when it crosses above the high water mark and + * when it crosses below the low water mark. + * + * The blr_slave_catchup routine will send binlog events to the slave until the high + * water mark is reached, at which point it will return. Later, when a low water mark + * callback is generated by the code that drains the DCB of data the blr_slave_catchup + * routine will again be called to write more events. The process is repeated until + * the slave has caught up with the master. + * + * Note: an additional check that the DCB is still above the low water mark is done + * prior to the return from this function to allow for any delays due to the call to + * the close system call, since this may cause thread rescheduling. + * + * @param router The binlog router + * @param slave The slave that is behind + * @return The number of bytes written + */ +int +blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave) +{ +GWBUF *head, *record; +REP_HEADER hdr; +int written, fd, rval = 1, burst = 0; +uint8_t *ptr; +struct timespec req; + + + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_EXPECTCB; + spinlock_release(&slave->catch_lock); +doitagain: + /* + * We have a slightly complex syncronisation mechansim here, + * we need to make sure that we do not have multiple threads + * running the catchup loop, but we need to be very careful + * that we do not loose a call that is coming via a callback + * call as this will stall the binlog catchup process. + * + * We don't want to simply use a traditional mutex here for + * the loop, since this would block a MaxScale thread for + * an unacceptable length of time. + * + * We have two status bits, the CS_READING that says we are + * in the outer loop and the CS_INNERLOOP, to say we are in + * the inner loop. + * + * If just CS_READING is set the other thread may be about to + * enter the inner loop or may be about to exit the function + * completely. Therefore we have to wait to see if CS_READING + * is cleared or CS_INNERLOOP is set. + * + * If CS_READING gets cleared then this thread should proceed + * into the loop. + * + * If CS_INNERLOOP get's set then this thread does not need to + * proceed. + * + * If CS_READING is not set then this thread simply enters the + * loop. + */ + req.tv_sec = 0; + req.tv_nsec = 1000; + spinlock_acquire(&slave->catch_lock); + if (slave->cstate & CS_UPTODATE) + { + LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, + "blr_slave_catchup called with up to date slave %d at " + "%s@%d. Reading position %s@%d\n", + slave->serverid, slave->binlogfile, + slave->binlog_pos, router->binlog_name, + router->binlog_position))); + slave->stats.n_alreadyupd++; + spinlock_release(&slave->catch_lock); + return 1; + } + while (slave->cstate & CS_READING) + { + // Wait until we know what the other thread is doing + while ((slave->cstate & (CS_READING|CS_INNERLOOP)) == CS_READING) + { + spinlock_release(&slave->catch_lock); + nanosleep(&req, NULL); + spinlock_acquire(&slave->catch_lock); + } + // Other thread is in the innerloop + if ((slave->cstate & (CS_READING|CS_INNERLOOP)) == (CS_READING|CS_INNERLOOP)) + { + spinlock_release(&slave->catch_lock); + LOGIF(LM, (skygw_log_write( + LOGFILE_MESSAGE, + "blr_slave_catchup thread returning due to " + "lock being held by another thread. %s@%d\n", + slave->binlogfile, + slave->binlog_pos))); + slave->stats.n_catchupnr++; + return 1; // We cheat here and return 1 because otherwise + // an error would be sent and we do not want that + } + + /* Release the lock for a short time to allow the other + * thread to exit the outer reading loop. + */ + spinlock_release(&slave->catch_lock); + nanosleep(&req, NULL); + spinlock_acquire(&slave->catch_lock); + } + if (slave->pthread) + LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Multiple threads sending to same thread.\n"))); + slave->pthread = pthread_self(); + slave->cstate |= CS_READING; + spinlock_release(&slave->catch_lock); + + if (DCB_ABOVE_HIGH_WATER(slave->dcb)) + LOGIF(LT, (skygw_log_write(LOGFILE_TRACE, "blr_slave_catchup above high water on entry.\n"))); + + do { + if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1) + { + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_READING; + spinlock_release(&slave->catch_lock); + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "blr_slave_catchup failed to open binlog file %s\n", + slave->binlogfile))); + return 0; + } + atomic_add(&slave->stats.n_bursts, 1); + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_INNERLOOP; + spinlock_release(&slave->catch_lock); + while ((!DCB_ABOVE_HIGH_WATER(slave->dcb)) && + (record = blr_read_binlog(fd, slave->binlog_pos, &hdr)) != NULL) + { +if (hdr.event_size > DEF_HIGH_WATER) slave->stats.n_above++; + head = gwbuf_alloc(5); + ptr = GWBUF_DATA(head); + encode_value(ptr, hdr.event_size + 1, 24); + ptr += 3; + *ptr++ = slave->seqno++; + *ptr++ = 0; // OK + head = gwbuf_append(head, record); + if (hdr.event_type == ROTATE_EVENT) + { + close(fd); + blr_slave_rotate(slave, GWBUF_DATA(record)); + if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1) + { + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "blr_slave_catchup failed to open binlog file %s\n", + slave->binlogfile))); + break; + } + } + written = slave->dcb->func.write(slave->dcb, head); + if (written && hdr.event_type != ROTATE_EVENT) + { + slave->binlog_pos = hdr.next_pos; + } + rval = written; + atomic_add(&slave->stats.n_events, 1); + burst++; + } + if (record == NULL) + slave->stats.n_failed_read++; + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_INNERLOOP; + spinlock_release(&slave->catch_lock); + + close(fd); + } while (record && DCB_BELOW_LOW_WATER(slave->dcb)); + if (record) + { + atomic_add(&slave->stats.n_flows, 1); + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + } + else + { + int state_change = 0; + spinlock_acquire(&slave->catch_lock); + if ((slave->cstate & CS_UPTODATE) == 0) + { + atomic_add(&slave->stats.n_upd, 1); + slave->cstate |= CS_UPTODATE; + state_change = 1; + } + spinlock_release(&slave->catch_lock); + if (state_change) + LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, + "blr_slave_catchup slave is up to date %s, %u\n", + slave->binlogfile, slave->binlog_pos))); + } + spinlock_acquire(&slave->catch_lock); +#if 0 +if (slave->pthread != pthread_self()) +{ + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Multple threads in catchup for same slave: %x and %x\n", slave->pthread, pthread_self()))); +abort(); +} +#endif + slave->pthread = 0; +#if 0 +if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) abort(); +#endif + slave->cstate &= ~CS_READING; + spinlock_release(&slave->catch_lock); +if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) +{ + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Expected to be above low water\n"))); +goto doitagain; +} + return rval; +} + +/** + * The DCB callback used by the slave to obtain DCB_REASON_LOW_WATER callbacks + * when the server sends all the the queue data for a DCB. This is the mechanism + * that is used to implement the flow control mechanism for the sending of + * large quantities of binlog records during the catchup process. + * + * @param dcb The DCB of the slave connection + * @param reason The reason the callback was called + * @param data The user data, in this case the server structure + */ +static int +blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data) +{ +ROUTER_SLAVE *slave = (ROUTER_SLAVE *)data; +ROUTER_INSTANCE *router = slave->router; + + if (reason == DCB_REASON_DRAINED) + { + if (slave->state == BLRS_DUMPING && + slave->binlog_pos != router->binlog_position) + { + atomic_add(&slave->stats.n_dcb, 1); + blr_slave_catchup(router, slave); + } + } + + if (reason == DCB_REASON_LOW_WATER) + { + if (slave->state == BLRS_DUMPING) + { + atomic_add(&slave->stats.n_cb, 1); + blr_slave_catchup(router, slave); + } + else + { + atomic_add(&slave->stats.n_cbna, 1); + } + } + return 0; +} + +/** + * Rotate the slave to the new binlog file + * + * @param slave The slave instance + * @param ptr The rotate event (minux header and OK byte) + */ +void +blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr) +{ + ptr += 19; // Skip header + slave->binlog_pos = extract_field(ptr, 32); + slave->binlog_pos += (extract_field(ptr+4, 32) << 32); + memcpy(slave->binlogfile, ptr + 8, BINLOG_FNAMELEN); + slave->binlogfile[BINLOG_FNAMELEN] = 0; +} diff --git a/server/modules/routing/debugcmd.c b/server/modules/routing/debugcmd.c index a9a5da12a..0a8fd2897 100644 --- a/server/modules/routing/debugcmd.c +++ b/server/modules/routing/debugcmd.c @@ -160,6 +160,10 @@ struct subcommand showoptions[] = { "Show all active sessions in MaxScale", "Show all active sessions in MaxScale", {0, 0, 0} }, + { "threads", 0, dShowThreads, + "Show the status of the polling threads in MaxScale", + "Show the status of the polling threads in MaxScale", + {0, 0, 0} }, { "users", 0, telnetdShowUsers, "Show statistics and user names for the debug interface", "Show statistics and user names for the debug interface", @@ -208,6 +212,10 @@ struct subcommand listoptions[] = { "List all the active sessions within MaxScale", "List all the active sessions within MaxScale", {0, 0, 0} }, + { "threads", 0, dShowThreads, + "List the status of the polling threads in MaxScale", + "List the status of the polling threads in MaxScale", + {0, 0, 0} }, { NULL, 0, NULL, NULL, NULL, {0, 0, 0} } };